File size: 7,684 Bytes
b767f00
 
 
 
 
4cdda95
 
ee9b192
b767f00
 
4cdda95
e186fb5
 
5a7f50a
b767f00
e186fb5
 
 
 
 
 
 
be8111b
24d7bd7
 
 
 
 
 
 
 
 
 
e186fb5
 
 
 
 
4cdda95
e186fb5
4cdda95
 
 
 
 
 
 
 
e186fb5
 
 
 
 
 
 
 
 
 
5a7f50a
 
e186fb5
4cdda95
e186fb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4cdda95
e186fb5
 
 
 
4cdda95
 
 
e186fb5
4cdda95
4e2ac67
 
 
 
b767f00
 
 
 
 
 
 
 
4cdda95
e186fb5
b767f00
ee9b192
b767f00
 
 
4cdda95
 
570c67b
b767f00
4cdda95
b767f00
 
 
 
 
 
 
b2d3878
 
b767f00
 
 
39b4480
9c9b15b
b767f00
 
 
 
 
 
 
4cdda95
b767f00
 
5a7f50a
b767f00
 
 
 
 
5a7f50a
b767f00
 
 
4cdda95
 
b767f00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4cdda95
 
 
b767f00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3543580
b2d3878
 
b767f00
 
 
 
 
 
4cdda95
 
 
 
 
b767f00
b2d3878
b767f00
4cdda95
b767f00
 
 
b2d3878
4cdda95
b767f00
 
 
 
 
 
 
4cdda95
 
b767f00
4cdda95
 
 
 
 
 
 
 
 
 
 
 
 
b767f00
 
 
4cdda95
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
try:
    import torch
    import pandas as pd
    import streamlit as st
    import re
    from transformers import BertTokenizer
    from model import IndoBERTBiLSTM
    from stqdm import stqdm
except Exception as e:
    print(e)
    
# Config
MAX_SEQ_LEN = 128
MODELS_PATH = "kadabengaran/IndoBERT-BiLSTM-Useful-App-Review"
LABELS = {'Not Useful': 0, 'Useful': 1}

def get_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

USE_CUDA = False
device = get_device()
if device.type == 'cuda':
    USE_CUDA = True

# Get the Keys
def get_key(val, my_dict):
    for key, value in my_dict.items():
        if val == value:
            return key

def load_tokenizer(model_path):
    tokenizer = BertTokenizer.from_pretrained(model_path)
    return tokenizer

def remove_special_characters(text):
    # case folding
    text = text.lower()

    # menghapus karakter khusus
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = re.sub(r'[0-9]', ' ', text)

    # replace multiple whitespace characters with a single space
    text = re.sub(r"\s+", " ", text)
    
    return text

def preprocess(text, tokenizer, max_seq=MAX_SEQ_LEN):
    return tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_seq,
                                 pad_to_max_length=True,
                                 return_attention_mask=True,
                                 return_tensors='pt'
                                 )
    
def load_model():
    model = IndoBERTBiLSTM.from_pretrained(MODELS_PATH)
    return model

def classify_single(text, model, tokenizer, device):
    
    if device.type == 'cuda':
        model.cuda()
        
    # We need Token IDs and Attention Mask for inference on the new sentence
    test_ids = []
    test_attention_mask = []

    # Apply preprocessing to the new sentence
    new_sentence = remove_special_characters(text)
    encoding = preprocess(new_sentence, tokenizer)

    # Extract IDs and Attention Mask
    test_ids.append(encoding['input_ids'])
    test_attention_mask.append(encoding['attention_mask'])
    test_ids = torch.cat(test_ids, dim=0)
    test_attention_mask = torch.cat(test_attention_mask, dim=0)

    # Forward pass, calculate logit
    with torch.no_grad():
        outputs = model(test_ids.to(device),
                        test_attention_mask.to(device))
    print("output ", outputs)
    result = torch.argmax(outputs, dim=-1)
    print("output ", result)
    return result.item()

def classify_multiple(data, model, tokenizer, device):
    
    if device.type == 'cuda':
        model.cuda()
        
    input_ids = []
    attention_masks = []
    for row in data.tolist():
        text = remove_special_characters(row)
        text = preprocess(text, tokenizer)
        input_ids.append(text['input_ids'])
        attention_masks.append(text['attention_mask'])
        
    result_list = []
    
    with torch.no_grad():
        for i in stqdm(range(len(input_ids))):
            test_ids = input_ids[i]
            test_attention_mask = attention_masks[i]
            outputs = model(test_ids.to(device), test_attention_mask.to(device))
            result = torch.argmax(outputs, dim= -1)
            result_label = get_key(result.item(), LABELS)
            result_list.append(result_label)
            
    return result_list

tab_labels = ["Single Input", "Multiple Input"]
class App:
    def __init__(self):
        self.fileTypes = ["csv"]
        self.default_tab_selected = tab_labels[0]
        self.input_text = None
        self.csv_input = None
        self.csv_process = None
        
    def run(self):
        self.init_session_state()  # Initialize session state
        tokenizer = load_tokenizer(MODELS_PATH)
        model = load_model()
        """App Review Classifier"""
        html_temp = """
        <div style="background-color:blue;padding:10px">
        <h1 style="color:white;text-align:center;">Klasifikasi Ulasan Aplikasi yang Berguna</h1>
        </div>
        """
        st.markdown(html_temp, unsafe_allow_html=True)
        st.markdown("")
        self.render_tabs()
        st.divider()
        self.render_process_button(model, tokenizer, device)

    def init_session_state(self):
        if "tab_selected" not in st.session_state:
            st.session_state.tab_selected = tab_labels[0]


    def render_tabs(self):
        tab_selected = st.session_state.get('tab_selected', self.default_tab_selected)
        tab_selected = st.sidebar.radio("Select Input Type", tab_labels)
        if USE_CUDA:
            st.sidebar.markdown(footer,unsafe_allow_html=True)

        if tab_selected == tab_labels[0]:
            self.render_single_input()
        elif tab_selected == tab_labels[1]:
            self.render_multiple_input()
            
        st.session_state.tab_selected = tab_selected

    def render_single_input(self):
        self.input_text = st.text_area("Enter Text Here", placeholder="Type Here")

    def render_multiple_input(self):
        """
        Upload File
        """
        st.markdown("Upload file")
        file = st.file_uploader("To ensure a smooth process, please use a maximum of 500 rows of data in the CSV file.", 
                                type=self.fileTypes)

        if not file:
            st.info("Please upload a file of type: " + ", ".join(self.fileTypes))
            return

        data = pd.read_csv(file)
        
        placeholder = st.empty()
        placeholder.dataframe(data.head(10))

        header_list = data.columns.tolist()
        header_list.insert(0, "---------- select column -------------")
        ques = st.radio("Select column to process", header_list, index=0)

        if header_list.index(ques) == 0:
            st.warning("Please select a column to process")
            return

        df_process = data[ques].astype(str)
        self.csv_input = data
        self.csv_process = df_process
        
    def render_process_button(self, model, tokenizer, device):
        if st.button("Process"):
            if st.session_state.tab_selected == tab_labels[0]:
                input_text = self.input_text
                if input_text:
                    classification = classify_single(input_text, model, tokenizer, device)
                    classification_label = get_key(classification, LABELS)
                    st.write("Classification result:", classification_label)
                else:
                    st.warning('Please enter text to process', icon="⚠️")
            elif st.session_state.tab_selected == tab_labels[1]:
                df_process = self.csv_process
                if df_process is not None:
                    classification = classify_multiple(df_process, model, tokenizer, device)
                    
                    st.divider()
                    st.write("Classification Result")
                    input_file = self.csv_input
                    input_file["classification_result"] = classification
                    st.dataframe(input_file.head(10))
                    st.download_button(
                        label="Download Result",
                        data=input_file.to_csv().encode("utf-8"),
                        file_name="classification_result.csv",
                        mime="text/csv",
                    )
                else:
                    st.warning('Please upload a file to process', icon="⚠️")
    
footer="""<style>
.footer {
position: fixed;
left: 10;
bottom: 0;
width: 100%;
color: #ffa9365e;
}
</style>
<div class="footer">
<p>CUDA enabled</p>
</div>
"""

if __name__ == "__main__":
    app = App()
    app.run()