kadabengaran commited on
Commit
4cdda95
·
1 Parent(s): be8111b
Files changed (3) hide show
  1. app/main.py +52 -55
  2. app/model.py +2 -3
  3. requirements.txt +1 -4
app/main.py CHANGED
@@ -5,33 +5,11 @@ try:
5
  import streamlit as st
6
  import re
7
  import streamlit as st
8
- from transformers import BertTokenizer, AutoConfig
9
- from model import IndoBERTBiLSTM, IndoBERTModel
10
  except Exception as e:
11
  print(e)
12
-
13
- STYLE = """
14
- <style>
15
- img {
16
- max-width: 100%;
17
- }
18
- </style>
19
- """
20
-
21
- footer="""<style>
22
- .footer {
23
- position: fixed;
24
- left: 0;
25
- bottom: 0;
26
- width: 100%;
27
- color: #e7e7e7;
28
- text-align: center;
29
- }
30
- </style>
31
- <div class="footer">
32
- <p>CUDA enabled</p>
33
- </div>
34
- """
35
  # Config
36
  MAX_SEQ_LEN = 128
37
  MODELS_PATH = "kadabengaran/IndoBERT-BiLSTM-Useful-App-Review"
@@ -59,10 +37,16 @@ def load_tokenizer(model_path):
59
  return tokenizer
60
 
61
  def remove_special_characters(text):
62
- text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
63
- text = re.sub(r"\s+", " ", text) # replace multiple whitespace characters with a single space
64
- text = re.sub(r'[0-9]', ' ', text) #remove number
65
  text = text.lower()
 
 
 
 
 
 
 
 
66
  return text
67
 
68
  def preprocess(text, tokenizer, max_seq=MAX_SEQ_LEN):
@@ -76,7 +60,7 @@ def load_model():
76
  model = IndoBERTBiLSTM.from_pretrained(MODELS_PATH)
77
  return model
78
 
79
- def predict_single(text, model, tokenizer, device):
80
 
81
  if device.type == 'cuda':
82
  model.cuda()
@@ -95,16 +79,16 @@ def predict_single(text, model, tokenizer, device):
95
  test_ids = torch.cat(test_ids, dim=0)
96
  test_attention_mask = torch.cat(test_attention_mask, dim=0)
97
 
98
- # Forward pass, calculate logit predictions
99
  with torch.no_grad():
100
  outputs = model(test_ids.to(device),
101
  test_attention_mask.to(device))
102
  print("output ", outputs)
103
- predictions = torch.argmax(outputs, dim=-1)
104
- print("output ", predictions)
105
- return predictions.item()
106
 
107
- def predict_multiple(data, model, tokenizer, device):
108
 
109
  if device.type == 'cuda':
110
  model.cuda()
@@ -112,29 +96,26 @@ def predict_multiple(data, model, tokenizer, device):
112
  input_ids = []
113
  attention_masks = []
114
  for row in data.tolist():
115
- # Apply remove_special_characters function to title column
116
  text = remove_special_characters(row)
117
  text = preprocess(text, tokenizer)
118
  input_ids.append(text['input_ids'])
119
  attention_masks.append(text['attention_mask'])
120
 
121
- predictions = []
122
 
123
  with torch.no_grad():
124
  for i in range(len(input_ids)):
125
  test_ids = input_ids[i]
126
  test_attention_mask = attention_masks[i]
127
  outputs = model(test_ids.to(device), test_attention_mask.to(device))
128
- prediction = torch.argmax(outputs, dim= -1)
129
- prediction_label = get_key(prediction.item(), LABELS)
130
- predictions.append(prediction_label)
131
 
132
- return predictions
133
 
134
  tab_labels = ["Single Input", "Multiple Input"]
135
  class App:
136
-
137
- print("Loading All")
138
  def __init__(self):
139
  self.fileTypes = ["csv"]
140
  self.default_tab_selected = tab_labels[0]
@@ -153,6 +134,7 @@ class App:
153
  </div>
154
  """
155
  st.markdown(html_temp, unsafe_allow_html=True)
 
156
  self.render_tabs()
157
  st.divider()
158
  self.render_process_button(model, tokenizer, device)
@@ -165,7 +147,8 @@ class App:
165
  def render_tabs(self):
166
  tab_selected = st.session_state.get('tab_selected', self.default_tab_selected)
167
  tab_selected = st.sidebar.radio("Select Input Type", tab_labels)
168
- # tab1, tab2 = st.tabs(tab_labels)
 
169
 
170
  if tab_selected == tab_labels[0]:
171
  self.render_single_input()
@@ -181,9 +164,9 @@ class App:
181
  """
182
  Upload File
183
  """
184
- st.markdown(STYLE, unsafe_allow_html=True)
185
- file = st.file_uploader("Upload file", type=self.fileTypes)
186
- # add "untuk kelancaran proses, maksimal csv dengan 1000 baris data"
187
 
188
  if not file:
189
  st.info("Please upload a file of type: " + ", ".join(self.fileTypes))
@@ -194,7 +177,6 @@ class App:
194
  placeholder = st.empty()
195
  placeholder.dataframe(data.head(10))
196
 
197
-
198
  header_list = data.columns.tolist()
199
  header_list.insert(0, "---------- select column -------------")
200
  ques = st.radio("Select column to process", header_list, index=0)
@@ -212,18 +194,20 @@ class App:
212
  if st.session_state.tab_selected == tab_labels[0]:
213
  input_text = self.input_text
214
  if input_text:
215
- prediction = predict_single(input_text, model, tokenizer, device)
216
- prediction_label = get_key(prediction, LABELS)
217
- st.write("Prediction:", prediction_label)
 
 
218
  elif st.session_state.tab_selected == tab_labels[1]:
219
  df_process = self.csv_process
220
  if df_process is not None:
221
- prediction = predict_multiple(df_process, model, tokenizer, device)
222
 
223
  st.divider()
224
  st.write("Classification Result")
225
  input_file = self.csv_input
226
- input_file["classification_result"] = prediction
227
  st.dataframe(input_file.head(10))
228
  st.download_button(
229
  label="Download Result",
@@ -231,10 +215,23 @@ class App:
231
  file_name="classification_result.csv",
232
  mime="text/csv",
233
  )
 
 
234
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
 
236
  if __name__ == "__main__":
237
  app = App()
238
- app.run()
239
- if USE_CUDA:
240
- st.markdown(footer,unsafe_allow_html=True)
 
5
  import streamlit as st
6
  import re
7
  import streamlit as st
8
+ from transformers import BertTokenizer
9
+ from model import IndoBERTBiLSTM
10
  except Exception as e:
11
  print(e)
12
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  # Config
14
  MAX_SEQ_LEN = 128
15
  MODELS_PATH = "kadabengaran/IndoBERT-BiLSTM-Useful-App-Review"
 
37
  return tokenizer
38
 
39
  def remove_special_characters(text):
40
+ # case folding
 
 
41
  text = text.lower()
42
+
43
+ # menghapus karakter khusus
44
+ text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
45
+ text = re.sub(r'[0-9]', ' ', text)
46
+
47
+ # replace multiple whitespace characters with a single space
48
+ text = re.sub(r"\s+", " ", text)
49
+
50
  return text
51
 
52
  def preprocess(text, tokenizer, max_seq=MAX_SEQ_LEN):
 
60
  model = IndoBERTBiLSTM.from_pretrained(MODELS_PATH)
61
  return model
62
 
63
+ def classify_single(text, model, tokenizer, device):
64
 
65
  if device.type == 'cuda':
66
  model.cuda()
 
79
  test_ids = torch.cat(test_ids, dim=0)
80
  test_attention_mask = torch.cat(test_attention_mask, dim=0)
81
 
82
+ # Forward pass, calculate logit
83
  with torch.no_grad():
84
  outputs = model(test_ids.to(device),
85
  test_attention_mask.to(device))
86
  print("output ", outputs)
87
+ result = torch.argmax(outputs, dim=-1)
88
+ print("output ", result)
89
+ return result.item()
90
 
91
+ def classify_multiple(data, model, tokenizer, device):
92
 
93
  if device.type == 'cuda':
94
  model.cuda()
 
96
  input_ids = []
97
  attention_masks = []
98
  for row in data.tolist():
 
99
  text = remove_special_characters(row)
100
  text = preprocess(text, tokenizer)
101
  input_ids.append(text['input_ids'])
102
  attention_masks.append(text['attention_mask'])
103
 
104
+ result_list = []
105
 
106
  with torch.no_grad():
107
  for i in range(len(input_ids)):
108
  test_ids = input_ids[i]
109
  test_attention_mask = attention_masks[i]
110
  outputs = model(test_ids.to(device), test_attention_mask.to(device))
111
+ result = torch.argmax(outputs, dim= -1)
112
+ result_label = get_key(result.item(), LABELS)
113
+ result.append(result_label)
114
 
115
+ return result_list
116
 
117
  tab_labels = ["Single Input", "Multiple Input"]
118
  class App:
 
 
119
  def __init__(self):
120
  self.fileTypes = ["csv"]
121
  self.default_tab_selected = tab_labels[0]
 
134
  </div>
135
  """
136
  st.markdown(html_temp, unsafe_allow_html=True)
137
+ st.markdown("")
138
  self.render_tabs()
139
  st.divider()
140
  self.render_process_button(model, tokenizer, device)
 
147
  def render_tabs(self):
148
  tab_selected = st.session_state.get('tab_selected', self.default_tab_selected)
149
  tab_selected = st.sidebar.radio("Select Input Type", tab_labels)
150
+ if USE_CUDA:
151
+ st.sidebar.markdown(footer,unsafe_allow_html=True)
152
 
153
  if tab_selected == tab_labels[0]:
154
  self.render_single_input()
 
164
  """
165
  Upload File
166
  """
167
+ st.markdown("Upload file")
168
+ file = st.file_uploader("To ensure a smooth process, please use a maximum of 500 rows of data in the CSV file.",
169
+ type=self.fileTypes)
170
 
171
  if not file:
172
  st.info("Please upload a file of type: " + ", ".join(self.fileTypes))
 
177
  placeholder = st.empty()
178
  placeholder.dataframe(data.head(10))
179
 
 
180
  header_list = data.columns.tolist()
181
  header_list.insert(0, "---------- select column -------------")
182
  ques = st.radio("Select column to process", header_list, index=0)
 
194
  if st.session_state.tab_selected == tab_labels[0]:
195
  input_text = self.input_text
196
  if input_text:
197
+ classification = classify_single(input_text, model, tokenizer, device)
198
+ classification_label = get_key(classification, LABELS)
199
+ st.write("Classification result:", classification_label)
200
+ else:
201
+ st.warning('Please enter text to process', icon="⚠️")
202
  elif st.session_state.tab_selected == tab_labels[1]:
203
  df_process = self.csv_process
204
  if df_process is not None:
205
+ classification = classify_multiple(df_process, model, tokenizer, device)
206
 
207
  st.divider()
208
  st.write("Classification Result")
209
  input_file = self.csv_input
210
+ input_file["classification_result"] = classification
211
  st.dataframe(input_file.head(10))
212
  st.download_button(
213
  label="Download Result",
 
215
  file_name="classification_result.csv",
216
  mime="text/csv",
217
  )
218
+ else:
219
+ st.warning('Please upload a file to process', icon="⚠️")
220
 
221
+ footer="""<style>
222
+ .footer {
223
+ position: fixed;
224
+ left: 10;
225
+ bottom: 0;
226
+ width: 100%;
227
+ color: #ffa9365e;
228
+ }
229
+ </style>
230
+ <div class="footer">
231
+ <p>CUDA enabled</p>
232
+ </div>
233
+ """
234
 
235
  if __name__ == "__main__":
236
  app = App()
237
+ app.run()
 
 
app/model.py CHANGED
@@ -13,10 +13,9 @@ device = get_device()
13
  if device.type == 'cuda':
14
  USE_CUDA = True
15
 
16
- bert_path = 'indobenchmark/indobert-base-p2'
17
  HIDDEN_DIM = 768
18
  OUTPUT_DIM = 2 # 2 if Binary Classification
19
- N_LAYERS = 1 # 2
20
  BIDIRECTIONAL = True
21
  DROPOUT = 0.2 # 0.2
22
 
@@ -29,7 +28,7 @@ class IndoBERTBiLSTM(PreTrainedModel):
29
  self.hidden_dim = HIDDEN_DIM
30
  self.bidirectional = BIDIRECTIONAL
31
 
32
- self.bert = BertModel.from_pretrained(bert_path)
33
  self.lstm = nn.LSTM(input_size=self.bert.config.hidden_size,
34
  hidden_size=self.hidden_dim,
35
  num_layers=self.n_layers,
 
13
  if device.type == 'cuda':
14
  USE_CUDA = True
15
 
16
+ base_bert = 'indobenchmark/indobert-base-p2'
17
  HIDDEN_DIM = 768
18
  OUTPUT_DIM = 2 # 2 if Binary Classification
 
19
  BIDIRECTIONAL = True
20
  DROPOUT = 0.2 # 0.2
21
 
 
28
  self.hidden_dim = HIDDEN_DIM
29
  self.bidirectional = BIDIRECTIONAL
30
 
31
+ self.bert = BertModel.from_pretrained(base_bert)
32
  self.lstm = nn.LSTM(input_size=self.bert.config.hidden_size,
33
  hidden_size=self.hidden_dim,
34
  num_layers=self.n_layers,
requirements.txt CHANGED
@@ -1,6 +1,3 @@
1
  streamlit
2
  torch
3
- torchvision
4
- transformers
5
- tokenizers
6
- pickleshare
 
1
  streamlit
2
  torch
3
+ transformers