Spaces:

kadabengaran
/

useful-review-classification

Runtime error

App Files Files Community

kadabengaran commited on Jun 21, 2023

Commit

4cdda95

1 Parent(s): be8111b

update

Browse files

Files changed (3) hide show

app/main.py +52 -55
app/model.py +2 -3
requirements.txt +1 -4

app/main.py CHANGED Viewed

@@ -5,33 +5,11 @@ try:
     import streamlit as st
     import re
     import streamlit as st
-    from transformers import BertTokenizer, AutoConfig
-    from model import IndoBERTBiLSTM, IndoBERTModel
 except Exception as e:
     print(e)
-STYLE = """
-<style>
-img {
-    max-width: 100%;
-}
-</style>
-"""
-footer="""<style>
-.footer {
-position: fixed;
-left: 0;
-bottom: 0;
-width: 100%;
-color: #e7e7e7;
-text-align: center;
-}
-</style>
-<div class="footer">
-<p>CUDA enabled</p>
-</div>
-"""
 # Config
 MAX_SEQ_LEN = 128
 MODELS_PATH = "kadabengaran/IndoBERT-BiLSTM-Useful-App-Review"
@@ -59,10 +37,16 @@ def load_tokenizer(model_path):
     return tokenizer
 def remove_special_characters(text):
-    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
-    text = re.sub(r"\s+", " ", text)  # replace multiple whitespace characters with a single space
-    text = re.sub(r'[0-9]', ' ', text) #remove number
     text = text.lower()
     return text
 def preprocess(text, tokenizer, max_seq=MAX_SEQ_LEN):
@@ -76,7 +60,7 @@ def load_model():
     model = IndoBERTBiLSTM.from_pretrained(MODELS_PATH)
     return model
-def predict_single(text, model, tokenizer, device):
     if device.type == 'cuda':
         model.cuda()
@@ -95,16 +79,16 @@ def predict_single(text, model, tokenizer, device):
     test_ids = torch.cat(test_ids, dim=0)
     test_attention_mask = torch.cat(test_attention_mask, dim=0)
-    # Forward pass, calculate logit predictions
     with torch.no_grad():
         outputs = model(test_ids.to(device),
                         test_attention_mask.to(device))
     print("output ", outputs)
-    predictions = torch.argmax(outputs, dim=-1)
-    print("output ", predictions)
-    return predictions.item()
-def predict_multiple(data, model, tokenizer, device):
     if device.type == 'cuda':
         model.cuda()
@@ -112,29 +96,26 @@ def predict_multiple(data, model, tokenizer, device):
     input_ids = []
     attention_masks = []
     for row in data.tolist():
-        # Apply remove_special_characters function to title column
         text = remove_special_characters(row)
         text = preprocess(text, tokenizer)
         input_ids.append(text['input_ids'])
         attention_masks.append(text['attention_mask'])
-    predictions = []
     with torch.no_grad():
         for i in range(len(input_ids)):
             test_ids = input_ids[i]
             test_attention_mask = attention_masks[i]
             outputs = model(test_ids.to(device), test_attention_mask.to(device))
-            prediction = torch.argmax(outputs, dim= -1)
-            prediction_label = get_key(prediction.item(), LABELS)
-            predictions.append(prediction_label)
-    return predictions
 tab_labels = ["Single Input", "Multiple Input"]
 class App:
-    print("Loading All")
     def __init__(self):
         self.fileTypes = ["csv"]
         self.default_tab_selected = tab_labels[0]
@@ -153,6 +134,7 @@ class App:
         </div>
         """
         st.markdown(html_temp, unsafe_allow_html=True)
         self.render_tabs()
         st.divider()
         self.render_process_button(model, tokenizer, device)
@@ -165,7 +147,8 @@ class App:
     def render_tabs(self):
         tab_selected = st.session_state.get('tab_selected', self.default_tab_selected)
         tab_selected = st.sidebar.radio("Select Input Type", tab_labels)
-        # tab1, tab2 = st.tabs(tab_labels)
         if tab_selected == tab_labels[0]:
             self.render_single_input()
@@ -181,9 +164,9 @@ class App:
         """
         Upload File
         """
-        st.markdown(STYLE, unsafe_allow_html=True)
-        file = st.file_uploader("Upload file", type=self.fileTypes)
-        # add "untuk kelancaran proses, maksimal csv dengan 1000 baris data"
         if not file:
             st.info("Please upload a file of type: " + ", ".join(self.fileTypes))
@@ -194,7 +177,6 @@ class App:
         placeholder = st.empty()
         placeholder.dataframe(data.head(10))
         header_list = data.columns.tolist()
         header_list.insert(0, "---------- select column -------------")
         ques = st.radio("Select column to process", header_list, index=0)
@@ -212,18 +194,20 @@ class App:
             if st.session_state.tab_selected == tab_labels[0]:
                 input_text = self.input_text
                 if input_text:
-                    prediction = predict_single(input_text, model, tokenizer, device)
-                    prediction_label = get_key(prediction, LABELS)
-                    st.write("Prediction:", prediction_label)
             elif st.session_state.tab_selected == tab_labels[1]:
                 df_process = self.csv_process
                 if df_process is not None:
-                    prediction = predict_multiple(df_process, model, tokenizer, device)
                     st.divider()
                     st.write("Classification Result")
                     input_file = self.csv_input
-                    input_file["classification_result"] = prediction
                     st.dataframe(input_file.head(10))
                     st.download_button(
                         label="Download Result",
@@ -231,10 +215,23 @@ class App:
                         file_name="classification_result.csv",
                         mime="text/csv",
                     )
 if __name__ == "__main__":
     app = App()
-    app.run()
-    if USE_CUDA:
-        st.markdown(footer,unsafe_allow_html=True)

     import streamlit as st
     import re
     import streamlit as st
+    from transformers import BertTokenizer
+    from model import IndoBERTBiLSTM
 except Exception as e:
     print(e)
 # Config
 MAX_SEQ_LEN = 128
 MODELS_PATH = "kadabengaran/IndoBERT-BiLSTM-Useful-App-Review"
     return tokenizer
 def remove_special_characters(text):
+    # case folding
     text = text.lower()
+    # menghapus karakter khusus
+    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
+    text = re.sub(r'[0-9]', ' ', text)
+    # replace multiple whitespace characters with a single space
+    text = re.sub(r"\s+", " ", text)
     return text
 def preprocess(text, tokenizer, max_seq=MAX_SEQ_LEN):
     model = IndoBERTBiLSTM.from_pretrained(MODELS_PATH)
     return model
+def classify_single(text, model, tokenizer, device):
     if device.type == 'cuda':
         model.cuda()
     test_ids = torch.cat(test_ids, dim=0)
     test_attention_mask = torch.cat(test_attention_mask, dim=0)
+    # Forward pass, calculate logit
     with torch.no_grad():
         outputs = model(test_ids.to(device),
                         test_attention_mask.to(device))
     print("output ", outputs)
+    result = torch.argmax(outputs, dim=-1)
+    print("output ", result)
+    return result.item()
+def classify_multiple(data, model, tokenizer, device):
     if device.type == 'cuda':
         model.cuda()
     input_ids = []
     attention_masks = []
     for row in data.tolist():
         text = remove_special_characters(row)
         text = preprocess(text, tokenizer)
         input_ids.append(text['input_ids'])
         attention_masks.append(text['attention_mask'])
+    result_list = []
     with torch.no_grad():
         for i in range(len(input_ids)):
             test_ids = input_ids[i]
             test_attention_mask = attention_masks[i]
             outputs = model(test_ids.to(device), test_attention_mask.to(device))
+            result = torch.argmax(outputs, dim= -1)
+            result_label = get_key(result.item(), LABELS)
+            result.append(result_label)
+    return result_list
 tab_labels = ["Single Input", "Multiple Input"]
 class App:
     def __init__(self):
         self.fileTypes = ["csv"]
         self.default_tab_selected = tab_labels[0]
         </div>
         """
         st.markdown(html_temp, unsafe_allow_html=True)
+        st.markdown("")
         self.render_tabs()
         st.divider()
         self.render_process_button(model, tokenizer, device)
     def render_tabs(self):
         tab_selected = st.session_state.get('tab_selected', self.default_tab_selected)
         tab_selected = st.sidebar.radio("Select Input Type", tab_labels)
+        if USE_CUDA:
+            st.sidebar.markdown(footer,unsafe_allow_html=True)
         if tab_selected == tab_labels[0]:
             self.render_single_input()
         """
         Upload File
         """
+        st.markdown("Upload file")
+        file = st.file_uploader("To ensure a smooth process, please use a maximum of 500 rows of data in the CSV file.",
+                                type=self.fileTypes)
         if not file:
             st.info("Please upload a file of type: " + ", ".join(self.fileTypes))
         placeholder = st.empty()
         placeholder.dataframe(data.head(10))
         header_list = data.columns.tolist()
         header_list.insert(0, "---------- select column -------------")
         ques = st.radio("Select column to process", header_list, index=0)
             if st.session_state.tab_selected == tab_labels[0]:
                 input_text = self.input_text
                 if input_text:
+                    classification = classify_single(input_text, model, tokenizer, device)
+                    classification_label = get_key(classification, LABELS)
+                    st.write("Classification result:", classification_label)
+                else:
+                    st.warning('Please enter text to process', icon="⚠️")
             elif st.session_state.tab_selected == tab_labels[1]:
                 df_process = self.csv_process
                 if df_process is not None:
+                    classification = classify_multiple(df_process, model, tokenizer, device)
                     st.divider()
                     st.write("Classification Result")
                     input_file = self.csv_input
+                    input_file["classification_result"] = classification
                     st.dataframe(input_file.head(10))
                     st.download_button(
                         label="Download Result",
                         file_name="classification_result.csv",
                         mime="text/csv",
                     )
+                else:
+                    st.warning('Please upload a file to process', icon="⚠️")
+footer="""<style>
+.footer {
+position: fixed;
+left: 10;
+bottom: 0;
+width: 100%;
+color: #ffa9365e;
+}
+</style>
+<div class="footer">
+<p>CUDA enabled</p>
+</div>
+"""
 if __name__ == "__main__":
     app = App()
+    app.run()

app/model.py CHANGED Viewed

@@ -13,10 +13,9 @@ device = get_device()
 if device.type == 'cuda':
     USE_CUDA = True
-bert_path = 'indobenchmark/indobert-base-p2'
 HIDDEN_DIM = 768
 OUTPUT_DIM = 2 # 2 if Binary Classification
-N_LAYERS = 1 # 2
 BIDIRECTIONAL = True
 DROPOUT = 0.2 # 0.2
@@ -29,7 +28,7 @@ class IndoBERTBiLSTM(PreTrainedModel):
         self.hidden_dim = HIDDEN_DIM
         self.bidirectional = BIDIRECTIONAL
-        self.bert = BertModel.from_pretrained(bert_path)
         self.lstm = nn.LSTM(input_size=self.bert.config.hidden_size,
                             hidden_size=self.hidden_dim,
                             num_layers=self.n_layers,

 if device.type == 'cuda':
     USE_CUDA = True
+base_bert = 'indobenchmark/indobert-base-p2'
 HIDDEN_DIM = 768
 OUTPUT_DIM = 2 # 2 if Binary Classification
 BIDIRECTIONAL = True
 DROPOUT = 0.2 # 0.2
         self.hidden_dim = HIDDEN_DIM
         self.bidirectional = BIDIRECTIONAL
+        self.bert = BertModel.from_pretrained(base_bert)
         self.lstm = nn.LSTM(input_size=self.bert.config.hidden_size,
                             hidden_size=self.hidden_dim,
                             num_layers=self.n_layers,

requirements.txt CHANGED Viewed

@@ -1,6 +1,3 @@
 streamlit
 torch
-torchvision
-transformers
-tokenizers
-pickleshare

 streamlit
 torch
+transformers