Spaces:

kadabengaran
/

useful-review-classification

Runtime error

kadabengaran commited on May 29, 2023

Commit

b767f00

1 Parent(s): e186fb5

Squashed commit of the following:

commit 8e43136c9db5455c3248a2d95aa18a4e7a25bd39
Author: Ardhy Satrio <[email protected]>
Date: Mon May 29 11:15:13 2023 +0800

multi input update

Files changed (1) hide show

app/main.py +168 -56

app/main.py CHANGED Viewed

@@ -1,15 +1,33 @@
-import torch
-import re
-import streamlit as st
-from transformers import BertTokenizer, BertModel
-from model import IndoBERTBiLSTM, IndoBERTModel
 # Config
 MAX_SEQ_LEN = 128
-bert_path = 'indolem/indobert-base-uncased'
-MODELS_PATH = ["kadabengaran/IndoBERT-Useful-App-Review",
-               "kadabengaran/IndoBERT-BiLSTM-Useful-App-Review"]
             #    "kadabengaran/IndoBERT-BiLSTM-Useful-App-Review"]
 HIDDEN_DIM = 768
 OUTPUT_DIM = 2 # 2 if Binary
@@ -37,15 +55,9 @@ def load_tokenizer(model_path):
 def remove_special_characters(text):
-    # menghapus karakter khusus kecuali tanda baca seperti titik, koma, dan tanda tanya
-    # text = re.sub(r"[^a-zA-Z0-9.,!?]+", " ", text)
     text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
-    # text = re.sub(r"'\s+|\s+'", " ", text)  # replace apostrophe with space if it's surrounded by whitespace
     text = re.sub(r"\s+", " ", text)  # replace multiple whitespace characters with a single space
     text = re.sub(r'[0-9]', ' ', text) #remove number
     text = text.lower()
     return text
@@ -61,21 +73,19 @@ def load_model():
     bert = BertModel.from_pretrained(bert_path)
 	# Load the model
-    model1 = IndoBERTBiLSTM.from_pretrained(MODELS_PATH[0],
                                      bert,
                                      HIDDEN_DIM,
                                      OUTPUT_DIM,
                                      N_LAYERS, BIDIRECTIONAL,
                                      DROPOUT)
-    model2 = IndoBERTModel.from_pretrained(MODELS_PATH[1],
                                      bert,
                                      OUTPUT_DIM)
-    return model1, model2
-def predict(text, model, tokenizer, device):
-    # model = torch.load(model_path, map_location=device)
     if device.type == 'cuda':
         model.cuda()
@@ -102,38 +112,140 @@ def predict(text, model, tokenizer, device):
     print("output ", predictions)
     return predictions.item()
-def main():
-    """App Review Classifier"""
-    # st.title("Klasifikasi Ulasan APlikasi")
-    # st.subheader("ML App with Streamlit")
-    html_temp = """
-	<div style="background-color:blue;padding:10px">
-	<h1 style="color:white;text-align:center;">Klasifikasi Ulasan Aplikasi yang Berguna</h1>
-	</div>
-	"""
-    st.markdown(html_temp, unsafe_allow_html=True)
-    # st.info("Prediction with ML")
-    input_text = st.text_area("Enter Text Here", placeholder="Type Here")
-    all_ml_models = ["IndoBERT", "IndoBERT-BiLSTM"]
-    model_choice = st.selectbox("Select Model", all_ml_models)
-    tokenizer = load_tokenizer(bert_path)
-    device = get_device()
-    model1, model2 = load_model()
-    prediction = 0
-    prediction_labels = {'Not Useful': 0, 'Useful': 1}
-    if st.button("Classify"):
-        st.text("Original Text:\n{}".format(input_text))
-        if model_choice == 'IndoBERT':
-            prediction = predict(input_text, model1, tokenizer, device)
-        elif model_choice == 'IndoBERT-BiLSTM':
-            prediction = predict(input_text, model2, tokenizer, device)
-        final_result = get_key(prediction, prediction_labels)
-        st.success("Review Categorized as:: {}".format(final_result))
-    # st.sidebar.subheader("About")
-if __name__ == '__main__':
-    main()

+try:
+    import torch
+    import pandas as pd
+    import streamlit as st
+    import re
+    import streamlit as st
+    from transformers import BertTokenizer, BertModel
+    from model import IndoBERTBiLSTM, IndoBERTModel
+except Exception as e:
+    print(e)
+STYLE = """
+<style>
+img {
+    max-width: 100%;
+}
+</style>
+"""
 # Config
 MAX_SEQ_LEN = 128
+bert_path = './local/base-indobert'
+# bert_path = 'indolem/indobert-base-uncased'
+# MODELS_PATH = ["kadabengaran/IndoBERT-Useful-App-Review",
+#                "kadabengaran/IndoBERT-BiLSTM-Useful-App-Review"]
+MODELS_PATH = ["./local/indobert1",
+               "./local/indobert2"]
+MODELS_NAME = ["IndoBERT-BiLSTM", "IndoBERT"]
+LABELS = {'Not Useful': 0, 'Useful': 1}
             #    "kadabengaran/IndoBERT-BiLSTM-Useful-App-Review"]
 HIDDEN_DIM = 768
 OUTPUT_DIM = 2 # 2 if Binary
 def remove_special_characters(text):
     text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
     text = re.sub(r"\s+", " ", text)  # replace multiple whitespace characters with a single space
     text = re.sub(r'[0-9]', ' ', text) #remove number
     text = text.lower()
     return text
     bert = BertModel.from_pretrained(bert_path)
 	# Load the model
+    model_combined = IndoBERTBiLSTM.from_pretrained(MODELS_PATH[0],
                                      bert,
                                      HIDDEN_DIM,
                                      OUTPUT_DIM,
                                      N_LAYERS, BIDIRECTIONAL,
                                      DROPOUT)
+    model_base = IndoBERTModel.from_pretrained(MODELS_PATH[1],
                                      bert,
                                      OUTPUT_DIM)
+    return model_combined, model_base
+def predict_single(text, model, tokenizer, device):
     if device.type == 'cuda':
         model.cuda()
     print("output ", predictions)
     return predictions.item()
+def predict_multiple(data, model, tokenizer, device):
+    input_ids = []
+    attention_masks = []
+    for row in data.tolist():
+        # Apply remove_special_characters function to title column
+        text = remove_special_characters(row)
+        text = preprocess(text, tokenizer)
+        input_ids.append(text['input_ids'])
+        attention_masks.append(text['attention_mask'])
+    predictions = []
+    with torch.no_grad():
+        for i in range(len(input_ids)):
+            test_ids = input_ids[i]
+            test_attention_mask = attention_masks[i]
+            outputs = model(test_ids.to(device), test_attention_mask.to(device))
+            prediction = torch.argmax(outputs, dim= -1)
+            prediction_label = get_key(prediction.item(), LABELS)
+            predictions.append(prediction_label)
+    return predictions
+tab_labels = ["Single Input", "Multiple Input"]
+class App:
+    print("Loading All")
+    def __init__(self):
+        self.fileTypes = ["csv"]
+        self.default_tab_selected = tab_labels[0]
+        self.input_text = None
+        self.input_file = None
+    def run(self):
+        self.init_session_state()  # Initialize session state
+        tokenizer = load_tokenizer(bert_path)
+        device = get_device()
+        model_combined, model_base = load_model()
+        """App Review Classifier"""
+        html_temp = """
+        <div style="background-color:blue;padding:10px">
+        <h1 style="color:white;text-align:center;">Klasifikasi Ulasan Aplikasi yang Berguna</h1>
+        </div>
+        """
+        st.markdown(html_temp, unsafe_allow_html=True)
+        self.render_tabs()
+        st.divider()
+        model_choice = self.render_model_selection()
+        if model_choice:
+            if model_choice == MODELS_NAME[0]:
+                model = model_combined
+            elif model_choice == MODELS_NAME[1]:
+                model = model_base
+            self.render_process_button(model, tokenizer, device)
+    def init_session_state(self):
+        if "tab_selected" not in st.session_state:
+            st.session_state.tab_selected = tab_labels[0]
+    def render_model_selection(self):
+        model_choice = st.selectbox("Select Model", MODELS_NAME)
+        return model_choice
+    def render_tabs(self):
+        tab_selected = st.session_state.get('tab_selected', self.default_tab_selected)
+        tab_selected = st.sidebar.radio("Select Input Type", tab_labels)
+        # tab1, tab2 = st.tabs(tab_labels)
+        if tab_selected == tab_labels[0]:
+            self.render_single_input()
+        elif tab_selected == tab_labels[1]:
+            self.render_multiple_input()
+        st.session_state.tab_selected = tab_selected
+    def render_single_input(self):
+        self.input_text = st.text_area("Enter Text Here", placeholder="Type Here")
+    def render_multiple_input(self):
+        """
+        Upload File
+        """
+        st.markdown(STYLE, unsafe_allow_html=True)
+        file = st.file_uploader("Upload file", type=self.fileTypes)
+        if not file:
+            st.info("Please upload a file of type: " + ", ".join(self.fileTypes))
+            return
+        data = pd.read_csv(file)
+        placeholder = st.empty()
+        placeholder.dataframe(data.head(10))
+        header_list = data.columns.tolist()
+        header_list.insert(0, "---------- select column -------------")
+        ques = st.radio("Select column to process", header_list, index=0)
+        if header_list.index(ques) == 0:
+            st.warning("Please select a column to process")
+            return
+        df_process = data[ques]
+        self.input_file = data
+        self.process_file = df_process
+    def render_process_button(self, model, tokenizer, device):
+        if st.button("Process"):
+            if st.session_state.tab_selected == tab_labels[0]:
+                input_text = self.input_text
+                if input_text:
+                    prediction = predict_single(input_text, model, tokenizer, device)
+                    prediction_label = get_key(prediction, LABELS)
+                    st.write("Prediction:", prediction_label)
+            elif st.session_state.tab_selected == tab_labels[1]:
+                df_process = self.process_file
+                if df_process is not None:
+                    prediction = predict_multiple(df_process, model, tokenizer, device)
+                    st.divider()
+                    st.write("Classification Result")
+                    input_file = self.input_file
+                    input_file["classification_result"] = prediction
+                    st.dataframe(input_file.head(10))
+                    st.download_button(
+                        label="Download Result",
+                        data=input_file.to_csv().encode("utf-8"),
+                        file_name="classification_result.csv",
+                        mime="text/csv",
+                    )
+if __name__ == "__main__":
+    app = App()
+    app.run()