Spaces:

kadabengaran
/

useful-review-classification

Runtime error

File size: 7,684 Bytes

b767f00
 
 
 
 
4cdda95
 
ee9b192
b767f00
 
4cdda95
e186fb5
 
5a7f50a
b767f00
e186fb5
 
 
 
 
 
 
be8111b
24d7bd7
 
 
 
 
 
 
 
 
 
e186fb5
 
 
 
 
4cdda95
e186fb5
4cdda95
 
 
 
 
 
 
 
e186fb5
 
 
 
 
 
 
 
 
 
5a7f50a
 
e186fb5
4cdda95
e186fb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4cdda95
e186fb5
 
 
 
4cdda95
 
 
e186fb5
4cdda95
4e2ac67
 
 
 
b767f00
 
 
 
 
 
 
 
4cdda95
e186fb5
b767f00
ee9b192
b767f00
 
 
4cdda95
 
570c67b
b767f00
4cdda95
b767f00
 
 
 
 
 
 
b2d3878
 
b767f00
 
 
39b4480
9c9b15b
b767f00
 
 
 
 
 
 
4cdda95
b767f00
 
5a7f50a
b767f00
 
 
 
 
5a7f50a
b767f00
 
 
4cdda95
 
b767f00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4cdda95
 
 
b767f00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3543580
b2d3878
 
b767f00
 
 
 
 
 
4cdda95
 
 
 
 
b767f00
b2d3878
b767f00
4cdda95
b767f00
 
 
b2d3878
4cdda95
b767f00
 
 
 
 
 
 
4cdda95
 
b767f00
4cdda95
 
 
 
 
 
 
 
 
 
 
 
 
b767f00
 
 
4cdda95

try:
    import torch
    import pandas as pd
    import streamlit as st
    import re
    from transformers import BertTokenizer
    from model import IndoBERTBiLSTM
    from stqdm import stqdm
except Exception as e:
    print(e)
    
# Config
MAX_SEQ_LEN = 128
MODELS_PATH = "kadabengaran/IndoBERT-BiLSTM-Useful-App-Review"
LABELS = {'Not Useful': 0, 'Useful': 1}

def get_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

USE_CUDA = False
device = get_device()
if device.type == 'cuda':
    USE_CUDA = True

# Get the Keys
def get_key(val, my_dict):
    for key, value in my_dict.items():
        if val == value:
            return key

def load_tokenizer(model_path):
    tokenizer = BertTokenizer.from_pretrained(model_path)
    return tokenizer

def remove_special_characters(text):
    # case folding
    text = text.lower()

    # menghapus karakter khusus
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = re.sub(r'[0-9]', ' ', text)

    # replace multiple whitespace characters with a single space
    text = re.sub(r"\s+", " ", text)
    
    return text

def preprocess(text, tokenizer, max_seq=MAX_SEQ_LEN):
    return tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_seq,
                                 pad_to_max_length=True,
                                 return_attention_mask=True,
                                 return_tensors='pt'
                                 )
    
def load_model():
    model = IndoBERTBiLSTM.from_pretrained(MODELS_PATH)
    return model

def classify_single(text, model, tokenizer, device):
    
    if device.type == 'cuda':
        model.cuda()
        
    # We need Token IDs and Attention Mask for inference on the new sentence
    test_ids = []
    test_attention_mask = []

    # Apply preprocessing to the new sentence
    new_sentence = remove_special_characters(text)
    encoding = preprocess(new_sentence, tokenizer)

    # Extract IDs and Attention Mask
    test_ids.append(encoding['input_ids'])
    test_attention_mask.append(encoding['attention_mask'])
    test_ids = torch.cat(test_ids, dim=0)
    test_attention_mask = torch.cat(test_attention_mask, dim=0)

    # Forward pass, calculate logit
    with torch.no_grad():
        outputs = model(test_ids.to(device),
                        test_attention_mask.to(device))
    print("output ", outputs)
    result = torch.argmax(outputs, dim=-1)
    print("output ", result)
    return result.item()

def classify_multiple(data, model, tokenizer, device):
    
    if device.type == 'cuda':
        model.cuda()
        
    input_ids = []
    attention_masks = []
    for row in data.tolist():
        text = remove_special_characters(row)
        text = preprocess(text, tokenizer)
        input_ids.append(text['input_ids'])
        attention_masks.append(text['attention_mask'])
        
    result_list = []
    
    with torch.no_grad():
        for i in stqdm(range(len(input_ids))):
            test_ids = input_ids[i]
            test_attention_mask = attention_masks[i]
            outputs = model(test_ids.to(device), test_attention_mask.to(device))
            result = torch.argmax(outputs, dim= -1)
            result_label = get_key(result.item(), LABELS)
            result_list.append(result_label)
            
    return result_list

tab_labels = ["Single Input", "Multiple Input"]
class App:
    def __init__(self):
        self.fileTypes = ["csv"]
        self.default_tab_selected = tab_labels[0]
        self.input_text = None
        self.csv_input = None
        self.csv_process = None
        
    def run(self):
        self.init_session_state()  # Initialize session state
        tokenizer = load_tokenizer(MODELS_PATH)
        model = load_model()
        """App Review Classifier"""
        html_temp = """
        <div style="background-color:blue;padding:10px">
        <h1 style="color:white;text-align:center;">Klasifikasi Ulasan Aplikasi yang Berguna</h1>
        </div>
        """
        st.markdown(html_temp, unsafe_allow_html=True)
        st.markdown("")
        self.render_tabs()
        st.divider()
        self.render_process_button(model, tokenizer, device)

    def init_session_state(self):
        if "tab_selected" not in st.session_state:
            st.session_state.tab_selected = tab_labels[0]


    def render_tabs(self):
        tab_selected = st.session_state.get('tab_selected', self.default_tab_selected)
        tab_selected = st.sidebar.radio("Select Input Type", tab_labels)
        if USE_CUDA:
            st.sidebar.markdown(footer,unsafe_allow_html=True)

        if tab_selected == tab_labels[0]:
            self.render_single_input()
        elif tab_selected == tab_labels[1]:
            self.render_multiple_input()
            
        st.session_state.tab_selected = tab_selected

    def render_single_input(self):
        self.input_text = st.text_area("Enter Text Here", placeholder="Type Here")

    def render_multiple_input(self):
        """
        Upload File
        """
        st.markdown("Upload file")
        file = st.file_uploader("To ensure a smooth process, please use a maximum of 500 rows of data in the CSV file.", 
                                type=self.fileTypes)

        if not file:
            st.info("Please upload a file of type: " + ", ".join(self.fileTypes))
            return

        data = pd.read_csv(file)
        
        placeholder = st.empty()
        placeholder.dataframe(data.head(10))

        header_list = data.columns.tolist()
        header_list.insert(0, "---------- select column -------------")
        ques = st.radio("Select column to process", header_list, index=0)

        if header_list.index(ques) == 0:
            st.warning("Please select a column to process")
            return

        df_process = data[ques].astype(str)
        self.csv_input = data
        self.csv_process = df_process
        
    def render_process_button(self, model, tokenizer, device):
        if st.button("Process"):
            if st.session_state.tab_selected == tab_labels[0]:
                input_text = self.input_text
                if input_text:
                    classification = classify_single(input_text, model, tokenizer, device)
                    classification_label = get_key(classification, LABELS)
                    st.write("Classification result:", classification_label)
                else:
                    st.warning('Please enter text to process', icon="⚠️")
            elif st.session_state.tab_selected == tab_labels[1]:
                df_process = self.csv_process
                if df_process is not None:
                    classification = classify_multiple(df_process, model, tokenizer, device)
                    
                    st.divider()
                    st.write("Classification Result")
                    input_file = self.csv_input
                    input_file["classification_result"] = classification
                    st.dataframe(input_file.head(10))
                    st.download_button(
                        label="Download Result",
                        data=input_file.to_csv().encode("utf-8"),
                        file_name="classification_result.csv",
                        mime="text/csv",
                    )
                else:
                    st.warning('Please upload a file to process', icon="⚠️")
    
footer="""<style>
.footer {
position: fixed;
left: 10;
bottom: 0;
width: 100%;
color: #ffa9365e;
}
</style>
<div class="footer">
<p>CUDA enabled</p>
</div>
"""

if __name__ == "__main__":
    app = App()
    app.run()