Spaces:
Runtime error
Runtime error
Commit
·
5a7f50a
1
Parent(s):
fe8f25f
update to single model
Browse files- app/main.py +8 -42
- app/model.py +37 -49
app/main.py
CHANGED
@@ -5,7 +5,7 @@ try:
|
|
5 |
import streamlit as st
|
6 |
import re
|
7 |
import streamlit as st
|
8 |
-
from transformers import BertTokenizer,
|
9 |
from model import IndoBERTBiLSTM, IndoBERTModel
|
10 |
except Exception as e:
|
11 |
print(e)
|
@@ -19,19 +19,8 @@ img {
|
|
19 |
"""
|
20 |
# Config
|
21 |
MAX_SEQ_LEN = 128
|
22 |
-
|
23 |
-
bert_path = 'indobenchmark/indobert-base-p2'
|
24 |
-
MODELS_PATH = ["kadabengaran/IndoBERT-BiLSTM-Useful-App-Review",
|
25 |
-
"kadabengaran/IndoBERT-Useful-App-Review"]
|
26 |
-
|
27 |
-
MODELS_NAME = ["IndoBERT-BiLSTM", "IndoBERT"]
|
28 |
LABELS = {'Not Useful': 0, 'Useful': 1}
|
29 |
-
|
30 |
-
HIDDEN_DIM = 768
|
31 |
-
OUTPUT_DIM = 2 # 2 if Binary
|
32 |
-
N_LAYERS = 2
|
33 |
-
BIDIRECTIONAL = True
|
34 |
-
DROPOUT = 0.2
|
35 |
|
36 |
# Get the Keys
|
37 |
def get_key(val, my_dict):
|
@@ -39,19 +28,16 @@ def get_key(val, my_dict):
|
|
39 |
if val == value:
|
40 |
return key
|
41 |
|
42 |
-
|
43 |
def get_device():
|
44 |
if torch.cuda.is_available():
|
45 |
return torch.device('cuda')
|
46 |
else:
|
47 |
return torch.device('cpu')
|
48 |
|
49 |
-
|
50 |
def load_tokenizer(model_path):
|
51 |
tokenizer = BertTokenizer.from_pretrained(model_path)
|
52 |
return tokenizer
|
53 |
|
54 |
-
|
55 |
def remove_special_characters(text):
|
56 |
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
|
57 |
text = re.sub(r"\s+", " ", text) # replace multiple whitespace characters with a single space
|
@@ -59,7 +45,6 @@ def remove_special_characters(text):
|
|
59 |
text = text.lower()
|
60 |
return text
|
61 |
|
62 |
-
|
63 |
def preprocess(text, tokenizer, max_seq=MAX_SEQ_LEN):
|
64 |
return tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_seq,
|
65 |
pad_to_max_length=True,
|
@@ -68,19 +53,8 @@ def preprocess(text, tokenizer, max_seq=MAX_SEQ_LEN):
|
|
68 |
)
|
69 |
|
70 |
def load_model():
|
71 |
-
|
72 |
-
|
73 |
-
# Load the model
|
74 |
-
model_combined = IndoBERTBiLSTM.from_pretrained(MODELS_PATH[0],
|
75 |
-
bert,
|
76 |
-
HIDDEN_DIM,
|
77 |
-
OUTPUT_DIM,
|
78 |
-
N_LAYERS, BIDIRECTIONAL,
|
79 |
-
DROPOUT)
|
80 |
-
model_base = IndoBERTModel.from_pretrained(MODELS_PATH[1],
|
81 |
-
bert,
|
82 |
-
OUTPUT_DIM)
|
83 |
-
return model_combined, model_base
|
84 |
|
85 |
def predict_single(text, model, tokenizer, device):
|
86 |
|
@@ -158,22 +132,14 @@ class App:
|
|
158 |
st.markdown(html_temp, unsafe_allow_html=True)
|
159 |
self.render_tabs()
|
160 |
st.divider()
|
161 |
-
|
162 |
-
|
163 |
-
if model_choice == MODELS_NAME[0]:
|
164 |
-
model = model_combined
|
165 |
-
elif model_choice == MODELS_NAME[1]:
|
166 |
-
model = model_base
|
167 |
-
self.render_process_button(model, tokenizer, device)
|
168 |
|
169 |
def init_session_state(self):
|
170 |
if "tab_selected" not in st.session_state:
|
171 |
st.session_state.tab_selected = tab_labels[0]
|
172 |
|
173 |
-
|
174 |
-
model_choice = st.selectbox("Select Model", MODELS_NAME)
|
175 |
-
return model_choice
|
176 |
-
|
177 |
def render_tabs(self):
|
178 |
tab_selected = st.session_state.get('tab_selected', self.default_tab_selected)
|
179 |
tab_selected = st.sidebar.radio("Select Input Type", tab_labels)
|
@@ -195,7 +161,7 @@ class App:
|
|
195 |
"""
|
196 |
st.markdown(STYLE, unsafe_allow_html=True)
|
197 |
file = st.file_uploader("Upload file", type=self.fileTypes)
|
198 |
-
|
199 |
|
200 |
if not file:
|
201 |
st.info("Please upload a file of type: " + ", ".join(self.fileTypes))
|
|
|
5 |
import streamlit as st
|
6 |
import re
|
7 |
import streamlit as st
|
8 |
+
from transformers import BertTokenizer, AutoConfig
|
9 |
from model import IndoBERTBiLSTM, IndoBERTModel
|
10 |
except Exception as e:
|
11 |
print(e)
|
|
|
19 |
"""
|
20 |
# Config
|
21 |
MAX_SEQ_LEN = 128
|
22 |
+
MODELS_PATH = "kadabengaran/IndoBERT-BiLSTM-Useful-App-Review"
|
|
|
|
|
|
|
|
|
|
|
23 |
LABELS = {'Not Useful': 0, 'Useful': 1}
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
# Get the Keys
|
26 |
def get_key(val, my_dict):
|
|
|
28 |
if val == value:
|
29 |
return key
|
30 |
|
|
|
31 |
def get_device():
|
32 |
if torch.cuda.is_available():
|
33 |
return torch.device('cuda')
|
34 |
else:
|
35 |
return torch.device('cpu')
|
36 |
|
|
|
37 |
def load_tokenizer(model_path):
|
38 |
tokenizer = BertTokenizer.from_pretrained(model_path)
|
39 |
return tokenizer
|
40 |
|
|
|
41 |
def remove_special_characters(text):
|
42 |
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
|
43 |
text = re.sub(r"\s+", " ", text) # replace multiple whitespace characters with a single space
|
|
|
45 |
text = text.lower()
|
46 |
return text
|
47 |
|
|
|
48 |
def preprocess(text, tokenizer, max_seq=MAX_SEQ_LEN):
|
49 |
return tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_seq,
|
50 |
pad_to_max_length=True,
|
|
|
53 |
)
|
54 |
|
55 |
def load_model():
|
56 |
+
model = IndoBERTBiLSTM.from_pretrained(MODELS_PATH)
|
57 |
+
return model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
def predict_single(text, model, tokenizer, device):
|
60 |
|
|
|
132 |
st.markdown(html_temp, unsafe_allow_html=True)
|
133 |
self.render_tabs()
|
134 |
st.divider()
|
135 |
+
model = model_combined
|
136 |
+
self.render_process_button(model, tokenizer, device)
|
|
|
|
|
|
|
|
|
|
|
137 |
|
138 |
def init_session_state(self):
|
139 |
if "tab_selected" not in st.session_state:
|
140 |
st.session_state.tab_selected = tab_labels[0]
|
141 |
|
142 |
+
|
|
|
|
|
|
|
143 |
def render_tabs(self):
|
144 |
tab_selected = st.session_state.get('tab_selected', self.default_tab_selected)
|
145 |
tab_selected = st.sidebar.radio("Select Input Type", tab_labels)
|
|
|
161 |
"""
|
162 |
st.markdown(STYLE, unsafe_allow_html=True)
|
163 |
file = st.file_uploader("Upload file", type=self.fileTypes)
|
164 |
+
# add "untuk kelancaran proses, maksimal csv dengan 1000 baris data"
|
165 |
|
166 |
if not file:
|
167 |
st.info("Please upload a file of type: " + ", ".join(self.fileTypes))
|
app/model.py
CHANGED
@@ -1,28 +1,40 @@
|
|
1 |
import torch.nn as nn
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
USE_CUDA = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
class IndoBERTBiLSTM(PreTrainedModel):
|
7 |
config_class = BertConfig
|
8 |
-
def __init__(self, bert_config
|
9 |
super().__init__(bert_config)
|
10 |
-
self.output_dim =
|
11 |
-
self.
|
12 |
-
self.
|
13 |
-
self.bidirectional = bidirectional
|
14 |
|
15 |
-
self.bert =
|
16 |
self.lstm = nn.LSTM(input_size=self.bert.config.hidden_size,
|
17 |
-
hidden_size=hidden_dim,
|
18 |
-
|
19 |
-
bidirectional=bidirectional,
|
20 |
batch_first=True)
|
21 |
-
self.dropout = nn.Dropout(
|
22 |
-
self.
|
23 |
-
self.hidden_layer = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, hidden_dim * 2 if bidirectional else hidden_dim)
|
24 |
-
self.output_layer = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, num_classes)
|
25 |
-
self.relu = nn.ReLU()
|
26 |
|
27 |
def forward(self, input_ids, attention_mask):
|
28 |
|
@@ -31,35 +43,26 @@ class IndoBERTBiLSTM(PreTrainedModel):
|
|
31 |
output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
|
32 |
sequence_output = output.last_hidden_state
|
33 |
|
34 |
-
# apply dropout
|
35 |
-
sequence_output = self.dropout(sequence_output)
|
36 |
-
# print('output size of the bert:', last_hidden_state.size())
|
37 |
-
|
38 |
lstm_output, (hidden_last, cn_last) = self.lstm(sequence_output, hidden)
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
# global pooling
|
43 |
-
lstm_output = lstm_output.permute(0, 2, 1)
|
44 |
-
pooled_output = self.global_pooling(lstm_output).squeeze()
|
45 |
|
46 |
-
#
|
47 |
-
|
48 |
-
hidden_layer_output = self.relu(hidden_layer_output)
|
49 |
|
50 |
# output layer
|
51 |
-
logits = self.output_layer(
|
52 |
-
# logits = nn.Softmax(dim=1)(logits)
|
53 |
|
54 |
return logits
|
55 |
|
56 |
def init_hidden(self, batch_size):
|
57 |
weight = next(self.parameters()).data
|
58 |
-
|
59 |
number = 1
|
60 |
if self.bidirectional:
|
61 |
number = 2
|
62 |
-
|
63 |
if (USE_CUDA):
|
64 |
hidden = (weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float().cuda(),
|
65 |
weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float().cuda()
|
@@ -68,21 +71,6 @@ class IndoBERTBiLSTM(PreTrainedModel):
|
|
68 |
hidden = (weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float(),
|
69 |
weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float()
|
70 |
)
|
71 |
-
|
72 |
-
return hidden
|
73 |
-
|
74 |
-
|
75 |
-
class IndoBERTModel(PreTrainedModel):
|
76 |
-
config_class = BertConfig
|
77 |
-
def __init__(self, bert_config, bert_pretrained, num_classes):
|
78 |
-
super().__init__(bert_config)
|
79 |
-
self.bert = bert_pretrained
|
80 |
-
self.dropout = nn.Dropout(0.1)
|
81 |
-
self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)
|
82 |
|
83 |
-
|
84 |
-
|
85 |
-
pooled_output = outputs.pooler_output
|
86 |
-
pooled_output = self.dropout(pooled_output)
|
87 |
-
logits = self.fc(pooled_output)
|
88 |
-
return logits
|
|
|
1 |
import torch.nn as nn
|
2 |
+
import torch
|
3 |
+
from transformers import BertModel, BertConfig, PreTrainedModel
|
4 |
+
|
5 |
+
def get_device():
|
6 |
+
if torch.cuda.is_available():
|
7 |
+
return torch.device('cuda')
|
8 |
+
else:
|
9 |
+
return torch.device('cpu')
|
10 |
|
11 |
USE_CUDA = False
|
12 |
+
device = get_device()
|
13 |
+
if device.type == 'cuda':
|
14 |
+
USE_CUDA = True
|
15 |
+
|
16 |
+
bert_path = 'indobenchmark/indobert-base-p2'
|
17 |
+
HIDDEN_DIM = 768
|
18 |
+
OUTPUT_DIM = 2 # 2 if Binary Classification
|
19 |
+
N_LAYERS = 1 # 2
|
20 |
+
BIDIRECTIONAL = True
|
21 |
+
DROPOUT = 0.2 # 0.2
|
22 |
|
23 |
class IndoBERTBiLSTM(PreTrainedModel):
|
24 |
config_class = BertConfig
|
25 |
+
def __init__(self, bert_config):
|
26 |
super().__init__(bert_config)
|
27 |
+
self.output_dim = OUTPUT_DIM
|
28 |
+
self.hidden_dim = HIDDEN_DIM
|
29 |
+
self.bidirectional = BIDIRECTIONAL
|
|
|
30 |
|
31 |
+
self.bert = BertModel.from_pretrained(bert_path)
|
32 |
self.lstm = nn.LSTM(input_size=self.bert.config.hidden_size,
|
33 |
+
hidden_size=self.hidden_dim,
|
34 |
+
bidirectional=self.bidirectional,
|
|
|
35 |
batch_first=True)
|
36 |
+
self.dropout = nn.Dropout(DROPOUT)
|
37 |
+
self.output_layer = nn.Linear(self.hidden_dim * 2 if self.bidirectional else self.hidden_dim, self.output_dim)
|
|
|
|
|
|
|
38 |
|
39 |
def forward(self, input_ids, attention_mask):
|
40 |
|
|
|
43 |
output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
|
44 |
sequence_output = output.last_hidden_state
|
45 |
|
|
|
|
|
|
|
|
|
46 |
lstm_output, (hidden_last, cn_last) = self.lstm(sequence_output, hidden)
|
47 |
+
hidden_last_L=hidden_last[-2]
|
48 |
+
hidden_last_R=hidden_last[-1]
|
49 |
+
hidden_last_out=torch.cat([hidden_last_L,hidden_last_R],dim=-1) #[16, 1536]
|
|
|
|
|
|
|
50 |
|
51 |
+
# apply dropout
|
52 |
+
out = self.dropout(hidden_last_out)
|
|
|
53 |
|
54 |
# output layer
|
55 |
+
logits = self.output_layer(out)
|
|
|
56 |
|
57 |
return logits
|
58 |
|
59 |
def init_hidden(self, batch_size):
|
60 |
weight = next(self.parameters()).data
|
61 |
+
|
62 |
number = 1
|
63 |
if self.bidirectional:
|
64 |
number = 2
|
65 |
+
|
66 |
if (USE_CUDA):
|
67 |
hidden = (weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float().cuda(),
|
68 |
weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float().cuda()
|
|
|
71 |
hidden = (weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float(),
|
72 |
weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float()
|
73 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
+
return hidden
|
76 |
+
|
|
|
|
|
|
|
|