kadabengaran commited on
Commit
5a7f50a
·
1 Parent(s): fe8f25f

update to single model

Browse files
Files changed (2) hide show
  1. app/main.py +8 -42
  2. app/model.py +37 -49
app/main.py CHANGED
@@ -5,7 +5,7 @@ try:
5
  import streamlit as st
6
  import re
7
  import streamlit as st
8
- from transformers import BertTokenizer, BertModel
9
  from model import IndoBERTBiLSTM, IndoBERTModel
10
  except Exception as e:
11
  print(e)
@@ -19,19 +19,8 @@ img {
19
  """
20
  # Config
21
  MAX_SEQ_LEN = 128
22
- # bert_path = 'indolem/indobert-base-uncased'
23
- bert_path = 'indobenchmark/indobert-base-p2'
24
- MODELS_PATH = ["kadabengaran/IndoBERT-BiLSTM-Useful-App-Review",
25
- "kadabengaran/IndoBERT-Useful-App-Review"]
26
-
27
- MODELS_NAME = ["IndoBERT-BiLSTM", "IndoBERT"]
28
  LABELS = {'Not Useful': 0, 'Useful': 1}
29
-
30
- HIDDEN_DIM = 768
31
- OUTPUT_DIM = 2 # 2 if Binary
32
- N_LAYERS = 2
33
- BIDIRECTIONAL = True
34
- DROPOUT = 0.2
35
 
36
  # Get the Keys
37
  def get_key(val, my_dict):
@@ -39,19 +28,16 @@ def get_key(val, my_dict):
39
  if val == value:
40
  return key
41
 
42
-
43
  def get_device():
44
  if torch.cuda.is_available():
45
  return torch.device('cuda')
46
  else:
47
  return torch.device('cpu')
48
 
49
-
50
  def load_tokenizer(model_path):
51
  tokenizer = BertTokenizer.from_pretrained(model_path)
52
  return tokenizer
53
 
54
-
55
  def remove_special_characters(text):
56
  text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
57
  text = re.sub(r"\s+", " ", text) # replace multiple whitespace characters with a single space
@@ -59,7 +45,6 @@ def remove_special_characters(text):
59
  text = text.lower()
60
  return text
61
 
62
-
63
  def preprocess(text, tokenizer, max_seq=MAX_SEQ_LEN):
64
  return tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_seq,
65
  pad_to_max_length=True,
@@ -68,19 +53,8 @@ def preprocess(text, tokenizer, max_seq=MAX_SEQ_LEN):
68
  )
69
 
70
  def load_model():
71
- bert = BertModel.from_pretrained(bert_path)
72
-
73
- # Load the model
74
- model_combined = IndoBERTBiLSTM.from_pretrained(MODELS_PATH[0],
75
- bert,
76
- HIDDEN_DIM,
77
- OUTPUT_DIM,
78
- N_LAYERS, BIDIRECTIONAL,
79
- DROPOUT)
80
- model_base = IndoBERTModel.from_pretrained(MODELS_PATH[1],
81
- bert,
82
- OUTPUT_DIM)
83
- return model_combined, model_base
84
 
85
  def predict_single(text, model, tokenizer, device):
86
 
@@ -158,22 +132,14 @@ class App:
158
  st.markdown(html_temp, unsafe_allow_html=True)
159
  self.render_tabs()
160
  st.divider()
161
- model_choice = self.render_model_selection()
162
- if model_choice:
163
- if model_choice == MODELS_NAME[0]:
164
- model = model_combined
165
- elif model_choice == MODELS_NAME[1]:
166
- model = model_base
167
- self.render_process_button(model, tokenizer, device)
168
 
169
  def init_session_state(self):
170
  if "tab_selected" not in st.session_state:
171
  st.session_state.tab_selected = tab_labels[0]
172
 
173
- def render_model_selection(self):
174
- model_choice = st.selectbox("Select Model", MODELS_NAME)
175
- return model_choice
176
-
177
  def render_tabs(self):
178
  tab_selected = st.session_state.get('tab_selected', self.default_tab_selected)
179
  tab_selected = st.sidebar.radio("Select Input Type", tab_labels)
@@ -195,7 +161,7 @@ class App:
195
  """
196
  st.markdown(STYLE, unsafe_allow_html=True)
197
  file = st.file_uploader("Upload file", type=self.fileTypes)
198
-
199
 
200
  if not file:
201
  st.info("Please upload a file of type: " + ", ".join(self.fileTypes))
 
5
  import streamlit as st
6
  import re
7
  import streamlit as st
8
+ from transformers import BertTokenizer, AutoConfig
9
  from model import IndoBERTBiLSTM, IndoBERTModel
10
  except Exception as e:
11
  print(e)
 
19
  """
20
  # Config
21
  MAX_SEQ_LEN = 128
22
+ MODELS_PATH = "kadabengaran/IndoBERT-BiLSTM-Useful-App-Review"
 
 
 
 
 
23
  LABELS = {'Not Useful': 0, 'Useful': 1}
 
 
 
 
 
 
24
 
25
  # Get the Keys
26
  def get_key(val, my_dict):
 
28
  if val == value:
29
  return key
30
 
 
31
  def get_device():
32
  if torch.cuda.is_available():
33
  return torch.device('cuda')
34
  else:
35
  return torch.device('cpu')
36
 
 
37
  def load_tokenizer(model_path):
38
  tokenizer = BertTokenizer.from_pretrained(model_path)
39
  return tokenizer
40
 
 
41
  def remove_special_characters(text):
42
  text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
43
  text = re.sub(r"\s+", " ", text) # replace multiple whitespace characters with a single space
 
45
  text = text.lower()
46
  return text
47
 
 
48
  def preprocess(text, tokenizer, max_seq=MAX_SEQ_LEN):
49
  return tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_seq,
50
  pad_to_max_length=True,
 
53
  )
54
 
55
  def load_model():
56
+ model = IndoBERTBiLSTM.from_pretrained(MODELS_PATH)
57
+ return model
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  def predict_single(text, model, tokenizer, device):
60
 
 
132
  st.markdown(html_temp, unsafe_allow_html=True)
133
  self.render_tabs()
134
  st.divider()
135
+ model = model_combined
136
+ self.render_process_button(model, tokenizer, device)
 
 
 
 
 
137
 
138
  def init_session_state(self):
139
  if "tab_selected" not in st.session_state:
140
  st.session_state.tab_selected = tab_labels[0]
141
 
142
+
 
 
 
143
  def render_tabs(self):
144
  tab_selected = st.session_state.get('tab_selected', self.default_tab_selected)
145
  tab_selected = st.sidebar.radio("Select Input Type", tab_labels)
 
161
  """
162
  st.markdown(STYLE, unsafe_allow_html=True)
163
  file = st.file_uploader("Upload file", type=self.fileTypes)
164
+ # add "untuk kelancaran proses, maksimal csv dengan 1000 baris data"
165
 
166
  if not file:
167
  st.info("Please upload a file of type: " + ", ".join(self.fileTypes))
app/model.py CHANGED
@@ -1,28 +1,40 @@
1
  import torch.nn as nn
2
- from transformers import PreTrainedModel, BertConfig
 
 
 
 
 
 
 
3
 
4
  USE_CUDA = False
 
 
 
 
 
 
 
 
 
 
5
 
6
  class IndoBERTBiLSTM(PreTrainedModel):
7
  config_class = BertConfig
8
- def __init__(self, bert_config, bert_pretrained_path, hidden_dim, num_classes, n_layers, bidirectional, dropout):
9
  super().__init__(bert_config)
10
- self.output_dim = num_classes
11
- self.n_layers = n_layers
12
- self.hidden_dim = hidden_dim
13
- self.bidirectional = bidirectional
14
 
15
- self.bert = bert_pretrained_path
16
  self.lstm = nn.LSTM(input_size=self.bert.config.hidden_size,
17
- hidden_size=hidden_dim,
18
- num_layers=n_layers,
19
- bidirectional=bidirectional,
20
  batch_first=True)
21
- self.dropout = nn.Dropout(dropout)
22
- self.global_pooling = nn.AdaptiveAvgPool1d(1)
23
- self.hidden_layer = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, hidden_dim * 2 if bidirectional else hidden_dim)
24
- self.output_layer = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, num_classes)
25
- self.relu = nn.ReLU()
26
 
27
  def forward(self, input_ids, attention_mask):
28
 
@@ -31,35 +43,26 @@ class IndoBERTBiLSTM(PreTrainedModel):
31
  output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
32
  sequence_output = output.last_hidden_state
33
 
34
- # apply dropout
35
- sequence_output = self.dropout(sequence_output)
36
- # print('output size of the bert:', last_hidden_state.size())
37
-
38
  lstm_output, (hidden_last, cn_last) = self.lstm(sequence_output, hidden)
39
- # print('output size of the LSTM:', lstm_output.size())
40
- lstm_output = self.dropout(lstm_output)
41
-
42
- # global pooling
43
- lstm_output = lstm_output.permute(0, 2, 1)
44
- pooled_output = self.global_pooling(lstm_output).squeeze()
45
 
46
- # pass through hidden layer
47
- hidden_layer_output = self.hidden_layer(pooled_output)
48
- hidden_layer_output = self.relu(hidden_layer_output)
49
 
50
  # output layer
51
- logits = self.output_layer(hidden_layer_output)
52
- # logits = nn.Softmax(dim=1)(logits)
53
 
54
  return logits
55
 
56
  def init_hidden(self, batch_size):
57
  weight = next(self.parameters()).data
58
-
59
  number = 1
60
  if self.bidirectional:
61
  number = 2
62
-
63
  if (USE_CUDA):
64
  hidden = (weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float().cuda(),
65
  weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float().cuda()
@@ -68,21 +71,6 @@ class IndoBERTBiLSTM(PreTrainedModel):
68
  hidden = (weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float(),
69
  weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float()
70
  )
71
-
72
- return hidden
73
-
74
-
75
- class IndoBERTModel(PreTrainedModel):
76
- config_class = BertConfig
77
- def __init__(self, bert_config, bert_pretrained, num_classes):
78
- super().__init__(bert_config)
79
- self.bert = bert_pretrained
80
- self.dropout = nn.Dropout(0.1)
81
- self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)
82
 
83
- def forward(self, input_ids, attention_mask):
84
- outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
85
- pooled_output = outputs.pooler_output
86
- pooled_output = self.dropout(pooled_output)
87
- logits = self.fc(pooled_output)
88
- return logits
 
1
  import torch.nn as nn
2
+ import torch
3
+ from transformers import BertModel, BertConfig, PreTrainedModel
4
+
5
+ def get_device():
6
+ if torch.cuda.is_available():
7
+ return torch.device('cuda')
8
+ else:
9
+ return torch.device('cpu')
10
 
11
  USE_CUDA = False
12
+ device = get_device()
13
+ if device.type == 'cuda':
14
+ USE_CUDA = True
15
+
16
+ bert_path = 'indobenchmark/indobert-base-p2'
17
+ HIDDEN_DIM = 768
18
+ OUTPUT_DIM = 2 # 2 if Binary Classification
19
+ N_LAYERS = 1 # 2
20
+ BIDIRECTIONAL = True
21
+ DROPOUT = 0.2 # 0.2
22
 
23
  class IndoBERTBiLSTM(PreTrainedModel):
24
  config_class = BertConfig
25
+ def __init__(self, bert_config):
26
  super().__init__(bert_config)
27
+ self.output_dim = OUTPUT_DIM
28
+ self.hidden_dim = HIDDEN_DIM
29
+ self.bidirectional = BIDIRECTIONAL
 
30
 
31
+ self.bert = BertModel.from_pretrained(bert_path)
32
  self.lstm = nn.LSTM(input_size=self.bert.config.hidden_size,
33
+ hidden_size=self.hidden_dim,
34
+ bidirectional=self.bidirectional,
 
35
  batch_first=True)
36
+ self.dropout = nn.Dropout(DROPOUT)
37
+ self.output_layer = nn.Linear(self.hidden_dim * 2 if self.bidirectional else self.hidden_dim, self.output_dim)
 
 
 
38
 
39
  def forward(self, input_ids, attention_mask):
40
 
 
43
  output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
44
  sequence_output = output.last_hidden_state
45
 
 
 
 
 
46
  lstm_output, (hidden_last, cn_last) = self.lstm(sequence_output, hidden)
47
+ hidden_last_L=hidden_last[-2]
48
+ hidden_last_R=hidden_last[-1]
49
+ hidden_last_out=torch.cat([hidden_last_L,hidden_last_R],dim=-1) #[16, 1536]
 
 
 
50
 
51
+ # apply dropout
52
+ out = self.dropout(hidden_last_out)
 
53
 
54
  # output layer
55
+ logits = self.output_layer(out)
 
56
 
57
  return logits
58
 
59
  def init_hidden(self, batch_size):
60
  weight = next(self.parameters()).data
61
+
62
  number = 1
63
  if self.bidirectional:
64
  number = 2
65
+
66
  if (USE_CUDA):
67
  hidden = (weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float().cuda(),
68
  weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float().cuda()
 
71
  hidden = (weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float(),
72
  weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float()
73
  )
 
 
 
 
 
 
 
 
 
 
 
74
 
75
+ return hidden
76
+