rdsarjito commited on
Commit
f391e9e
·
1 Parent(s): 554b605
app.py CHANGED
@@ -1,15 +1,22 @@
1
- # app.py
2
  import streamlit as st
3
  import torch
4
  import torch.nn as nn
5
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
6
  import re
7
  import numpy as np
 
 
 
 
 
 
8
 
9
- # Target labels
10
  target_columns = ['susu', 'kacang', 'telur', 'makanan_laut', 'gandum']
11
 
12
- # Clean text
 
 
 
13
  def clean_text(text):
14
  text = text.replace('--', ' ')
15
  text = re.sub(r"http\S+", "", text)
@@ -19,67 +26,159 @@ def clean_text(text):
19
  text = text.strip().lower()
20
  return text
21
 
22
- # Load tokenizer
23
- tokenizer = AutoTokenizer.from_pretrained("tokenizer_dir")
24
- max_length = 128
25
-
26
- # Define model architecture
27
  class MultilabelBertClassifier(nn.Module):
28
  def __init__(self, model_name, num_labels):
29
  super(MultilabelBertClassifier, self).__init__()
30
  self.bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
31
  self.bert.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
32
-
33
  def forward(self, input_ids, attention_mask):
34
  outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
35
  return outputs.logits
36
 
37
  # Load model
38
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
39
- model = torch.load("model/alergen_model_full.pt", map_location=device)
40
-
41
- # Jika model dibungkus DataParallel, kita ambil model asli
42
- if hasattr(model, "module"):
43
- model = model.module
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
- model.to(device)
46
- model.eval()
47
-
48
- # Prediction function
49
- def predict_alergens(text):
50
- cleaned = clean_text(text)
51
- inputs = tokenizer.encode_plus(
52
- cleaned,
53
  add_special_tokens=True,
54
  max_length=max_length,
55
  truncation=True,
56
  return_tensors='pt',
57
  padding='max_length'
58
  )
59
- input_ids = inputs['input_ids'].to(device)
60
- attention_mask = inputs['attention_mask'].to(device)
61
-
62
  with torch.no_grad():
63
- logits = model(input_ids=input_ids, attention_mask=attention_mask)
64
- probs = torch.sigmoid(logits)
65
- preds = (probs > 0.5).float().cpu().numpy()[0]
66
-
67
- return {target: bool(preds[i]) for i, target in enumerate(target_columns)}
68
-
69
- # Streamlit UI
70
- st.title("Deteksi Alergen dari Resep Masakan 🧪🍲")
71
-
72
- recipe_input = st.text_area("Masukkan bahan-bahan resep di sini:", height=200)
73
-
74
- if st.button("Deteksi Alergen"):
75
- if recipe_input.strip() == "":
76
- st.warning("Silakan masukkan teks resep terlebih dahulu.")
77
- else:
78
- with st.spinner("Menganalisis..."):
79
- result = predict_alergens(recipe_input)
80
- st.subheader("Hasil Prediksi Alergen:")
81
- for allergen, is_present in result.items():
82
- if is_present:
83
- st.error(f"⚠️ {allergen}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  else:
85
- st.success(f" Bebas dari {allergen}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import torch
3
  import torch.nn as nn
 
4
  import re
5
  import numpy as np
6
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+
10
+ # Set page configuration
11
+ st.set_page_config(page_title="Aplikasi Deteksi Alergen", page_icon="🍲", layout="wide")
12
 
13
+ # Target label
14
  target_columns = ['susu', 'kacang', 'telur', 'makanan_laut', 'gandum']
15
 
16
+ # Device
17
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+
19
+ # Text cleaning
20
  def clean_text(text):
21
  text = text.replace('--', ' ')
22
  text = re.sub(r"http\S+", "", text)
 
26
  text = text.strip().lower()
27
  return text
28
 
29
+ # Multilabel BERT model
 
 
 
 
30
  class MultilabelBertClassifier(nn.Module):
31
  def __init__(self, model_name, num_labels):
32
  super(MultilabelBertClassifier, self).__init__()
33
  self.bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
34
  self.bert.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
35
+
36
  def forward(self, input_ids, attention_mask):
37
  outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
38
  return outputs.logits
39
 
40
  # Load model
41
+ @st.cache_resource
42
+ def load_model():
43
+ tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p2')
44
+ model = MultilabelBertClassifier('indobenchmark/indobert-base-p1', len(target_columns))
45
+ try:
46
+ state_dict = torch.load('model/alergen_model.pt', map_location=device)
47
+ if 'model_state_dict' in state_dict:
48
+ model_state_dict = state_dict['model_state_dict']
49
+ else:
50
+ model_state_dict = state_dict
51
+ new_state_dict = {k[7:] if k.startswith('module.') else k: v for k, v in model_state_dict.items()}
52
+ model.load_state_dict(new_state_dict, strict=False)
53
+ st.success("Model berhasil dimuat!")
54
+ except Exception as e:
55
+ st.error(f"Error loading model: {str(e)}")
56
+ st.info("Menggunakan model tanpa pre-trained weights.")
57
+ model.to(device)
58
+ model.eval()
59
+ return tokenizer, model
60
 
61
+ def predict_alergens(ingredients_text, tokenizer, model, threshold=0.5, max_length=128):
62
+ cleaned_text = clean_text(ingredients_text)
63
+ encoding = tokenizer.encode_plus(
64
+ cleaned_text,
 
 
 
 
65
  add_special_tokens=True,
66
  max_length=max_length,
67
  truncation=True,
68
  return_tensors='pt',
69
  padding='max_length'
70
  )
71
+ input_ids = encoding['input_ids'].to(device)
72
+ attention_mask = encoding['attention_mask'].to(device)
73
+
74
  with torch.no_grad():
75
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask)
76
+ probs = torch.sigmoid(outputs).cpu().numpy()[0] # hasil sigmoid (0-1)
77
+
78
+ results = []
79
+ for i, label in enumerate(target_columns):
80
+ present = probs[i] > threshold
81
+ percent = float(probs[i]) * 100
82
+ results.append({
83
+ 'label': label,
84
+ 'present': present,
85
+ 'probability': percent
86
+ })
87
+ return results
88
+
89
+ # Scrape Cookpad
90
+ def scrape_ingredients_from_url(url):
91
+ try:
92
+ headers = {"User-Agent": "Mozilla/5.0"}
93
+ response = requests.get(url, headers=headers)
94
+ soup = BeautifulSoup(response.text, 'html.parser')
95
+ ingredients_div = soup.find('div', id='ingredients')
96
+ if not ingredients_div:
97
+ return None
98
+ items = ingredients_div.find_all(['li', 'span'])
99
+ ingredients = [item.get_text(strip=True) for item in items if item.get_text(strip=True)]
100
+ return '\n'.join(ingredients)
101
+ except Exception as e:
102
+ st.error(f"Gagal mengambil data dari URL: {e}")
103
+ return None
104
+
105
+ # Main App
106
+ def main():
107
+ st.title("Aplikasi Deteksi Alergen dalam Resep")
108
+ st.markdown("""
109
+ Aplikasi ini memprediksi alergen yang terkandung dalam resep makanan berdasarkan bahan-bahan.
110
+ """)
111
+
112
+ with st.spinner("Memuat model..."):
113
+ tokenizer, model = load_model()
114
+
115
+ col1, col2 = st.columns([3, 2])
116
+
117
+ with col1:
118
+ st.subheader("Masukkan URL Resep dari Cookpad")
119
+ url = st.text_input("Contoh: https://cookpad.com/id/resep/24678703-gulai-telur-tahu-dan-kacang-panjang")
120
+
121
+ threshold = st.slider(
122
+ "Atur Threshold Deteksi Alergen",
123
+ min_value=0.1,
124
+ max_value=0.9,
125
+ value=0.5,
126
+ step=0.05,
127
+ help="Semakin rendah threshold, semakin sensitif model terhadap kemungkinan adanya alergen."
128
+ )
129
+
130
+ if st.button("Deteksi Alergen", type="primary"):
131
+ if url:
132
+ with st.spinner("Mengambil bahan resep dari URL..."):
133
+ ingredients = scrape_ingredients_from_url(url)
134
+
135
+ if ingredients:
136
+ st.text_area("Daftar Bahan", ingredients, height=200)
137
+ with st.spinner("Menganalisis bahan..."):
138
+ alergens = predict_alergens(ingredients, tokenizer, model, threshold=threshold)
139
+
140
+ with col2:
141
+ st.subheader("Hasil Deteksi")
142
+ emoji_map = {
143
+ 'susu': '🥛',
144
+ 'kacang': '🥜',
145
+ 'telur': '🥚',
146
+ 'makanan_laut': '🦐',
147
+ 'gandum': '🌾'
148
+ }
149
+
150
+ detected = []
151
+ for result in alergens:
152
+ label = result['label']
153
+ name = label.replace('_', ' ').title()
154
+ prob = result['probability']
155
+ present = result['present']
156
+ emoji = emoji_map.get(label, '')
157
+
158
+ if present:
159
+ st.error(f"{emoji} {name}: Terdeteksi ⚠️ ({prob:.2f}%)")
160
+ detected.append(name)
161
+ else:
162
+ st.success(f"{emoji} {name}: Tidak Terdeteksi ✓ ({prob:.2f}%)")
163
+
164
+ if detected:
165
+ st.warning(f"Resep ini mengandung alergen: {', '.join(detected)}")
166
+ else:
167
+ st.success("Resep ini tidak mengandung alergen yang terdeteksi.")
168
+ else:
169
+ st.warning("Gagal mengambil bahan dari halaman Cookpad. Pastikan URL valid.")
170
  else:
171
+ st.warning("Silakan masukkan URL resep terlebih dahulu.")
172
+
173
+ with st.expander("Tentang Aplikasi"):
174
+ st.markdown("""
175
+ Aplikasi ini menggunakan model IndoBERT untuk deteksi 5 jenis alergen dari bahan resep:
176
+ - Susu 🥛
177
+ - Kacang 🥜
178
+ - Telur 🥚
179
+ - Makanan Laut 🦐
180
+ - Gandum 🌾
181
+ """)
182
+
183
+ if __name__ == "__main__":
184
+ main()
model/{alergen_model_full.pt → alergen_model.pt} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7b5bbb0945b811482c8bb868a13bd655572de100833a50fd516efc0e52b7c17
3
- size 497911105
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28df831b272894c11265ef5f4cf1ac2a2ca89e765b26bff928f34c388ff015d5
3
+ size 497868974
requirements.txt CHANGED
@@ -1,4 +1,6 @@
1
- streamlit==1.30.0
2
  torch==2.0.1
3
  transformers==4.36.2
4
- numpy==1.25.2
 
 
 
1
+ streamlit==1.31.0
2
  torch==2.0.1
3
  transformers==4.36.2
4
+ numpy==1.25.2
5
+ scikit-learn==1.3.0
6
+ tqdm==4.66.1
save_model.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import torch.nn as nn
4
+ from transformers import AutoModelForSequenceClassification
5
+
6
+ # Define target columns
7
+ target_columns = ['susu', 'kacang', 'telur', 'makanan_laut', 'gandum']
8
+
9
+ # Define model for multilabel classification
10
+ class MultilabelBertClassifier(nn.Module):
11
+ def __init__(self, model_name, num_labels):
12
+ super(MultilabelBertClassifier, self).__init__()
13
+ self.bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
14
+ # Replace the classification head with our own for multilabel
15
+ self.bert.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
16
+
17
+ def forward(self, input_ids, attention_mask):
18
+ outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
19
+ return outputs.logits
20
+
21
+ # Set device
22
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
23
+ print(f"Using device: {device}")
24
+
25
+ # Initialize model
26
+ model = MultilabelBertClassifier('indobenchmark/indobert-base-p1', len(target_columns))
27
+
28
+ # Load the best model for evaluation
29
+ print("Loading model from best_alergen_model.pt...")
30
+ state_dict = torch.load('best_alergen_model.pt', map_location=device)
31
+
32
+ # If the model was trained with DataParallel, we need to remove the 'module.' prefix
33
+ new_state_dict = {}
34
+ for k, v in state_dict.items():
35
+ name = k[7:] if k.startswith('module.') else k
36
+ new_state_dict[name] = v
37
+
38
+ model.load_state_dict(new_state_dict)
39
+ model.to(device)
40
+
41
+ # Create model directory
42
+ os.makedirs('model', exist_ok=True)
43
+
44
+ # Save model
45
+ print("Saving model to model/alergen_model.pt...")
46
+ torch.save({
47
+ 'model_state_dict': model.state_dict(),
48
+ 'target_columns': target_columns,
49
+ }, 'model/alergen_model.pt')
50
+
51
+ print("Done!")
tokenizer_dir/special_tokens_map.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "cls_token": "[CLS]",
3
- "mask_token": "[MASK]",
4
- "pad_token": "[PAD]",
5
- "sep_token": "[SEP]",
6
- "unk_token": "[UNK]"
7
- }
 
 
 
 
 
 
 
 
tokenizer_dir/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
tokenizer_dir/tokenizer_config.json DELETED
@@ -1,58 +0,0 @@
1
- {
2
- "added_tokens_decoder": {
3
- "0": {
4
- "content": "[PAD]",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "1": {
12
- "content": "[UNK]",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "2": {
20
- "content": "[CLS]",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "3": {
28
- "content": "[SEP]",
29
- "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "4": {
36
- "content": "[MASK]",
37
- "lstrip": false,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false,
41
- "special": true
42
- }
43
- },
44
- "clean_up_tokenization_spaces": true,
45
- "cls_token": "[CLS]",
46
- "do_basic_tokenize": true,
47
- "do_lower_case": true,
48
- "extra_special_tokens": {},
49
- "mask_token": "[MASK]",
50
- "model_max_length": 1000000000000000019884624838656,
51
- "never_split": null,
52
- "pad_token": "[PAD]",
53
- "sep_token": "[SEP]",
54
- "strip_accents": null,
55
- "tokenize_chinese_chars": true,
56
- "tokenizer_class": "BertTokenizer",
57
- "unk_token": "[UNK]"
58
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenizer_dir/vocab.txt DELETED
The diff for this file is too large to render. See raw diff