Spaces:
Running
Running
rdsarjito
commited on
Commit
·
f391e9e
1
Parent(s):
554b605
8 commit
Browse files- app.py +148 -49
- model/{alergen_model_full.pt → alergen_model.pt} +2 -2
- requirements.txt +4 -2
- save_model.py +51 -0
- tokenizer_dir/special_tokens_map.json +0 -7
- tokenizer_dir/tokenizer.json +0 -0
- tokenizer_dir/tokenizer_config.json +0 -58
- tokenizer_dir/vocab.txt +0 -0
app.py
CHANGED
@@ -1,15 +1,22 @@
|
|
1 |
-
# app.py
|
2 |
import streamlit as st
|
3 |
import torch
|
4 |
import torch.nn as nn
|
5 |
-
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
6 |
import re
|
7 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
-
# Target
|
10 |
target_columns = ['susu', 'kacang', 'telur', 'makanan_laut', 'gandum']
|
11 |
|
12 |
-
#
|
|
|
|
|
|
|
13 |
def clean_text(text):
|
14 |
text = text.replace('--', ' ')
|
15 |
text = re.sub(r"http\S+", "", text)
|
@@ -19,67 +26,159 @@ def clean_text(text):
|
|
19 |
text = text.strip().lower()
|
20 |
return text
|
21 |
|
22 |
-
#
|
23 |
-
tokenizer = AutoTokenizer.from_pretrained("tokenizer_dir")
|
24 |
-
max_length = 128
|
25 |
-
|
26 |
-
# Define model architecture
|
27 |
class MultilabelBertClassifier(nn.Module):
|
28 |
def __init__(self, model_name, num_labels):
|
29 |
super(MultilabelBertClassifier, self).__init__()
|
30 |
self.bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
|
31 |
self.bert.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
|
32 |
-
|
33 |
def forward(self, input_ids, attention_mask):
|
34 |
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
|
35 |
return outputs.logits
|
36 |
|
37 |
# Load model
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
-
model.
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
def predict_alergens(text):
|
50 |
-
cleaned = clean_text(text)
|
51 |
-
inputs = tokenizer.encode_plus(
|
52 |
-
cleaned,
|
53 |
add_special_tokens=True,
|
54 |
max_length=max_length,
|
55 |
truncation=True,
|
56 |
return_tensors='pt',
|
57 |
padding='max_length'
|
58 |
)
|
59 |
-
input_ids =
|
60 |
-
attention_mask =
|
61 |
-
|
62 |
with torch.no_grad():
|
63 |
-
|
64 |
-
probs = torch.sigmoid(
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
else:
|
85 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
import torch
|
3 |
import torch.nn as nn
|
|
|
4 |
import re
|
5 |
import numpy as np
|
6 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
7 |
+
import requests
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
|
10 |
+
# Set page configuration
|
11 |
+
st.set_page_config(page_title="Aplikasi Deteksi Alergen", page_icon="🍲", layout="wide")
|
12 |
|
13 |
+
# Target label
|
14 |
target_columns = ['susu', 'kacang', 'telur', 'makanan_laut', 'gandum']
|
15 |
|
16 |
+
# Device
|
17 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
18 |
+
|
19 |
+
# Text cleaning
|
20 |
def clean_text(text):
|
21 |
text = text.replace('--', ' ')
|
22 |
text = re.sub(r"http\S+", "", text)
|
|
|
26 |
text = text.strip().lower()
|
27 |
return text
|
28 |
|
29 |
+
# Multilabel BERT model
|
|
|
|
|
|
|
|
|
30 |
class MultilabelBertClassifier(nn.Module):
|
31 |
def __init__(self, model_name, num_labels):
|
32 |
super(MultilabelBertClassifier, self).__init__()
|
33 |
self.bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
|
34 |
self.bert.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
|
35 |
+
|
36 |
def forward(self, input_ids, attention_mask):
|
37 |
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
|
38 |
return outputs.logits
|
39 |
|
40 |
# Load model
|
41 |
+
@st.cache_resource
|
42 |
+
def load_model():
|
43 |
+
tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p2')
|
44 |
+
model = MultilabelBertClassifier('indobenchmark/indobert-base-p1', len(target_columns))
|
45 |
+
try:
|
46 |
+
state_dict = torch.load('model/alergen_model.pt', map_location=device)
|
47 |
+
if 'model_state_dict' in state_dict:
|
48 |
+
model_state_dict = state_dict['model_state_dict']
|
49 |
+
else:
|
50 |
+
model_state_dict = state_dict
|
51 |
+
new_state_dict = {k[7:] if k.startswith('module.') else k: v for k, v in model_state_dict.items()}
|
52 |
+
model.load_state_dict(new_state_dict, strict=False)
|
53 |
+
st.success("Model berhasil dimuat!")
|
54 |
+
except Exception as e:
|
55 |
+
st.error(f"Error loading model: {str(e)}")
|
56 |
+
st.info("Menggunakan model tanpa pre-trained weights.")
|
57 |
+
model.to(device)
|
58 |
+
model.eval()
|
59 |
+
return tokenizer, model
|
60 |
|
61 |
+
def predict_alergens(ingredients_text, tokenizer, model, threshold=0.5, max_length=128):
|
62 |
+
cleaned_text = clean_text(ingredients_text)
|
63 |
+
encoding = tokenizer.encode_plus(
|
64 |
+
cleaned_text,
|
|
|
|
|
|
|
|
|
65 |
add_special_tokens=True,
|
66 |
max_length=max_length,
|
67 |
truncation=True,
|
68 |
return_tensors='pt',
|
69 |
padding='max_length'
|
70 |
)
|
71 |
+
input_ids = encoding['input_ids'].to(device)
|
72 |
+
attention_mask = encoding['attention_mask'].to(device)
|
73 |
+
|
74 |
with torch.no_grad():
|
75 |
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
|
76 |
+
probs = torch.sigmoid(outputs).cpu().numpy()[0] # hasil sigmoid (0-1)
|
77 |
+
|
78 |
+
results = []
|
79 |
+
for i, label in enumerate(target_columns):
|
80 |
+
present = probs[i] > threshold
|
81 |
+
percent = float(probs[i]) * 100
|
82 |
+
results.append({
|
83 |
+
'label': label,
|
84 |
+
'present': present,
|
85 |
+
'probability': percent
|
86 |
+
})
|
87 |
+
return results
|
88 |
+
|
89 |
+
# Scrape Cookpad
|
90 |
+
def scrape_ingredients_from_url(url):
|
91 |
+
try:
|
92 |
+
headers = {"User-Agent": "Mozilla/5.0"}
|
93 |
+
response = requests.get(url, headers=headers)
|
94 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
95 |
+
ingredients_div = soup.find('div', id='ingredients')
|
96 |
+
if not ingredients_div:
|
97 |
+
return None
|
98 |
+
items = ingredients_div.find_all(['li', 'span'])
|
99 |
+
ingredients = [item.get_text(strip=True) for item in items if item.get_text(strip=True)]
|
100 |
+
return '\n'.join(ingredients)
|
101 |
+
except Exception as e:
|
102 |
+
st.error(f"Gagal mengambil data dari URL: {e}")
|
103 |
+
return None
|
104 |
+
|
105 |
+
# Main App
|
106 |
+
def main():
|
107 |
+
st.title("Aplikasi Deteksi Alergen dalam Resep")
|
108 |
+
st.markdown("""
|
109 |
+
Aplikasi ini memprediksi alergen yang terkandung dalam resep makanan berdasarkan bahan-bahan.
|
110 |
+
""")
|
111 |
+
|
112 |
+
with st.spinner("Memuat model..."):
|
113 |
+
tokenizer, model = load_model()
|
114 |
+
|
115 |
+
col1, col2 = st.columns([3, 2])
|
116 |
+
|
117 |
+
with col1:
|
118 |
+
st.subheader("Masukkan URL Resep dari Cookpad")
|
119 |
+
url = st.text_input("Contoh: https://cookpad.com/id/resep/24678703-gulai-telur-tahu-dan-kacang-panjang")
|
120 |
+
|
121 |
+
threshold = st.slider(
|
122 |
+
"Atur Threshold Deteksi Alergen",
|
123 |
+
min_value=0.1,
|
124 |
+
max_value=0.9,
|
125 |
+
value=0.5,
|
126 |
+
step=0.05,
|
127 |
+
help="Semakin rendah threshold, semakin sensitif model terhadap kemungkinan adanya alergen."
|
128 |
+
)
|
129 |
+
|
130 |
+
if st.button("Deteksi Alergen", type="primary"):
|
131 |
+
if url:
|
132 |
+
with st.spinner("Mengambil bahan resep dari URL..."):
|
133 |
+
ingredients = scrape_ingredients_from_url(url)
|
134 |
+
|
135 |
+
if ingredients:
|
136 |
+
st.text_area("Daftar Bahan", ingredients, height=200)
|
137 |
+
with st.spinner("Menganalisis bahan..."):
|
138 |
+
alergens = predict_alergens(ingredients, tokenizer, model, threshold=threshold)
|
139 |
+
|
140 |
+
with col2:
|
141 |
+
st.subheader("Hasil Deteksi")
|
142 |
+
emoji_map = {
|
143 |
+
'susu': '🥛',
|
144 |
+
'kacang': '🥜',
|
145 |
+
'telur': '🥚',
|
146 |
+
'makanan_laut': '🦐',
|
147 |
+
'gandum': '🌾'
|
148 |
+
}
|
149 |
+
|
150 |
+
detected = []
|
151 |
+
for result in alergens:
|
152 |
+
label = result['label']
|
153 |
+
name = label.replace('_', ' ').title()
|
154 |
+
prob = result['probability']
|
155 |
+
present = result['present']
|
156 |
+
emoji = emoji_map.get(label, '')
|
157 |
+
|
158 |
+
if present:
|
159 |
+
st.error(f"{emoji} {name}: Terdeteksi ⚠️ ({prob:.2f}%)")
|
160 |
+
detected.append(name)
|
161 |
+
else:
|
162 |
+
st.success(f"{emoji} {name}: Tidak Terdeteksi ✓ ({prob:.2f}%)")
|
163 |
+
|
164 |
+
if detected:
|
165 |
+
st.warning(f"Resep ini mengandung alergen: {', '.join(detected)}")
|
166 |
+
else:
|
167 |
+
st.success("Resep ini tidak mengandung alergen yang terdeteksi.")
|
168 |
+
else:
|
169 |
+
st.warning("Gagal mengambil bahan dari halaman Cookpad. Pastikan URL valid.")
|
170 |
else:
|
171 |
+
st.warning("Silakan masukkan URL resep terlebih dahulu.")
|
172 |
+
|
173 |
+
with st.expander("Tentang Aplikasi"):
|
174 |
+
st.markdown("""
|
175 |
+
Aplikasi ini menggunakan model IndoBERT untuk deteksi 5 jenis alergen dari bahan resep:
|
176 |
+
- Susu 🥛
|
177 |
+
- Kacang 🥜
|
178 |
+
- Telur 🥚
|
179 |
+
- Makanan Laut 🦐
|
180 |
+
- Gandum 🌾
|
181 |
+
""")
|
182 |
+
|
183 |
+
if __name__ == "__main__":
|
184 |
+
main()
|
model/{alergen_model_full.pt → alergen_model.pt}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:28df831b272894c11265ef5f4cf1ac2a2ca89e765b26bff928f34c388ff015d5
|
3 |
+
size 497868974
|
requirements.txt
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
-
streamlit==1.
|
2 |
torch==2.0.1
|
3 |
transformers==4.36.2
|
4 |
-
numpy==1.25.2
|
|
|
|
|
|
1 |
+
streamlit==1.31.0
|
2 |
torch==2.0.1
|
3 |
transformers==4.36.2
|
4 |
+
numpy==1.25.2
|
5 |
+
scikit-learn==1.3.0
|
6 |
+
tqdm==4.66.1
|
save_model.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
from transformers import AutoModelForSequenceClassification
|
5 |
+
|
6 |
+
# Define target columns
|
7 |
+
target_columns = ['susu', 'kacang', 'telur', 'makanan_laut', 'gandum']
|
8 |
+
|
9 |
+
# Define model for multilabel classification
|
10 |
+
class MultilabelBertClassifier(nn.Module):
|
11 |
+
def __init__(self, model_name, num_labels):
|
12 |
+
super(MultilabelBertClassifier, self).__init__()
|
13 |
+
self.bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
|
14 |
+
# Replace the classification head with our own for multilabel
|
15 |
+
self.bert.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
|
16 |
+
|
17 |
+
def forward(self, input_ids, attention_mask):
|
18 |
+
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
|
19 |
+
return outputs.logits
|
20 |
+
|
21 |
+
# Set device
|
22 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
23 |
+
print(f"Using device: {device}")
|
24 |
+
|
25 |
+
# Initialize model
|
26 |
+
model = MultilabelBertClassifier('indobenchmark/indobert-base-p1', len(target_columns))
|
27 |
+
|
28 |
+
# Load the best model for evaluation
|
29 |
+
print("Loading model from best_alergen_model.pt...")
|
30 |
+
state_dict = torch.load('best_alergen_model.pt', map_location=device)
|
31 |
+
|
32 |
+
# If the model was trained with DataParallel, we need to remove the 'module.' prefix
|
33 |
+
new_state_dict = {}
|
34 |
+
for k, v in state_dict.items():
|
35 |
+
name = k[7:] if k.startswith('module.') else k
|
36 |
+
new_state_dict[name] = v
|
37 |
+
|
38 |
+
model.load_state_dict(new_state_dict)
|
39 |
+
model.to(device)
|
40 |
+
|
41 |
+
# Create model directory
|
42 |
+
os.makedirs('model', exist_ok=True)
|
43 |
+
|
44 |
+
# Save model
|
45 |
+
print("Saving model to model/alergen_model.pt...")
|
46 |
+
torch.save({
|
47 |
+
'model_state_dict': model.state_dict(),
|
48 |
+
'target_columns': target_columns,
|
49 |
+
}, 'model/alergen_model.pt')
|
50 |
+
|
51 |
+
print("Done!")
|
tokenizer_dir/special_tokens_map.json
DELETED
@@ -1,7 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"cls_token": "[CLS]",
|
3 |
-
"mask_token": "[MASK]",
|
4 |
-
"pad_token": "[PAD]",
|
5 |
-
"sep_token": "[SEP]",
|
6 |
-
"unk_token": "[UNK]"
|
7 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer_dir/tokenizer.json
DELETED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_dir/tokenizer_config.json
DELETED
@@ -1,58 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"added_tokens_decoder": {
|
3 |
-
"0": {
|
4 |
-
"content": "[PAD]",
|
5 |
-
"lstrip": false,
|
6 |
-
"normalized": false,
|
7 |
-
"rstrip": false,
|
8 |
-
"single_word": false,
|
9 |
-
"special": true
|
10 |
-
},
|
11 |
-
"1": {
|
12 |
-
"content": "[UNK]",
|
13 |
-
"lstrip": false,
|
14 |
-
"normalized": false,
|
15 |
-
"rstrip": false,
|
16 |
-
"single_word": false,
|
17 |
-
"special": true
|
18 |
-
},
|
19 |
-
"2": {
|
20 |
-
"content": "[CLS]",
|
21 |
-
"lstrip": false,
|
22 |
-
"normalized": false,
|
23 |
-
"rstrip": false,
|
24 |
-
"single_word": false,
|
25 |
-
"special": true
|
26 |
-
},
|
27 |
-
"3": {
|
28 |
-
"content": "[SEP]",
|
29 |
-
"lstrip": false,
|
30 |
-
"normalized": false,
|
31 |
-
"rstrip": false,
|
32 |
-
"single_word": false,
|
33 |
-
"special": true
|
34 |
-
},
|
35 |
-
"4": {
|
36 |
-
"content": "[MASK]",
|
37 |
-
"lstrip": false,
|
38 |
-
"normalized": false,
|
39 |
-
"rstrip": false,
|
40 |
-
"single_word": false,
|
41 |
-
"special": true
|
42 |
-
}
|
43 |
-
},
|
44 |
-
"clean_up_tokenization_spaces": true,
|
45 |
-
"cls_token": "[CLS]",
|
46 |
-
"do_basic_tokenize": true,
|
47 |
-
"do_lower_case": true,
|
48 |
-
"extra_special_tokens": {},
|
49 |
-
"mask_token": "[MASK]",
|
50 |
-
"model_max_length": 1000000000000000019884624838656,
|
51 |
-
"never_split": null,
|
52 |
-
"pad_token": "[PAD]",
|
53 |
-
"sep_token": "[SEP]",
|
54 |
-
"strip_accents": null,
|
55 |
-
"tokenize_chinese_chars": true,
|
56 |
-
"tokenizer_class": "BertTokenizer",
|
57 |
-
"unk_token": "[UNK]"
|
58 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer_dir/vocab.txt
DELETED
The diff for this file is too large to render.
See raw diff
|
|