rdsarjito
8 commit
f391e9e
import streamlit as st
import torch
import torch.nn as nn
import re
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import requests
from bs4 import BeautifulSoup
# Set page configuration
st.set_page_config(page_title="Aplikasi Deteksi Alergen", page_icon="🍲", layout="wide")
# Target label
target_columns = ['susu', 'kacang', 'telur', 'makanan_laut', 'gandum']
# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Text cleaning
def clean_text(text):
text = text.replace('--', ' ')
text = re.sub(r"http\S+", "", text)
text = re.sub('\n', ' ', text)
text = re.sub("[^a-zA-Z0-9\s]", " ", text)
text = re.sub(" {2,}", " ", text)
text = text.strip().lower()
return text
# Multilabel BERT model
class MultilabelBertClassifier(nn.Module):
def __init__(self, model_name, num_labels):
super(MultilabelBertClassifier, self).__init__()
self.bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
self.bert.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
def forward(self, input_ids, attention_mask):
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
return outputs.logits
# Load model
@st.cache_resource
def load_model():
tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p2')
model = MultilabelBertClassifier('indobenchmark/indobert-base-p1', len(target_columns))
try:
state_dict = torch.load('model/alergen_model.pt', map_location=device)
if 'model_state_dict' in state_dict:
model_state_dict = state_dict['model_state_dict']
else:
model_state_dict = state_dict
new_state_dict = {k[7:] if k.startswith('module.') else k: v for k, v in model_state_dict.items()}
model.load_state_dict(new_state_dict, strict=False)
st.success("Model berhasil dimuat!")
except Exception as e:
st.error(f"Error loading model: {str(e)}")
st.info("Menggunakan model tanpa pre-trained weights.")
model.to(device)
model.eval()
return tokenizer, model
def predict_alergens(ingredients_text, tokenizer, model, threshold=0.5, max_length=128):
cleaned_text = clean_text(ingredients_text)
encoding = tokenizer.encode_plus(
cleaned_text,
add_special_tokens=True,
max_length=max_length,
truncation=True,
return_tensors='pt',
padding='max_length'
)
input_ids = encoding['input_ids'].to(device)
attention_mask = encoding['attention_mask'].to(device)
with torch.no_grad():
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
probs = torch.sigmoid(outputs).cpu().numpy()[0] # hasil sigmoid (0-1)
results = []
for i, label in enumerate(target_columns):
present = probs[i] > threshold
percent = float(probs[i]) * 100
results.append({
'label': label,
'present': present,
'probability': percent
})
return results
# Scrape Cookpad
def scrape_ingredients_from_url(url):
try:
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
ingredients_div = soup.find('div', id='ingredients')
if not ingredients_div:
return None
items = ingredients_div.find_all(['li', 'span'])
ingredients = [item.get_text(strip=True) for item in items if item.get_text(strip=True)]
return '\n'.join(ingredients)
except Exception as e:
st.error(f"Gagal mengambil data dari URL: {e}")
return None
# Main App
def main():
st.title("Aplikasi Deteksi Alergen dalam Resep")
st.markdown("""
Aplikasi ini memprediksi alergen yang terkandung dalam resep makanan berdasarkan bahan-bahan.
""")
with st.spinner("Memuat model..."):
tokenizer, model = load_model()
col1, col2 = st.columns([3, 2])
with col1:
st.subheader("Masukkan URL Resep dari Cookpad")
url = st.text_input("Contoh: https://cookpad.com/id/resep/24678703-gulai-telur-tahu-dan-kacang-panjang")
threshold = st.slider(
"Atur Threshold Deteksi Alergen",
min_value=0.1,
max_value=0.9,
value=0.5,
step=0.05,
help="Semakin rendah threshold, semakin sensitif model terhadap kemungkinan adanya alergen."
)
if st.button("Deteksi Alergen", type="primary"):
if url:
with st.spinner("Mengambil bahan resep dari URL..."):
ingredients = scrape_ingredients_from_url(url)
if ingredients:
st.text_area("Daftar Bahan", ingredients, height=200)
with st.spinner("Menganalisis bahan..."):
alergens = predict_alergens(ingredients, tokenizer, model, threshold=threshold)
with col2:
st.subheader("Hasil Deteksi")
emoji_map = {
'susu': 'πŸ₯›',
'kacang': 'πŸ₯œ',
'telur': 'πŸ₯š',
'makanan_laut': '🦐',
'gandum': '🌾'
}
detected = []
for result in alergens:
label = result['label']
name = label.replace('_', ' ').title()
prob = result['probability']
present = result['present']
emoji = emoji_map.get(label, '')
if present:
st.error(f"{emoji} {name}: Terdeteksi ⚠️ ({prob:.2f}%)")
detected.append(name)
else:
st.success(f"{emoji} {name}: Tidak Terdeteksi βœ“ ({prob:.2f}%)")
if detected:
st.warning(f"Resep ini mengandung alergen: {', '.join(detected)}")
else:
st.success("Resep ini tidak mengandung alergen yang terdeteksi.")
else:
st.warning("Gagal mengambil bahan dari halaman Cookpad. Pastikan URL valid.")
else:
st.warning("Silakan masukkan URL resep terlebih dahulu.")
with st.expander("Tentang Aplikasi"):
st.markdown("""
Aplikasi ini menggunakan model IndoBERT untuk deteksi 5 jenis alergen dari bahan resep:
- Susu πŸ₯›
- Kacang πŸ₯œ
- Telur πŸ₯š
- Makanan Laut 🦐
- Gandum 🌾
""")
if __name__ == "__main__":
main()