Spaces:

ramadn
/

allergen_detector_bert

Running

App Files Files Community

rdsarjito commited on 10 days ago

Commit

f391e9e

1 Parent(s): 554b605

8 commit

Browse files

Files changed (8) hide show

app.py +148 -49
model/{alergen_model_full.pt → alergen_model.pt} +2 -2
requirements.txt +4 -2
save_model.py +51 -0
tokenizer_dir/special_tokens_map.json +0 -7
tokenizer_dir/tokenizer.json +0 -0
tokenizer_dir/tokenizer_config.json +0 -58
tokenizer_dir/vocab.txt +0 -0

app.py CHANGED Viewed

@@ -1,15 +1,22 @@
-# app.py
 import streamlit as st
 import torch
 import torch.nn as nn
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import re
 import numpy as np
-# Target labels
 target_columns = ['susu', 'kacang', 'telur', 'makanan_laut', 'gandum']
-# Clean text
 def clean_text(text):
     text = text.replace('--', ' ')
     text = re.sub(r"http\S+", "", text)
@@ -19,67 +26,159 @@ def clean_text(text):
     text = text.strip().lower()
     return text
-# Load tokenizer
-tokenizer = AutoTokenizer.from_pretrained("tokenizer_dir")
-max_length = 128
-# Define model architecture
 class MultilabelBertClassifier(nn.Module):
     def __init__(self, model_name, num_labels):
         super(MultilabelBertClassifier, self).__init__()
         self.bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
         self.bert.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
     def forward(self, input_ids, attention_mask):
         outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
         return outputs.logits
 # Load model
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model = torch.load("model/alergen_model_full.pt", map_location=device)
-# Jika model dibungkus DataParallel, kita ambil model asli
-if hasattr(model, "module"):
-    model = model.module
-model.to(device)
-model.eval()
-# Prediction function
-def predict_alergens(text):
-    cleaned = clean_text(text)
-    inputs = tokenizer.encode_plus(
-        cleaned,
         add_special_tokens=True,
         max_length=max_length,
         truncation=True,
         return_tensors='pt',
         padding='max_length'
     )
-    input_ids = inputs['input_ids'].to(device)
-    attention_mask = inputs['attention_mask'].to(device)
     with torch.no_grad():
-        logits = model(input_ids=input_ids, attention_mask=attention_mask)
-        probs = torch.sigmoid(logits)
-        preds = (probs > 0.5).float().cpu().numpy()[0]
-    return {target: bool(preds[i]) for i, target in enumerate(target_columns)}
-# Streamlit UI
-st.title("Deteksi Alergen dari Resep Masakan 🧪🍲")
-recipe_input = st.text_area("Masukkan bahan-bahan resep di sini:", height=200)
-if st.button("Deteksi Alergen"):
-    if recipe_input.strip() == "":
-        st.warning("Silakan masukkan teks resep terlebih dahulu.")
-    else:
-        with st.spinner("Menganalisis..."):
-            result = predict_alergens(recipe_input)
-        st.subheader("Hasil Prediksi Alergen:")
-        for allergen, is_present in result.items():
-            if is_present:
-                st.error(f"⚠️ {allergen}")
             else:
-                st.success(f"✅ Bebas dari {allergen}")

 import streamlit as st
 import torch
 import torch.nn as nn
 import re
 import numpy as np
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import requests
+from bs4 import BeautifulSoup
+# Set page configuration
+st.set_page_config(page_title="Aplikasi Deteksi Alergen", page_icon="🍲", layout="wide")
+# Target label
 target_columns = ['susu', 'kacang', 'telur', 'makanan_laut', 'gandum']
+# Device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Text cleaning
 def clean_text(text):
     text = text.replace('--', ' ')
     text = re.sub(r"http\S+", "", text)
     text = text.strip().lower()
     return text
+# Multilabel BERT model
 class MultilabelBertClassifier(nn.Module):
     def __init__(self, model_name, num_labels):
         super(MultilabelBertClassifier, self).__init__()
         self.bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
         self.bert.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
     def forward(self, input_ids, attention_mask):
         outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
         return outputs.logits
 # Load model
+@st.cache_resource
+def load_model():
+    tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p2')
+    model = MultilabelBertClassifier('indobenchmark/indobert-base-p1', len(target_columns))
+    try:
+        state_dict = torch.load('model/alergen_model.pt', map_location=device)
+        if 'model_state_dict' in state_dict:
+            model_state_dict = state_dict['model_state_dict']
+        else:
+            model_state_dict = state_dict
+        new_state_dict = {k[7:] if k.startswith('module.') else k: v for k, v in model_state_dict.items()}
+        model.load_state_dict(new_state_dict, strict=False)
+        st.success("Model berhasil dimuat!")
+    except Exception as e:
+        st.error(f"Error loading model: {str(e)}")
+        st.info("Menggunakan model tanpa pre-trained weights.")
+    model.to(device)
+    model.eval()
+    return tokenizer, model
+def predict_alergens(ingredients_text, tokenizer, model, threshold=0.5, max_length=128):
+    cleaned_text = clean_text(ingredients_text)
+    encoding = tokenizer.encode_plus(
+        cleaned_text,
         add_special_tokens=True,
         max_length=max_length,
         truncation=True,
         return_tensors='pt',
         padding='max_length'
     )
+    input_ids = encoding['input_ids'].to(device)
+    attention_mask = encoding['attention_mask'].to(device)
     with torch.no_grad():
+        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+        probs = torch.sigmoid(outputs).cpu().numpy()[0]  # hasil sigmoid (0-1)
+    results = []
+    for i, label in enumerate(target_columns):
+        present = probs[i] > threshold
+        percent = float(probs[i]) * 100
+        results.append({
+            'label': label,
+            'present': present,
+            'probability': percent
+        })
+    return results
+# Scrape Cookpad
+def scrape_ingredients_from_url(url):
+    try:
+        headers = {"User-Agent": "Mozilla/5.0"}
+        response = requests.get(url, headers=headers)
+        soup = BeautifulSoup(response.text, 'html.parser')
+        ingredients_div = soup.find('div', id='ingredients')
+        if not ingredients_div:
+            return None
+        items = ingredients_div.find_all(['li', 'span'])
+        ingredients = [item.get_text(strip=True) for item in items if item.get_text(strip=True)]
+        return '\n'.join(ingredients)
+    except Exception as e:
+        st.error(f"Gagal mengambil data dari URL: {e}")
+        return None
+# Main App
+def main():
+    st.title("Aplikasi Deteksi Alergen dalam Resep")
+    st.markdown("""
+    Aplikasi ini memprediksi alergen yang terkandung dalam resep makanan berdasarkan bahan-bahan.
+    """)
+    with st.spinner("Memuat model..."):
+        tokenizer, model = load_model()
+    col1, col2 = st.columns([3, 2])
+    with col1:
+        st.subheader("Masukkan URL Resep dari Cookpad")
+        url = st.text_input("Contoh: https://cookpad.com/id/resep/24678703-gulai-telur-tahu-dan-kacang-panjang")
+        threshold = st.slider(
+            "Atur Threshold Deteksi Alergen",
+            min_value=0.1,
+            max_value=0.9,
+            value=0.5,
+            step=0.05,
+            help="Semakin rendah threshold, semakin sensitif model terhadap kemungkinan adanya alergen."
+        )
+        if st.button("Deteksi Alergen", type="primary"):
+            if url:
+                with st.spinner("Mengambil bahan resep dari URL..."):
+                    ingredients = scrape_ingredients_from_url(url)
+                if ingredients:
+                    st.text_area("Daftar Bahan", ingredients, height=200)
+                    with st.spinner("Menganalisis bahan..."):
+                        alergens = predict_alergens(ingredients, tokenizer, model, threshold=threshold)
+                    with col2:
+                        st.subheader("Hasil Deteksi")
+                        emoji_map = {
+                            'susu': '🥛',
+                            'kacang': '🥜',
+                            'telur': '🥚',
+                            'makanan_laut': '🦐',
+                            'gandum': '🌾'
+                        }
+                        detected = []
+                        for result in alergens:
+                            label = result['label']
+                            name = label.replace('_', ' ').title()
+                            prob = result['probability']
+                            present = result['present']
+                            emoji = emoji_map.get(label, '')
+                            if present:
+                                st.error(f"{emoji} {name}: Terdeteksi ⚠️ ({prob:.2f}%)")
+                                detected.append(name)
+                            else:
+                                st.success(f"{emoji} {name}: Tidak Terdeteksi ✓ ({prob:.2f}%)")
+                        if detected:
+                            st.warning(f"Resep ini mengandung alergen: {', '.join(detected)}")
+                        else:
+                            st.success("Resep ini tidak mengandung alergen yang terdeteksi.")
+                else:
+                    st.warning("Gagal mengambil bahan dari halaman Cookpad. Pastikan URL valid.")
             else:
+                st.warning("Silakan masukkan URL resep terlebih dahulu.")
+    with st.expander("Tentang Aplikasi"):
+        st.markdown("""
+        Aplikasi ini menggunakan model IndoBERT untuk deteksi 5 jenis alergen dari bahan resep:
+        - Susu 🥛
+        - Kacang 🥜
+        - Telur 🥚
+        - Makanan Laut 🦐
+        - Gandum 🌾
+        """)
+if __name__ == "__main__":
+    main()

model/{alergen_model_full.pt → alergen_model.pt} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a7b5bbb0945b811482c8bb868a13bd655572de100833a50fd516efc0e52b7c17
-size 497911105

 version https://git-lfs.github.com/spec/v1
+oid sha256:28df831b272894c11265ef5f4cf1ac2a2ca89e765b26bff928f34c388ff015d5
+size 497868974

requirements.txt CHANGED Viewed

@@ -1,4 +1,6 @@
-streamlit==1.30.0
 torch==2.0.1
 transformers==4.36.2
-numpy==1.25.2

+streamlit==1.31.0
 torch==2.0.1
 transformers==4.36.2
+numpy==1.25.2
+scikit-learn==1.3.0
+tqdm==4.66.1

save_model.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import os
+import torch
+import torch.nn as nn
+from transformers import AutoModelForSequenceClassification
+# Define target columns
+target_columns = ['susu', 'kacang', 'telur', 'makanan_laut', 'gandum']
+# Define model for multilabel classification
+class MultilabelBertClassifier(nn.Module):
+    def __init__(self, model_name, num_labels):
+        super(MultilabelBertClassifier, self).__init__()
+        self.bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
+        # Replace the classification head with our own for multilabel
+        self.bert.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
+    def forward(self, input_ids, attention_mask):
+        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+        return outputs.logits
+# Set device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+# Initialize model
+model = MultilabelBertClassifier('indobenchmark/indobert-base-p1', len(target_columns))
+# Load the best model for evaluation
+print("Loading model from best_alergen_model.pt...")
+state_dict = torch.load('best_alergen_model.pt', map_location=device)
+# If the model was trained with DataParallel, we need to remove the 'module.' prefix
+new_state_dict = {}
+for k, v in state_dict.items():
+    name = k[7:] if k.startswith('module.') else k
+    new_state_dict[name] = v
+model.load_state_dict(new_state_dict)
+model.to(device)
+# Create model directory
+os.makedirs('model', exist_ok=True)
+# Save model
+print("Saving model to model/alergen_model.pt...")
+torch.save({
+    'model_state_dict': model.state_dict(),
+    'target_columns': target_columns,
+}, 'model/alergen_model.pt')
+print("Done!")

tokenizer_dir/special_tokens_map.json DELETED Viewed

@@ -1,7 +0,0 @@
-{
-  "cls_token": "[CLS]",
-  "mask_token": "[MASK]",
-  "pad_token": "[PAD]",
-  "sep_token": "[SEP]",
-  "unk_token": "[UNK]"
-}

tokenizer_dir/tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_dir/tokenizer_config.json DELETED Viewed

@@ -1,58 +0,0 @@
-{
-  "added_tokens_decoder": {
-    "0": {
-      "content": "[PAD]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "1": {
-      "content": "[UNK]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "2": {
-      "content": "[CLS]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "3": {
-      "content": "[SEP]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "4": {
-      "content": "[MASK]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "clean_up_tokenization_spaces": true,
-  "cls_token": "[CLS]",
-  "do_basic_tokenize": true,
-  "do_lower_case": true,
-  "extra_special_tokens": {},
-  "mask_token": "[MASK]",
-  "model_max_length": 1000000000000000019884624838656,
-  "never_split": null,
-  "pad_token": "[PAD]",
-  "sep_token": "[SEP]",
-  "strip_accents": null,
-  "tokenize_chinese_chars": true,
-  "tokenizer_class": "BertTokenizer",
-  "unk_token": "[UNK]"
-}

tokenizer_dir/vocab.txt DELETED Viewed

The diff for this file is too large to render. See raw diff