Spaces:

ramadn
/

allergen_detector_bert

Running

App Files Files Community

rdsarjito commited on 28 days ago

Commit

314c91a

1 Parent(s): e88e274

4 commit

Browse files

Files changed (3) hide show

app.py +204 -40
model_loader.py +58 -0
requirements.txt +9 -9

app.py CHANGED Viewed

@@ -1,38 +1,84 @@
 import streamlit as st
 import torch
 import torch.nn as nn
-import numpy as np
-import pandas as pd
 import re
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
-# Load tokenizer dan model
-MODEL_PATH = 'model/alergen_model.pt'
-MODEL_NAME = 'indobenchmark/indobert-base-p1'
-TARGET_COLUMNS = ['susu', 'kacang', 'telur', 'makanan_laut', 'gandum']
-MAX_LEN = 128
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 class MultilabelBertClassifier(nn.Module):
     def __init__(self, model_name, num_labels):
         super(MultilabelBertClassifier, self).__init__()
         self.bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
         self.bert.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
     def forward(self, input_ids, attention_mask):
         outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
         return outputs.logits
-model = MultilabelBertClassifier(MODEL_NAME, len(TARGET_COLUMNS))
-model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
-model.to(device)
-model.eval()
-# Fungsi preprocessing
 def clean_text(text):
     text = text.replace('--', ' ')
     text = re.sub(r"http\S+", "", text)
     text = re.sub('\n', ' ', text)
     text = re.sub("[^a-zA-Z0-9\s]", " ", text)
@@ -41,38 +87,156 @@ def clean_text(text):
     text = text.lower()
     return text
-# Fungsi prediksi
-def predict(text):
-    cleaned = clean_text(text)
     encoding = tokenizer.encode_plus(
-        cleaned,
         add_special_tokens=True,
-        max_length=MAX_LEN,
         return_tensors='pt',
-        padding='max_length',
-        truncation=True
     )
     input_ids = encoding['input_ids'].to(device)
     attention_mask = encoding['attention_mask'].to(device)
     with torch.no_grad():
-        logits = model(input_ids=input_ids, attention_mask=attention_mask)
-        probs = torch.sigmoid(logits).cpu().numpy().flatten()
-        results = {TARGET_COLUMNS[i]: float(probs[i]) for i in range(len(TARGET_COLUMNS))}
-        return results
-# STREAMLIT UI
-st.title("🔍 Deteksi Alergen dari Bahan Makanan")
-st.markdown("Masukkan daftar bahan makanan, dan sistem akan memprediksi kemungkinan alergen.")
-user_input = st.text_area("🧾 Bahan makanan (contoh: 2 butir telur, 1 gelas susu, kacang tanah...)")
-if st.button("Prediksi Alergen"):
-    if user_input.strip() == "":
-        st.warning("Silakan masukkan bahan makanan terlebih dahulu.")
-    else:
-        with st.spinner("Memproses..."):
-            predictions = predict(user_input)
-            st.subheader("📊 Hasil Prediksi:")
-            for allergen, score in predictions.items():
-                st.write(f"- **{allergen}**: {'✅ Terdeteksi' if score > 0.5 else '❌ Tidak terdeteksi'} (Probabilitas: {score:.2f})")

 import streamlit as st
 import torch
 import torch.nn as nn
 import re
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import numpy as np
+import os
+# Set page configuration
+st.set_page_config(
+    page_title="Allergen Detector",
+    page_icon="🍽️",
+    layout="wide"
+)
+# Define styling
+st.markdown("""
+<style>
+    .main-header {
+        font-size: 2.5rem;
+        color: #1E88E5;
+        text-align: center;
+    }
+    .sub-header {
+        font-size: 1.5rem;
+        color: #424242;
+        margin-bottom: 1rem;
+    }
+    .result-positive {
+        font-size: 1.2rem;
+        color: #D32F2F;
+        font-weight: bold;
+    }
+    .result-negative {
+        font-size: 1.2rem;
+        color: #388E3C;
+        font-weight: bold;
+    }
+    .footer {
+        text-align: center;
+        color: #616161;
+        margin-top: 2rem;
+    }
+</style>
+""", unsafe_allow_html=True)
+# App title and description
+st.markdown("<h1 class='main-header'>Allergen Detector</h1>", unsafe_allow_html=True)
+st.markdown("<p class='sub-header'>Detect common allergens in your recipe ingredients</p>", unsafe_allow_html=True)
+# Set device
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Target columns (allergen types)
+target_columns = ['susu', 'kacang', 'telur', 'makanan_laut', 'gandum']
+allergen_display_names = {
+    'susu': 'Milk (Susu)',
+    'kacang': 'Nuts (Kacang)',
+    'telur': 'Eggs (Telur)',
+    'makanan_laut': 'Seafood (Makanan Laut)',
+    'gandum': 'Wheat (Gandum)'
+}
+# Define model for multilabel classification
 class MultilabelBertClassifier(nn.Module):
     def __init__(self, model_name, num_labels):
         super(MultilabelBertClassifier, self).__init__()
         self.bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
+        # Replace the classification head with our own for multilabel
         self.bert.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
     def forward(self, input_ids, attention_mask):
         outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
         return outputs.logits
+# Clean text function
+@st.cache_data
 def clean_text(text):
+    # Convert dashes to spaces for better tokenization
     text = text.replace('--', ' ')
+    # Basic cleaning
     text = re.sub(r"http\S+", "", text)
     text = re.sub('\n', ' ', text)
     text = re.sub("[^a-zA-Z0-9\s]", " ", text)
     text = text.lower()
     return text
+# Function to load model
+@st.cache_resource
+def load_model():
+    try:
+        # Initialize tokenizer
+        tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p2')
+        # Initialize model
+        model = MultilabelBertClassifier('indobenchmark/indobert-base-p1', len(target_columns))
+        # Load the trained model
+        # In a real deployment, you would use the saved model file
+        # For demo purposes, we'll assume the model file is in the same directory
+        model_path = "model/alergen_model.pt"
+        if os.path.exists(model_path):
+            checkpoint = torch.load(model_path, map_location=device)
+            model.load_state_dict(checkpoint['model_state_dict'])
+        else:
+            st.error("Model file not found. Please make sure 'alergen_model.pt' is in the same directory.")
+        model.to(device)
+        model.eval()
+        return model, tokenizer
+    except Exception as e:
+        st.error(f"Error loading model: {str(e)}")
+        return None, None
+# Function to predict allergens
+def predict_allergens(model, tokenizer, ingredients_text, max_length=128):
+    if not model or not tokenizer:
+        return {}
+    # Clean the text
+    cleaned_text = clean_text(ingredients_text)
+    # Tokenize
     encoding = tokenizer.encode_plus(
+        cleaned_text,
         add_special_tokens=True,
+        max_length=max_length,
+        truncation=True,
         return_tensors='pt',
+        padding='max_length'
     )
     input_ids = encoding['input_ids'].to(device)
     attention_mask = encoding['attention_mask'].to(device)
     with torch.no_grad():
+        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+        predictions = torch.sigmoid(outputs)
+        probabilities = predictions.cpu().numpy()[0]
+        binary_predictions = (probabilities > 0.5).astype(bool)
+    result = {
+        'binary': {},
+        'probabilities': {}
+    }
+    for i, target in enumerate(target_columns):
+        result['binary'][target] = bool(binary_predictions[i])
+        result['probabilities'][target] = float(probabilities[i])
+    return result
+# Main app
+def main():
+    # Load model and tokenizer
+    model, tokenizer = load_model()
+    # Input area
+    st.markdown("### Enter Recipe Ingredients")
+    ingredients = st.text_area(
+        "Paste your recipe ingredients here:",
+        height=200,
+        placeholder="Example: 1 bungkus Lontong homemade, 2 butir Telur ayam, 2 kotak kecil Tahu coklat..."
+    )
+    # Sample recipe option
+    use_sample = st.checkbox("Use sample recipe")
+    if use_sample:
+        sample_recipe = "1 bungkus Lontong homemade 2 butir Telur ayam 2 kotak kecil Tahu coklat 4 butir kecil Kentang 2 buah Tomat merah 1 buah Ketimun lalap 4 lembar Selada keriting 2 lembar Kol putih 2 porsi Saus kacang homemade 4 buah Kerupuk udang goreng Secukupnya emping goreng 2 sdt Bawang goreng Secukupnya Kecap manis (bila suka)"
+        ingredients = sample_recipe
+        st.text_area("Sample recipe:", value=sample_recipe, height=150, disabled=True)
+    # Analyze button
+    analyze_button = st.button("Analyze Ingredients")
+    # Results section
+    if analyze_button and ingredients:
+        with st.spinner("Analyzing ingredients..."):
+            # Make prediction
+            results = predict_allergens(model, tokenizer, ingredients)
+            if results:
+                st.markdown("### Analysis Results")
+                # Display results in columns
+                col1, col2 = st.columns(2)
+                with col1:
+                    st.markdown("#### Detected Allergens:")
+                    # Check if any allergens were detected
+                    if any(results['binary'].values()):
+                        for allergen, present in results['binary'].items():
+                            if present:
+                                st.markdown(f"<p class='result-positive'>✓ {allergen_display_names[allergen]}</p>", unsafe_allow_html=True)
+                    else:
+                        st.markdown("<p class='result-negative'>No allergens detected</p>", unsafe_allow_html=True)
+                with col2:
+                    st.markdown("#### Confidence Scores:")
+                    for allergen, probability in results['probabilities'].items():
+                        # Create a progress bar for each allergen
+                        st.write(f"{allergen_display_names[allergen]}")
+                        st.progress(probability)
+                        st.write(f"{probability:.2%}")
+                        st.write("")
+                # Display a summary
+                st.markdown("### Summary")
+                detected = [allergen_display_names[a] for a, p in results['binary'].items() if p]
+                if detected:
+                    st.warning(f"This recipe contains the following allergens: {', '.join(detected)}")
+                else:
+                    st.success("This recipe appears to be free from the common allergens we can detect.")
+                st.info("Note: This analysis is based on an AI model and may not be 100% accurate. Always verify allergen information from trusted sources if you have dietary restrictions.")
+    # Information section
+    with st.expander("About This App"):
+        st.write("""
+        This allergen detector uses a fine-tuned IndoBERT model to identify common allergens in recipe ingredients.
+        The model can detect the following allergens:
+        - Milk (Susu)
+        - Nuts (Kacang)
+        - Eggs (Telur)
+        - Seafood (Makanan Laut)
+        - Wheat (Gandum)
+        The accuracy of detection depends on how clearly the ingredients are described. The model has been trained on Indonesian recipe data.
+        """)
+    # Footer
+    st.markdown("<p class='footer'>Developed with ❤️ using Streamlit and PyTorch</p>", unsafe_allow_html=True)
+if __name__ == "__main__":
+    main()

model_loader.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import torch
+import torch.nn as nn
+from transformers import AutoModelForSequenceClassification
+# Define target columns
+target_columns = ['susu', 'kacang', 'telur', 'makanan_laut', 'gandum']
+# Define model class - same as in your original code
+class MultilabelBertClassifier(nn.Module):
+    def __init__(self, model_name, num_labels):
+        super(MultilabelBertClassifier, self).__init__()
+        self.bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
+        # Replace the classification head with our own for multilabel
+        self.bert.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
+    def forward(self, input_ids, attention_mask):
+        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+        return outputs.logits
+# Function to load the saved model
+def load_saved_model(model_path, device='cpu'):
+    """
+    Load the saved allergen detection model
+    Args:
+        model_path (str): Path to the saved model file
+        device (str): Device to load the model onto ('cpu' or 'cuda')
+    Returns:
+        model: The loaded model
+    """
+    try:
+        # Create model instance
+        model = MultilabelBertClassifier('indobenchmark/indobert-base-p1', len(target_columns))
+        # Load saved weights
+        checkpoint = torch.load(model_path, map_location=device)
+        # Check if model was saved using DataParallel
+        if 'module.' in list(checkpoint['model_state_dict'].keys())[0]:
+            # Create new OrderedDict without 'module.' prefix
+            from collections import OrderedDict
+            new_state_dict = OrderedDict()
+            for k, v in checkpoint['model_state_dict'].items():
+                name = k[7:] if k.startswith('module.') else k
+                new_state_dict[name] = v
+            model.load_state_dict(new_state_dict)
+        else:
+            model.load_state_dict(checkpoint['model_state_dict'])
+        # Move model to device and set to evaluation mode
+        model.to(device)
+        model.eval()
+        return model
+    except Exception as e:
+        print(f"Error loading model: {str(e)}")
+        return None

requirements.txt CHANGED Viewed

@@ -1,9 +1,9 @@
-streamlit
-pandas
-numpy
-torch
-transformers
-scikit-learn
-tqdm
-matplotlib
-sentencepiece

+streamlit==1.30.0
+torch==2.0.1
+transformers==4.35.2
+numpy==1.24.3
+pandas==2.0.3
+scikit-learn==1.3.0
+regex==2023.8.8
+tqdm==4.66.1
+matplotlib==3.7.2