Spaces:

ramadn
/

allergen_detector_bert

Running

App Files Files Community

rdsarjito commited on 17 days ago

Commit

554b605

1 Parent(s): b1b9a76

7 commit

Browse files

Files changed (7) hide show

app.py +51 -216
model/{alergen_model.pt → alergen_model_full.pt} +2 -2
requirements.txt +4 -5
tokenizer_dir/special_tokens_map.json +7 -0
tokenizer_dir/tokenizer.json +0 -0
tokenizer_dir/tokenizer_config.json +58 -0
tokenizer_dir/vocab.txt +0 -0

app.py CHANGED Viewed

@@ -1,250 +1,85 @@
 import streamlit as st
 import torch
 import torch.nn as nn
 import re
-from transformers import AutoTokenizer
-import os
 import numpy as np
-# Set page config
-st.set_page_config(
-    page_title="Allergen Detection App",
-    page_icon="🍲",
-    layout="wide"
-)
-# Set device
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# Define target columns (allergens)
 target_columns = ['susu', 'kacang', 'telur', 'makanan_laut', 'gandum']
-# Clean text function
 def clean_text(text):
-    # Convert dashes to spaces for better tokenization
     text = text.replace('--', ' ')
-    # Basic cleaning
     text = re.sub(r"http\S+", "", text)
     text = re.sub('\n', ' ', text)
     text = re.sub("[^a-zA-Z0-9\s]", " ", text)
     text = re.sub(" {2,}", " ", text)
-    text = text.strip()
-    text = text.lower()
     return text
-# Define model for multilabel classification
 class MultilabelBertClassifier(nn.Module):
     def __init__(self, model_name, num_labels):
         super(MultilabelBertClassifier, self).__init__()
-        # Replace with a simpler initialization for inference only
-        from transformers import AutoConfig, AutoModel
-        self.config = AutoConfig.from_pretrained(model_name)
-        self.bert = AutoModel.from_pretrained(model_name)
-        self.classifier = nn.Linear(self.config.hidden_size, num_labels)
     def forward(self, input_ids, attention_mask):
         outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
-        pooled_output = outputs.last_hidden_state[:, 0, :]  # Use [CLS] token
-        return self.classifier(pooled_output)
-# Function to remove 'module.' prefix from state dict keys
-def remove_module_prefix(state_dict):
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        if key.startswith('module.'):
-            new_key = key[7:]  # Remove 'module.' prefix
-        else:
-            new_key = key
-        new_state_dict[new_key] = value
-    return new_state_dict
-# Load model function
-@st.cache_resource
-def load_model():
-    # Load tokenizer
-    tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p2')
-    # Initialize model
-    model = MultilabelBertClassifier('indobenchmark/indobert-base-p1', len(target_columns))
-    # Check if model exists
-    model_path = "model/alergen_model.pt"
-    if os.path.exists(model_path):
-        try:
-            # Load model weights
-            checkpoint = torch.load(model_path, map_location=device)
-            # Check if state_dict is directly in checkpoint or under 'model_state_dict' key
-            if 'model_state_dict' in checkpoint:
-                state_dict = checkpoint['model_state_dict']
-            else:
-                state_dict = checkpoint
-            # Remove 'module.' prefix if it exists
-            state_dict = remove_module_prefix(state_dict)
-            # Load the processed state dict
-            model.load_state_dict(state_dict)
-            model.to(device)
-            model.eval()
-            return model, tokenizer
-        except Exception as e:
-            st.error(f"Error loading model: {str(e)}")
-            return None, tokenizer
-    else:
-        st.error("Model file not found. Please upload the model file.")
-        return None, tokenizer
-# Function to predict allergens
-def predict_allergens(model, tokenizer, ingredients_text, max_length=128):
-    if not model:
-        return {}
-    # Clean the text
-    cleaned_text = clean_text(ingredients_text)
-    # Tokenize
-    encoding = tokenizer.encode_plus(
-        cleaned_text,
         add_special_tokens=True,
         max_length=max_length,
         truncation=True,
         return_tensors='pt',
         padding='max_length'
     )
-    input_ids = encoding['input_ids'].to(device)
-    attention_mask = encoding['attention_mask'].to(device)
     with torch.no_grad():
-        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
-        predictions = torch.sigmoid(outputs)
-        predictions = (predictions > 0.5).float().cpu().numpy()[0]
-    result = {}
-    for i, target in enumerate(target_columns):
-        result[target] = bool(predictions[i])
-    return result
-# UI components
-def main():
-    st.title("🍲 Allergen Detection in Indonesian Recipes")
-    st.write("This app predicts common allergens in your recipe based on ingredients.")
-    # Create directory for model if it doesn't exist
-    os.makedirs("model", exist_ok=True)
-    # Sidebar for model upload
-    with st.sidebar:
-        st.header("Model Settings")
-        uploaded_model = st.file_uploader("Upload model file (alergen_model.pt)", type=["pt"])
-        if uploaded_model:
-            # Save uploaded model
-            with open("model/alergen_model.pt", "wb") as f:
-                f.write(uploaded_model.getbuffer())
-            st.success("Model uploaded successfully!")
-        st.markdown("---")
-        st.write("Allergen Categories:")
-        for allergen in target_columns:
-            if allergen == 'susu':
-                st.write("- Susu (Milk)")
-            elif allergen == 'kacang':
-                st.write("- Kacang (Nuts)")
-            elif allergen == 'telur':
-                st.write("- Telur (Eggs)")
-            elif allergen == 'makanan_laut':
-                st.write("- Makanan Laut (Seafood)")
-            elif allergen == 'gandum':
-                st.write("- Gandum (Wheat/Gluten)")
-    # Load model
-    model, tokenizer = load_model()
-    # Input area
-    st.header("Recipe Ingredients")
-    # Example button
-    if st.button("Load Example"):
-        example_text = "1 bungkus Lontong homemade 2 butir Telur ayam 2 kotak kecil Tahu coklat 4 butir kecil Kentang 2 buah Tomat merah 1 buah Ketimun lalap 4 lembar Selada keriting 2 lembar Kol putih 2 porsi Saus kacang homemade 4 buah Kerupuk udang goreng Secukupnya emping goreng 2 sdt Bawang goreng Secukupnya Kecap manis (bila suka)"
-        st.session_state.ingredients = example_text
-    # Text input
-    ingredients_text = st.text_area(
-        "Enter recipe ingredients (in Indonesian):",
-        height=150,
-        key="ingredients"
-    )
-    # Predict button
-    if st.button("Detect Allergens"):
-        if ingredients_text.strip() == "":
-            st.warning("Please enter ingredients first.")
-        elif model is None:
-            st.error("Please upload the model file first.")
-        else:
-            with st.spinner("Analyzing ingredients..."):
-                # Make prediction
-                allergens = predict_allergens(model, tokenizer, ingredients_text)
-                # Display results
-                st.header("Results")
-                # Create columns for results
-                col1, col2 = st.columns(2)
-                with col1:
-                    st.subheader("Detected Allergens:")
-                    has_allergens = False
-                    for allergen, present in allergens.items():
-                        if present:
-                            has_allergens = True
-                            if allergen == 'susu':
-                                st.warning("🥛 Susu (Milk)")
-                            elif allergen == 'kacang':
-                                st.warning("🥜 Kacang (Nuts)")
-                            elif allergen == 'telur':
-                                st.warning("🥚 Telur (Eggs)")
-                            elif allergen == 'makanan_laut':
-                                st.warning("🦐 Makanan Laut (Seafood)")
-                            elif allergen == 'gandum':
-                                st.warning("🌾 Gandum (Wheat/Gluten)")
-                    if not has_allergens:
-                        st.success("✅ No allergens detected!")
-                with col2:
-                    st.subheader("All Categories:")
-                    for allergen, present in allergens.items():
-                        if allergen == 'susu':
-                            st.write("🥛 Susu (Milk): " + ("Detected ⚠️" if present else "Not detected ✓"))
-                        elif allergen == 'kacang':
-                            st.write("🥜 Kacang (Nuts): " + ("Detected ⚠️" if present else "Not detected ✓"))
-                        elif allergen == 'telur':
-                            st.write("🥚 Telur (Eggs): " + ("Detected ⚠️" if present else "Not detected ✓"))
-                        elif allergen == 'makanan_laut':
-                            st.write("🦐 Makanan Laut (Seafood): " + ("Detected ⚠️" if present else "Not detected ✓"))
-                        elif allergen == 'gandum':
-                            st.write("🌾 Gandum (Wheat/Gluten): " + ("Detected ⚠️" if present else "Not detected ✓"))
-                # Show cleaned text
-                with st.expander("Processed Text"):
-                    st.code(clean_text(ingredients_text))
-    # Instructions and information
-    with st.expander("How to Use"):
-        st.write("""
-        1. First, upload the trained model file (`alergen_model.pt`) using the sidebar uploader
-        2. Enter your recipe ingredients in the text box (in Indonesian)
-        3. Click the "Detect Allergens" button to analyze the recipe
-        4. View the results showing which allergens are present in your recipe
-        The model detects five common allergen categories: milk, nuts, eggs, seafood, and wheat/gluten.
-        """)
-if __name__ == "__main__":
-    main()

+# app.py
 import streamlit as st
 import torch
 import torch.nn as nn
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import re
 import numpy as np
+# Target labels
 target_columns = ['susu', 'kacang', 'telur', 'makanan_laut', 'gandum']
+# Clean text
 def clean_text(text):
     text = text.replace('--', ' ')
     text = re.sub(r"http\S+", "", text)
     text = re.sub('\n', ' ', text)
     text = re.sub("[^a-zA-Z0-9\s]", " ", text)
     text = re.sub(" {2,}", " ", text)
+    text = text.strip().lower()
     return text
+# Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained("tokenizer_dir")
+max_length = 128
+# Define model architecture
 class MultilabelBertClassifier(nn.Module):
     def __init__(self, model_name, num_labels):
         super(MultilabelBertClassifier, self).__init__()
+        self.bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
+        self.bert.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
     def forward(self, input_ids, attention_mask):
         outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+        return outputs.logits
+# Load model
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = torch.load("model/alergen_model_full.pt", map_location=device)
+# Jika model dibungkus DataParallel, kita ambil model asli
+if hasattr(model, "module"):
+    model = model.module
+model.to(device)
+model.eval()
+# Prediction function
+def predict_alergens(text):
+    cleaned = clean_text(text)
+    inputs = tokenizer.encode_plus(
+        cleaned,
         add_special_tokens=True,
         max_length=max_length,
         truncation=True,
         return_tensors='pt',
         padding='max_length'
     )
+    input_ids = inputs['input_ids'].to(device)
+    attention_mask = inputs['attention_mask'].to(device)
     with torch.no_grad():
+        logits = model(input_ids=input_ids, attention_mask=attention_mask)
+        probs = torch.sigmoid(logits)
+        preds = (probs > 0.5).float().cpu().numpy()[0]
+    return {target: bool(preds[i]) for i, target in enumerate(target_columns)}
+# Streamlit UI
+st.title("Deteksi Alergen dari Resep Masakan 🧪🍲")
+recipe_input = st.text_area("Masukkan bahan-bahan resep di sini:", height=200)
+if st.button("Deteksi Alergen"):
+    if recipe_input.strip() == "":
+        st.warning("Silakan masukkan teks resep terlebih dahulu.")
+    else:
+        with st.spinner("Menganalisis..."):
+            result = predict_alergens(recipe_input)
+        st.subheader("Hasil Prediksi Alergen:")
+        for allergen, is_present in result.items():
+            if is_present:
+                st.error(f"⚠️ {allergen}")
+            else:
+                st.success(f"✅ Bebas dari {allergen}")

model/{alergen_model.pt → alergen_model_full.pt} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:28df831b272894c11265ef5f4cf1ac2a2ca89e765b26bff928f34c388ff015d5
-size 497868974

 version https://git-lfs.github.com/spec/v1
+oid sha256:a7b5bbb0945b811482c8bb868a13bd655572de100833a50fd516efc0e52b7c17
+size 497911105

requirements.txt CHANGED Viewed

@@ -1,5 +1,4 @@
-streamlit>=1.25.0
-torch>=2.0.0
-transformers>=4.30.0
-numpy>=1.22.0
-protobuf>=3.20.0

+streamlit==1.30.0
+torch==2.0.1
+transformers==4.36.2
+numpy==1.25.2

tokenizer_dir/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer_dir/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_dir/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

tokenizer_dir/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff