Spaces:

dejanseo
/

bulgarian-search-query-intent-classifier

Running

App Files Files Community

dejanseo commited on Mar 18

Commit

d30503a

verified ·

1 Parent(s): c5ee7d7

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -152

app.py CHANGED Viewed

@@ -1,158 +1,79 @@
-import os
-import json
-import torch
 import streamlit as st
-import pandas as pd
-from transformers import AutoModelForSequenceClassification, AutoTokenizer
-# ==============================
-# ⚙️ CONFIGURABLE PARAMETERS
-# ==============================
-MODEL_PATH = "dejanseo/bulgarian-search-query-intent-alpha"  # HF model repository
-LABEL_MAP_PATH = "label_map.json"  # Ensure this file is in the same directory as app.py
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# ==============================
-# 📌 Load Model and Tokenizer
-# ==============================
 @st.cache_resource
-def load_inference_resources():
-    # Load the label mapping from local file
-    with open(LABEL_MAP_PATH, "r") as f:
-        label_map = json.load(f)
-    # Convert ID keys from string to int for id_to_label mapping
-    id_to_label = {int(k): v for k, v in label_map["id_to_label"].items()}
-    # Load the tokenizer and model from Hugging Face
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
-    model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
-    model.to(DEVICE)
-    model.eval()  # Set model to evaluation mode
-    return model, tokenizer, label_map["label_to_id"], id_to_label
-# ==============================
-# 📌 Inference Function
-# ==============================
-def predict_intent(query, model, tokenizer, id_to_label):
-    """
-    Predict the intent of a Bulgarian search query.
-    """
-    # Tokenize input text
-    inputs = tokenizer(
-        query,
-        padding="max_length",
-        truncation=True,
-        max_length=128,
-        return_tensors="pt"
-    )
-    # Move inputs to the same device as the model
-    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
-    # Inference without gradient tracking
-    with torch.no_grad():
-        outputs = model(**inputs)
-    # Compute probabilities with softmax
-    probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)[0]
-    # Identify the predicted class and confidence
-    predicted_class_id = torch.argmax(probabilities).item()
-    predicted_intent = id_to_label[predicted_class_id]
-    confidence = probabilities[predicted_class_id].item()
-    # Build a dictionary with all intent scores
-    all_intents = {id_to_label[i]: prob.item() for i, prob in enumerate(probabilities)}
-    sorted_intents = sorted(all_intents.items(), key=lambda x: x[1], reverse=True)
-    return {
-        "query": query,
-        "predicted_intent": predicted_intent,
-        "confidence": confidence,
-        "all_scores": sorted_intents
-    }
-# ==============================
-# 🌟 Streamlit UI for Inference
-# ==============================
-def inference_ui():
-    st.title("🔍 Bulgarian Search Intent Classification")
-    try:
-        # Load resources
-        model, tokenizer, label_to_id, id_to_label = load_inference_resources()
-        st.success(f"✅ Model loaded successfully! Found {len(id_to_label)} intent classes.")
-        # Show available intents
-        with st.expander("Available Intent Classes"):
-            st.write(", ".join(id_to_label.values()))
-        # Single query inference
-        query = st.text_input("Enter a Bulgarian search query:", "Как да направя резервация за ресторант?")
-        if st.button("Predict Intent"):
-            with st.spinner("Analyzing query..."):
-                prediction = predict_intent(query, model, tokenizer, id_to_label)
-            st.subheader("Prediction Results")
-            st.metric(
-                label="Predicted Intent",
-                value=prediction["predicted_intent"],
-                delta=f"{prediction['confidence']*100:.2f}% confidence"
-            )
-            st.subheader("Intent Probabilities")
-            df_probs = pd.DataFrame(prediction["all_scores"], columns=["Intent", "Probability"])
-            df_top5 = df_probs.head(5)
-            st.bar_chart(df_top5.set_index("Intent"))
-            with st.expander("View All Intent Probabilities"):
-                st.dataframe(df_probs)
-        # Batch inference section
-        st.subheader("Batch Inference")
-        uploaded_file = st.file_uploader("Upload a CSV/Excel file with queries", type=["csv", "xlsx", "parquet"])
-        if uploaded_file is not None:
-            if uploaded_file.name.endswith(".csv"):
-                df = pd.read_csv(uploaded_file)
-            elif uploaded_file.name.endswith(".xlsx"):
-                df = pd.read_excel(uploaded_file)
-            elif uploaded_file.name.endswith(".parquet"):
-                df = pd.read_parquet(uploaded_file)
-            query_column = "query" if "query" in df.columns else st.selectbox("Select the column containing queries:", df.columns)
-            if query_column and st.button("Run Batch Inference"):
-                progress_bar = st.progress(0)
-                results = []
-                for i, row in enumerate(df[query_column]):
-                    progress_bar.progress((i + 1) / len(df))
-                    prediction = predict_intent(row, model, tokenizer, id_to_label)
-                    results.append({
-                        "query": row,
-                        "predicted_intent": prediction["predicted_intent"],
-                        "confidence": prediction["confidence"]
-                    })
-                results_df = pd.DataFrame(results)
-                st.subheader("Batch Inference Results")
-                st.dataframe(results_df)
-                csv = results_df.to_csv(index=False)
-                st.download_button(
-                    label="Download Results as CSV",
-                    data=csv,
-                    file_name="batch_inference_results.csv",
-                    mime="text/csv"
-                )
-    except Exception as e:
-        st.error(f"❌ Error loading model: {str(e)}")
-        st.error("Please ensure the model and label map files are available.")
-if __name__ == "__main__":
-    inference_ui()

 import streamlit as st
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+# Load model and tokenizer from Hugging Face Hub
 @st.cache_resource
+def load_model_and_tokenizer():
+    model_name = "dejanseo/bulgarian-search-query-intent"
+    model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    return model, tokenizer
+# Load resources
+model, tokenizer = load_model_and_tokenizer()
+st.title("Класификация на намерения за търсене (Български)")
+st.write(
+    "Въведете една или повече заявки (всеки на нов ред) или качете `.txt` файл, в който "
+    "всяка заявка е на отделен ред без допълнителни параметри."
+)
+# Display author info
+st.markdown(
+    "### Моделът е създаден от [DEJAN AI](https://dejan.ai)"
+)
+# Текстово поле за въвеждане на заявки
+queries_input = st.text_area("Въведете вашите заявки (по една на ред):")
+# Качване на `.txt` файл
+uploaded_file = st.file_uploader(
+    "Качете `.txt` файл с заявки (всеки ред съдържа една заявка)", type=["txt"]
+)
+# Събиране на заявките от текстовото поле и/или файла
+queries = []
+if queries_input.strip():
+    queries.extend([line.strip() for line in queries_input.splitlines() if line.strip()])
+if uploaded_file is not None:
+    file_content = uploaded_file.read().decode("utf-8")
+    queries.extend([line.strip() for line in file_content.splitlines() if line.strip()])
+if st.button("Класифицирай"):
+    if queries:
+        # Tokenize in batch
+        inputs = tokenizer(
+            queries,
+            return_tensors="pt",
+            truncation=True,
+            padding=True,
+            max_length=256
+        )
+        # Run inference
+        with torch.no_grad():
+            outputs = model(**inputs)
+        logits = outputs.logits
+        predictions = logits.argmax(dim=-1).tolist()
+        probabilities = F.softmax(logits, dim=-1)
+        confidence_scores = probabilities.max(dim=-1).values.tolist()
+        # Използване на наличната label mapping от модела
+        id2label = model.config.id2label
+        results = []
+        for query, pred, conf in zip(queries, predictions, confidence_scores):
+            predicted_label = id2label.get(str(pred), id2label.get(pred, "Неизвестно"))
+            results.append({
+                "Заявка": query,
+                "Предсказано намерение": predicted_label,
+                "Доверие": f"{conf:.2f}"
+            })
+        st.write("### Резултати:")
+        st.table(results)
+    else:
+        st.warning("Моля, въведете поне една заявка, преди да класифицирате.")