import streamlit as st import pandas as pd import zipfile import os from sentence_transformers import SentenceTransformer, util from transformers import pipeline # Constants ZIP_FILE = "xnli-multilingual-nli-dataset.zip" CSV_FILE = "en_test.csv" EXTRACT_FOLDER = "extracted_data" # Load and extract ZIP @st.cache_data def extract_and_load(): if not os.path.exists(EXTRACT_FOLDER): with zipfile.ZipFile(ZIP_FILE, "r") as zip_ref: zip_ref.extractall(EXTRACT_FOLDER) csv_path = os.path.join(EXTRACT_FOLDER, CSV_FILE) df = pd.read_csv(csv_path).dropna().sample(500) return df[['premise', 'hypothesis', 'label']] df = extract_and_load() # Load models nli_model = pipeline("text-classification", model="joeddav/xlm-roberta-large-xnli") embedder = SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v2") # UI st.title("🌐 Multilingual RAG-style NLI Explorer") st.markdown("Enter a sentence in **any language**, and the app will find a related statement from the dataset and infer their relationship.") user_input = st.text_input("Enter your **hypothesis** (your own sentence):") if user_input: with st.spinner("Finding most relevant premise..."): premise_embeddings = embedder.encode(df['premise'].tolist(), convert_to_tensor=True) user_embedding = embedder.encode(user_input, convert_to_tensor=True) top_hit = util.semantic_search(user_embedding, premise_embeddings, top_k=1)[0][0] match_idx = top_hit['corpus_id'] selected_premise = df.iloc[match_idx]['premise'] st.subheader("🔍 Most Relevant Premise:") st.write(selected_premise) # Run NLI classification full_input = f"{selected_premise} {user_input}" result = nli_model(full_input)[0] st.subheader("🧠 Predicted Relationship:") st.write(f"**{result['label']}** (confidence: {result['score']:.2f})")