|
import streamlit as st |
|
import pandas as pd |
|
import zipfile |
|
import os |
|
from sentence_transformers import SentenceTransformer, util |
|
from transformers import pipeline |
|
|
|
|
|
ZIP_FILE = "xnli-multilingual-nli-dataset.zip" |
|
CSV_FILE = "en_test.csv" |
|
EXTRACT_FOLDER = "extracted_data" |
|
|
|
|
|
@st.cache_data |
|
def extract_and_load(): |
|
if not os.path.exists(EXTRACT_FOLDER): |
|
with zipfile.ZipFile(ZIP_FILE, "r") as zip_ref: |
|
zip_ref.extractall(EXTRACT_FOLDER) |
|
csv_path = os.path.join(EXTRACT_FOLDER, CSV_FILE) |
|
df = pd.read_csv(csv_path).dropna().sample(500) |
|
return df[['premise', 'hypothesis', 'label']] |
|
|
|
df = extract_and_load() |
|
|
|
|
|
nli_model = pipeline("text-classification", model="joeddav/xlm-roberta-large-xnli") |
|
embedder = SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v2") |
|
|
|
|
|
st.title("π Multilingual RAG-style NLI Explorer") |
|
st.markdown("Enter a sentence in **any language**, and the app will find a related statement from the dataset and infer their relationship.") |
|
|
|
user_input = st.text_input("Enter your **hypothesis** (your own sentence):") |
|
|
|
if user_input: |
|
with st.spinner("Finding most relevant premise..."): |
|
premise_embeddings = embedder.encode(df['premise'].tolist(), convert_to_tensor=True) |
|
user_embedding = embedder.encode(user_input, convert_to_tensor=True) |
|
|
|
top_hit = util.semantic_search(user_embedding, premise_embeddings, top_k=1)[0][0] |
|
match_idx = top_hit['corpus_id'] |
|
selected_premise = df.iloc[match_idx]['premise'] |
|
|
|
st.subheader("π Most Relevant Premise:") |
|
st.write(selected_premise) |
|
|
|
|
|
full_input = f"{selected_premise} </s> {user_input}" |
|
result = nli_model(full_input)[0] |
|
|
|
st.subheader("π§ Predicted Relationship:") |
|
st.write(f"**{result['label']}** (confidence: {result['score']:.2f})") |
|
|