Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import json | |
import os | |
import time | |
import zipfile | |
from sentence_transformers import SentenceTransformer, util | |
from loguru import logger | |
# ================== CONFIGURATION ================== | |
st.set_page_config(page_title="Problem Deduplication Explorer", layout="wide") | |
# Load a pre-trained model for embeddings | |
MODEL_NAME = "all-MiniLM-L6-v2" | |
model = SentenceTransformer(MODEL_NAME) | |
# Load preloaded dataset | |
def load_data(): | |
file_path = "data/merged_dataset.csv.zip" | |
with zipfile.ZipFile(file_path, 'r') as zip_ref: | |
zip_ref.printdir() | |
zip_ref.extractall("data/extracted") | |
df = pd.read_csv("data/extracted/merged_dataset.csv") | |
return df | |
df = load_data() | |
display_columns = ["uuid","problem", "source", "question_type", "problem_type"] | |
df = df[display_columns] | |
# ================== FUNCTION DEFINITIONS ================== | |
def compute_embeddings(problems): | |
"""Compute sentence embeddings.""" | |
return model.encode(problems, normalize_embeddings=True) | |
def find_similar_problems(df, similarity_threshold=0.9): | |
"""Find similar problems using cosine similarity, optimized for speed with clean UI updates.""" | |
status_box = st.empty() | |
status_box.info("π Computing problem embeddings...") | |
start_time = time.time() | |
embeddings = compute_embeddings(df['problem'].tolist()) | |
status_box.info("π Computing cosine similarity matrix...") | |
similarity_matrix = util.cos_sim(embeddings, embeddings).numpy() | |
status_box.info("π Filtering similar problems...") | |
num_problems = len(df) | |
upper_triangle_indices = np.triu_indices(num_problems, k=1) | |
i_indices, j_indices = upper_triangle_indices | |
similarity_scores = similarity_matrix[i_indices, j_indices] | |
mask = similarity_scores > similarity_threshold | |
filtered_i = i_indices[mask] | |
filtered_j = j_indices[mask] | |
filtered_scores = similarity_scores[mask] | |
pairs = [ | |
(df.iloc[i]["uuid"], df.iloc[j]["uuid"], float(score)) | |
for i, j, score in zip(filtered_i, filtered_j, filtered_scores) | |
] | |
sorted_pairs = sorted(pairs, key=lambda x: x[2], reverse=True) | |
status_box.empty() | |
st.success(f"β Analysis complete! Found {len(sorted_pairs)} similar problems in {time.time() - start_time:.2f}s", icon="π") | |
return sorted_pairs | |
def analyze_clusters(df, similarity_threshold=0.9): | |
"""Analyze duplicate problem clusters.""" | |
pairs = find_similar_problems(df, similarity_threshold) | |
detailed_analysis = [] | |
for base_uuid, comp_uuid, score in pairs: | |
base_row = df[df["uuid"] == base_uuid].iloc[0] | |
comp_row = df[df["uuid"] == comp_uuid].iloc[0] | |
column_differences = {} | |
for col in df.columns: | |
if col != "uuid": | |
base_val = base_row[col] | |
comp_val = comp_row[col] | |
column_differences[col] = { | |
'base': base_val, | |
'comparison': comp_val, | |
'match': bool(base_val == comp_val) | |
} | |
detailed_analysis.append({ | |
'base_uuid': base_uuid, | |
'comp_uuid': comp_uuid, | |
'similarity_score': score, | |
'column_differences': column_differences, | |
}) | |
return detailed_analysis | |
# ================== STREAMLIT UI ================== | |
st.title("π Problem Deduplication Explorer") | |
st.sidebar.header("Settings") | |
similarity_threshold = st.sidebar.slider( | |
"Similarity Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.01 | |
) | |
# Display first 5 rows of dataset | |
st.subheader("π Explore the Dataset") | |
st.dataframe(df.head(5)) | |
if st.sidebar.button("Run Deduplication Analysis"): | |
with st.spinner("Analyzing..."): | |
results = analyze_clusters(df, similarity_threshold) | |
st.success("Analysis Complete!") | |
st.subheader("π Duplicate Problem Pairs") | |
# Filtering options | |
sources = df["source"].unique().tolist() | |
question_types = df["question_type"].unique().tolist() | |
selected_source = st.sidebar.selectbox("Filter by Source", [None] + sources) | |
selected_qtype = st.sidebar.selectbox("Filter by Question Type", [None] + question_types) | |
if selected_source: | |
results = [r for r in results if df[df["uuid"] == r["base_uuid"]]["source"].values[0] == selected_source] | |
if selected_qtype: | |
results = [r for r in results if df[df["uuid"] == r["base_uuid"]]["question_type"].values[0] == selected_qtype] | |
# Display top 5 initially | |
num_display = 5 | |
shown_results = results[:num_display] | |
for entry in shown_results: | |
base_problem = df[df["uuid"] == entry["base_uuid"]]["problem"].values[0] | |
similar_problem = df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0] | |
st.markdown(f"### Problem: {base_problem}") | |
st.write(f"**Similar to:** {similar_problem}") | |
st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}") | |
with st.expander("Show Column Differences"): | |
st.json(entry["column_differences"]) | |
st.markdown("---") | |
if len(results) > num_display: | |
if st.button("Show More Results"): | |
extra_results = results[num_display:num_display * 2] | |
for entry in extra_results: | |
base_problem = df[df["uuid"] == entry["base_uuid"]]["problem"].values[0] | |
similar_problem = df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0] | |
st.markdown(f"### Problem: {base_problem}") | |
st.write(f"**Similar to:** {similar_problem}") | |
st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}") | |
with st.expander("Show Column Differences"): | |
st.json(entry["column_differences"]) | |
st.markdown("---") | |