Spaces:
Sleeping
Sleeping
File size: 5,926 Bytes
532392b f83d20c 532392b f83d20c c28a67e 532392b 99b887a c28a67e e13f19b 532392b c6f32b1 f83d20c 3e140a6 532392b f83d20c 8026d59 532392b 8026d59 f83d20c 8026d59 f83d20c 532392b f83d20c 8026d59 532392b f83d20c 8026d59 f83d20c 8026d59 f83d20c 532392b 3e140a6 532392b 3e140a6 f83d20c 3e140a6 532392b c4bc190 3e140a6 532392b 3e140a6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import streamlit as st
import pandas as pd
import numpy as np
import json
import os
import time
import zipfile
from sentence_transformers import SentenceTransformer, util
from loguru import logger
# ================== CONFIGURATION ==================
st.set_page_config(page_title="Problem Deduplication Explorer", layout="wide")
# Load a pre-trained model for embeddings
MODEL_NAME = "all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME)
# Load preloaded dataset
@st.cache_data
def load_data():
file_path = "data/merged_dataset.csv.zip"
with zipfile.ZipFile(file_path, 'r') as zip_ref:
zip_ref.printdir()
zip_ref.extractall("data/extracted")
df = pd.read_csv("data/extracted/merged_dataset.csv")
return df
df = load_data()
display_columns = ["uuid","problem", "source", "question_type", "problem_type"]
df = df[display_columns]
# ================== FUNCTION DEFINITIONS ==================
def compute_embeddings(problems):
"""Compute sentence embeddings."""
return model.encode(problems, normalize_embeddings=True)
def find_similar_problems(df, similarity_threshold=0.9):
"""Find similar problems using cosine similarity, optimized for speed with clean UI updates."""
status_box = st.empty()
status_box.info("π Computing problem embeddings...")
start_time = time.time()
embeddings = compute_embeddings(df['problem'].tolist())
status_box.info("π Computing cosine similarity matrix...")
similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
status_box.info("π Filtering similar problems...")
num_problems = len(df)
upper_triangle_indices = np.triu_indices(num_problems, k=1)
i_indices, j_indices = upper_triangle_indices
similarity_scores = similarity_matrix[i_indices, j_indices]
mask = similarity_scores > similarity_threshold
filtered_i = i_indices[mask]
filtered_j = j_indices[mask]
filtered_scores = similarity_scores[mask]
pairs = [
(df.iloc[i]["uuid"], df.iloc[j]["uuid"], float(score))
for i, j, score in zip(filtered_i, filtered_j, filtered_scores)
]
sorted_pairs = sorted(pairs, key=lambda x: x[2], reverse=True)
status_box.empty()
st.success(f"β
Analysis complete! Found {len(sorted_pairs)} similar problems in {time.time() - start_time:.2f}s", icon="π")
return sorted_pairs
def analyze_clusters(df, similarity_threshold=0.9):
"""Analyze duplicate problem clusters."""
pairs = find_similar_problems(df, similarity_threshold)
detailed_analysis = []
for base_uuid, comp_uuid, score in pairs:
base_row = df[df["uuid"] == base_uuid].iloc[0]
comp_row = df[df["uuid"] == comp_uuid].iloc[0]
column_differences = {}
for col in df.columns:
if col != "uuid":
base_val = base_row[col]
comp_val = comp_row[col]
column_differences[col] = {
'base': base_val,
'comparison': comp_val,
'match': bool(base_val == comp_val)
}
detailed_analysis.append({
'base_uuid': base_uuid,
'comp_uuid': comp_uuid,
'similarity_score': score,
'column_differences': column_differences,
})
return detailed_analysis
# ================== STREAMLIT UI ==================
st.title("π Problem Deduplication Explorer")
st.sidebar.header("Settings")
similarity_threshold = st.sidebar.slider(
"Similarity Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.01
)
# Display first 5 rows of dataset
st.subheader("π Explore the Dataset")
st.dataframe(df.head(5))
if st.sidebar.button("Run Deduplication Analysis"):
with st.spinner("Analyzing..."):
results = analyze_clusters(df, similarity_threshold)
st.success("Analysis Complete!")
st.subheader("π Duplicate Problem Pairs")
# Filtering options
sources = df["source"].unique().tolist()
question_types = df["question_type"].unique().tolist()
selected_source = st.sidebar.selectbox("Filter by Source", [None] + sources)
selected_qtype = st.sidebar.selectbox("Filter by Question Type", [None] + question_types)
if selected_source:
results = [r for r in results if df[df["uuid"] == r["base_uuid"]]["source"].values[0] == selected_source]
if selected_qtype:
results = [r for r in results if df[df["uuid"] == r["base_uuid"]]["question_type"].values[0] == selected_qtype]
# Display top 5 initially
num_display = 5
shown_results = results[:num_display]
for entry in shown_results:
base_problem = df[df["uuid"] == entry["base_uuid"]]["problem"].values[0]
similar_problem = df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0]
st.markdown(f"### Problem: {base_problem}")
st.write(f"**Similar to:** {similar_problem}")
st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
with st.expander("Show Column Differences"):
st.json(entry["column_differences"])
st.markdown("---")
if len(results) > num_display:
if st.button("Show More Results"):
extra_results = results[num_display:num_display * 2]
for entry in extra_results:
base_problem = df[df["uuid"] == entry["base_uuid"]]["problem"].values[0]
similar_problem = df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0]
st.markdown(f"### Problem: {base_problem}")
st.write(f"**Similar to:** {similar_problem}")
st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
with st.expander("Show Column Differences"):
st.json(entry["column_differences"])
st.markdown("---")
|