Spaces:

dsleo
/

math-dedup

Sleeping

File size: 3,874 Bytes

import streamlit as st
import pandas as pd
import json
import os
from sentence_transformers import SentenceTransformer, util
from openai import OpenAI
from loguru import logger

# ================== CONFIGURATION ==================
st.set_page_config(page_title="Problem Deduplication Explorer", layout="wide")

# Load a pre-trained model for embeddings
MODEL_NAME = "all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME)

# Load preloaded dataset
@st.cache_data
def load_data():
    file_path = "data/merged_dataset.csv.zip"
    with gzip.open(file_path, "rt") as f:
        df = pd.read_csv(f)

df = load_data()

# ================== FUNCTION DEFINITIONS ==================
def compute_embeddings(problems):
    """Compute sentence embeddings."""
    return model.encode(problems, normalize_embeddings=True)

def find_similar_problems(df, similarity_threshold=0.9):
    """Find similar problems using cosine similarity."""
    embeddings = compute_embeddings(df['problem'].tolist())
    similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()

    clusters = {}
    for i in range(len(df)):
        current_uuid = df["uuid"][i]
        similar_items = [
            (df["uuid"][j], similarity_matrix[i][j])
            for j in range(i + 1, len(df))
            if similarity_matrix[i][j] > similarity_threshold
        ]

        if similar_items:
            clusters[current_uuid] = similar_items

    return clusters

def analyze_clusters(df, similarity_threshold=0.9):
    """Analyze duplicate problem clusters."""
    clusters = find_similar_problems(df, similarity_threshold)
    detailed_analysis = {}

    for key, values in clusters.items():
        base_row = df[df["uuid"] == key].iloc[0]
        cluster_details = []

        for val, score in values:
            comparison_row = df[df["uuid"] == val].iloc[0]
            
            column_differences = {}
            for col in df.columns:
                if col != "uuid":
                    column_differences[col] = {
                        'base': base_row[col],
                        'comparison': comparison_row[col],
                        'match': base_row[col] == comparison_row[col]
                    }

            cluster_details.append({
                'uuid': val,
                'similarity_score': score,
                'column_differences': column_differences,
            })

        detailed_analysis[key] = cluster_details

    return detailed_analysis

# ================== STREAMLIT UI ==================
st.title("🔍 Problem Deduplication Explorer")

st.sidebar.header("Settings")
similarity_threshold = st.sidebar.slider(
    "Similarity Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.01
)

if st.sidebar.button("Run Deduplication Analysis"):
    with st.spinner("Analyzing..."):
        results = analyze_clusters(df, similarity_threshold)

    st.success("Analysis Complete!")

    st.subheader("📊 Duplicate Problem Clusters")
    for base_uuid, cluster in results.items():
        base_problem = df[df["uuid"] == base_uuid]["problem"].values[0]
        st.markdown(f"### Problem: {base_problem}")

        for entry in cluster:
            similar_problem = df[df["uuid"] == entry["uuid"]]["problem"].values[0]
            st.write(f"**Similar to:** {similar_problem}")
            st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")

            with st.expander("Show Column Differences"):
                st.json(entry["column_differences"])

            st.markdown("---")

    # Export results
    st.sidebar.download_button(
        label="Download Results as JSON",
        data=json.dumps(results, indent=2),
        file_name="deduplication_results.json",
        mime="application/json"
    )

# ================== DATAFRAME DISPLAY ==================
st.subheader("📄 Explore the Dataset")
st.dataframe(df)