File size: 5,926 Bytes
532392b
 
f83d20c
532392b
 
f83d20c
c28a67e
532392b
 
 
 
 
 
 
 
 
 
 
 
 
99b887a
c28a67e
 
 
 
e13f19b
532392b
 
 
c6f32b1
f83d20c
3e140a6
532392b
 
 
 
 
f83d20c
8026d59
532392b
8026d59
f83d20c
8026d59
 
f83d20c
532392b
f83d20c
8026d59
532392b
f83d20c
8026d59
f83d20c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8026d59
 
f83d20c
 
 
532392b
 
 
3e140a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
532392b
 
 
 
 
 
 
 
 
 
3e140a6
 
f83d20c
3e140a6
532392b
 
 
 
c4bc190
3e140a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
532392b
3e140a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import streamlit as st
import pandas as pd
import numpy as np
import json
import os
import time
import zipfile
from sentence_transformers import SentenceTransformer, util
from loguru import logger

# ================== CONFIGURATION ==================
st.set_page_config(page_title="Problem Deduplication Explorer", layout="wide")

# Load a pre-trained model for embeddings
MODEL_NAME = "all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME)

# Load preloaded dataset
@st.cache_data
def load_data():
    file_path = "data/merged_dataset.csv.zip"
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.printdir()
        zip_ref.extractall("data/extracted")
        df = pd.read_csv("data/extracted/merged_dataset.csv")
    return df

df = load_data()

display_columns = ["uuid","problem", "source", "question_type", "problem_type"]
df = df[display_columns]

# ================== FUNCTION DEFINITIONS ==================
def compute_embeddings(problems):
    """Compute sentence embeddings."""
    return model.encode(problems, normalize_embeddings=True)



def find_similar_problems(df, similarity_threshold=0.9):
    """Find similar problems using cosine similarity, optimized for speed with clean UI updates."""
    
    status_box = st.empty() 
    status_box.info("πŸ”„ Computing problem embeddings...")
    start_time = time.time()
    embeddings = compute_embeddings(df['problem'].tolist())
    
    status_box.info("πŸ”„ Computing cosine similarity matrix...")
    similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()

    status_box.info("πŸ”„ Filtering similar problems...")
    num_problems = len(df)
    upper_triangle_indices = np.triu_indices(num_problems, k=1)
    
    i_indices, j_indices = upper_triangle_indices
    similarity_scores = similarity_matrix[i_indices, j_indices]

    mask = similarity_scores > similarity_threshold
    filtered_i = i_indices[mask]
    filtered_j = j_indices[mask]
    filtered_scores = similarity_scores[mask]

    pairs = [
        (df.iloc[i]["uuid"], df.iloc[j]["uuid"], float(score))
        for i, j, score in zip(filtered_i, filtered_j, filtered_scores)
    ]
    
    sorted_pairs = sorted(pairs, key=lambda x: x[2], reverse=True)

    status_box.empty()
    st.success(f"βœ… Analysis complete! Found {len(sorted_pairs)} similar problems in {time.time() - start_time:.2f}s", icon="πŸŽ‰")

    return sorted_pairs

def analyze_clusters(df, similarity_threshold=0.9):
    """Analyze duplicate problem clusters."""
    pairs = find_similar_problems(df, similarity_threshold)
    detailed_analysis = []
    for base_uuid, comp_uuid, score in pairs:
        base_row = df[df["uuid"] == base_uuid].iloc[0]
        comp_row = df[df["uuid"] == comp_uuid].iloc[0]
        
        column_differences = {}
        for col in df.columns:
            if col != "uuid":
                base_val = base_row[col]
                comp_val = comp_row[col]
                column_differences[col] = {
                    'base': base_val,
                    'comparison': comp_val,
                    'match': bool(base_val == comp_val)
                }
        detailed_analysis.append({
            'base_uuid': base_uuid,
            'comp_uuid': comp_uuid,
            'similarity_score': score,
            'column_differences': column_differences,
        })
    return detailed_analysis

# ================== STREAMLIT UI ==================
st.title("πŸ” Problem Deduplication Explorer")

st.sidebar.header("Settings")
similarity_threshold = st.sidebar.slider(
    "Similarity Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.01
)

# Display first 5 rows of dataset
st.subheader("πŸ“„ Explore the Dataset")
st.dataframe(df.head(5))

if st.sidebar.button("Run Deduplication Analysis"):
    with st.spinner("Analyzing..."):
        results = analyze_clusters(df, similarity_threshold)
    st.success("Analysis Complete!")
    
    st.subheader("πŸ“Š Duplicate Problem Pairs")
    
    # Filtering options
    sources = df["source"].unique().tolist()
    question_types = df["question_type"].unique().tolist()
    
    selected_source = st.sidebar.selectbox("Filter by Source", [None] + sources)
    selected_qtype = st.sidebar.selectbox("Filter by Question Type", [None] + question_types)
    
    if selected_source:
        results = [r for r in results if df[df["uuid"] == r["base_uuid"]]["source"].values[0] == selected_source]
    if selected_qtype:
        results = [r for r in results if df[df["uuid"] == r["base_uuid"]]["question_type"].values[0] == selected_qtype]
    
    # Display top 5 initially
    num_display = 5
    shown_results = results[:num_display]
    
    for entry in shown_results:
        base_problem = df[df["uuid"] == entry["base_uuid"]]["problem"].values[0]
        similar_problem = df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0]
        
        st.markdown(f"### Problem: {base_problem}")
        st.write(f"**Similar to:** {similar_problem}")
        st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
        with st.expander("Show Column Differences"):
            st.json(entry["column_differences"])
        st.markdown("---")
    
    if len(results) > num_display:
        if st.button("Show More Results"):
            extra_results = results[num_display:num_display * 2]
            for entry in extra_results:
                base_problem = df[df["uuid"] == entry["base_uuid"]]["problem"].values[0]
                similar_problem = df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0]
                
                st.markdown(f"### Problem: {base_problem}")
                st.write(f"**Similar to:** {similar_problem}")
                st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
                with st.expander("Show Column Differences"):
                    st.json(entry["column_differences"])
                st.markdown("---")