File size: 8,593 Bytes
532392b
 
f83d20c
532392b
 
f83d20c
c28a67e
532392b
 
 
 
17186a1
 
 
 
 
 
74d7b60
 
 
 
 
 
 
 
17186a1
 
 
 
 
 
 
 
 
532392b
17186a1
532392b
 
 
 
17186a1
 
 
 
 
 
 
 
 
 
 
 
532392b
17186a1
 
 
 
 
 
532392b
17186a1
640a2eb
17186a1
 
 
532392b
17186a1
 
 
 
 
7362def
532392b
17186a1
 
f83d20c
 
 
17186a1
f83d20c
 
17186a1
 
f83d20c
17186a1
 
 
 
f83d20c
7362def
17186a1
 
 
 
7362def
17186a1
f83d20c
17186a1
 
 
 
 
 
3e140a6
 
17186a1
 
 
 
 
 
 
 
 
 
 
3e140a6
 
 
 
 
 
 
532392b
 
74d7b60
 
 
 
 
 
 
 
 
17186a1
 
3e140a6
17186a1
 
 
3e140a6
17186a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3e140a6
17186a1
 
 
3e140a6
17186a1
 
 
 
 
 
 
 
74d7b60
 
 
 
 
3e140a6
74d7b60
17186a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74d7b60
 
 
17186a1
74d7b60
17186a1
 
 
 
74d7b60
 
17186a1
 
 
 
 
 
 
 
 
 
 
 
 
 
74d7b60
17186a1
 
 
 
 
 
 
 
 
 
 
 
3e140a6
17186a1
 
 
3e140a6
 
17186a1
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import streamlit as st
import pandas as pd
import numpy as np
import json
import os
import time
import zipfile
from sentence_transformers import SentenceTransformer, util
from loguru import logger

# ================== CONFIGURATION ==================
st.set_page_config(
    page_title="Problem Deduplication Explorer",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Initialize session state
if 'page_number' not in st.session_state:
    st.session_state.page_number = 0
if 'analysis_results' not in st.session_state:
    st.session_state.analysis_results = None
if 'filtered_results' not in st.session_state:
    st.session_state.filtered_results = None

# Load a pre-trained model for embeddings with HF caching
@st.cache_resource
def load_model():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    try:
        return SentenceTransformer(model_name, cache_folder="/tmp/sentence_transformers")
    except Exception as e:
        st.error(f"Error loading model: {e}")
        return None

model = load_model()

# Load preloaded dataset
@st.cache_data
def load_data():
    try:
        file_path = "data/merged_dataset.csv.zip"
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            with zip_ref.open(zip_ref.namelist()[0]) as file:
                df = pd.read_csv(file)
        return df[["uuid", "problem", "source", "question_type", "problem_type"]]
    except Exception as e:
        st.error(f"Error loading dataset: {e}")
        return pd.DataFrame(columns=["uuid", "problem", "source", "question_type", "problem_type"])

# Cache embeddings computation with error handling
@st.cache_data
def compute_embeddings(problems):
    """Compute and cache sentence embeddings."""
    try:
        return model.encode(problems, normalize_embeddings=True)
    except Exception as e:
        st.error(f"Error computing embeddings: {e}")
        return np.array([])

def find_similar_problems(df, similarity_threshold=0.9, progress_bar=None):
    """Find similar problems using cosine similarity, optimized for speed."""
    if df.empty:
        return []
        
    embeddings = compute_embeddings(df['problem'].tolist())
    if embeddings.size == 0:
        return []
        
    if progress_bar:
        progress_bar.progress(0.33, "Computing similarity matrix...")

    similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
    if progress_bar:
        progress_bar.progress(0.66, "Finding similar pairs...")

    num_problems = len(df)
    upper_triangle_indices = np.triu_indices(num_problems, k=1)
    similarity_scores = similarity_matrix[upper_triangle_indices]
    
    mask = similarity_scores > similarity_threshold
    filtered_indices = np.where(mask)[0]
    
    pairs = [
        (df.iloc[upper_triangle_indices[0][i]]["uuid"],
         df.iloc[upper_triangle_indices[1][i]]["uuid"],
         float(similarity_scores[i]))
        for i in filtered_indices
    ]

    if progress_bar:
        progress_bar.progress(1.0, "Analysis complete!")
        time.sleep(0.5)
        progress_bar.empty()

    return sorted(pairs, key=lambda x: x[2], reverse=True)

@st.cache_data
def analyze_clusters(_df, pairs):
    """Analyze duplicate problem clusters with caching."""
    if not pairs or _df.empty:
        return []
        
    detailed_analysis = []
    for base_uuid, comp_uuid, score in pairs:
        base_row = _df[_df["uuid"] == base_uuid].iloc[0]
        comp_row = _df[_df["uuid"] == comp_uuid].iloc[0]
        
        column_differences = {
            col: {
                'base': base_row[col],
                'comparison': comp_row[col],
                'match': bool(base_row[col] == comp_row[col])
            }
            for col in _df.columns if col != "uuid"
        }
        
        detailed_analysis.append({
            'base_uuid': base_uuid,
            'comp_uuid': comp_uuid,
            'similarity_score': score,
            'column_differences': column_differences,
        })
    return detailed_analysis

def apply_filters(results, df, selected_source, selected_qtype):
    """Apply filters to results."""
    filtered = results.copy()
    if selected_source:
        filtered = [r for r in filtered if df[df["uuid"] == r["base_uuid"]]["source"].values[0] == selected_source]
    if selected_qtype:
        filtered = [r for r in filtered if df[df["uuid"] == r["base_uuid"]]["question_type"].values[0] == selected_qtype]
    return filtered

def main():
    st.title("πŸ” Problem Deduplication Explorer")
    
    if model is None:
        st.error("Failed to load the model. Please try again later.")
        return
    
    # Sidebar configuration
    with st.sidebar:
        st.header("Settings")
        similarity_threshold = st.slider(
            "Similarity Threshold",
            min_value=0.5,
            max_value=1.0,
            value=0.9,
            step=0.01,
            help="Higher values mean more similar problems"
        )
        
        items_per_page = st.select_slider(
            "Items per page",
            options=[5, 10, 20, 50],
            value=10,
            help="Number of results to show per page"
        )

    # Load and display dataset
    df = load_data()
    
    if df.empty:
        st.error("Failed to load the dataset. Please check if the data file exists in the correct location.")
        return
    
    with st.expander("πŸ“„ Dataset Preview", expanded=False):
        st.dataframe(
            df.head(),
            use_container_width=True,
            hide_index=True
        )

    # Analysis section
    if st.sidebar.button("Run Deduplication Analysis", type="primary") or st.session_state.analysis_results is not None:
        if st.session_state.analysis_results is None:
            progress_bar = st.progress(0, "Starting analysis...")
            pairs = find_similar_problems(df, similarity_threshold, progress_bar)
            st.session_state.analysis_results = analyze_clusters(df, pairs)
        
        results = st.session_state.analysis_results
        
        if not results:
            st.warning("No similar problems found with the current threshold.")
            return
            
        # Filtering options
        sources = sorted(df["source"].unique().tolist())
        question_types = sorted(df["question_type"].unique().tolist())
        
        col1, col2 = st.columns(2)
        with col1:
            selected_source = st.selectbox("Filter by Source", [None] + sources)
        with col2:
            selected_qtype = st.selectbox("Filter by Question Type", [None] + question_types)
        
        # Apply filters and store in session state
        filtered_results = apply_filters(results, df, selected_source, selected_qtype)
        st.session_state.filtered_results = filtered_results
        
        if not filtered_results:
            st.warning("No results found with the current filters.")
            return
            
        # Pagination
        total_pages = (len(filtered_results) - 1) // items_per_page
        st.session_state.page_number = min(st.session_state.page_number, total_pages)
        
        col1, col2, col3 = st.columns([1, 3, 1])
        with col1:
            if st.button("← Previous", disabled=st.session_state.page_number <= 0):
                st.session_state.page_number -= 1
        with col2:
            st.write(f"Page {st.session_state.page_number + 1} of {total_pages + 1}")
        with col3:
            if st.button("Next β†’", disabled=st.session_state.page_number >= total_pages):
                st.session_state.page_number += 1
        
        # Display results
        start_idx = st.session_state.page_number * items_per_page
        end_idx = start_idx + items_per_page
        page_results = filtered_results[start_idx:end_idx]
        
        for entry in page_results:
            with st.container():
                col1, col2 = st.columns([1, 1])
                
                with col1:
                    st.markdown("### Original Problem")
                    st.info(df[df["uuid"] == entry["base_uuid"]]["problem"].values[0])
                    
                with col2:
                    st.markdown("### Similar Problem")
                    st.info(df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0])
                
                st.metric("Similarity Score", f"{entry['similarity_score']:.4f}")
                
                with st.expander("Show Details"):
                    st.json(entry["column_differences"])
                st.markdown("---")

if __name__ == "__main__":
    main()