import streamlit as st import pandas as pd import numpy as np import json import os import time import zipfile from sentence_transformers import SentenceTransformer, util from loguru import logger # ================== CONFIGURATION ================== st.set_page_config( page_title="Problem Deduplication Explorer", layout="wide", initial_sidebar_state="expanded" ) # Initialize session state if 'page_number' not in st.session_state: st.session_state.page_number = 0 if 'analysis_results' not in st.session_state: st.session_state.analysis_results = None if 'filtered_results' not in st.session_state: st.session_state.filtered_results = None # Load a pre-trained model for embeddings with HF caching @st.cache_resource def load_model(): model_name = "sentence-transformers/all-MiniLM-L6-v2" try: return SentenceTransformer(model_name, cache_folder="/tmp/sentence_transformers") except Exception as e: st.error(f"Error loading model: {e}") return None model = load_model() # Load preloaded dataset @st.cache_data def load_data(): try: file_path = "data/merged_dataset.csv.zip" with zipfile.ZipFile(file_path, 'r') as zip_ref: with zip_ref.open(zip_ref.namelist()[0]) as file: df = pd.read_csv(file) return df[["uuid", "problem", "source", "question_type", "problem_type"]] except Exception as e: st.error(f"Error loading dataset: {e}") return pd.DataFrame(columns=["uuid", "problem", "source", "question_type", "problem_type"]) # Cache embeddings computation with error handling @st.cache_data def compute_embeddings(problems): """Compute and cache sentence embeddings.""" try: return model.encode(problems, normalize_embeddings=True) except Exception as e: st.error(f"Error computing embeddings: {e}") return np.array([]) def find_similar_problems(df, similarity_threshold=0.9, progress_bar=None): """Find similar problems using cosine similarity, optimized for speed.""" if df.empty: return [] embeddings = compute_embeddings(df['problem'].tolist()) if embeddings.size == 0: return [] if progress_bar: progress_bar.progress(0.33, "Computing similarity matrix...") similarity_matrix = util.cos_sim(embeddings, embeddings).numpy() if progress_bar: progress_bar.progress(0.66, "Finding similar pairs...") num_problems = len(df) upper_triangle_indices = np.triu_indices(num_problems, k=1) similarity_scores = similarity_matrix[upper_triangle_indices] mask = similarity_scores > similarity_threshold filtered_indices = np.where(mask)[0] pairs = [ (df.iloc[upper_triangle_indices[0][i]]["uuid"], df.iloc[upper_triangle_indices[1][i]]["uuid"], float(similarity_scores[i])) for i in filtered_indices ] if progress_bar: progress_bar.progress(1.0, "Analysis complete!") time.sleep(0.5) progress_bar.empty() return sorted(pairs, key=lambda x: x[2], reverse=True) @st.cache_data def analyze_clusters(_df, pairs): """Analyze duplicate problem clusters with caching.""" if not pairs or _df.empty: return [] detailed_analysis = [] for base_uuid, comp_uuid, score in pairs: base_row = _df[_df["uuid"] == base_uuid].iloc[0] comp_row = _df[_df["uuid"] == comp_uuid].iloc[0] column_differences = { col: { 'base': base_row[col], 'comparison': comp_row[col], 'match': bool(base_row[col] == comp_row[col]) } for col in _df.columns if col != "uuid" } detailed_analysis.append({ 'base_uuid': base_uuid, 'comp_uuid': comp_uuid, 'similarity_score': score, 'column_differences': column_differences, }) return detailed_analysis def apply_filters(results, df, selected_source, selected_qtype): """Apply filters to results.""" filtered = results.copy() if selected_source: filtered = [r for r in filtered if df[df["uuid"] == r["base_uuid"]]["source"].values[0] == selected_source] if selected_qtype: filtered = [r for r in filtered if df[df["uuid"] == r["base_uuid"]]["question_type"].values[0] == selected_qtype] return filtered def main(): st.title("🔍 Problem Deduplication Explorer") if model is None: st.error("Failed to load the model. Please try again later.") return # Sidebar configuration with st.sidebar: st.header("Settings") similarity_threshold = st.slider( "Similarity Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.01, help="Higher values mean more similar problems" ) items_per_page = st.select_slider( "Items per page", options=[5, 10, 20, 50], value=10, help="Number of results to show per page" ) # Load and display dataset df = load_data() if df.empty: st.error("Failed to load the dataset. Please check if the data file exists in the correct location.") return with st.expander("📄 Dataset Preview", expanded=False): st.dataframe( df.head(), use_container_width=True, hide_index=True ) # Analysis section if st.sidebar.button("Run Deduplication Analysis", type="primary") or st.session_state.analysis_results is not None: if st.session_state.analysis_results is None: progress_bar = st.progress(0, "Starting analysis...") pairs = find_similar_problems(df, similarity_threshold, progress_bar) st.session_state.analysis_results = analyze_clusters(df, pairs) results = st.session_state.analysis_results if not results: st.warning("No similar problems found with the current threshold.") return # Filtering options sources = sorted(df["source"].unique().tolist()) question_types = sorted(df["question_type"].unique().tolist()) col1, col2 = st.columns(2) with col1: selected_source = st.selectbox("Filter by Source", [None] + sources) with col2: selected_qtype = st.selectbox("Filter by Question Type", [None] + question_types) # Apply filters and store in session state filtered_results = apply_filters(results, df, selected_source, selected_qtype) st.session_state.filtered_results = filtered_results if not filtered_results: st.warning("No results found with the current filters.") return # Pagination total_pages = (len(filtered_results) - 1) // items_per_page st.session_state.page_number = min(st.session_state.page_number, total_pages) col1, col2, col3 = st.columns([1, 3, 1]) with col1: if st.button("← Previous", disabled=st.session_state.page_number <= 0): st.session_state.page_number -= 1 with col2: st.write(f"Page {st.session_state.page_number + 1} of {total_pages + 1}") with col3: if st.button("Next →", disabled=st.session_state.page_number >= total_pages): st.session_state.page_number += 1 # Display results start_idx = st.session_state.page_number * items_per_page end_idx = start_idx + items_per_page page_results = filtered_results[start_idx:end_idx] for entry in page_results: with st.container(): col1, col2 = st.columns([1, 1]) with col1: st.markdown("### Original Problem") st.info(df[df["uuid"] == entry["base_uuid"]]["problem"].values[0]) with col2: st.markdown("### Similar Problem") st.info(df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0]) st.metric("Similarity Score", f"{entry['similarity_score']:.4f}") with st.expander("Show Details"): st.json(entry["column_differences"]) st.markdown("---") if __name__ == "__main__": main()