import streamlit as st import pandas as pd import json import os from sentence_transformers import SentenceTransformer, util from openai import OpenAI from loguru import logger # ================== CONFIGURATION ================== st.set_page_config(page_title="Problem Deduplication Explorer", layout="wide") # Load a pre-trained model for embeddings MODEL_NAME = "all-MiniLM-L6-v2" model = SentenceTransformer(MODEL_NAME) # Load preloaded dataset @st.cache_data def load_data(): file_path = "data/merged_dataset.csv.zip" with gzip.open(file_path, "rt") as f: df = pd.read_csv(f) df = load_data() # ================== FUNCTION DEFINITIONS ================== def compute_embeddings(problems): """Compute sentence embeddings.""" return model.encode(problems, normalize_embeddings=True) def find_similar_problems(df, similarity_threshold=0.9): """Find similar problems using cosine similarity.""" embeddings = compute_embeddings(df['problem'].tolist()) similarity_matrix = util.cos_sim(embeddings, embeddings).numpy() clusters = {} for i in range(len(df)): current_uuid = df["uuid"][i] similar_items = [ (df["uuid"][j], similarity_matrix[i][j]) for j in range(i + 1, len(df)) if similarity_matrix[i][j] > similarity_threshold ] if similar_items: clusters[current_uuid] = similar_items return clusters def analyze_clusters(df, similarity_threshold=0.9): """Analyze duplicate problem clusters.""" clusters = find_similar_problems(df, similarity_threshold) detailed_analysis = {} for key, values in clusters.items(): base_row = df[df["uuid"] == key].iloc[0] cluster_details = [] for val, score in values: comparison_row = df[df["uuid"] == val].iloc[0] column_differences = {} for col in df.columns: if col != "uuid": column_differences[col] = { 'base': base_row[col], 'comparison': comparison_row[col], 'match': base_row[col] == comparison_row[col] } cluster_details.append({ 'uuid': val, 'similarity_score': score, 'column_differences': column_differences, }) detailed_analysis[key] = cluster_details return detailed_analysis # ================== STREAMLIT UI ================== st.title("🔍 Problem Deduplication Explorer") st.sidebar.header("Settings") similarity_threshold = st.sidebar.slider( "Similarity Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.01 ) if st.sidebar.button("Run Deduplication Analysis"): with st.spinner("Analyzing..."): results = analyze_clusters(df, similarity_threshold) st.success("Analysis Complete!") st.subheader("📊 Duplicate Problem Clusters") for base_uuid, cluster in results.items(): base_problem = df[df["uuid"] == base_uuid]["problem"].values[0] st.markdown(f"### Problem: {base_problem}") for entry in cluster: similar_problem = df[df["uuid"] == entry["uuid"]]["problem"].values[0] st.write(f"**Similar to:** {similar_problem}") st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}") with st.expander("Show Column Differences"): st.json(entry["column_differences"]) st.markdown("---") # Export results st.sidebar.download_button( label="Download Results as JSON", data=json.dumps(results, indent=2), file_name="deduplication_results.json", mime="application/json" ) # ================== DATAFRAME DISPLAY ================== st.subheader("📄 Explore the Dataset") st.dataframe(df)