File size: 3,874 Bytes
532392b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99b887a
 
 
532392b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import streamlit as st
import pandas as pd
import json
import os
from sentence_transformers import SentenceTransformer, util
from openai import OpenAI
from loguru import logger

# ================== CONFIGURATION ==================
st.set_page_config(page_title="Problem Deduplication Explorer", layout="wide")

# Load a pre-trained model for embeddings
MODEL_NAME = "all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME)

# Load preloaded dataset
@st.cache_data
def load_data():
    file_path = "data/merged_dataset.csv.zip"
    with gzip.open(file_path, "rt") as f:
        df = pd.read_csv(f)

df = load_data()

# ================== FUNCTION DEFINITIONS ==================
def compute_embeddings(problems):
    """Compute sentence embeddings."""
    return model.encode(problems, normalize_embeddings=True)

def find_similar_problems(df, similarity_threshold=0.9):
    """Find similar problems using cosine similarity."""
    embeddings = compute_embeddings(df['problem'].tolist())
    similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()

    clusters = {}
    for i in range(len(df)):
        current_uuid = df["uuid"][i]
        similar_items = [
            (df["uuid"][j], similarity_matrix[i][j])
            for j in range(i + 1, len(df))
            if similarity_matrix[i][j] > similarity_threshold
        ]

        if similar_items:
            clusters[current_uuid] = similar_items

    return clusters

def analyze_clusters(df, similarity_threshold=0.9):
    """Analyze duplicate problem clusters."""
    clusters = find_similar_problems(df, similarity_threshold)
    detailed_analysis = {}

    for key, values in clusters.items():
        base_row = df[df["uuid"] == key].iloc[0]
        cluster_details = []

        for val, score in values:
            comparison_row = df[df["uuid"] == val].iloc[0]
            
            column_differences = {}
            for col in df.columns:
                if col != "uuid":
                    column_differences[col] = {
                        'base': base_row[col],
                        'comparison': comparison_row[col],
                        'match': base_row[col] == comparison_row[col]
                    }

            cluster_details.append({
                'uuid': val,
                'similarity_score': score,
                'column_differences': column_differences,
            })

        detailed_analysis[key] = cluster_details

    return detailed_analysis

# ================== STREAMLIT UI ==================
st.title("πŸ” Problem Deduplication Explorer")

st.sidebar.header("Settings")
similarity_threshold = st.sidebar.slider(
    "Similarity Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.01
)

if st.sidebar.button("Run Deduplication Analysis"):
    with st.spinner("Analyzing..."):
        results = analyze_clusters(df, similarity_threshold)

    st.success("Analysis Complete!")

    st.subheader("πŸ“Š Duplicate Problem Clusters")
    for base_uuid, cluster in results.items():
        base_problem = df[df["uuid"] == base_uuid]["problem"].values[0]
        st.markdown(f"### Problem: {base_problem}")

        for entry in cluster:
            similar_problem = df[df["uuid"] == entry["uuid"]]["problem"].values[0]
            st.write(f"**Similar to:** {similar_problem}")
            st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")

            with st.expander("Show Column Differences"):
                st.json(entry["column_differences"])

            st.markdown("---")

    # Export results
    st.sidebar.download_button(
        label="Download Results as JSON",
        data=json.dumps(results, indent=2),
        file_name="deduplication_results.json",
        mime="application/json"
    )

# ================== DATAFRAME DISPLAY ==================
st.subheader("πŸ“„ Explore the Dataset")
st.dataframe(df)