File size: 4,430 Bytes
532392b
 
 
 
e13f19b
532392b
 
 
 
 
 
 
 
 
 
 
 
 
99b887a
e13f19b
 
 
 
 
532392b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c4bc190
532392b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c4bc190
 
 
 
 
 
 
532392b
c4bc190
 
 
532392b
 
 
c4bc190
532392b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c4bc190
532392b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import streamlit as st
import pandas as pd
import json
import os
import zipfile
from sentence_transformers import SentenceTransformer, util
from loguru import logger

# ================== CONFIGURATION ==================
st.set_page_config(page_title="Problem Deduplication Explorer", layout="wide")

# Load a pre-trained model for embeddings
MODEL_NAME = "all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME)

# Load preloaded dataset
@st.cache_data
def load_data():
    file_path = "data/merged_dataset.csv.zip"
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.printdir()
        zip_ref.extractall("data/extracted")
        df = pd.read_csv("data/extracted/merged_dataset.csv")
    return df

df = load_data()

# ================== FUNCTION DEFINITIONS ==================
def compute_embeddings(problems):
    """Compute sentence embeddings."""
    return model.encode(problems, normalize_embeddings=True)

def find_similar_problems(df, similarity_threshold=0.9):
    """Find similar problems using cosine similarity."""
    embeddings = compute_embeddings(df['problem'].tolist())
    similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
    clusters = {}
    for i in range(len(df)):
        current_uuid = df["uuid"][i]
        similar_items = [
            (df["uuid"][j], float(similarity_matrix[i][j]))  # Convert float32 to float
            for j in range(i + 1, len(df))
            if similarity_matrix[i][j] > similarity_threshold
        ]
        if similar_items:
            clusters[current_uuid] = similar_items
    return clusters

def analyze_clusters(df, similarity_threshold=0.9):
    """Analyze duplicate problem clusters."""
    clusters = find_similar_problems(df, similarity_threshold)
    detailed_analysis = {}
    for key, values in clusters.items():
        base_row = df[df["uuid"] == key].iloc[0]
        cluster_details = []
        for val, score in values:
            comparison_row = df[df["uuid"] == val].iloc[0]
            
            column_differences = {}
            for col in df.columns:
                if col != "uuid":
                    base_val = base_row[col]
                    comp_val = comparison_row[col]
                    # Convert numpy types to native Python types
                    if hasattr(base_val, 'item'):
                        base_val = base_val.item()
                    if hasattr(comp_val, 'item'):
                        comp_val = comp_val.item()
                    column_differences[col] = {
                        'base': base_val,
                        'comparison': comp_val,
                        'match': bool(base_val == comp_val)  # Convert numpy bool to Python bool
                    }
            cluster_details.append({
                'uuid': val,
                'similarity_score': float(score),  # Convert float32 to float
                'column_differences': column_differences,
            })
        detailed_analysis[key] = cluster_details
    return detailed_analysis

# ================== STREAMLIT UI ==================
st.title("πŸ” Problem Deduplication Explorer")

st.sidebar.header("Settings")
similarity_threshold = st.sidebar.slider(
    "Similarity Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.01
)

if st.sidebar.button("Run Deduplication Analysis"):
    with st.spinner("Analyzing..."):
        results = analyze_clusters(df, similarity_threshold)
    st.success("Analysis Complete!")
    
    st.subheader("πŸ“Š Duplicate Problem Clusters")
    for base_uuid, cluster in results.items():
        base_problem = df[df["uuid"] == base_uuid]["problem"].values[0]
        st.markdown(f"### Problem: {base_problem}")
        for entry in cluster:
            similar_problem = df[df["uuid"] == entry["uuid"]]["problem"].values[0]
            st.write(f"**Similar to:** {similar_problem}")
            st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
            with st.expander("Show Column Differences"):
                st.json(entry["column_differences"])
            st.markdown("---")

    # Export results
    st.sidebar.download_button(
        label="Download Results as JSON",
        data=json.dumps(results, indent=2),
        file_name="deduplication_results.json",
        mime="application/json"
    )

# ================== DATAFRAME DISPLAY ==================
st.subheader("πŸ“„ Explore the Dataset")
st.dataframe(df)