Spaces:
Sleeping
Sleeping
serialization issue
Browse files
app.py
CHANGED
@@ -3,7 +3,6 @@ import pandas as pd
|
|
3 |
import json
|
4 |
import os
|
5 |
from sentence_transformers import SentenceTransformer, util
|
6 |
-
from openai import OpenAI
|
7 |
from loguru import logger
|
8 |
|
9 |
# ================== CONFIGURATION ==================
|
@@ -31,50 +30,49 @@ def find_similar_problems(df, similarity_threshold=0.9):
|
|
31 |
"""Find similar problems using cosine similarity."""
|
32 |
embeddings = compute_embeddings(df['problem'].tolist())
|
33 |
similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
|
34 |
-
|
35 |
clusters = {}
|
36 |
for i in range(len(df)):
|
37 |
current_uuid = df["uuid"][i]
|
38 |
similar_items = [
|
39 |
-
(df["uuid"][j], similarity_matrix[i][j])
|
40 |
for j in range(i + 1, len(df))
|
41 |
if similarity_matrix[i][j] > similarity_threshold
|
42 |
]
|
43 |
-
|
44 |
if similar_items:
|
45 |
clusters[current_uuid] = similar_items
|
46 |
-
|
47 |
return clusters
|
48 |
|
49 |
def analyze_clusters(df, similarity_threshold=0.9):
|
50 |
"""Analyze duplicate problem clusters."""
|
51 |
clusters = find_similar_problems(df, similarity_threshold)
|
52 |
detailed_analysis = {}
|
53 |
-
|
54 |
for key, values in clusters.items():
|
55 |
base_row = df[df["uuid"] == key].iloc[0]
|
56 |
cluster_details = []
|
57 |
-
|
58 |
for val, score in values:
|
59 |
comparison_row = df[df["uuid"] == val].iloc[0]
|
60 |
|
61 |
column_differences = {}
|
62 |
for col in df.columns:
|
63 |
if col != "uuid":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
column_differences[col] = {
|
65 |
-
'base':
|
66 |
-
'comparison':
|
67 |
-
'match':
|
68 |
}
|
69 |
-
|
70 |
cluster_details.append({
|
71 |
'uuid': val,
|
72 |
-
'similarity_score': score,
|
73 |
'column_differences': column_differences,
|
74 |
})
|
75 |
-
|
76 |
detailed_analysis[key] = cluster_details
|
77 |
-
|
78 |
return detailed_analysis
|
79 |
|
80 |
# ================== STREAMLIT UI ==================
|
@@ -88,22 +86,18 @@ similarity_threshold = st.sidebar.slider(
|
|
88 |
if st.sidebar.button("Run Deduplication Analysis"):
|
89 |
with st.spinner("Analyzing..."):
|
90 |
results = analyze_clusters(df, similarity_threshold)
|
91 |
-
|
92 |
st.success("Analysis Complete!")
|
93 |
-
|
94 |
st.subheader("📊 Duplicate Problem Clusters")
|
95 |
for base_uuid, cluster in results.items():
|
96 |
base_problem = df[df["uuid"] == base_uuid]["problem"].values[0]
|
97 |
st.markdown(f"### Problem: {base_problem}")
|
98 |
-
|
99 |
for entry in cluster:
|
100 |
similar_problem = df[df["uuid"] == entry["uuid"]]["problem"].values[0]
|
101 |
st.write(f"**Similar to:** {similar_problem}")
|
102 |
st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
|
103 |
-
|
104 |
with st.expander("Show Column Differences"):
|
105 |
st.json(entry["column_differences"])
|
106 |
-
|
107 |
st.markdown("---")
|
108 |
|
109 |
# Export results
|
|
|
3 |
import json
|
4 |
import os
|
5 |
from sentence_transformers import SentenceTransformer, util
|
|
|
6 |
from loguru import logger
|
7 |
|
8 |
# ================== CONFIGURATION ==================
|
|
|
30 |
"""Find similar problems using cosine similarity."""
|
31 |
embeddings = compute_embeddings(df['problem'].tolist())
|
32 |
similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
|
|
|
33 |
clusters = {}
|
34 |
for i in range(len(df)):
|
35 |
current_uuid = df["uuid"][i]
|
36 |
similar_items = [
|
37 |
+
(df["uuid"][j], float(similarity_matrix[i][j])) # Convert float32 to float
|
38 |
for j in range(i + 1, len(df))
|
39 |
if similarity_matrix[i][j] > similarity_threshold
|
40 |
]
|
|
|
41 |
if similar_items:
|
42 |
clusters[current_uuid] = similar_items
|
|
|
43 |
return clusters
|
44 |
|
45 |
def analyze_clusters(df, similarity_threshold=0.9):
|
46 |
"""Analyze duplicate problem clusters."""
|
47 |
clusters = find_similar_problems(df, similarity_threshold)
|
48 |
detailed_analysis = {}
|
|
|
49 |
for key, values in clusters.items():
|
50 |
base_row = df[df["uuid"] == key].iloc[0]
|
51 |
cluster_details = []
|
|
|
52 |
for val, score in values:
|
53 |
comparison_row = df[df["uuid"] == val].iloc[0]
|
54 |
|
55 |
column_differences = {}
|
56 |
for col in df.columns:
|
57 |
if col != "uuid":
|
58 |
+
base_val = base_row[col]
|
59 |
+
comp_val = comparison_row[col]
|
60 |
+
# Convert numpy types to native Python types
|
61 |
+
if hasattr(base_val, 'item'):
|
62 |
+
base_val = base_val.item()
|
63 |
+
if hasattr(comp_val, 'item'):
|
64 |
+
comp_val = comp_val.item()
|
65 |
column_differences[col] = {
|
66 |
+
'base': base_val,
|
67 |
+
'comparison': comp_val,
|
68 |
+
'match': bool(base_val == comp_val) # Convert numpy bool to Python bool
|
69 |
}
|
|
|
70 |
cluster_details.append({
|
71 |
'uuid': val,
|
72 |
+
'similarity_score': float(score), # Convert float32 to float
|
73 |
'column_differences': column_differences,
|
74 |
})
|
|
|
75 |
detailed_analysis[key] = cluster_details
|
|
|
76 |
return detailed_analysis
|
77 |
|
78 |
# ================== STREAMLIT UI ==================
|
|
|
86 |
if st.sidebar.button("Run Deduplication Analysis"):
|
87 |
with st.spinner("Analyzing..."):
|
88 |
results = analyze_clusters(df, similarity_threshold)
|
|
|
89 |
st.success("Analysis Complete!")
|
90 |
+
|
91 |
st.subheader("📊 Duplicate Problem Clusters")
|
92 |
for base_uuid, cluster in results.items():
|
93 |
base_problem = df[df["uuid"] == base_uuid]["problem"].values[0]
|
94 |
st.markdown(f"### Problem: {base_problem}")
|
|
|
95 |
for entry in cluster:
|
96 |
similar_problem = df[df["uuid"] == entry["uuid"]]["problem"].values[0]
|
97 |
st.write(f"**Similar to:** {similar_problem}")
|
98 |
st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
|
|
|
99 |
with st.expander("Show Column Differences"):
|
100 |
st.json(entry["column_differences"])
|
|
|
101 |
st.markdown("---")
|
102 |
|
103 |
# Export results
|