dsleo commited on
Commit
c4bc190
·
1 Parent(s): ff6e183

serialization issue

Browse files
Files changed (1) hide show
  1. app.py +13 -19
app.py CHANGED
@@ -3,7 +3,6 @@ import pandas as pd
3
  import json
4
  import os
5
  from sentence_transformers import SentenceTransformer, util
6
- from openai import OpenAI
7
  from loguru import logger
8
 
9
  # ================== CONFIGURATION ==================
@@ -31,50 +30,49 @@ def find_similar_problems(df, similarity_threshold=0.9):
31
  """Find similar problems using cosine similarity."""
32
  embeddings = compute_embeddings(df['problem'].tolist())
33
  similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
34
-
35
  clusters = {}
36
  for i in range(len(df)):
37
  current_uuid = df["uuid"][i]
38
  similar_items = [
39
- (df["uuid"][j], similarity_matrix[i][j])
40
  for j in range(i + 1, len(df))
41
  if similarity_matrix[i][j] > similarity_threshold
42
  ]
43
-
44
  if similar_items:
45
  clusters[current_uuid] = similar_items
46
-
47
  return clusters
48
 
49
  def analyze_clusters(df, similarity_threshold=0.9):
50
  """Analyze duplicate problem clusters."""
51
  clusters = find_similar_problems(df, similarity_threshold)
52
  detailed_analysis = {}
53
-
54
  for key, values in clusters.items():
55
  base_row = df[df["uuid"] == key].iloc[0]
56
  cluster_details = []
57
-
58
  for val, score in values:
59
  comparison_row = df[df["uuid"] == val].iloc[0]
60
 
61
  column_differences = {}
62
  for col in df.columns:
63
  if col != "uuid":
 
 
 
 
 
 
 
64
  column_differences[col] = {
65
- 'base': base_row[col],
66
- 'comparison': comparison_row[col],
67
- 'match': base_row[col] == comparison_row[col]
68
  }
69
-
70
  cluster_details.append({
71
  'uuid': val,
72
- 'similarity_score': score,
73
  'column_differences': column_differences,
74
  })
75
-
76
  detailed_analysis[key] = cluster_details
77
-
78
  return detailed_analysis
79
 
80
  # ================== STREAMLIT UI ==================
@@ -88,22 +86,18 @@ similarity_threshold = st.sidebar.slider(
88
  if st.sidebar.button("Run Deduplication Analysis"):
89
  with st.spinner("Analyzing..."):
90
  results = analyze_clusters(df, similarity_threshold)
91
-
92
  st.success("Analysis Complete!")
93
-
94
  st.subheader("📊 Duplicate Problem Clusters")
95
  for base_uuid, cluster in results.items():
96
  base_problem = df[df["uuid"] == base_uuid]["problem"].values[0]
97
  st.markdown(f"### Problem: {base_problem}")
98
-
99
  for entry in cluster:
100
  similar_problem = df[df["uuid"] == entry["uuid"]]["problem"].values[0]
101
  st.write(f"**Similar to:** {similar_problem}")
102
  st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
103
-
104
  with st.expander("Show Column Differences"):
105
  st.json(entry["column_differences"])
106
-
107
  st.markdown("---")
108
 
109
  # Export results
 
3
  import json
4
  import os
5
  from sentence_transformers import SentenceTransformer, util
 
6
  from loguru import logger
7
 
8
  # ================== CONFIGURATION ==================
 
30
  """Find similar problems using cosine similarity."""
31
  embeddings = compute_embeddings(df['problem'].tolist())
32
  similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
 
33
  clusters = {}
34
  for i in range(len(df)):
35
  current_uuid = df["uuid"][i]
36
  similar_items = [
37
+ (df["uuid"][j], float(similarity_matrix[i][j])) # Convert float32 to float
38
  for j in range(i + 1, len(df))
39
  if similarity_matrix[i][j] > similarity_threshold
40
  ]
 
41
  if similar_items:
42
  clusters[current_uuid] = similar_items
 
43
  return clusters
44
 
45
  def analyze_clusters(df, similarity_threshold=0.9):
46
  """Analyze duplicate problem clusters."""
47
  clusters = find_similar_problems(df, similarity_threshold)
48
  detailed_analysis = {}
 
49
  for key, values in clusters.items():
50
  base_row = df[df["uuid"] == key].iloc[0]
51
  cluster_details = []
 
52
  for val, score in values:
53
  comparison_row = df[df["uuid"] == val].iloc[0]
54
 
55
  column_differences = {}
56
  for col in df.columns:
57
  if col != "uuid":
58
+ base_val = base_row[col]
59
+ comp_val = comparison_row[col]
60
+ # Convert numpy types to native Python types
61
+ if hasattr(base_val, 'item'):
62
+ base_val = base_val.item()
63
+ if hasattr(comp_val, 'item'):
64
+ comp_val = comp_val.item()
65
  column_differences[col] = {
66
+ 'base': base_val,
67
+ 'comparison': comp_val,
68
+ 'match': bool(base_val == comp_val) # Convert numpy bool to Python bool
69
  }
 
70
  cluster_details.append({
71
  'uuid': val,
72
+ 'similarity_score': float(score), # Convert float32 to float
73
  'column_differences': column_differences,
74
  })
 
75
  detailed_analysis[key] = cluster_details
 
76
  return detailed_analysis
77
 
78
  # ================== STREAMLIT UI ==================
 
86
  if st.sidebar.button("Run Deduplication Analysis"):
87
  with st.spinner("Analyzing..."):
88
  results = analyze_clusters(df, similarity_threshold)
 
89
  st.success("Analysis Complete!")
90
+
91
  st.subheader("📊 Duplicate Problem Clusters")
92
  for base_uuid, cluster in results.items():
93
  base_problem = df[df["uuid"] == base_uuid]["problem"].values[0]
94
  st.markdown(f"### Problem: {base_problem}")
 
95
  for entry in cluster:
96
  similar_problem = df[df["uuid"] == entry["uuid"]]["problem"].values[0]
97
  st.write(f"**Similar to:** {similar_problem}")
98
  st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
 
99
  with st.expander("Show Column Differences"):
100
  st.json(entry["column_differences"])
 
101
  st.markdown("---")
102
 
103
  # Export results