dsleo commited on
Commit
3e140a6
·
verified ·
1 Parent(s): e13f19b
Files changed (1) hide show
  1. app.py +80 -60
app.py CHANGED
@@ -2,7 +2,7 @@ import streamlit as st
2
  import pandas as pd
3
  import json
4
  import os
5
- import zipfile
6
  from sentence_transformers import SentenceTransformer, util
7
  from loguru import logger
8
 
@@ -17,14 +17,15 @@ model = SentenceTransformer(MODEL_NAME)
17
  @st.cache_data
18
  def load_data():
19
  file_path = "data/merged_dataset.csv.zip"
20
- with zipfile.ZipFile(file_path, 'r') as zip_ref:
21
- zip_ref.printdir()
22
- zip_ref.extractall("data/extracted")
23
- df = pd.read_csv("data/extracted/merged_dataset.csv")
24
  return df
25
 
26
  df = load_data()
27
 
 
 
 
28
  # ================== FUNCTION DEFINITIONS ==================
29
  def compute_embeddings(problems):
30
  """Compute sentence embeddings."""
@@ -34,49 +35,38 @@ def find_similar_problems(df, similarity_threshold=0.9):
34
  """Find similar problems using cosine similarity."""
35
  embeddings = compute_embeddings(df['problem'].tolist())
36
  similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
37
- clusters = {}
38
  for i in range(len(df)):
39
- current_uuid = df["uuid"][i]
40
- similar_items = [
41
- (df["uuid"][j], float(similarity_matrix[i][j])) # Convert float32 to float
42
- for j in range(i + 1, len(df))
43
- if similarity_matrix[i][j] > similarity_threshold
44
- ]
45
- if similar_items:
46
- clusters[current_uuid] = similar_items
47
- return clusters
48
 
49
  def analyze_clusters(df, similarity_threshold=0.9):
50
  """Analyze duplicate problem clusters."""
51
- clusters = find_similar_problems(df, similarity_threshold)
52
- detailed_analysis = {}
53
- for key, values in clusters.items():
54
- base_row = df[df["uuid"] == key].iloc[0]
55
- cluster_details = []
56
- for val, score in values:
57
- comparison_row = df[df["uuid"] == val].iloc[0]
58
-
59
- column_differences = {}
60
- for col in df.columns:
61
- if col != "uuid":
62
- base_val = base_row[col]
63
- comp_val = comparison_row[col]
64
- # Convert numpy types to native Python types
65
- if hasattr(base_val, 'item'):
66
- base_val = base_val.item()
67
- if hasattr(comp_val, 'item'):
68
- comp_val = comp_val.item()
69
- column_differences[col] = {
70
- 'base': base_val,
71
- 'comparison': comp_val,
72
- 'match': bool(base_val == comp_val) # Convert numpy bool to Python bool
73
- }
74
- cluster_details.append({
75
- 'uuid': val,
76
- 'similarity_score': float(score), # Convert float32 to float
77
- 'column_differences': column_differences,
78
- })
79
- detailed_analysis[key] = cluster_details
80
  return detailed_analysis
81
 
82
  # ================== STREAMLIT UI ==================
@@ -87,23 +77,58 @@ similarity_threshold = st.sidebar.slider(
87
  "Similarity Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.01
88
  )
89
 
 
 
 
 
90
  if st.sidebar.button("Run Deduplication Analysis"):
91
  with st.spinner("Analyzing..."):
92
  results = analyze_clusters(df, similarity_threshold)
93
  st.success("Analysis Complete!")
94
 
95
- st.subheader("📊 Duplicate Problem Clusters")
96
- for base_uuid, cluster in results.items():
97
- base_problem = df[df["uuid"] == base_uuid]["problem"].values[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  st.markdown(f"### Problem: {base_problem}")
99
- for entry in cluster:
100
- similar_problem = df[df["uuid"] == entry["uuid"]]["problem"].values[0]
101
- st.write(f"**Similar to:** {similar_problem}")
102
- st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
103
- with st.expander("Show Column Differences"):
104
- st.json(entry["column_differences"])
105
- st.markdown("---")
106
-
 
 
 
 
 
 
 
 
 
 
 
 
107
  # Export results
108
  st.sidebar.download_button(
109
  label="Download Results as JSON",
@@ -111,8 +136,3 @@ if st.sidebar.button("Run Deduplication Analysis"):
111
  file_name="deduplication_results.json",
112
  mime="application/json"
113
  )
114
-
115
- # ================== DATAFRAME DISPLAY ==================
116
- st.subheader("📄 Explore the Dataset")
117
- st.dataframe(df)
118
-
 
2
  import pandas as pd
3
  import json
4
  import os
5
+ import gzip
6
  from sentence_transformers import SentenceTransformer, util
7
  from loguru import logger
8
 
 
17
  @st.cache_data
18
  def load_data():
19
  file_path = "data/merged_dataset.csv.zip"
20
+ with gzip.open(file_path, "rt") as f:
21
+ df = pd.read_csv(f)
 
 
22
  return df
23
 
24
  df = load_data()
25
 
26
+ display_columns = ["problem", "source", "question_type", "problem_type"]
27
+ df_filtered = df[display_columns]
28
+
29
  # ================== FUNCTION DEFINITIONS ==================
30
  def compute_embeddings(problems):
31
  """Compute sentence embeddings."""
 
35
  """Find similar problems using cosine similarity."""
36
  embeddings = compute_embeddings(df['problem'].tolist())
37
  similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
38
+ pairs = []
39
  for i in range(len(df)):
40
+ for j in range(i + 1, len(df)):
41
+ score = similarity_matrix[i][j]
42
+ if score > similarity_threshold:
43
+ pairs.append((df.iloc[i]["uuid"], df.iloc[j]["uuid"], float(score)))
44
+ return sorted(pairs, key=lambda x: x[2], reverse=True) # Sort by similarity score
 
 
 
 
45
 
46
  def analyze_clusters(df, similarity_threshold=0.9):
47
  """Analyze duplicate problem clusters."""
48
+ pairs = find_similar_problems(df, similarity_threshold)
49
+ detailed_analysis = []
50
+ for base_uuid, comp_uuid, score in pairs:
51
+ base_row = df[df["uuid"] == base_uuid].iloc[0]
52
+ comp_row = df[df["uuid"] == comp_uuid].iloc[0]
53
+
54
+ column_differences = {}
55
+ for col in df.columns:
56
+ if col != "uuid":
57
+ base_val = base_row[col]
58
+ comp_val = comp_row[col]
59
+ column_differences[col] = {
60
+ 'base': base_val,
61
+ 'comparison': comp_val,
62
+ 'match': bool(base_val == comp_val)
63
+ }
64
+ detailed_analysis.append({
65
+ 'base_uuid': base_uuid,
66
+ 'comp_uuid': comp_uuid,
67
+ 'similarity_score': score,
68
+ 'column_differences': column_differences,
69
+ })
 
 
 
 
 
 
 
70
  return detailed_analysis
71
 
72
  # ================== STREAMLIT UI ==================
 
77
  "Similarity Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.01
78
  )
79
 
80
+ # Display first 5 rows of dataset
81
+ st.subheader("📄 Explore the Dataset")
82
+ st.dataframe(df_filtered.head(5))
83
+
84
  if st.sidebar.button("Run Deduplication Analysis"):
85
  with st.spinner("Analyzing..."):
86
  results = analyze_clusters(df, similarity_threshold)
87
  st.success("Analysis Complete!")
88
 
89
+ st.subheader("📊 Duplicate Problem Pairs")
90
+
91
+ # Filtering options
92
+ sources = df["source"].unique().tolist()
93
+ question_types = df["question_type"].unique().tolist()
94
+
95
+ selected_source = st.sidebar.selectbox("Filter by Source", [None] + sources)
96
+ selected_qtype = st.sidebar.selectbox("Filter by Question Type", [None] + question_types)
97
+
98
+ if selected_source:
99
+ results = [r for r in results if df[df["uuid"] == r["base_uuid"]]["source"].values[0] == selected_source]
100
+ if selected_qtype:
101
+ results = [r for r in results if df[df["uuid"] == r["base_uuid"]]["question_type"].values[0] == selected_qtype]
102
+
103
+ # Display top 5 initially
104
+ num_display = 5
105
+ shown_results = results[:num_display]
106
+
107
+ for entry in shown_results:
108
+ base_problem = df[df["uuid"] == entry["base_uuid"]]["problem"].values[0]
109
+ similar_problem = df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0]
110
+
111
  st.markdown(f"### Problem: {base_problem}")
112
+ st.write(f"**Similar to:** {similar_problem}")
113
+ st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
114
+ with st.expander("Show Column Differences"):
115
+ st.json(entry["column_differences"])
116
+ st.markdown("---")
117
+
118
+ if len(results) > num_display:
119
+ if st.button("Show More Results"):
120
+ extra_results = results[num_display:num_display * 2]
121
+ for entry in extra_results:
122
+ base_problem = df[df["uuid"] == entry["base_uuid"]]["problem"].values[0]
123
+ similar_problem = df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0]
124
+
125
+ st.markdown(f"### Problem: {base_problem}")
126
+ st.write(f"**Similar to:** {similar_problem}")
127
+ st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
128
+ with st.expander("Show Column Differences"):
129
+ st.json(entry["column_differences"])
130
+ st.markdown("---")
131
+
132
  # Export results
133
  st.sidebar.download_button(
134
  label="Download Results as JSON",
 
136
  file_name="deduplication_results.json",
137
  mime="application/json"
138
  )