Spaces:

dsleo
/

math-dedup

Sleeping

App Files Files Community

dsleo commited on Feb 6

Commit

3e140a6

verified ·

1 Parent(s): e13f19b

better

Browse files

Files changed (1) hide show

app.py +80 -60

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import streamlit as st
 import pandas as pd
 import json
 import os
-import zipfile
 from sentence_transformers import SentenceTransformer, util
 from loguru import logger
@@ -17,14 +17,15 @@ model = SentenceTransformer(MODEL_NAME)
 @st.cache_data
 def load_data():
     file_path = "data/merged_dataset.csv.zip"
-    with zipfile.ZipFile(file_path, 'r') as zip_ref:
-        zip_ref.printdir()
-        zip_ref.extractall("data/extracted")
-        df = pd.read_csv("data/extracted/merged_dataset.csv")
     return df
 df = load_data()
 # ================== FUNCTION DEFINITIONS ==================
 def compute_embeddings(problems):
     """Compute sentence embeddings."""
@@ -34,49 +35,38 @@ def find_similar_problems(df, similarity_threshold=0.9):
     """Find similar problems using cosine similarity."""
     embeddings = compute_embeddings(df['problem'].tolist())
     similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
-    clusters = {}
     for i in range(len(df)):
-        current_uuid = df["uuid"][i]
-        similar_items = [
-            (df["uuid"][j], float(similarity_matrix[i][j]))  # Convert float32 to float
-            for j in range(i + 1, len(df))
-            if similarity_matrix[i][j] > similarity_threshold
-        ]
-        if similar_items:
-            clusters[current_uuid] = similar_items
-    return clusters
 def analyze_clusters(df, similarity_threshold=0.9):
     """Analyze duplicate problem clusters."""
-    clusters = find_similar_problems(df, similarity_threshold)
-    detailed_analysis = {}
-    for key, values in clusters.items():
-        base_row = df[df["uuid"] == key].iloc[0]
-        cluster_details = []
-        for val, score in values:
-            comparison_row = df[df["uuid"] == val].iloc[0]
-            column_differences = {}
-            for col in df.columns:
-                if col != "uuid":
-                    base_val = base_row[col]
-                    comp_val = comparison_row[col]
-                    # Convert numpy types to native Python types
-                    if hasattr(base_val, 'item'):
-                        base_val = base_val.item()
-                    if hasattr(comp_val, 'item'):
-                        comp_val = comp_val.item()
-                    column_differences[col] = {
-                        'base': base_val,
-                        'comparison': comp_val,
-                        'match': bool(base_val == comp_val)  # Convert numpy bool to Python bool
-                    }
-            cluster_details.append({
-                'uuid': val,
-                'similarity_score': float(score),  # Convert float32 to float
-                'column_differences': column_differences,
-            })
-        detailed_analysis[key] = cluster_details
     return detailed_analysis
 # ================== STREAMLIT UI ==================
@@ -87,23 +77,58 @@ similarity_threshold = st.sidebar.slider(
     "Similarity Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.01
 )
 if st.sidebar.button("Run Deduplication Analysis"):
     with st.spinner("Analyzing..."):
         results = analyze_clusters(df, similarity_threshold)
     st.success("Analysis Complete!")
-    st.subheader("📊 Duplicate Problem Clusters")
-    for base_uuid, cluster in results.items():
-        base_problem = df[df["uuid"] == base_uuid]["problem"].values[0]
         st.markdown(f"### Problem: {base_problem}")
-        for entry in cluster:
-            similar_problem = df[df["uuid"] == entry["uuid"]]["problem"].values[0]
-            st.write(f"**Similar to:** {similar_problem}")
-            st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
-            with st.expander("Show Column Differences"):
-                st.json(entry["column_differences"])
-            st.markdown("---")
     # Export results
     st.sidebar.download_button(
         label="Download Results as JSON",
@@ -111,8 +136,3 @@ if st.sidebar.button("Run Deduplication Analysis"):
         file_name="deduplication_results.json",
         mime="application/json"
     )
-# ================== DATAFRAME DISPLAY ==================
-st.subheader("📄 Explore the Dataset")
-st.dataframe(df)

 import pandas as pd
 import json
 import os
+import gzip
 from sentence_transformers import SentenceTransformer, util
 from loguru import logger
 @st.cache_data
 def load_data():
     file_path = "data/merged_dataset.csv.zip"
+    with gzip.open(file_path, "rt") as f:
+        df = pd.read_csv(f)
     return df
 df = load_data()
+display_columns = ["problem", "source", "question_type", "problem_type"]
+df_filtered = df[display_columns]
 # ================== FUNCTION DEFINITIONS ==================
 def compute_embeddings(problems):
     """Compute sentence embeddings."""
     """Find similar problems using cosine similarity."""
     embeddings = compute_embeddings(df['problem'].tolist())
     similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
+    pairs = []
     for i in range(len(df)):
+        for j in range(i + 1, len(df)):
+            score = similarity_matrix[i][j]
+            if score > similarity_threshold:
+                pairs.append((df.iloc[i]["uuid"], df.iloc[j]["uuid"], float(score)))
+    return sorted(pairs, key=lambda x: x[2], reverse=True)  # Sort by similarity score
 def analyze_clusters(df, similarity_threshold=0.9):
     """Analyze duplicate problem clusters."""
+    pairs = find_similar_problems(df, similarity_threshold)
+    detailed_analysis = []
+    for base_uuid, comp_uuid, score in pairs:
+        base_row = df[df["uuid"] == base_uuid].iloc[0]
+        comp_row = df[df["uuid"] == comp_uuid].iloc[0]
+        column_differences = {}
+        for col in df.columns:
+            if col != "uuid":
+                base_val = base_row[col]
+                comp_val = comp_row[col]
+                column_differences[col] = {
+                    'base': base_val,
+                    'comparison': comp_val,
+                    'match': bool(base_val == comp_val)
+                }
+        detailed_analysis.append({
+            'base_uuid': base_uuid,
+            'comp_uuid': comp_uuid,
+            'similarity_score': score,
+            'column_differences': column_differences,
+        })
     return detailed_analysis
 # ================== STREAMLIT UI ==================
     "Similarity Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.01
 )
+# Display first 5 rows of dataset
+st.subheader("📄 Explore the Dataset")
+st.dataframe(df_filtered.head(5))
 if st.sidebar.button("Run Deduplication Analysis"):
     with st.spinner("Analyzing..."):
         results = analyze_clusters(df, similarity_threshold)
     st.success("Analysis Complete!")
+    st.subheader("📊 Duplicate Problem Pairs")
+    # Filtering options
+    sources = df["source"].unique().tolist()
+    question_types = df["question_type"].unique().tolist()
+    selected_source = st.sidebar.selectbox("Filter by Source", [None] + sources)
+    selected_qtype = st.sidebar.selectbox("Filter by Question Type", [None] + question_types)
+    if selected_source:
+        results = [r for r in results if df[df["uuid"] == r["base_uuid"]]["source"].values[0] == selected_source]
+    if selected_qtype:
+        results = [r for r in results if df[df["uuid"] == r["base_uuid"]]["question_type"].values[0] == selected_qtype]
+    # Display top 5 initially
+    num_display = 5
+    shown_results = results[:num_display]
+    for entry in shown_results:
+        base_problem = df[df["uuid"] == entry["base_uuid"]]["problem"].values[0]
+        similar_problem = df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0]
         st.markdown(f"### Problem: {base_problem}")
+        st.write(f"**Similar to:** {similar_problem}")
+        st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
+        with st.expander("Show Column Differences"):
+            st.json(entry["column_differences"])
+        st.markdown("---")
+    if len(results) > num_display:
+        if st.button("Show More Results"):
+            extra_results = results[num_display:num_display * 2]
+            for entry in extra_results:
+                base_problem = df[df["uuid"] == entry["base_uuid"]]["problem"].values[0]
+                similar_problem = df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0]
+                st.markdown(f"### Problem: {base_problem}")
+                st.write(f"**Similar to:** {similar_problem}")
+                st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
+                with st.expander("Show Column Differences"):
+                    st.json(entry["column_differences"])
+                st.markdown("---")
     # Export results
     st.sidebar.download_button(
         label="Download Results as JSON",
         file_name="deduplication_results.json",
         mime="application/json"
     )