Spaces:

dsleo
/

math-dedup

Sleeping

App Files Files Community

dsleo commited on Feb 6

Commit

532392b

1 Parent(s): 9bdd6f6

first stab at app

Browse files

Files changed (2) hide show

app.py +134 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import streamlit as st
+import pandas as pd
+import json
+import os
+from sentence_transformers import SentenceTransformer, util
+from openai import OpenAI
+from loguru import logger
+# ================== CONFIGURATION ==================
+st.set_page_config(page_title="Problem Deduplication Explorer", layout="wide")
+# Load a pre-trained model for embeddings
+MODEL_NAME = "all-MiniLM-L6-v2"
+model = SentenceTransformer(MODEL_NAME)
+# Load preloaded dataset
+@st.cache_data
+def load_data():
+    data = [
+        {
+            "uuid": "350d6834-3231-5d23-89e9-c7dc0f3fde0b",
+            "problem": "A function $f$ has the property that $f(3x-1)=x^2+x+1$ for all real numbers $x$. What is $f(5)$?",
+            "source": "aops-wiki",
+            "question_type": "MCQ",
+            "problem_type": "Algebra"
+        },
+        {
+            "uuid": "b67e9cf9-8b3a-5a34-a118-4ce2aeb2c3d8",
+            "problem": "A function $f$ has the property that $f(3x-1)=x^2+x+1$ for all real numbers $x$. What is $f(5)$?",
+            "source": "MATH-train",
+            "question_type": "math-word-problem",
+            "problem_type": "Algebra"
+        },
+    ]
+    return pd.DataFrame(data)
+df = load_data()
+# ================== FUNCTION DEFINITIONS ==================
+def compute_embeddings(problems):
+    """Compute sentence embeddings."""
+    return model.encode(problems, normalize_embeddings=True)
+def find_similar_problems(df, similarity_threshold=0.9):
+    """Find similar problems using cosine similarity."""
+    embeddings = compute_embeddings(df['problem'].tolist())
+    similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
+    clusters = {}
+    for i in range(len(df)):
+        current_uuid = df["uuid"][i]
+        similar_items = [
+            (df["uuid"][j], similarity_matrix[i][j])
+            for j in range(i + 1, len(df))
+            if similarity_matrix[i][j] > similarity_threshold
+        ]
+        if similar_items:
+            clusters[current_uuid] = similar_items
+    return clusters
+def analyze_clusters(df, similarity_threshold=0.9):
+    """Analyze duplicate problem clusters."""
+    clusters = find_similar_problems(df, similarity_threshold)
+    detailed_analysis = {}
+    for key, values in clusters.items():
+        base_row = df[df["uuid"] == key].iloc[0]
+        cluster_details = []
+        for val, score in values:
+            comparison_row = df[df["uuid"] == val].iloc[0]
+            column_differences = {}
+            for col in df.columns:
+                if col != "uuid":
+                    column_differences[col] = {
+                        'base': base_row[col],
+                        'comparison': comparison_row[col],
+                        'match': base_row[col] == comparison_row[col]
+                    }
+            cluster_details.append({
+                'uuid': val,
+                'similarity_score': score,
+                'column_differences': column_differences,
+            })
+        detailed_analysis[key] = cluster_details
+    return detailed_analysis
+# ================== STREAMLIT UI ==================
+st.title("🔍 Problem Deduplication Explorer")
+st.sidebar.header("Settings")
+similarity_threshold = st.sidebar.slider(
+    "Similarity Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.01
+)
+if st.sidebar.button("Run Deduplication Analysis"):
+    with st.spinner("Analyzing..."):
+        results = analyze_clusters(df, similarity_threshold)
+    st.success("Analysis Complete!")
+    st.subheader("📊 Duplicate Problem Clusters")
+    for base_uuid, cluster in results.items():
+        base_problem = df[df["uuid"] == base_uuid]["problem"].values[0]
+        st.markdown(f"### Problem: {base_problem}")
+        for entry in cluster:
+            similar_problem = df[df["uuid"] == entry["uuid"]]["problem"].values[0]
+            st.write(f"**Similar to:** {similar_problem}")
+            st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
+            with st.expander("Show Column Differences"):
+                st.json(entry["column_differences"])
+            st.markdown("---")
+    # Export results
+    st.sidebar.download_button(
+        label="Download Results as JSON",
+        data=json.dumps(results, indent=2),
+        file_name="deduplication_results.json",
+        mime="application/json"
+    )
+# ================== DATAFRAME DISPLAY ==================
+st.subheader("📄 Explore the Dataset")
+st.dataframe(df)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+streamlit
+pandas
+sentence-transformers
+openai
+loguru