dsleo commited on
Commit
532392b
Β·
1 Parent(s): 9bdd6f6

first stab at app

Browse files
Files changed (2) hide show
  1. app.py +134 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import json
4
+ import os
5
+ from sentence_transformers import SentenceTransformer, util
6
+ from openai import OpenAI
7
+ from loguru import logger
8
+
9
+ # ================== CONFIGURATION ==================
10
+ st.set_page_config(page_title="Problem Deduplication Explorer", layout="wide")
11
+
12
+ # Load a pre-trained model for embeddings
13
+ MODEL_NAME = "all-MiniLM-L6-v2"
14
+ model = SentenceTransformer(MODEL_NAME)
15
+
16
+ # Load preloaded dataset
17
+ @st.cache_data
18
+ def load_data():
19
+ data = [
20
+ {
21
+ "uuid": "350d6834-3231-5d23-89e9-c7dc0f3fde0b",
22
+ "problem": "A function $f$ has the property that $f(3x-1)=x^2+x+1$ for all real numbers $x$. What is $f(5)$?",
23
+ "source": "aops-wiki",
24
+ "question_type": "MCQ",
25
+ "problem_type": "Algebra"
26
+ },
27
+ {
28
+ "uuid": "b67e9cf9-8b3a-5a34-a118-4ce2aeb2c3d8",
29
+ "problem": "A function $f$ has the property that $f(3x-1)=x^2+x+1$ for all real numbers $x$. What is $f(5)$?",
30
+ "source": "MATH-train",
31
+ "question_type": "math-word-problem",
32
+ "problem_type": "Algebra"
33
+ },
34
+ ]
35
+ return pd.DataFrame(data)
36
+
37
+ df = load_data()
38
+
39
+ # ================== FUNCTION DEFINITIONS ==================
40
+ def compute_embeddings(problems):
41
+ """Compute sentence embeddings."""
42
+ return model.encode(problems, normalize_embeddings=True)
43
+
44
+ def find_similar_problems(df, similarity_threshold=0.9):
45
+ """Find similar problems using cosine similarity."""
46
+ embeddings = compute_embeddings(df['problem'].tolist())
47
+ similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
48
+
49
+ clusters = {}
50
+ for i in range(len(df)):
51
+ current_uuid = df["uuid"][i]
52
+ similar_items = [
53
+ (df["uuid"][j], similarity_matrix[i][j])
54
+ for j in range(i + 1, len(df))
55
+ if similarity_matrix[i][j] > similarity_threshold
56
+ ]
57
+
58
+ if similar_items:
59
+ clusters[current_uuid] = similar_items
60
+
61
+ return clusters
62
+
63
+ def analyze_clusters(df, similarity_threshold=0.9):
64
+ """Analyze duplicate problem clusters."""
65
+ clusters = find_similar_problems(df, similarity_threshold)
66
+ detailed_analysis = {}
67
+
68
+ for key, values in clusters.items():
69
+ base_row = df[df["uuid"] == key].iloc[0]
70
+ cluster_details = []
71
+
72
+ for val, score in values:
73
+ comparison_row = df[df["uuid"] == val].iloc[0]
74
+
75
+ column_differences = {}
76
+ for col in df.columns:
77
+ if col != "uuid":
78
+ column_differences[col] = {
79
+ 'base': base_row[col],
80
+ 'comparison': comparison_row[col],
81
+ 'match': base_row[col] == comparison_row[col]
82
+ }
83
+
84
+ cluster_details.append({
85
+ 'uuid': val,
86
+ 'similarity_score': score,
87
+ 'column_differences': column_differences,
88
+ })
89
+
90
+ detailed_analysis[key] = cluster_details
91
+
92
+ return detailed_analysis
93
+
94
+ # ================== STREAMLIT UI ==================
95
+ st.title("πŸ” Problem Deduplication Explorer")
96
+
97
+ st.sidebar.header("Settings")
98
+ similarity_threshold = st.sidebar.slider(
99
+ "Similarity Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.01
100
+ )
101
+
102
+ if st.sidebar.button("Run Deduplication Analysis"):
103
+ with st.spinner("Analyzing..."):
104
+ results = analyze_clusters(df, similarity_threshold)
105
+
106
+ st.success("Analysis Complete!")
107
+
108
+ st.subheader("πŸ“Š Duplicate Problem Clusters")
109
+ for base_uuid, cluster in results.items():
110
+ base_problem = df[df["uuid"] == base_uuid]["problem"].values[0]
111
+ st.markdown(f"### Problem: {base_problem}")
112
+
113
+ for entry in cluster:
114
+ similar_problem = df[df["uuid"] == entry["uuid"]]["problem"].values[0]
115
+ st.write(f"**Similar to:** {similar_problem}")
116
+ st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
117
+
118
+ with st.expander("Show Column Differences"):
119
+ st.json(entry["column_differences"])
120
+
121
+ st.markdown("---")
122
+
123
+ # Export results
124
+ st.sidebar.download_button(
125
+ label="Download Results as JSON",
126
+ data=json.dumps(results, indent=2),
127
+ file_name="deduplication_results.json",
128
+ mime="application/json"
129
+ )
130
+
131
+ # ================== DATAFRAME DISPLAY ==================
132
+ st.subheader("πŸ“„ Explore the Dataset")
133
+ st.dataframe(df)
134
+
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ sentence-transformers
4
+ openai
5
+ loguru