dsleo commited on
Commit
f83d20c
Β·
verified Β·
1 Parent(s): c6f32b1

optim + logging

Browse files
Files changed (1) hide show
  1. app.py +36 -10
app.py CHANGED
@@ -1,7 +1,9 @@
1
  import streamlit as st
2
  import pandas as pd
 
3
  import json
4
  import os
 
5
  import zipfile
6
  from sentence_transformers import SentenceTransformer, util
7
  from loguru import logger
@@ -26,24 +28,48 @@ def load_data():
26
  df = load_data()
27
 
28
  display_columns = ["uuid","problem", "source", "question_type", "problem_type"]
29
- df_filtered = df[display_columns]
30
 
31
  # ================== FUNCTION DEFINITIONS ==================
32
  def compute_embeddings(problems):
33
  """Compute sentence embeddings."""
34
  return model.encode(problems, normalize_embeddings=True)
35
 
 
36
  def find_similar_problems(df, similarity_threshold=0.9):
37
- """Find similar problems using cosine similarity."""
 
 
 
38
  embeddings = compute_embeddings(df['problem'].tolist())
 
 
 
39
  similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
40
- pairs = []
41
- for i in range(len(df)):
42
- for j in range(i + 1, len(df)):
43
- score = similarity_matrix[i][j]
44
- if score > similarity_threshold:
45
- pairs.append((df.iloc[i]["uuid"], df.iloc[j]["uuid"], float(score)))
46
- return sorted(pairs, key=lambda x: x[2], reverse=True) # Sort by similarity score
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  def analyze_clusters(df, similarity_threshold=0.9):
49
  """Analyze duplicate problem clusters."""
@@ -81,7 +107,7 @@ similarity_threshold = st.sidebar.slider(
81
 
82
  # Display first 5 rows of dataset
83
  st.subheader("πŸ“„ Explore the Dataset")
84
- st.dataframe(df_filtered.head(5))
85
 
86
  if st.sidebar.button("Run Deduplication Analysis"):
87
  with st.spinner("Analyzing..."):
 
1
  import streamlit as st
2
  import pandas as pd
3
+ import numpy as np
4
  import json
5
  import os
6
+ import time
7
  import zipfile
8
  from sentence_transformers import SentenceTransformer, util
9
  from loguru import logger
 
28
  df = load_data()
29
 
30
  display_columns = ["uuid","problem", "source", "question_type", "problem_type"]
31
+ df = df[display_columns]
32
 
33
  # ================== FUNCTION DEFINITIONS ==================
34
  def compute_embeddings(problems):
35
  """Compute sentence embeddings."""
36
  return model.encode(problems, normalize_embeddings=True)
37
 
38
+
39
  def find_similar_problems(df, similarity_threshold=0.9):
40
+ """Find similar problems using cosine similarity"""
41
+
42
+ st.status("πŸ”„ Computing problem embeddings...")
43
+ start_time = time.time()
44
  embeddings = compute_embeddings(df['problem'].tolist())
45
+ st.success("βœ… Embeddings computed!", icon="βœ…")
46
+
47
+ st.status("πŸ”„ Computing cosine similarity matrix...")
48
  similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
49
+ st.success("βœ… Similarity matrix computed!", icon="βœ…")
50
+
51
+ num_problems = len(df)
52
+ upper_triangle_indices = np.triu_indices(num_problems, k=1)
53
+
54
+ st.status("πŸ”„ Filtering similar problems...")
55
+ i_indices, j_indices = upper_triangle_indices
56
+ similarity_scores = similarity_matrix[i_indices, j_indices]
57
+
58
+ mask = similarity_scores > similarity_threshold
59
+ filtered_i = i_indices[mask]
60
+ filtered_j = j_indices[mask]
61
+ filtered_scores = similarity_scores[mask]
62
+
63
+ pairs = [
64
+ (df.iloc[i]["uuid"], df.iloc[j]["uuid"], float(score))
65
+ for i, j, score in zip(filtered_i, filtered_j, filtered_scores)
66
+ ]
67
+
68
+ sorted_pairs = sorted(pairs, key=lambda x: x[2], reverse=True)
69
+
70
+ st.success(f"βœ… Analysis complete! Found {len(sorted_pairs)} similar problems in {time.time() - start_time:.2f}s", icon="πŸŽ‰")
71
+
72
+ return sorted_pairs
73
 
74
  def analyze_clusters(df, similarity_threshold=0.9):
75
  """Analyze duplicate problem clusters."""
 
107
 
108
  # Display first 5 rows of dataset
109
  st.subheader("πŸ“„ Explore the Dataset")
110
+ st.dataframe(df.head(5))
111
 
112
  if st.sidebar.button("Run Deduplication Analysis"):
113
  with st.spinner("Analyzing..."):