dsleo commited on
Commit
640a2eb
Β·
verified Β·
1 Parent(s): 12311e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -15
app.py CHANGED
@@ -36,43 +36,39 @@ def compute_embeddings(problems):
36
  return model.encode(problems, normalize_embeddings=True)
37
 
38
  def find_similar_problems(df, similarity_threshold=0.9):
39
- """Find similar problems using cosine similarity, optimized with clear UI updates."""
40
 
41
- status_msgs = []
42
-
43
- msg = st.status("πŸ”„ Computing problem embeddings...")
44
- status_msgs.append(msg)
45
  start_time = time.time()
46
  embeddings = compute_embeddings(df['problem'].tolist())
47
-
48
- msg = st.status("πŸ”„ Computing cosine similarity matrix...")
49
- status_msgs.append(msg)
50
  similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
 
51
 
52
- msg = st.status("πŸ”„ Filtering similar problems...")
53
- status_msgs.append(msg)
54
-
55
  num_problems = len(df)
56
  upper_triangle_indices = np.triu_indices(num_problems, k=1)
57
 
 
58
  i_indices, j_indices = upper_triangle_indices
59
  similarity_scores = similarity_matrix[i_indices, j_indices]
60
 
 
61
  mask = similarity_scores > similarity_threshold
62
  filtered_i = i_indices[mask]
63
  filtered_j = j_indices[mask]
64
  filtered_scores = similarity_scores[mask]
65
 
 
66
  pairs = [
67
  (df.iloc[i]["uuid"], df.iloc[j]["uuid"], float(score))
68
  for i, j, score in zip(filtered_i, filtered_j, filtered_scores)
69
  ]
70
 
71
  sorted_pairs = sorted(pairs, key=lambda x: x[2], reverse=True)
72
-
73
- for msg in status_msgs:
74
- msg.empty()
75
-
76
  st.success(f"βœ… Analysis complete! Found {len(sorted_pairs)} similar problems in {time.time() - start_time:.2f}s", icon="πŸŽ‰")
77
 
78
  return sorted_pairs
 
36
  return model.encode(problems, normalize_embeddings=True)
37
 
38
  def find_similar_problems(df, similarity_threshold=0.9):
39
+ """Find similar problems using cosine similarity, optimized for speed."""
40
 
41
+ st.status("πŸ”„ Computing problem embeddings...")
 
 
 
42
  start_time = time.time()
43
  embeddings = compute_embeddings(df['problem'].tolist())
44
+ st.success("βœ… Embeddings computed!", icon="βœ…")
45
+
46
+ st.status("πŸ”„ Computing cosine similarity matrix...")
47
  similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
48
+ st.success("βœ… Similarity matrix computed!", icon="βœ…")
49
 
50
+ # Use numpy.triu_indices to efficiently get upper-triangle indices (excluding diagonal)
 
 
51
  num_problems = len(df)
52
  upper_triangle_indices = np.triu_indices(num_problems, k=1)
53
 
54
+ st.status("πŸ”„ Filtering similar problems...")
55
  i_indices, j_indices = upper_triangle_indices
56
  similarity_scores = similarity_matrix[i_indices, j_indices]
57
 
58
+ # Filter based on threshold
59
  mask = similarity_scores > similarity_threshold
60
  filtered_i = i_indices[mask]
61
  filtered_j = j_indices[mask]
62
  filtered_scores = similarity_scores[mask]
63
 
64
+ # Convert results into a sorted list of tuples
65
  pairs = [
66
  (df.iloc[i]["uuid"], df.iloc[j]["uuid"], float(score))
67
  for i, j, score in zip(filtered_i, filtered_j, filtered_scores)
68
  ]
69
 
70
  sorted_pairs = sorted(pairs, key=lambda x: x[2], reverse=True)
71
+
 
 
 
72
  st.success(f"βœ… Analysis complete! Found {len(sorted_pairs)} similar problems in {time.time() - start_time:.2f}s", icon="πŸŽ‰")
73
 
74
  return sorted_pairs