de-Rodrigo commited on
Commit
b79fb2d
1 Parent(s): 0598719

Incllude Selector for Distance Method

Browse files
Files changed (1) hide show
  1. app.py +108 -36
app.py CHANGED
@@ -6,7 +6,8 @@ from bokeh.models import ColumnDataSource, DataTable, TableColumn, CustomJS, Sel
6
  from bokeh.layouts import column
7
  from bokeh.palettes import Reds9, Blues9, Oranges9, Purples9, Greys9, BuGn9, Greens9
8
  from sklearn.decomposition import PCA
9
- from sklearn.manifold import TSNE
 
10
  import io
11
  import ot
12
  from sklearn.linear_model import LinearRegression
@@ -37,7 +38,7 @@ def config_style():
37
  st.markdown('<h1 class="main-title">Merit Embeddings 馃帓馃搩馃弳</h1>', unsafe_allow_html=True)
38
 
39
  # =============================================================================
40
- # Funciones de carga de datos, generaci贸n de gr谩ficos y c谩lculo de distancias (sin cambios)
41
  # =============================================================================
42
 
43
  def load_embeddings(model, version):
@@ -106,36 +107,90 @@ def split_versions(df_combined, reduced):
106
  unique_subsets = {"real": unique_real, "synthetic": unique_synth}
107
  return df_dict, unique_subsets
108
 
109
- def compute_wasserstein_distances_synthetic_individual(synthetic_df: pd.DataFrame, df_real: pd.DataFrame, real_labels: list) -> pd.DataFrame:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  distances = {}
111
  groups = synthetic_df.groupby(['source', 'label'])
112
  for (source, label), group in groups:
113
  key = f"{label} ({source})"
114
  data = group[['x', 'y']].values
115
- n = data.shape[0]
116
- weights = np.ones(n) / n
117
  distances[key] = {}
118
  for real_label in real_labels:
119
  real_data = df_real[df_real['label'] == real_label][['x','y']].values
120
- m = real_data.shape[0]
121
- weights_real = np.ones(m) / m
122
- M = ot.dist(data, real_data, metric='euclidean')
123
- distances[key][real_label] = ot.emd2(weights, weights_real, M)
124
-
125
  for source, group in synthetic_df.groupby('source'):
126
  key = f"Global ({source})"
127
  data = group[['x','y']].values
128
- n = data.shape[0]
129
- weights = np.ones(n) / n
130
  distances[key] = {}
131
  for real_label in real_labels:
132
  real_data = df_real[df_real['label'] == real_label][['x','y']].values
133
- m = real_data.shape[0]
134
- weights_real = np.ones(m) / m
135
- M = ot.dist(data, real_data, metric='euclidean')
136
- distances[key][real_label] = ot.emd2(weights, weights_real, M)
137
  return pd.DataFrame(distances).T
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  def create_table(df_distances):
140
  df_table = df_distances.copy()
141
  df_table.reset_index(inplace=True)
@@ -300,10 +355,12 @@ def calculate_cluster_centers(df, labels):
300
  return centers
301
 
302
  # =============================================================================
303
- # Funci贸n centralizada para la pipeline: reducci贸n, distancias y regresi贸n global
 
 
304
  # =============================================================================
305
 
306
- def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, reduction_method="t-SNE"):
307
  if reduction_method == "PCA":
308
  reducer = PCA(n_components=2)
309
  else:
@@ -313,17 +370,26 @@ def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, r
313
 
314
  reduced = reducer.fit_transform(df_combined[embedding_cols].values)
315
 
316
- # Si se usa PCA, capturamos la varianza explicada
317
  explained_variance = None
318
  if reduction_method == "PCA":
319
  explained_variance = reducer.explained_variance_ratio_
320
 
 
 
 
 
 
 
 
 
321
  dfs_reduced, unique_subsets = split_versions(df_combined, reduced)
322
 
323
- df_distances = compute_wasserstein_distances_synthetic_individual(
324
  dfs_reduced["synthetic"],
325
  dfs_reduced["real"],
326
- unique_subsets["real"]
 
327
  )
328
 
329
  global_distances = {}
@@ -349,7 +415,7 @@ def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, r
349
  intercept = model_global.intercept_
350
 
351
  scatter_fig = figure(width=600, height=600, tools="pan,wheel_zoom,reset,save",
352
- title="Scatter Plot: Wasserstein vs F1")
353
  source_colors = {
354
  "es-digital-paragraph-degradation-seq": "blue",
355
  "es-digital-line-degradation-seq": "green",
@@ -369,10 +435,10 @@ def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, r
369
  fill_color=source_colors.get(source, "gray"),
370
  line_color=source_colors.get(source, "gray"),
371
  legend_label=source)
372
- scatter_fig.xaxis.axis_label = "Wasserstein Distance (Global, por Colegio)"
373
  scatter_fig.yaxis.axis_label = "F1 Score"
374
  scatter_fig.legend.location = "top_right"
375
- hover_tool = HoverTool(tooltips=[("Wass. Distance", "@x"), ("f1", "@y"), ("Subset", "@Fuente")])
376
  scatter_fig.add_tools(hover_tool)
377
 
378
  x_line = np.linspace(all_x_arr.min(), all_x_arr.max(), 100)
@@ -387,15 +453,16 @@ def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, r
387
  "dfs_reduced": dfs_reduced,
388
  "unique_subsets": unique_subsets,
389
  "df_distances": df_distances,
390
- "explained_variance": explained_variance # Se incluye la varianza explicada (solo para PCA)
 
 
391
  }
392
 
393
-
394
  # =============================================================================
395
- # Funci贸n de optimizaci贸n (grid search) para TSNE, usando la misma pipeline
396
  # =============================================================================
397
 
398
- def optimize_tsne_params(df_combined, embedding_cols, df_f1):
399
  perplexity_range = np.linspace(30, 50, 10)
400
  learning_rate_range = np.linspace(200, 1000, 20)
401
 
@@ -412,7 +479,7 @@ def optimize_tsne_params(df_combined, embedding_cols, df_f1):
412
  progress_text.text(f"Evaluating: Perplexity={p:.2f}, Learning Rate={lr:.2f} (Step {step}/{total_steps})")
413
 
414
  tsne_params = {"perplexity": p, "learning_rate": lr}
415
- result = compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, reduction_method="t-SNE")
416
  r2_temp = result["R2"]
417
  st.write(f"Parameters: Perplexity={p:.2f}, Learning Rate={lr:.2f} -> R虏={r2_temp:.4f}")
418
 
@@ -424,11 +491,11 @@ def optimize_tsne_params(df_combined, embedding_cols, df_f1):
424
  return best_params, best_R2
425
 
426
  # =============================================================================
427
- # Funci贸n principal run_model que integra optimizaci贸n, selector de versi贸n y ejecuci贸n manual
 
428
  # =============================================================================
429
 
430
  def run_model(model_name):
431
- # Seleccionar la versi贸n del modelo
432
  version = st.selectbox("Select Model Version:", options=["vanilla", "finetuned_real"], key=f"version_{model_name}")
433
 
434
  embeddings = load_embeddings(model_name, version)
@@ -446,11 +513,15 @@ def run_model(model_name):
446
  st.markdown('<h6 class="sub-title">Select Dimensionality Reduction Method</h6>', unsafe_allow_html=True)
447
  reduction_method = st.selectbox("", options=["t-SNE", "PCA"], key=f"reduction_{model_name}")
448
 
 
 
 
 
449
  tsne_params = {}
450
  if reduction_method == "t-SNE":
451
  if st.button("Optimize TSNE parameters", key=f"optimize_tsne_{model_name}"):
452
  st.info("Running optimization, this can take a while...")
453
- best_params, best_R2 = optimize_tsne_params(df_combined, embedding_cols, df_f1)
454
  st.success(f"Best parameters: Perplexity = {best_params[0]:.2f}, Learning Rate = {best_params[1]:.2f} with R虏 = {best_R2:.4f}")
455
  tsne_params = {"perplexity": best_params[0], "learning_rate": best_params[1]}
456
  else:
@@ -473,9 +544,8 @@ def run_model(model_name):
473
  key=f"learning_rate_{model_name}"
474
  )
475
  tsne_params = {"perplexity": perplexity_val, "learning_rate": learning_rate_val}
476
- # Si se selecciona PCA, tsne_params no se usa.
477
 
478
- result = compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, reduction_method=reduction_method)
479
 
480
  reg_metrics = pd.DataFrame({
481
  "Slope": [result["slope"]],
@@ -484,7 +554,6 @@ def run_model(model_name):
484
  })
485
  st.table(reg_metrics)
486
 
487
- # Si se ha utilizado PCA, mostramos la varianza explicada
488
  if reduction_method == "PCA" and result["explained_variance"] is not None:
489
  st.subheader("Explained Variance Ratio")
490
  variance_df = pd.DataFrame({
@@ -492,6 +561,10 @@ def run_model(model_name):
492
  "Explained Variance": result["explained_variance"]
493
  })
494
  st.table(variance_df)
 
 
 
 
495
 
496
  data_table, df_table, source_table = create_table(result["df_distances"])
497
  real_subset_names = list(df_table.columns[1:])
@@ -554,7 +627,6 @@ def run_model(model_name):
554
  key=f"download_button_excel_{model_name}"
555
  )
556
 
557
-
558
  def main():
559
  config_style()
560
  tabs = st.tabs(["Donut", "Idefics2"])
 
6
  from bokeh.layouts import column
7
  from bokeh.palettes import Reds9, Blues9, Oranges9, Purples9, Greys9, BuGn9, Greens9
8
  from sklearn.decomposition import PCA
9
+ from sklearn.manifold import TSNE, trustworthiness
10
+ from sklearn.metrics import pairwise_distances
11
  import io
12
  import ot
13
  from sklearn.linear_model import LinearRegression
 
38
  st.markdown('<h1 class="main-title">Merit Embeddings 馃帓馃搩馃弳</h1>', unsafe_allow_html=True)
39
 
40
  # =============================================================================
41
+ # Funciones de carga de datos y procesamiento (sin cambios en su mayor铆a)
42
  # =============================================================================
43
 
44
  def load_embeddings(model, version):
 
107
  unique_subsets = {"real": unique_real, "synthetic": unique_synth}
108
  return df_dict, unique_subsets
109
 
110
+ # =============================================================================
111
+ # Funciones para calcular distancias entre clusters seg煤n la m茅trica seleccionada
112
+ # (Wasserstein, Euclidean o KL)
113
+ # =============================================================================
114
+
115
+ def compute_cluster_distance(synthetic_points, real_points, metric="wasserstein", bins=20):
116
+ if metric.lower() == "wasserstein":
117
+ n = synthetic_points.shape[0]
118
+ m = real_points.shape[0]
119
+ weights = np.ones(n) / n
120
+ weights_real = np.ones(m) / m
121
+ M = ot.dist(synthetic_points, real_points, metric='euclidean')
122
+ return ot.emd2(weights, weights_real, M)
123
+ elif metric.lower() == "euclidean":
124
+ center_syn = np.mean(synthetic_points, axis=0)
125
+ center_real = np.mean(real_points, axis=0)
126
+ return np.linalg.norm(center_syn - center_real)
127
+ elif metric.lower() == "kl":
128
+ all_points = np.vstack([synthetic_points, real_points])
129
+ x_min, y_min = np.min(all_points, axis=0)
130
+ x_max, y_max = np.max(all_points, axis=0)
131
+ x_bins = np.linspace(x_min, x_max, bins+1)
132
+ y_bins = np.linspace(y_min, y_max, bins+1)
133
+ H_syn, _, _ = np.histogram2d(synthetic_points[:,0], synthetic_points[:,1], bins=[x_bins, y_bins])
134
+ H_real, _, _ = np.histogram2d(real_points[:,0], real_points[:,1], bins=[x_bins, y_bins])
135
+ eps = 1e-10
136
+ P = H_syn + eps
137
+ Q = H_real + eps
138
+ P = P / P.sum()
139
+ Q = Q / Q.sum()
140
+ kl = np.sum(P * np.log(P / Q))
141
+ return kl
142
+ else:
143
+ raise ValueError("M茅trica desconocida. Usa 'wasserstein', 'euclidean' o 'kl'.")
144
+
145
+ def compute_cluster_distances_synthetic_individual(synthetic_df: pd.DataFrame, df_real: pd.DataFrame, real_labels: list, metric="wasserstein", bins=20) -> pd.DataFrame:
146
  distances = {}
147
  groups = synthetic_df.groupby(['source', 'label'])
148
  for (source, label), group in groups:
149
  key = f"{label} ({source})"
150
  data = group[['x', 'y']].values
 
 
151
  distances[key] = {}
152
  for real_label in real_labels:
153
  real_data = df_real[df_real['label'] == real_label][['x','y']].values
154
+ d = compute_cluster_distance(data, real_data, metric=metric, bins=bins)
155
+ distances[key][real_label] = d
 
 
 
156
  for source, group in synthetic_df.groupby('source'):
157
  key = f"Global ({source})"
158
  data = group[['x','y']].values
 
 
159
  distances[key] = {}
160
  for real_label in real_labels:
161
  real_data = df_real[df_real['label'] == real_label][['x','y']].values
162
+ d = compute_cluster_distance(data, real_data, metric=metric, bins=bins)
163
+ distances[key][real_label] = d
 
 
164
  return pd.DataFrame(distances).T
165
 
166
+ # =============================================================================
167
+ # Funci贸n para calcular continuidad (mide la preservaci贸n de la vecindad original en el embedding)
168
+ # =============================================================================
169
+
170
+ def compute_continuity(X, X_embedded, n_neighbors=5):
171
+ n = X.shape[0]
172
+ D_high = pairwise_distances(X, metric='euclidean')
173
+ D_low = pairwise_distances(X_embedded, metric='euclidean')
174
+ indices_high = np.argsort(D_high, axis=1)
175
+ indices_low = np.argsort(D_low, axis=1)
176
+ k_high = indices_high[:, 1:n_neighbors+1]
177
+ k_low = indices_low[:, 1:n_neighbors+1]
178
+ total = 0.0
179
+ for i in range(n):
180
+ set_high = set(k_high[i])
181
+ set_low = set(k_low[i])
182
+ missing = set_high - set_low
183
+ for j in missing:
184
+ rank = np.where(indices_low[i] == j)[0][0]
185
+ total += (rank - n_neighbors)
186
+ norm = 2.0 / (n * n_neighbors * (2*n - 3*n_neighbors - 1))
187
+ continuity_value = 1 - norm * total
188
+ return continuity_value
189
+
190
+ # =============================================================================
191
+ # Funciones de visualizaci贸n (sin cambios)
192
+ # =============================================================================
193
+
194
  def create_table(df_distances):
195
  df_table = df_distances.copy()
196
  df_table.reset_index(inplace=True)
 
355
  return centers
356
 
357
  # =============================================================================
358
+ # Pipeline central: reducci贸n, c谩lculo de distancias y regresi贸n global.
359
+ # Se agrega el par谩metro distance_metric.
360
+ # Adem谩s, si se utiliza t-SNE, se calculan trustworthiness y continuity.
361
  # =============================================================================
362
 
363
+ def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, reduction_method="t-SNE", distance_metric="wasserstein"):
364
  if reduction_method == "PCA":
365
  reducer = PCA(n_components=2)
366
  else:
 
370
 
371
  reduced = reducer.fit_transform(df_combined[embedding_cols].values)
372
 
373
+ # Para PCA se captura la explained variance ratio
374
  explained_variance = None
375
  if reduction_method == "PCA":
376
  explained_variance = reducer.explained_variance_ratio_
377
 
378
+ # Si se usa t-SNE, calculamos trustworthiness y continuity
379
+ trust = None
380
+ cont = None
381
+ if reduction_method == "t-SNE":
382
+ X = df_combined[embedding_cols].values
383
+ trust = trustworthiness(X, reduced, n_neighbors=5)
384
+ cont = compute_continuity(X, reduced, n_neighbors=5)
385
+
386
  dfs_reduced, unique_subsets = split_versions(df_combined, reduced)
387
 
388
+ df_distances = compute_cluster_distances_synthetic_individual(
389
  dfs_reduced["synthetic"],
390
  dfs_reduced["real"],
391
+ unique_subsets["real"],
392
+ metric=distance_metric
393
  )
394
 
395
  global_distances = {}
 
415
  intercept = model_global.intercept_
416
 
417
  scatter_fig = figure(width=600, height=600, tools="pan,wheel_zoom,reset,save",
418
+ title="Scatter Plot: Distance vs F1")
419
  source_colors = {
420
  "es-digital-paragraph-degradation-seq": "blue",
421
  "es-digital-line-degradation-seq": "green",
 
435
  fill_color=source_colors.get(source, "gray"),
436
  line_color=source_colors.get(source, "gray"),
437
  legend_label=source)
438
+ scatter_fig.xaxis.axis_label = "Distance (Global, por Colegio)"
439
  scatter_fig.yaxis.axis_label = "F1 Score"
440
  scatter_fig.legend.location = "top_right"
441
+ hover_tool = HoverTool(tooltips=[("Distance", "@x"), ("F1", "@y"), ("Subset", "@Fuente")])
442
  scatter_fig.add_tools(hover_tool)
443
 
444
  x_line = np.linspace(all_x_arr.min(), all_x_arr.max(), 100)
 
453
  "dfs_reduced": dfs_reduced,
454
  "unique_subsets": unique_subsets,
455
  "df_distances": df_distances,
456
+ "explained_variance": explained_variance, # Solo para PCA
457
+ "trustworthiness": trust, # Solo para t-SNE
458
+ "continuity": cont # Solo para t-SNE
459
  }
460
 
 
461
  # =============================================================================
462
+ # Optimizaci贸n de par谩metros para TSNE (se propaga tambi茅n la m茅trica de distancia)
463
  # =============================================================================
464
 
465
+ def optimize_tsne_params(df_combined, embedding_cols, df_f1, distance_metric):
466
  perplexity_range = np.linspace(30, 50, 10)
467
  learning_rate_range = np.linspace(200, 1000, 20)
468
 
 
479
  progress_text.text(f"Evaluating: Perplexity={p:.2f}, Learning Rate={lr:.2f} (Step {step}/{total_steps})")
480
 
481
  tsne_params = {"perplexity": p, "learning_rate": lr}
482
+ result = compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, reduction_method="t-SNE", distance_metric=distance_metric)
483
  r2_temp = result["R2"]
484
  st.write(f"Parameters: Perplexity={p:.2f}, Learning Rate={lr:.2f} -> R虏={r2_temp:.4f}")
485
 
 
491
  return best_params, best_R2
492
 
493
  # =============================================================================
494
+ # Funci贸n principal run_model: incluye selector de versi贸n, m茅todo de reducci贸n, m茅trica de distancia,
495
+ # y, si se usa t-SNE, muestra trustworthiness y continuity.
496
  # =============================================================================
497
 
498
  def run_model(model_name):
 
499
  version = st.selectbox("Select Model Version:", options=["vanilla", "finetuned_real"], key=f"version_{model_name}")
500
 
501
  embeddings = load_embeddings(model_name, version)
 
513
  st.markdown('<h6 class="sub-title">Select Dimensionality Reduction Method</h6>', unsafe_allow_html=True)
514
  reduction_method = st.selectbox("", options=["t-SNE", "PCA"], key=f"reduction_{model_name}")
515
 
516
+ distance_metric = st.selectbox("Select Distance Metric:",
517
+ options=["Wasserstein", "Euclidean", "KL"],
518
+ key=f"distance_metric_{model_name}")
519
+
520
  tsne_params = {}
521
  if reduction_method == "t-SNE":
522
  if st.button("Optimize TSNE parameters", key=f"optimize_tsne_{model_name}"):
523
  st.info("Running optimization, this can take a while...")
524
+ best_params, best_R2 = optimize_tsne_params(df_combined, embedding_cols, df_f1, distance_metric.lower())
525
  st.success(f"Best parameters: Perplexity = {best_params[0]:.2f}, Learning Rate = {best_params[1]:.2f} with R虏 = {best_R2:.4f}")
526
  tsne_params = {"perplexity": best_params[0], "learning_rate": best_params[1]}
527
  else:
 
544
  key=f"learning_rate_{model_name}"
545
  )
546
  tsne_params = {"perplexity": perplexity_val, "learning_rate": learning_rate_val}
 
547
 
548
+ result = compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, reduction_method=reduction_method, distance_metric=distance_metric.lower())
549
 
550
  reg_metrics = pd.DataFrame({
551
  "Slope": [result["slope"]],
 
554
  })
555
  st.table(reg_metrics)
556
 
 
557
  if reduction_method == "PCA" and result["explained_variance"] is not None:
558
  st.subheader("Explained Variance Ratio")
559
  variance_df = pd.DataFrame({
 
561
  "Explained Variance": result["explained_variance"]
562
  })
563
  st.table(variance_df)
564
+ elif reduction_method == "t-SNE":
565
+ st.subheader("t-SNE Quality Metrics")
566
+ st.write(f"Trustworthiness: {result['trustworthiness']:.4f}")
567
+ st.write(f"Continuity: {result['continuity']:.4f}")
568
 
569
  data_table, df_table, source_table = create_table(result["df_distances"])
570
  real_subset_names = list(df_table.columns[1:])
 
627
  key=f"download_button_excel_{model_name}"
628
  )
629
 
 
630
  def main():
631
  config_style()
632
  tabs = st.tabs(["Donut", "Idefics2"])