de-Rodrigo commited on
Commit
c300990
1 Parent(s): 3cd6e58

Show PCA for Targe Dataset (Real)

Browse files
Files changed (1) hide show
  1. app.py +144 -30
app.py CHANGED
@@ -14,7 +14,7 @@ from sklearn.linear_model import LinearRegression
14
 
15
  N_COMPONENTS = 2
16
  TSNE_NEIGHBOURS = 150
17
- WEIGHT_FACTOR = 0.25
18
 
19
  TOOLTIPS = """
20
  <div>
@@ -76,17 +76,17 @@ def load_embeddings(model, version, embedding_prefix, weight_factor):
76
  "pretrained": df_pretratrained}
77
 
78
  elif model == "Idefics2":
79
- df_real = pd.read_csv(f"data/donut/{version}/{embedding_prefix}/de_Rodrigo_merit_secret_britanico_{weight_factor}embeddings.csv")
80
- df_par = pd.read_csv(f"data/donut/{version}/{embedding_prefix}/de_Rodrigo_merit_es-digital-paragraph-degradation-seq_{weight_factor}embeddings.csv")
81
- df_line = pd.read_csv(f"data/donut/{version}/{embedding_prefix}/de_Rodrigo_merit_es-digital-line-degradation-seq_{weight_factor}embeddings.csv")
82
- df_seq = pd.read_csv(f"data/donut/{version}/{embedding_prefix}/de_Rodrigo_merit_es-digital-seq_{weight_factor}embeddings.csv")
83
- df_rot = pd.read_csv(f"data/donut/{version}/{embedding_prefix}/de_Rodrigo_merit_es-digital-rotation-degradation-seq_{weight_factor}embeddings.csv")
84
- df_zoom = pd.read_csv(f"data/donut/{version}/{embedding_prefix}/de_Rodrigo_merit_es-digital-zoom-degradation-seq_{weight_factor}embeddings.csv")
85
- df_render = pd.read_csv(f"data/donut/{version}/{embedding_prefix}/de_Rodrigo_merit_es-render-seq_{weight_factor}embeddings.csv")
86
 
87
  # Cargar ambos subconjuntos pretrained y combinarlos
88
- df_pretratrained_PDFA = pd.read_csv(f"data/donut/{version}/{embedding_prefix}/de_Rodrigo_merit_aux_PDFA_{weight_factor}embeddings.csv")
89
- df_pretratrained_IDL = pd.read_csv(f"data/donut/{version}/{embedding_prefix}/de_Rodrigo_merit_aux_IDL_{weight_factor}embeddings.csv")
90
  df_pretratrained = pd.concat([df_pretratrained_PDFA, df_pretratrained_IDL], ignore_index=True)
91
 
92
  # Asignar etiquetas de versi贸n
@@ -116,9 +116,6 @@ def load_embeddings(model, version, embedding_prefix, weight_factor):
116
  st.error("Modelo no reconocido")
117
  return None
118
 
119
-
120
-
121
-
122
  def split_versions(df_combined, reduced):
123
  # Asignar las coordenadas si la reducci贸n es 2D
124
  if reduced.shape[1] == 2:
@@ -138,7 +135,6 @@ def split_versions(df_combined, reduced):
138
  unique_subsets = {"real": unique_real, "synthetic": unique_synth, "pretrained": unique_pretrained}
139
  return df_dict, unique_subsets
140
 
141
-
142
  def get_embedding_from_df(df):
143
  # Retorna el embedding completo (4 dimensiones en este caso) guardado en la columna 'embedding'
144
  if 'embedding' in df.columns:
@@ -282,7 +278,6 @@ def create_figure(dfs, unique_subsets, color_maps, model_name):
282
  fig.legend.visible = show_legend
283
  return fig, real_renderers, synthetic_renderers, pretrained_renderers
284
 
285
-
286
  def add_dataset_to_fig(fig, df, selected_labels, marker, color_mapping, group_label):
287
  renderers = {}
288
  for label in selected_labels:
@@ -392,7 +387,6 @@ def get_color_maps(unique_subsets):
392
 
393
  return color_map
394
 
395
-
396
  def calculate_cluster_centers(df, labels):
397
  centers = {}
398
  for label in labels:
@@ -541,13 +535,10 @@ def run_model(model_name):
541
  # Selector para el m茅todo de c贸mputo del embedding
542
  embedding_computation = st.selectbox("驴C贸mo se computa el embedding?", options=["weighted", "averaged"], key=f"embedding_method_{model_name}")
543
  # Se asigna el prefijo correspondiente
544
- # prefijo_embedding = "weighted_" if embedding_computation == "weighted" else "averaged_"
545
 
546
  if embedding_computation == "weighted":
547
- # prefijo_embedding = "weighted_"
548
  weight_factor = f"{WEIGHT_FACTOR}_"
549
  else:
550
- # prefijo_embedding = "averaged_"
551
  weight_factor = ""
552
 
553
  embeddings = load_embeddings(model_name, version, embedding_computation, weight_factor)
@@ -555,7 +546,7 @@ def run_model(model_name):
555
  return
556
 
557
  # Nuevo selector para incluir o excluir el dataset pretrained
558
- include_pretrained = st.checkbox("Incluir dataset pretrained", value=True)
559
  if not include_pretrained:
560
  # Removemos la entrada pretrained del diccionario, si existe.
561
  embeddings.pop("pretrained", None)
@@ -572,10 +563,10 @@ def run_model(model_name):
572
  return
573
 
574
  st.markdown('<h6 class="sub-title">Select Dimensionality Reduction Method</h6>', unsafe_allow_html=True)
575
- reduction_method = st.selectbox("", options=["t-SNE", "PCA"], key=f"reduction_{model_name}")
576
 
577
  distance_metric = st.selectbox("Select Distance Metric:",
578
- options=["Wasserstein", "Euclidean", "KL"],
579
  key=f"distance_metric_{model_name}")
580
 
581
  tsne_params = {}
@@ -628,13 +619,12 @@ def run_model(model_name):
628
  st.write(f"Trustworthiness: {result['trustworthiness']:.4f}")
629
  st.write(f"Continuity: {result['continuity']:.4f}")
630
 
631
- # Si se us贸 PCA, se muestran los plots de loadings con Bokeh (con hover para ver la etiqueta)
632
  if reduction_method == "PCA" and result.get("pca_model") is not None:
633
  pca_model = result["pca_model"]
634
  components = pca_model.components_ # Shape: (n_components, n_features)
635
 
636
- st.subheader("Pesos de las Componentes Principales (Loadings)")
637
- # Se crea un plot de barras por cada componente
638
  for i, comp in enumerate(components):
639
  source = ColumnDataSource(data=dict(
640
  dimensions=embedding_cols,
@@ -642,11 +632,17 @@ def run_model(model_name):
642
  ))
643
  p = figure(x_range=embedding_cols, title=f"Componente Principal {i+1}",
644
  plot_height=400, plot_width=600,
645
- toolbar_location=None, tools="")
 
 
 
 
 
 
 
 
646
  p.vbar(x='dimensions', top='weight', width=0.8, source=source)
647
- # Ocultar etiquetas del eje x para un aspecto m谩s limpio
648
  p.xaxis.major_label_text_font_size = '0pt'
649
- # Agregar HoverTool para mostrar la dimensi贸n y su peso
650
  hover = HoverTool(tooltips=[("Dimensi贸n", "@dimensions"), ("Peso", "@weight")])
651
  p.add_tools(hover)
652
  p.xaxis.axis_label = "Dimensiones originales"
@@ -709,7 +705,7 @@ def run_model(model_name):
709
  layout = column(result["scatter_fig"], column(real_select, reset_button, data_table))
710
 
711
  st.bokeh_chart(layout, use_container_width=True)
712
-
713
  buffer = io.BytesIO()
714
  df_table.to_excel(buffer, index=False)
715
  buffer.seek(0)
@@ -722,6 +718,124 @@ def run_model(model_name):
722
  key=f"download_button_excel_{model_name}"
723
  )
724
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
725
  def main():
726
  config_style()
727
  tabs = st.tabs(["Donut", "Idefics2"])
@@ -733,4 +847,4 @@ def main():
733
  run_model("Idefics2")
734
 
735
  if __name__ == "__main__":
736
- main()
 
14
 
15
  N_COMPONENTS = 2
16
  TSNE_NEIGHBOURS = 150
17
+ WEIGHT_FACTOR = 0.1
18
 
19
  TOOLTIPS = """
20
  <div>
 
76
  "pretrained": df_pretratrained}
77
 
78
  elif model == "Idefics2":
79
+ df_real = pd.read_csv(f"data/idefics2/{version}/{embedding_prefix}/de_Rodrigo_merit_secret_britanico_{weight_factor}embeddings.csv")
80
+ df_par = pd.read_csv(f"data/idefics2/{version}/{embedding_prefix}/de_Rodrigo_merit_es-digital-paragraph-degradation-seq_{weight_factor}embeddings.csv")
81
+ df_line = pd.read_csv(f"data/idefics2/{version}/{embedding_prefix}/de_Rodrigo_merit_es-digital-line-degradation-seq_{weight_factor}embeddings.csv")
82
+ df_seq = pd.read_csv(f"data/idefics2/{version}/{embedding_prefix}/de_Rodrigo_merit_es-digital-seq_{weight_factor}embeddings.csv")
83
+ df_rot = pd.read_csv(f"data/idefics2/{version}/{embedding_prefix}/de_Rodrigo_merit_es-digital-rotation-degradation-seq_{weight_factor}embeddings.csv")
84
+ df_zoom = pd.read_csv(f"data/idefics2/{version}/{embedding_prefix}/de_Rodrigo_merit_es-digital-zoom-degradation-seq_{weight_factor}embeddings.csv")
85
+ df_render = pd.read_csv(f"data/idefics2/{version}/{embedding_prefix}/de_Rodrigo_merit_es-render-seq_{weight_factor}embeddings.csv")
86
 
87
  # Cargar ambos subconjuntos pretrained y combinarlos
88
+ df_pretratrained_PDFA = pd.read_csv(f"data/idefics2/{version}/{embedding_prefix}/de_Rodrigo_merit_aux_PDFA_{weight_factor}embeddings.csv")
89
+ df_pretratrained_IDL = pd.read_csv(f"data/idefics2/{version}/{embedding_prefix}/de_Rodrigo_merit_aux_IDL_{weight_factor}embeddings.csv")
90
  df_pretratrained = pd.concat([df_pretratrained_PDFA, df_pretratrained_IDL], ignore_index=True)
91
 
92
  # Asignar etiquetas de versi贸n
 
116
  st.error("Modelo no reconocido")
117
  return None
118
 
 
 
 
119
  def split_versions(df_combined, reduced):
120
  # Asignar las coordenadas si la reducci贸n es 2D
121
  if reduced.shape[1] == 2:
 
135
  unique_subsets = {"real": unique_real, "synthetic": unique_synth, "pretrained": unique_pretrained}
136
  return df_dict, unique_subsets
137
 
 
138
  def get_embedding_from_df(df):
139
  # Retorna el embedding completo (4 dimensiones en este caso) guardado en la columna 'embedding'
140
  if 'embedding' in df.columns:
 
278
  fig.legend.visible = show_legend
279
  return fig, real_renderers, synthetic_renderers, pretrained_renderers
280
 
 
281
  def add_dataset_to_fig(fig, df, selected_labels, marker, color_mapping, group_label):
282
  renderers = {}
283
  for label in selected_labels:
 
387
 
388
  return color_map
389
 
 
390
  def calculate_cluster_centers(df, labels):
391
  centers = {}
392
  for label in labels:
 
535
  # Selector para el m茅todo de c贸mputo del embedding
536
  embedding_computation = st.selectbox("驴C贸mo se computa el embedding?", options=["weighted", "averaged"], key=f"embedding_method_{model_name}")
537
  # Se asigna el prefijo correspondiente
 
538
 
539
  if embedding_computation == "weighted":
 
540
  weight_factor = f"{WEIGHT_FACTOR}_"
541
  else:
 
542
  weight_factor = ""
543
 
544
  embeddings = load_embeddings(model_name, version, embedding_computation, weight_factor)
 
546
  return
547
 
548
  # Nuevo selector para incluir o excluir el dataset pretrained
549
+ include_pretrained = st.checkbox("Incluir dataset pretrained", value=True, key=f"legend_{model_name}_pretrained")
550
  if not include_pretrained:
551
  # Removemos la entrada pretrained del diccionario, si existe.
552
  embeddings.pop("pretrained", None)
 
563
  return
564
 
565
  st.markdown('<h6 class="sub-title">Select Dimensionality Reduction Method</h6>', unsafe_allow_html=True)
566
+ reduction_method = st.selectbox("", options=["PCA", "t-SNE"], key=f"reduction_{model_name}")
567
 
568
  distance_metric = st.selectbox("Select Distance Metric:",
569
+ options=["Euclidean", "Wasserstein", "KL"],
570
  key=f"distance_metric_{model_name}")
571
 
572
  tsne_params = {}
 
619
  st.write(f"Trustworthiness: {result['trustworthiness']:.4f}")
620
  st.write(f"Continuity: {result['continuity']:.4f}")
621
 
622
+ # Mostrar los plots de loadings si se us贸 PCA (para el conjunto combinado)
623
  if reduction_method == "PCA" and result.get("pca_model") is not None:
624
  pca_model = result["pca_model"]
625
  components = pca_model.components_ # Shape: (n_components, n_features)
626
 
627
+ st.subheader("Pesos de las Componentes Principales (Loadings) - Conjunto Combinado")
 
628
  for i, comp in enumerate(components):
629
  source = ColumnDataSource(data=dict(
630
  dimensions=embedding_cols,
 
632
  ))
633
  p = figure(x_range=embedding_cols, title=f"Componente Principal {i+1}",
634
  plot_height=400, plot_width=600,
635
+ toolbar_location="above",
636
+ tools="pan,wheel_zoom,reset,save,hover",
637
+ active_scroll="wheel_zoom")
638
+
639
+ # Establecer fondo blanco
640
+ p.background_fill_color = "white"
641
+ # Mostrar solo grilla horizontal
642
+ p.xgrid.grid_line_color = None
643
+ p.ygrid.grid_line_color = "gray"
644
  p.vbar(x='dimensions', top='weight', width=0.8, source=source)
 
645
  p.xaxis.major_label_text_font_size = '0pt'
 
646
  hover = HoverTool(tooltips=[("Dimensi贸n", "@dimensions"), ("Peso", "@weight")])
647
  p.add_tools(hover)
648
  p.xaxis.axis_label = "Dimensiones originales"
 
705
  layout = column(result["scatter_fig"], column(real_select, reset_button, data_table))
706
 
707
  st.bokeh_chart(layout, use_container_width=True)
708
+
709
  buffer = io.BytesIO()
710
  df_table.to_excel(buffer, index=False)
711
  buffer.seek(0)
 
718
  key=f"download_button_excel_{model_name}"
719
  )
720
 
721
+ # Nuevo bloque: PCA solo para df_real
722
+ if reduction_method == "PCA":
723
+ st.markdown("## PCA - Solo Muestras Reales")
724
+ # Extraemos 煤nicamente las muestras reales
725
+ df_real_only = embeddings["real"].copy()
726
+ pca_real = PCA(n_components=N_COMPONENTS)
727
+ reduced_real = pca_real.fit_transform(df_real_only[embedding_cols].values)
728
+ df_real_only['embedding'] = list(reduced_real)
729
+ if reduced_real.shape[1] == 2:
730
+ df_real_only['x'] = reduced_real[:, 0]
731
+ df_real_only['y'] = reduced_real[:, 1]
732
+ explained_variance_real = pca_real.explained_variance_ratio_
733
+ unique_labels_real = sorted(df_real_only['label'].unique().tolist())
734
+
735
+ # Definir mapeo de colores usando la paleta Reds9
736
+ num_labels = len(unique_labels_real)
737
+ if num_labels <= 9:
738
+ red_palette = Reds9[:num_labels]
739
+ else:
740
+ red_palette = (Reds9 * ((num_labels // 9) + 1))[:num_labels]
741
+ real_color_mapping = {label: red_palette[i] for i, label in enumerate(unique_labels_real)}
742
+
743
+ st.subheader("PCA - Real: Explained Variance Ratio")
744
+ component_names_real = [f"PC{i+1}" for i in range(len(explained_variance_real))]
745
+ variance_df_real = pd.DataFrame({
746
+ "Component": component_names_real,
747
+ "Explained Variance": explained_variance_real
748
+ })
749
+ st.table(variance_df_real)
750
+
751
+ # Agregar scatter plot para visualizar el PCA real
752
+ st.subheader("PCA - Real: Scatter Plot")
753
+ fig_real = figure(
754
+ title="PCA - Solo Real: Scatter Plot",
755
+ plot_width=600,
756
+ plot_height=600,
757
+ tools="pan,wheel_zoom,reset,save,hover",
758
+ active_scroll="wheel_zoom",
759
+ background_fill_color="white"
760
+ )
761
+ # Mostrar solo grid horizontal
762
+ fig_real.xgrid.grid_line_color = None
763
+ fig_real.ygrid.grid_line_color = "gray"
764
+
765
+ # Dibujar los puntos por cada etiqueta
766
+ for label in unique_labels_real:
767
+ subset = df_real_only[df_real_only['label'] == label]
768
+ source_scatter = ColumnDataSource(data={
769
+ 'x': subset['x'],
770
+ 'y': subset['y'],
771
+ 'label': subset['label']
772
+ })
773
+ fig_real.circle('x', 'y', size=10,
774
+ fill_color=real_color_mapping[label],
775
+ line_color=real_color_mapping[label],
776
+ legend_label=label,
777
+ source=source_scatter)
778
+
779
+ # Calcular el centroide de todos los puntos
780
+ center_x = df_real_only['x'].mean()
781
+ center_y = df_real_only['y'].mean()
782
+
783
+ # Calcular el radio como la m谩xima distancia desde el centroide
784
+ distances = np.sqrt((df_real_only['x'] - center_x)**2 + (df_real_only['y'] - center_y)**2)
785
+ radius = distances.max()
786
+
787
+ # Dibujar el centroide
788
+ fig_real.circle(x=center_x, y=center_y, size=15,
789
+ fill_color="black", line_color="black", legend_label="Centroide")
790
+
791
+ # Dibujar la circunferencia (con l铆nea discontinua)
792
+ fig_real.circle(x=center_x, y=center_y, radius=radius,
793
+ fill_color=None, line_color="black", line_dash="dashed", legend_label="Circunferencia")
794
+
795
+ fig_real.xaxis.axis_label = "PC1"
796
+ fig_real.yaxis.axis_label = "PC2"
797
+
798
+ hover_scatter = fig_real.select_one(HoverTool)
799
+ hover_scatter.tooltips = [("Label", "@label"), ("PC1", "@x"), ("PC2", "@y")]
800
+
801
+ fig_real.legend.location = "top_right"
802
+ st.bokeh_chart(fig_real)
803
+
804
+ # Mostrar el valor del radio debajo del gr谩fico
805
+ st.write(f"El radio de la circunferencia es: {int(radius)}")
806
+
807
+ # Mostrar los plots de loadings (Component Loadings)
808
+ st.subheader("PCA - Real: Component Loadings")
809
+ st.markdown("### Pesos de las Componentes Principales (Loadings) - Conjunto Combinado")
810
+ for i, comp in enumerate(pca_real.components_):
811
+ source = ColumnDataSource(data=dict(
812
+ dimensions=embedding_cols,
813
+ weight=comp
814
+ ))
815
+ p = figure(
816
+ x_range=embedding_cols,
817
+ title=f"Componente Principal {i+1}",
818
+ plot_height=400,
819
+ plot_width=600,
820
+ toolbar_location="above",
821
+ tools="pan,wheel_zoom,reset,save,hover",
822
+ active_scroll="wheel_zoom"
823
+ )
824
+ # Fondo blanco y solo grid horizontal
825
+ p.background_fill_color = "white"
826
+ p.xgrid.grid_line_color = None
827
+ p.ygrid.grid_line_color = "gray"
828
+ p.vbar(x='dimensions', top='weight', width=0.8, source=source,
829
+ fill_color="#2b83ba", line_color="#2b83ba")
830
+ # No se muestran etiquetas en el eje horizontal
831
+ p.xaxis.axis_label = "Dimensiones Originales"
832
+ p.xaxis.major_label_text_font_size = '0pt'
833
+ # Configurar el HoverTool
834
+ hover = p.select_one(HoverTool)
835
+ hover.tooltips = [("Dimensi贸n", "@dimensions"), ("Peso", "@weight")]
836
+ st.bokeh_chart(p)
837
+
838
+
839
  def main():
840
  config_style()
841
  tabs = st.tabs(["Donut", "Idefics2"])
 
847
  run_model("Idefics2")
848
 
849
  if __name__ == "__main__":
850
+ main()