Spaces:
Running
Running
Commit
路
c300990
1
Parent(s):
3cd6e58
Show PCA for Targe Dataset (Real)
Browse files
app.py
CHANGED
@@ -14,7 +14,7 @@ from sklearn.linear_model import LinearRegression
|
|
14 |
|
15 |
N_COMPONENTS = 2
|
16 |
TSNE_NEIGHBOURS = 150
|
17 |
-
WEIGHT_FACTOR = 0.
|
18 |
|
19 |
TOOLTIPS = """
|
20 |
<div>
|
@@ -76,17 +76,17 @@ def load_embeddings(model, version, embedding_prefix, weight_factor):
|
|
76 |
"pretrained": df_pretratrained}
|
77 |
|
78 |
elif model == "Idefics2":
|
79 |
-
df_real = pd.read_csv(f"data/
|
80 |
-
df_par = pd.read_csv(f"data/
|
81 |
-
df_line = pd.read_csv(f"data/
|
82 |
-
df_seq = pd.read_csv(f"data/
|
83 |
-
df_rot = pd.read_csv(f"data/
|
84 |
-
df_zoom = pd.read_csv(f"data/
|
85 |
-
df_render = pd.read_csv(f"data/
|
86 |
|
87 |
# Cargar ambos subconjuntos pretrained y combinarlos
|
88 |
-
df_pretratrained_PDFA = pd.read_csv(f"data/
|
89 |
-
df_pretratrained_IDL = pd.read_csv(f"data/
|
90 |
df_pretratrained = pd.concat([df_pretratrained_PDFA, df_pretratrained_IDL], ignore_index=True)
|
91 |
|
92 |
# Asignar etiquetas de versi贸n
|
@@ -116,9 +116,6 @@ def load_embeddings(model, version, embedding_prefix, weight_factor):
|
|
116 |
st.error("Modelo no reconocido")
|
117 |
return None
|
118 |
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
def split_versions(df_combined, reduced):
|
123 |
# Asignar las coordenadas si la reducci贸n es 2D
|
124 |
if reduced.shape[1] == 2:
|
@@ -138,7 +135,6 @@ def split_versions(df_combined, reduced):
|
|
138 |
unique_subsets = {"real": unique_real, "synthetic": unique_synth, "pretrained": unique_pretrained}
|
139 |
return df_dict, unique_subsets
|
140 |
|
141 |
-
|
142 |
def get_embedding_from_df(df):
|
143 |
# Retorna el embedding completo (4 dimensiones en este caso) guardado en la columna 'embedding'
|
144 |
if 'embedding' in df.columns:
|
@@ -282,7 +278,6 @@ def create_figure(dfs, unique_subsets, color_maps, model_name):
|
|
282 |
fig.legend.visible = show_legend
|
283 |
return fig, real_renderers, synthetic_renderers, pretrained_renderers
|
284 |
|
285 |
-
|
286 |
def add_dataset_to_fig(fig, df, selected_labels, marker, color_mapping, group_label):
|
287 |
renderers = {}
|
288 |
for label in selected_labels:
|
@@ -392,7 +387,6 @@ def get_color_maps(unique_subsets):
|
|
392 |
|
393 |
return color_map
|
394 |
|
395 |
-
|
396 |
def calculate_cluster_centers(df, labels):
|
397 |
centers = {}
|
398 |
for label in labels:
|
@@ -541,13 +535,10 @@ def run_model(model_name):
|
|
541 |
# Selector para el m茅todo de c贸mputo del embedding
|
542 |
embedding_computation = st.selectbox("驴C贸mo se computa el embedding?", options=["weighted", "averaged"], key=f"embedding_method_{model_name}")
|
543 |
# Se asigna el prefijo correspondiente
|
544 |
-
# prefijo_embedding = "weighted_" if embedding_computation == "weighted" else "averaged_"
|
545 |
|
546 |
if embedding_computation == "weighted":
|
547 |
-
# prefijo_embedding = "weighted_"
|
548 |
weight_factor = f"{WEIGHT_FACTOR}_"
|
549 |
else:
|
550 |
-
# prefijo_embedding = "averaged_"
|
551 |
weight_factor = ""
|
552 |
|
553 |
embeddings = load_embeddings(model_name, version, embedding_computation, weight_factor)
|
@@ -555,7 +546,7 @@ def run_model(model_name):
|
|
555 |
return
|
556 |
|
557 |
# Nuevo selector para incluir o excluir el dataset pretrained
|
558 |
-
include_pretrained = st.checkbox("Incluir dataset pretrained", value=True)
|
559 |
if not include_pretrained:
|
560 |
# Removemos la entrada pretrained del diccionario, si existe.
|
561 |
embeddings.pop("pretrained", None)
|
@@ -572,10 +563,10 @@ def run_model(model_name):
|
|
572 |
return
|
573 |
|
574 |
st.markdown('<h6 class="sub-title">Select Dimensionality Reduction Method</h6>', unsafe_allow_html=True)
|
575 |
-
reduction_method = st.selectbox("", options=["
|
576 |
|
577 |
distance_metric = st.selectbox("Select Distance Metric:",
|
578 |
-
options=["
|
579 |
key=f"distance_metric_{model_name}")
|
580 |
|
581 |
tsne_params = {}
|
@@ -628,13 +619,12 @@ def run_model(model_name):
|
|
628 |
st.write(f"Trustworthiness: {result['trustworthiness']:.4f}")
|
629 |
st.write(f"Continuity: {result['continuity']:.4f}")
|
630 |
|
631 |
-
#
|
632 |
if reduction_method == "PCA" and result.get("pca_model") is not None:
|
633 |
pca_model = result["pca_model"]
|
634 |
components = pca_model.components_ # Shape: (n_components, n_features)
|
635 |
|
636 |
-
st.subheader("Pesos de las Componentes Principales (Loadings)")
|
637 |
-
# Se crea un plot de barras por cada componente
|
638 |
for i, comp in enumerate(components):
|
639 |
source = ColumnDataSource(data=dict(
|
640 |
dimensions=embedding_cols,
|
@@ -642,11 +632,17 @@ def run_model(model_name):
|
|
642 |
))
|
643 |
p = figure(x_range=embedding_cols, title=f"Componente Principal {i+1}",
|
644 |
plot_height=400, plot_width=600,
|
645 |
-
toolbar_location=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
646 |
p.vbar(x='dimensions', top='weight', width=0.8, source=source)
|
647 |
-
# Ocultar etiquetas del eje x para un aspecto m谩s limpio
|
648 |
p.xaxis.major_label_text_font_size = '0pt'
|
649 |
-
# Agregar HoverTool para mostrar la dimensi贸n y su peso
|
650 |
hover = HoverTool(tooltips=[("Dimensi贸n", "@dimensions"), ("Peso", "@weight")])
|
651 |
p.add_tools(hover)
|
652 |
p.xaxis.axis_label = "Dimensiones originales"
|
@@ -709,7 +705,7 @@ def run_model(model_name):
|
|
709 |
layout = column(result["scatter_fig"], column(real_select, reset_button, data_table))
|
710 |
|
711 |
st.bokeh_chart(layout, use_container_width=True)
|
712 |
-
|
713 |
buffer = io.BytesIO()
|
714 |
df_table.to_excel(buffer, index=False)
|
715 |
buffer.seek(0)
|
@@ -722,6 +718,124 @@ def run_model(model_name):
|
|
722 |
key=f"download_button_excel_{model_name}"
|
723 |
)
|
724 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
725 |
def main():
|
726 |
config_style()
|
727 |
tabs = st.tabs(["Donut", "Idefics2"])
|
@@ -733,4 +847,4 @@ def main():
|
|
733 |
run_model("Idefics2")
|
734 |
|
735 |
if __name__ == "__main__":
|
736 |
-
main()
|
|
|
14 |
|
15 |
N_COMPONENTS = 2
|
16 |
TSNE_NEIGHBOURS = 150
|
17 |
+
WEIGHT_FACTOR = 0.1
|
18 |
|
19 |
TOOLTIPS = """
|
20 |
<div>
|
|
|
76 |
"pretrained": df_pretratrained}
|
77 |
|
78 |
elif model == "Idefics2":
|
79 |
+
df_real = pd.read_csv(f"data/idefics2/{version}/{embedding_prefix}/de_Rodrigo_merit_secret_britanico_{weight_factor}embeddings.csv")
|
80 |
+
df_par = pd.read_csv(f"data/idefics2/{version}/{embedding_prefix}/de_Rodrigo_merit_es-digital-paragraph-degradation-seq_{weight_factor}embeddings.csv")
|
81 |
+
df_line = pd.read_csv(f"data/idefics2/{version}/{embedding_prefix}/de_Rodrigo_merit_es-digital-line-degradation-seq_{weight_factor}embeddings.csv")
|
82 |
+
df_seq = pd.read_csv(f"data/idefics2/{version}/{embedding_prefix}/de_Rodrigo_merit_es-digital-seq_{weight_factor}embeddings.csv")
|
83 |
+
df_rot = pd.read_csv(f"data/idefics2/{version}/{embedding_prefix}/de_Rodrigo_merit_es-digital-rotation-degradation-seq_{weight_factor}embeddings.csv")
|
84 |
+
df_zoom = pd.read_csv(f"data/idefics2/{version}/{embedding_prefix}/de_Rodrigo_merit_es-digital-zoom-degradation-seq_{weight_factor}embeddings.csv")
|
85 |
+
df_render = pd.read_csv(f"data/idefics2/{version}/{embedding_prefix}/de_Rodrigo_merit_es-render-seq_{weight_factor}embeddings.csv")
|
86 |
|
87 |
# Cargar ambos subconjuntos pretrained y combinarlos
|
88 |
+
df_pretratrained_PDFA = pd.read_csv(f"data/idefics2/{version}/{embedding_prefix}/de_Rodrigo_merit_aux_PDFA_{weight_factor}embeddings.csv")
|
89 |
+
df_pretratrained_IDL = pd.read_csv(f"data/idefics2/{version}/{embedding_prefix}/de_Rodrigo_merit_aux_IDL_{weight_factor}embeddings.csv")
|
90 |
df_pretratrained = pd.concat([df_pretratrained_PDFA, df_pretratrained_IDL], ignore_index=True)
|
91 |
|
92 |
# Asignar etiquetas de versi贸n
|
|
|
116 |
st.error("Modelo no reconocido")
|
117 |
return None
|
118 |
|
|
|
|
|
|
|
119 |
def split_versions(df_combined, reduced):
|
120 |
# Asignar las coordenadas si la reducci贸n es 2D
|
121 |
if reduced.shape[1] == 2:
|
|
|
135 |
unique_subsets = {"real": unique_real, "synthetic": unique_synth, "pretrained": unique_pretrained}
|
136 |
return df_dict, unique_subsets
|
137 |
|
|
|
138 |
def get_embedding_from_df(df):
|
139 |
# Retorna el embedding completo (4 dimensiones en este caso) guardado en la columna 'embedding'
|
140 |
if 'embedding' in df.columns:
|
|
|
278 |
fig.legend.visible = show_legend
|
279 |
return fig, real_renderers, synthetic_renderers, pretrained_renderers
|
280 |
|
|
|
281 |
def add_dataset_to_fig(fig, df, selected_labels, marker, color_mapping, group_label):
|
282 |
renderers = {}
|
283 |
for label in selected_labels:
|
|
|
387 |
|
388 |
return color_map
|
389 |
|
|
|
390 |
def calculate_cluster_centers(df, labels):
|
391 |
centers = {}
|
392 |
for label in labels:
|
|
|
535 |
# Selector para el m茅todo de c贸mputo del embedding
|
536 |
embedding_computation = st.selectbox("驴C贸mo se computa el embedding?", options=["weighted", "averaged"], key=f"embedding_method_{model_name}")
|
537 |
# Se asigna el prefijo correspondiente
|
|
|
538 |
|
539 |
if embedding_computation == "weighted":
|
|
|
540 |
weight_factor = f"{WEIGHT_FACTOR}_"
|
541 |
else:
|
|
|
542 |
weight_factor = ""
|
543 |
|
544 |
embeddings = load_embeddings(model_name, version, embedding_computation, weight_factor)
|
|
|
546 |
return
|
547 |
|
548 |
# Nuevo selector para incluir o excluir el dataset pretrained
|
549 |
+
include_pretrained = st.checkbox("Incluir dataset pretrained", value=True, key=f"legend_{model_name}_pretrained")
|
550 |
if not include_pretrained:
|
551 |
# Removemos la entrada pretrained del diccionario, si existe.
|
552 |
embeddings.pop("pretrained", None)
|
|
|
563 |
return
|
564 |
|
565 |
st.markdown('<h6 class="sub-title">Select Dimensionality Reduction Method</h6>', unsafe_allow_html=True)
|
566 |
+
reduction_method = st.selectbox("", options=["PCA", "t-SNE"], key=f"reduction_{model_name}")
|
567 |
|
568 |
distance_metric = st.selectbox("Select Distance Metric:",
|
569 |
+
options=["Euclidean", "Wasserstein", "KL"],
|
570 |
key=f"distance_metric_{model_name}")
|
571 |
|
572 |
tsne_params = {}
|
|
|
619 |
st.write(f"Trustworthiness: {result['trustworthiness']:.4f}")
|
620 |
st.write(f"Continuity: {result['continuity']:.4f}")
|
621 |
|
622 |
+
# Mostrar los plots de loadings si se us贸 PCA (para el conjunto combinado)
|
623 |
if reduction_method == "PCA" and result.get("pca_model") is not None:
|
624 |
pca_model = result["pca_model"]
|
625 |
components = pca_model.components_ # Shape: (n_components, n_features)
|
626 |
|
627 |
+
st.subheader("Pesos de las Componentes Principales (Loadings) - Conjunto Combinado")
|
|
|
628 |
for i, comp in enumerate(components):
|
629 |
source = ColumnDataSource(data=dict(
|
630 |
dimensions=embedding_cols,
|
|
|
632 |
))
|
633 |
p = figure(x_range=embedding_cols, title=f"Componente Principal {i+1}",
|
634 |
plot_height=400, plot_width=600,
|
635 |
+
toolbar_location="above",
|
636 |
+
tools="pan,wheel_zoom,reset,save,hover",
|
637 |
+
active_scroll="wheel_zoom")
|
638 |
+
|
639 |
+
# Establecer fondo blanco
|
640 |
+
p.background_fill_color = "white"
|
641 |
+
# Mostrar solo grilla horizontal
|
642 |
+
p.xgrid.grid_line_color = None
|
643 |
+
p.ygrid.grid_line_color = "gray"
|
644 |
p.vbar(x='dimensions', top='weight', width=0.8, source=source)
|
|
|
645 |
p.xaxis.major_label_text_font_size = '0pt'
|
|
|
646 |
hover = HoverTool(tooltips=[("Dimensi贸n", "@dimensions"), ("Peso", "@weight")])
|
647 |
p.add_tools(hover)
|
648 |
p.xaxis.axis_label = "Dimensiones originales"
|
|
|
705 |
layout = column(result["scatter_fig"], column(real_select, reset_button, data_table))
|
706 |
|
707 |
st.bokeh_chart(layout, use_container_width=True)
|
708 |
+
|
709 |
buffer = io.BytesIO()
|
710 |
df_table.to_excel(buffer, index=False)
|
711 |
buffer.seek(0)
|
|
|
718 |
key=f"download_button_excel_{model_name}"
|
719 |
)
|
720 |
|
721 |
+
# Nuevo bloque: PCA solo para df_real
|
722 |
+
if reduction_method == "PCA":
|
723 |
+
st.markdown("## PCA - Solo Muestras Reales")
|
724 |
+
# Extraemos 煤nicamente las muestras reales
|
725 |
+
df_real_only = embeddings["real"].copy()
|
726 |
+
pca_real = PCA(n_components=N_COMPONENTS)
|
727 |
+
reduced_real = pca_real.fit_transform(df_real_only[embedding_cols].values)
|
728 |
+
df_real_only['embedding'] = list(reduced_real)
|
729 |
+
if reduced_real.shape[1] == 2:
|
730 |
+
df_real_only['x'] = reduced_real[:, 0]
|
731 |
+
df_real_only['y'] = reduced_real[:, 1]
|
732 |
+
explained_variance_real = pca_real.explained_variance_ratio_
|
733 |
+
unique_labels_real = sorted(df_real_only['label'].unique().tolist())
|
734 |
+
|
735 |
+
# Definir mapeo de colores usando la paleta Reds9
|
736 |
+
num_labels = len(unique_labels_real)
|
737 |
+
if num_labels <= 9:
|
738 |
+
red_palette = Reds9[:num_labels]
|
739 |
+
else:
|
740 |
+
red_palette = (Reds9 * ((num_labels // 9) + 1))[:num_labels]
|
741 |
+
real_color_mapping = {label: red_palette[i] for i, label in enumerate(unique_labels_real)}
|
742 |
+
|
743 |
+
st.subheader("PCA - Real: Explained Variance Ratio")
|
744 |
+
component_names_real = [f"PC{i+1}" for i in range(len(explained_variance_real))]
|
745 |
+
variance_df_real = pd.DataFrame({
|
746 |
+
"Component": component_names_real,
|
747 |
+
"Explained Variance": explained_variance_real
|
748 |
+
})
|
749 |
+
st.table(variance_df_real)
|
750 |
+
|
751 |
+
# Agregar scatter plot para visualizar el PCA real
|
752 |
+
st.subheader("PCA - Real: Scatter Plot")
|
753 |
+
fig_real = figure(
|
754 |
+
title="PCA - Solo Real: Scatter Plot",
|
755 |
+
plot_width=600,
|
756 |
+
plot_height=600,
|
757 |
+
tools="pan,wheel_zoom,reset,save,hover",
|
758 |
+
active_scroll="wheel_zoom",
|
759 |
+
background_fill_color="white"
|
760 |
+
)
|
761 |
+
# Mostrar solo grid horizontal
|
762 |
+
fig_real.xgrid.grid_line_color = None
|
763 |
+
fig_real.ygrid.grid_line_color = "gray"
|
764 |
+
|
765 |
+
# Dibujar los puntos por cada etiqueta
|
766 |
+
for label in unique_labels_real:
|
767 |
+
subset = df_real_only[df_real_only['label'] == label]
|
768 |
+
source_scatter = ColumnDataSource(data={
|
769 |
+
'x': subset['x'],
|
770 |
+
'y': subset['y'],
|
771 |
+
'label': subset['label']
|
772 |
+
})
|
773 |
+
fig_real.circle('x', 'y', size=10,
|
774 |
+
fill_color=real_color_mapping[label],
|
775 |
+
line_color=real_color_mapping[label],
|
776 |
+
legend_label=label,
|
777 |
+
source=source_scatter)
|
778 |
+
|
779 |
+
# Calcular el centroide de todos los puntos
|
780 |
+
center_x = df_real_only['x'].mean()
|
781 |
+
center_y = df_real_only['y'].mean()
|
782 |
+
|
783 |
+
# Calcular el radio como la m谩xima distancia desde el centroide
|
784 |
+
distances = np.sqrt((df_real_only['x'] - center_x)**2 + (df_real_only['y'] - center_y)**2)
|
785 |
+
radius = distances.max()
|
786 |
+
|
787 |
+
# Dibujar el centroide
|
788 |
+
fig_real.circle(x=center_x, y=center_y, size=15,
|
789 |
+
fill_color="black", line_color="black", legend_label="Centroide")
|
790 |
+
|
791 |
+
# Dibujar la circunferencia (con l铆nea discontinua)
|
792 |
+
fig_real.circle(x=center_x, y=center_y, radius=radius,
|
793 |
+
fill_color=None, line_color="black", line_dash="dashed", legend_label="Circunferencia")
|
794 |
+
|
795 |
+
fig_real.xaxis.axis_label = "PC1"
|
796 |
+
fig_real.yaxis.axis_label = "PC2"
|
797 |
+
|
798 |
+
hover_scatter = fig_real.select_one(HoverTool)
|
799 |
+
hover_scatter.tooltips = [("Label", "@label"), ("PC1", "@x"), ("PC2", "@y")]
|
800 |
+
|
801 |
+
fig_real.legend.location = "top_right"
|
802 |
+
st.bokeh_chart(fig_real)
|
803 |
+
|
804 |
+
# Mostrar el valor del radio debajo del gr谩fico
|
805 |
+
st.write(f"El radio de la circunferencia es: {int(radius)}")
|
806 |
+
|
807 |
+
# Mostrar los plots de loadings (Component Loadings)
|
808 |
+
st.subheader("PCA - Real: Component Loadings")
|
809 |
+
st.markdown("### Pesos de las Componentes Principales (Loadings) - Conjunto Combinado")
|
810 |
+
for i, comp in enumerate(pca_real.components_):
|
811 |
+
source = ColumnDataSource(data=dict(
|
812 |
+
dimensions=embedding_cols,
|
813 |
+
weight=comp
|
814 |
+
))
|
815 |
+
p = figure(
|
816 |
+
x_range=embedding_cols,
|
817 |
+
title=f"Componente Principal {i+1}",
|
818 |
+
plot_height=400,
|
819 |
+
plot_width=600,
|
820 |
+
toolbar_location="above",
|
821 |
+
tools="pan,wheel_zoom,reset,save,hover",
|
822 |
+
active_scroll="wheel_zoom"
|
823 |
+
)
|
824 |
+
# Fondo blanco y solo grid horizontal
|
825 |
+
p.background_fill_color = "white"
|
826 |
+
p.xgrid.grid_line_color = None
|
827 |
+
p.ygrid.grid_line_color = "gray"
|
828 |
+
p.vbar(x='dimensions', top='weight', width=0.8, source=source,
|
829 |
+
fill_color="#2b83ba", line_color="#2b83ba")
|
830 |
+
# No se muestran etiquetas en el eje horizontal
|
831 |
+
p.xaxis.axis_label = "Dimensiones Originales"
|
832 |
+
p.xaxis.major_label_text_font_size = '0pt'
|
833 |
+
# Configurar el HoverTool
|
834 |
+
hover = p.select_one(HoverTool)
|
835 |
+
hover.tooltips = [("Dimensi贸n", "@dimensions"), ("Peso", "@weight")]
|
836 |
+
st.bokeh_chart(p)
|
837 |
+
|
838 |
+
|
839 |
def main():
|
840 |
config_style()
|
841 |
tabs = st.tabs(["Donut", "Idefics2"])
|
|
|
847 |
run_model("Idefics2")
|
848 |
|
849 |
if __name__ == "__main__":
|
850 |
+
main()
|