Spaces:
Running
Running
Commit
路
eff2e30
1
Parent(s):
d5b8742
Replicate PCA Considering Only Real Samples
Browse files
app.py
CHANGED
@@ -14,7 +14,7 @@ from sklearn.linear_model import LinearRegression
|
|
14 |
|
15 |
N_COMPONENTS = 2
|
16 |
TSNE_NEIGHBOURS = 150
|
17 |
-
WEIGHT_FACTOR = 0.
|
18 |
|
19 |
TOOLTIPS = """
|
20 |
<div>
|
@@ -747,62 +747,6 @@ def run_model(model_name):
|
|
747 |
"Explained Variance": explained_variance_real
|
748 |
})
|
749 |
st.table(variance_df_real)
|
750 |
-
|
751 |
-
# Agregar scatter plot para visualizar el PCA real
|
752 |
-
st.subheader("PCA - Real: Scatter Plot")
|
753 |
-
fig_real = figure(
|
754 |
-
title="PCA - Solo Real: Scatter Plot",
|
755 |
-
plot_width=600,
|
756 |
-
plot_height=600,
|
757 |
-
tools="pan,wheel_zoom,reset,save,hover",
|
758 |
-
active_scroll="wheel_zoom",
|
759 |
-
background_fill_color="white"
|
760 |
-
)
|
761 |
-
# Mostrar solo grid horizontal
|
762 |
-
fig_real.xgrid.grid_line_color = None
|
763 |
-
fig_real.ygrid.grid_line_color = "gray"
|
764 |
-
|
765 |
-
# Dibujar los puntos por cada etiqueta
|
766 |
-
for label in unique_labels_real:
|
767 |
-
subset = df_real_only[df_real_only['label'] == label]
|
768 |
-
source_scatter = ColumnDataSource(data={
|
769 |
-
'x': subset['x'],
|
770 |
-
'y': subset['y'],
|
771 |
-
'label': subset['label']
|
772 |
-
})
|
773 |
-
fig_real.circle('x', 'y', size=10,
|
774 |
-
fill_color=real_color_mapping[label],
|
775 |
-
line_color=real_color_mapping[label],
|
776 |
-
legend_label=label,
|
777 |
-
source=source_scatter)
|
778 |
-
|
779 |
-
# Calcular el centroide de todos los puntos
|
780 |
-
center_x = df_real_only['x'].mean()
|
781 |
-
center_y = df_real_only['y'].mean()
|
782 |
-
|
783 |
-
# Calcular el radio como la m谩xima distancia desde el centroide
|
784 |
-
distances = np.sqrt((df_real_only['x'] - center_x)**2 + (df_real_only['y'] - center_y)**2)
|
785 |
-
radius = distances.max()
|
786 |
-
|
787 |
-
# Dibujar el centroide
|
788 |
-
fig_real.circle(x=center_x, y=center_y, size=15,
|
789 |
-
fill_color="black", line_color="black", legend_label="Centroide")
|
790 |
-
|
791 |
-
# Dibujar la circunferencia (con l铆nea discontinua)
|
792 |
-
fig_real.circle(x=center_x, y=center_y, radius=radius,
|
793 |
-
fill_color=None, line_color="black", line_dash="dashed", legend_label="Circunferencia")
|
794 |
-
|
795 |
-
fig_real.xaxis.axis_label = "PC1"
|
796 |
-
fig_real.yaxis.axis_label = "PC2"
|
797 |
-
|
798 |
-
hover_scatter = fig_real.select_one(HoverTool)
|
799 |
-
hover_scatter.tooltips = [("Label", "@label"), ("PC1", "@x"), ("PC2", "@y")]
|
800 |
-
|
801 |
-
fig_real.legend.location = "top_right"
|
802 |
-
st.bokeh_chart(fig_real)
|
803 |
-
|
804 |
-
# Mostrar el valor del radio debajo del gr谩fico
|
805 |
-
st.write(f"El radio de la circunferencia es: {int(radius)}")
|
806 |
|
807 |
# Mostrar los plots de loadings (Component Loadings)
|
808 |
st.subheader("PCA - Real: Component Loadings")
|
@@ -834,6 +778,250 @@ def run_model(model_name):
|
|
834 |
hover = p.select_one(HoverTool)
|
835 |
hover.tooltips = [("Dimensi贸n", "@dimensions"), ("Peso", "@weight")]
|
836 |
st.bokeh_chart(p)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
837 |
|
838 |
|
839 |
def main():
|
|
|
14 |
|
15 |
N_COMPONENTS = 2
|
16 |
TSNE_NEIGHBOURS = 150
|
17 |
+
WEIGHT_FACTOR = 0.05
|
18 |
|
19 |
TOOLTIPS = """
|
20 |
<div>
|
|
|
747 |
"Explained Variance": explained_variance_real
|
748 |
})
|
749 |
st.table(variance_df_real)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
750 |
|
751 |
# Mostrar los plots de loadings (Component Loadings)
|
752 |
st.subheader("PCA - Real: Component Loadings")
|
|
|
778 |
hover = p.select_one(HoverTool)
|
779 |
hover.tooltips = [("Dimensi贸n", "@dimensions"), ("Peso", "@weight")]
|
780 |
st.bokeh_chart(p)
|
781 |
+
|
782 |
+
# Segundo PCA: Proyecci贸n de todos los subconjuntos usando los loadings calculados con df_real_only
|
783 |
+
st.subheader("PCA - Todos los subconjuntos proyectados (usando loadings de df_real)")
|
784 |
+
|
785 |
+
# Crear un diccionario para almacenar las proyecciones usando el PCA calculado con las muestras reales (pca_real)
|
786 |
+
df_all = {}
|
787 |
+
|
788 |
+
# Proyectar las muestras reales
|
789 |
+
df_real_proj = embeddings["real"].copy()
|
790 |
+
proj_real = pca_real.transform(df_real_proj[embedding_cols].values)
|
791 |
+
df_real_proj['pc1'] = proj_real[:, 0]
|
792 |
+
df_real_proj['pc2'] = proj_real[:, 1]
|
793 |
+
df_all["real"] = df_real_proj
|
794 |
+
|
795 |
+
# Proyectar el subconjunto synthetic, si existe
|
796 |
+
if "synthetic" in embeddings:
|
797 |
+
df_synth_proj = embeddings["synthetic"].copy()
|
798 |
+
proj_synth = pca_real.transform(df_synth_proj[embedding_cols].values)
|
799 |
+
df_synth_proj['pc1'] = proj_synth[:, 0]
|
800 |
+
df_synth_proj['pc2'] = proj_synth[:, 1]
|
801 |
+
df_all["synthetic"] = df_synth_proj
|
802 |
+
|
803 |
+
# Proyectar el subconjunto pretrained, si existe
|
804 |
+
if "pretrained" in embeddings:
|
805 |
+
df_pretr_proj = embeddings["pretrained"].copy()
|
806 |
+
proj_pretr = pca_real.transform(df_pretr_proj[embedding_cols].values)
|
807 |
+
df_pretr_proj['pc1'] = proj_pretr[:, 0]
|
808 |
+
df_pretr_proj['pc2'] = proj_pretr[:, 1]
|
809 |
+
df_all["pretrained"] = df_pretr_proj
|
810 |
+
|
811 |
+
# Para utilizar las mismas funciones de plot (create_figure, add_dataset_to_fig, add_synthetic_dataset_to_fig),
|
812 |
+
# renombramos las columnas 'pc1' y 'pc2' a 'x' y 'y' en cada dataframe
|
813 |
+
for key in df_all:
|
814 |
+
df_all[key]["x"] = df_all[key]["pc1"]
|
815 |
+
df_all[key]["y"] = df_all[key]["pc2"]
|
816 |
+
|
817 |
+
# Construir los subconjuntos 煤nicos con la granularidad deseada:
|
818 |
+
# - Para "real" y "pretrained": agrupamos por label.
|
819 |
+
# - Para "synthetic": agrupamos por la columna "source" (cada source tendr谩 sus labels).
|
820 |
+
unique_subsets = {}
|
821 |
+
# Real:
|
822 |
+
unique_subsets["real"] = sorted(df_all["real"]['label'].unique().tolist())
|
823 |
+
# Synthetic:
|
824 |
+
if "synthetic" in df_all:
|
825 |
+
unique_synth = {}
|
826 |
+
for source in df_all["synthetic"]["source"].unique():
|
827 |
+
unique_synth[source] = sorted(df_all["synthetic"][df_all["synthetic"]["source"] == source]['label'].unique().tolist())
|
828 |
+
unique_subsets["synthetic"] = unique_synth
|
829 |
+
else:
|
830 |
+
unique_subsets["synthetic"] = {}
|
831 |
+
# Pretrained:
|
832 |
+
if "pretrained" in df_all:
|
833 |
+
unique_subsets["pretrained"] = sorted(df_all["pretrained"]['label'].unique().tolist())
|
834 |
+
else:
|
835 |
+
unique_subsets["pretrained"] = []
|
836 |
+
|
837 |
+
# Obtener los mapeos de colores utilizando la funci贸n ya definida
|
838 |
+
color_maps = get_color_maps(unique_subsets)
|
839 |
+
|
840 |
+
# Definir un mapeo de marcadores para los subconjuntos synthetic (granularidad por source)
|
841 |
+
marker_mapping = {
|
842 |
+
"es-digital-paragraph-degradation-seq": "x",
|
843 |
+
"es-digital-line-degradation-seq": "cross",
|
844 |
+
"es-digital-seq": "triangle",
|
845 |
+
"es-digital-rotation-degradation-seq": "diamond",
|
846 |
+
"es-digital-zoom-degradation-seq": "asterisk",
|
847 |
+
"es-render-seq": "inverted_triangle"
|
848 |
+
}
|
849 |
+
|
850 |
+
# Ahora, crear la figura utilizando las funciones existentes para mantener la granularidad:
|
851 |
+
# Se plotean las muestras reales, synthetic (por source) y pretrained con sus respectivos marcadores y colores.
|
852 |
+
fig_all = figure(
|
853 |
+
title="PCA - Todos los subconjuntos proyectados",
|
854 |
+
plot_width=600,
|
855 |
+
plot_height=600,
|
856 |
+
tools="pan,wheel_zoom,reset,save,hover",
|
857 |
+
active_scroll="wheel_zoom",
|
858 |
+
background_fill_color="white"
|
859 |
+
)
|
860 |
+
# Solo grid horizontal
|
861 |
+
fig_all.xgrid.grid_line_color = None
|
862 |
+
fig_all.ygrid.grid_line_color = "gray"
|
863 |
+
|
864 |
+
# Ploteamos los puntos de las muestras reales (agrupados por label)
|
865 |
+
for label in unique_subsets["real"]:
|
866 |
+
subset = df_all["real"][df_all["real"]['label'] == label]
|
867 |
+
source = ColumnDataSource(data={
|
868 |
+
'x': subset['x'],
|
869 |
+
'y': subset['y'],
|
870 |
+
'label': subset['label']
|
871 |
+
})
|
872 |
+
# Usamos 'circle' para las reales
|
873 |
+
fig_all.circle('x', 'y', size=10,
|
874 |
+
fill_color=color_maps["real"][label],
|
875 |
+
line_color=color_maps["real"][label],
|
876 |
+
legend_label=f"Real: {label}",
|
877 |
+
source=source)
|
878 |
+
|
879 |
+
# Ploteamos los puntos de synthetic, diferenciando cada source con su marcador
|
880 |
+
if unique_subsets["synthetic"]:
|
881 |
+
for source_name, labels in unique_subsets["synthetic"].items():
|
882 |
+
df_source = df_all["synthetic"][df_all["synthetic"]["source"] == source_name]
|
883 |
+
marker = marker_mapping.get(source_name, "square")
|
884 |
+
# Para cada label en ese source, usamos la funci贸n auxiliar
|
885 |
+
renderers = add_synthetic_dataset_to_fig(fig_all, df_source, labels,
|
886 |
+
marker=marker,
|
887 |
+
color_mapping=color_maps["synthetic"][source_name],
|
888 |
+
group_label=source_name)
|
889 |
+
# Ploteamos los puntos de pretrained (agrupados por label)
|
890 |
+
if unique_subsets["pretrained"]:
|
891 |
+
for label in unique_subsets["pretrained"]:
|
892 |
+
subset = df_all["pretrained"][df_all["pretrained"]['label'] == label]
|
893 |
+
source = ColumnDataSource(data={
|
894 |
+
'x': subset['x'],
|
895 |
+
'y': subset['y'],
|
896 |
+
'label': subset['label']
|
897 |
+
})
|
898 |
+
# Usamos 'triangle' para pretrained (por ejemplo)
|
899 |
+
fig_all.triangle('x', 'y', size=10,
|
900 |
+
fill_color=color_maps["pretrained"][label],
|
901 |
+
line_color=color_maps["pretrained"][label],
|
902 |
+
legend_label=f"Pretrained: {label}",
|
903 |
+
source=source)
|
904 |
+
|
905 |
+
# Calcular el centroide y el radio (usando solo las muestras reales)
|
906 |
+
center_x = df_all["real"]['x'].mean()
|
907 |
+
center_y = df_all["real"]['y'].mean()
|
908 |
+
distances = np.sqrt((df_all["real"]['x'] - center_x)**2 + (df_all["real"]['y'] - center_y)**2)
|
909 |
+
radius = distances.max()
|
910 |
+
|
911 |
+
# Dibujar el centroide y la circunferencia en el plot
|
912 |
+
fig_all.circle(x=center_x, y=center_y, size=15,
|
913 |
+
fill_color="black", line_color="black", legend_label="Centroide")
|
914 |
+
fig_all.circle(x=center_x, y=center_y, radius=radius,
|
915 |
+
fill_color=None, line_color="black", line_dash="dashed", legend_label="Circunferencia")
|
916 |
+
|
917 |
+
fig_all.xaxis.axis_label = "PC1"
|
918 |
+
fig_all.yaxis.axis_label = "PC2"
|
919 |
+
hover_all = fig_all.select_one(HoverTool)
|
920 |
+
hover_all.tooltips = [("Label", "@label"), ("PC1", "@x"), ("PC2", "@y")]
|
921 |
+
|
922 |
+
# Agregar checkbox para mostrar u ocultar la leyenda, igual que en el primer PCA
|
923 |
+
show_legend_second = st.checkbox("Show Legend", value=False, key=f"legend_second_{model_name}")
|
924 |
+
fig_all.legend.visible = show_legend_second
|
925 |
+
fig_all.legend.location = "top_right"
|
926 |
+
|
927 |
+
st.bokeh_chart(fig_all)
|
928 |
+
|
929 |
+
# Mostrar el valor del radio debajo del gr谩fico
|
930 |
+
st.write(f"El radio de la circunferencia (calculado a partir de las muestras reales) es: {radius:.4f}")
|
931 |
+
|
932 |
+
|
933 |
+
# --- C谩lculo de distancias y scatter plot de Distance vs F1 para el nuevo PCA ---
|
934 |
+
|
935 |
+
# Se calcula la distancia de cada subset synthetic a cada subset real usando los datos proyectados (df_all)
|
936 |
+
# Se utiliza la funci贸n compute_cluster_distances_synthetic_individual ya definida
|
937 |
+
real_labels_new = sorted(df_all["real"]['label'].unique().tolist())
|
938 |
+
df_distances_new = compute_cluster_distances_synthetic_individual(
|
939 |
+
df_all["synthetic"],
|
940 |
+
df_all["real"],
|
941 |
+
real_labels_new,
|
942 |
+
metric="wasserstein", # Puedes cambiar la m茅trica seg煤n lo requieras
|
943 |
+
bins=20
|
944 |
+
)
|
945 |
+
|
946 |
+
# Extraer las distancias globales (por cada source) del dataframe obtenido,
|
947 |
+
# buscando filas cuyo 铆ndice comience con "Global" (formato "Global (source)")
|
948 |
+
global_distances_new = {}
|
949 |
+
for idx in df_distances_new.index:
|
950 |
+
if idx.startswith("Global"):
|
951 |
+
source_name = idx.split("(")[1].rstrip(")")
|
952 |
+
global_distances_new[source_name] = df_distances_new.loc[idx].values
|
953 |
+
|
954 |
+
# Ahora, relacionar estas distancias con los valores de F1 (ya cargados en df_f1)
|
955 |
+
all_x_new = []
|
956 |
+
all_y_new = []
|
957 |
+
for source in df_f1.columns:
|
958 |
+
if source in global_distances_new:
|
959 |
+
x_vals = global_distances_new[source]
|
960 |
+
y_vals = df_f1[source].values
|
961 |
+
all_x_new.extend(x_vals)
|
962 |
+
all_y_new.extend(y_vals)
|
963 |
+
all_x_arr_new = np.array(all_x_new).reshape(-1, 1)
|
964 |
+
all_y_arr_new = np.array(all_y_new)
|
965 |
+
|
966 |
+
# Realizar la regresi贸n lineal global sobre estos datos
|
967 |
+
model_global_new = LinearRegression().fit(all_x_arr_new, all_y_arr_new)
|
968 |
+
r2_new = model_global_new.score(all_x_arr_new, all_y_arr_new)
|
969 |
+
slope_new = model_global_new.coef_[0]
|
970 |
+
intercept_new = model_global_new.intercept_
|
971 |
+
|
972 |
+
# Crear el scatter plot
|
973 |
+
scatter_fig_new = figure(
|
974 |
+
width=600,
|
975 |
+
height=600,
|
976 |
+
tools="pan,wheel_zoom,reset,save,hover",
|
977 |
+
active_scroll="wheel_zoom",
|
978 |
+
title="Scatter Plot: Distance vs F1 (Nueva PCA)",
|
979 |
+
background_fill_color="white"
|
980 |
+
)
|
981 |
+
# Configurar 煤nicamente grid horizontal
|
982 |
+
scatter_fig_new.xgrid.grid_line_color = None
|
983 |
+
scatter_fig_new.ygrid.grid_line_color = "gray"
|
984 |
+
|
985 |
+
# Mantenemos el mismo c贸digo de colores que en el otro scatter plot
|
986 |
+
source_colors = {
|
987 |
+
"es-digital-paragraph-degradation-seq": "blue",
|
988 |
+
"es-digital-line-degradation-seq": "green",
|
989 |
+
"es-digital-seq": "red",
|
990 |
+
"es-digital-zoom-degradation-seq": "orange",
|
991 |
+
"es-digital-rotation-degradation-seq": "purple",
|
992 |
+
"es-digital-rotation-zoom-degradation-seq": "brown",
|
993 |
+
"es-render-seq": "cyan"
|
994 |
+
}
|
995 |
+
|
996 |
+
# Dibujar cada conjunto: para cada source (por ejemplo, es-render-seq, etc.)
|
997 |
+
for source in df_f1.columns:
|
998 |
+
if source in global_distances_new:
|
999 |
+
x_vals = global_distances_new[source]
|
1000 |
+
y_vals = df_f1[source].values
|
1001 |
+
data = {"x": x_vals, "y": y_vals, "Fuente": [source]*len(x_vals)}
|
1002 |
+
cds = ColumnDataSource(data=data)
|
1003 |
+
scatter_fig_new.circle(
|
1004 |
+
'x', 'y', size=8, alpha=0.7, source=cds,
|
1005 |
+
fill_color=source_colors.get(source, "gray"),
|
1006 |
+
line_color=source_colors.get(source, "gray"),
|
1007 |
+
legend_label=source
|
1008 |
+
)
|
1009 |
+
|
1010 |
+
scatter_fig_new.xaxis.axis_label = "Distance (Global, por Colegio) - Nueva PCA"
|
1011 |
+
scatter_fig_new.yaxis.axis_label = "F1 Score"
|
1012 |
+
scatter_fig_new.legend.location = "top_right"
|
1013 |
+
|
1014 |
+
hover_tool_new = scatter_fig_new.select_one(HoverTool)
|
1015 |
+
hover_tool_new.tooltips = [("Distance", "@x"), ("F1", "@y"), ("Subset", "@Fuente")]
|
1016 |
+
|
1017 |
+
# Dibujar la l铆nea de regresi贸n global
|
1018 |
+
x_line_new = np.linspace(all_x_arr_new.min(), all_x_arr_new.max(), 100)
|
1019 |
+
y_line_new = model_global_new.predict(x_line_new.reshape(-1,1))
|
1020 |
+
scatter_fig_new.line(x_line_new, y_line_new, line_width=2, line_color="black", legend_label="Global Regression")
|
1021 |
+
|
1022 |
+
st.bokeh_chart(scatter_fig_new)
|
1023 |
+
|
1024 |
+
st.write(f"Regresi贸n global (Nueva PCA): R虏 = {r2_new:.4f}, Slope = {slope_new:.4f}, Intercept = {intercept_new:.4f}")
|
1025 |
|
1026 |
|
1027 |
def main():
|