Spaces:

de-Rodrigo
/

Embeddings

Running

App Files Files Community

de-Rodrigo commited on Mar 24

Commit

eff2e30

1 Parent(s): d5b8742

Replicate PCA Considering Only Real Samples

Browse files

Files changed (1) hide show

app.py +245 -57

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ from sklearn.linear_model import LinearRegression
 N_COMPONENTS = 2
 TSNE_NEIGHBOURS = 150
-WEIGHT_FACTOR = 0.1
 TOOLTIPS = """
 <div>
@@ -747,62 +747,6 @@ def run_model(model_name):
             "Explained Variance": explained_variance_real
         })
         st.table(variance_df_real)
-        # Agregar scatter plot para visualizar el PCA real
-        st.subheader("PCA - Real: Scatter Plot")
-        fig_real = figure(
-            title="PCA - Solo Real: Scatter Plot",
-            plot_width=600,
-            plot_height=600,
-            tools="pan,wheel_zoom,reset,save,hover",
-            active_scroll="wheel_zoom",
-            background_fill_color="white"
-        )
-        # Mostrar solo grid horizontal
-        fig_real.xgrid.grid_line_color = None
-        fig_real.ygrid.grid_line_color = "gray"
-        # Dibujar los puntos por cada etiqueta
-        for label in unique_labels_real:
-            subset = df_real_only[df_real_only['label'] == label]
-            source_scatter = ColumnDataSource(data={
-                'x': subset['x'],
-                'y': subset['y'],
-                'label': subset['label']
-            })
-            fig_real.circle('x', 'y', size=10,
-                            fill_color=real_color_mapping[label],
-                            line_color=real_color_mapping[label],
-                            legend_label=label,
-                            source=source_scatter)
-        # Calcular el centroide de todos los puntos
-        center_x = df_real_only['x'].mean()
-        center_y = df_real_only['y'].mean()
-        # Calcular el radio como la máxima distancia desde el centroide
-        distances = np.sqrt((df_real_only['x'] - center_x)**2 + (df_real_only['y'] - center_y)**2)
-        radius = distances.max()
-        # Dibujar el centroide
-        fig_real.circle(x=center_x, y=center_y, size=15,
-                        fill_color="black", line_color="black", legend_label="Centroide")
-        # Dibujar la circunferencia (con línea discontinua)
-        fig_real.circle(x=center_x, y=center_y, radius=radius,
-                        fill_color=None, line_color="black", line_dash="dashed", legend_label="Circunferencia")
-        fig_real.xaxis.axis_label = "PC1"
-        fig_real.yaxis.axis_label = "PC2"
-        hover_scatter = fig_real.select_one(HoverTool)
-        hover_scatter.tooltips = [("Label", "@label"), ("PC1", "@x"), ("PC2", "@y")]
-        fig_real.legend.location = "top_right"
-        st.bokeh_chart(fig_real)
-        # Mostrar el valor del radio debajo del gráfico
-        st.write(f"El radio de la circunferencia es: {int(radius)}")
         # Mostrar los plots de loadings (Component Loadings)
         st.subheader("PCA - Real: Component Loadings")
@@ -834,6 +778,250 @@ def run_model(model_name):
             hover = p.select_one(HoverTool)
             hover.tooltips = [("Dimensión", "@dimensions"), ("Peso", "@weight")]
             st.bokeh_chart(p)
 def main():

 N_COMPONENTS = 2
 TSNE_NEIGHBOURS = 150
+WEIGHT_FACTOR = 0.05
 TOOLTIPS = """
 <div>
             "Explained Variance": explained_variance_real
         })
         st.table(variance_df_real)
         # Mostrar los plots de loadings (Component Loadings)
         st.subheader("PCA - Real: Component Loadings")
             hover = p.select_one(HoverTool)
             hover.tooltips = [("Dimensión", "@dimensions"), ("Peso", "@weight")]
             st.bokeh_chart(p)
+        # Segundo PCA: Proyección de todos los subconjuntos usando los loadings calculados con df_real_only
+        st.subheader("PCA - Todos los subconjuntos proyectados (usando loadings de df_real)")
+        # Crear un diccionario para almacenar las proyecciones usando el PCA calculado con las muestras reales (pca_real)
+        df_all = {}
+        # Proyectar las muestras reales
+        df_real_proj = embeddings["real"].copy()
+        proj_real = pca_real.transform(df_real_proj[embedding_cols].values)
+        df_real_proj['pc1'] = proj_real[:, 0]
+        df_real_proj['pc2'] = proj_real[:, 1]
+        df_all["real"] = df_real_proj
+        # Proyectar el subconjunto synthetic, si existe
+        if "synthetic" in embeddings:
+            df_synth_proj = embeddings["synthetic"].copy()
+            proj_synth = pca_real.transform(df_synth_proj[embedding_cols].values)
+            df_synth_proj['pc1'] = proj_synth[:, 0]
+            df_synth_proj['pc2'] = proj_synth[:, 1]
+            df_all["synthetic"] = df_synth_proj
+        # Proyectar el subconjunto pretrained, si existe
+        if "pretrained" in embeddings:
+            df_pretr_proj = embeddings["pretrained"].copy()
+            proj_pretr = pca_real.transform(df_pretr_proj[embedding_cols].values)
+            df_pretr_proj['pc1'] = proj_pretr[:, 0]
+            df_pretr_proj['pc2'] = proj_pretr[:, 1]
+            df_all["pretrained"] = df_pretr_proj
+        # Para utilizar las mismas funciones de plot (create_figure, add_dataset_to_fig, add_synthetic_dataset_to_fig),
+        # renombramos las columnas 'pc1' y 'pc2' a 'x' y 'y' en cada dataframe
+        for key in df_all:
+            df_all[key]["x"] = df_all[key]["pc1"]
+            df_all[key]["y"] = df_all[key]["pc2"]
+        # Construir los subconjuntos únicos con la granularidad deseada:
+        # - Para "real" y "pretrained": agrupamos por label.
+        # - Para "synthetic": agrupamos por la columna "source" (cada source tendrá sus labels).
+        unique_subsets = {}
+        # Real:
+        unique_subsets["real"] = sorted(df_all["real"]['label'].unique().tolist())
+        # Synthetic:
+        if "synthetic" in df_all:
+            unique_synth = {}
+            for source in df_all["synthetic"]["source"].unique():
+                unique_synth[source] = sorted(df_all["synthetic"][df_all["synthetic"]["source"] == source]['label'].unique().tolist())
+            unique_subsets["synthetic"] = unique_synth
+        else:
+            unique_subsets["synthetic"] = {}
+        # Pretrained:
+        if "pretrained" in df_all:
+            unique_subsets["pretrained"] = sorted(df_all["pretrained"]['label'].unique().tolist())
+        else:
+            unique_subsets["pretrained"] = []
+        # Obtener los mapeos de colores utilizando la función ya definida
+        color_maps = get_color_maps(unique_subsets)
+        # Definir un mapeo de marcadores para los subconjuntos synthetic (granularidad por source)
+        marker_mapping = {
+            "es-digital-paragraph-degradation-seq": "x",
+            "es-digital-line-degradation-seq": "cross",
+            "es-digital-seq": "triangle",
+            "es-digital-rotation-degradation-seq": "diamond",
+            "es-digital-zoom-degradation-seq": "asterisk",
+            "es-render-seq": "inverted_triangle"
+        }
+        # Ahora, crear la figura utilizando las funciones existentes para mantener la granularidad:
+        # Se plotean las muestras reales, synthetic (por source) y pretrained con sus respectivos marcadores y colores.
+        fig_all = figure(
+            title="PCA - Todos los subconjuntos proyectados",
+            plot_width=600,
+            plot_height=600,
+            tools="pan,wheel_zoom,reset,save,hover",
+            active_scroll="wheel_zoom",
+            background_fill_color="white"
+        )
+        # Solo grid horizontal
+        fig_all.xgrid.grid_line_color = None
+        fig_all.ygrid.grid_line_color = "gray"
+        # Ploteamos los puntos de las muestras reales (agrupados por label)
+        for label in unique_subsets["real"]:
+            subset = df_all["real"][df_all["real"]['label'] == label]
+            source = ColumnDataSource(data={
+                'x': subset['x'],
+                'y': subset['y'],
+                'label': subset['label']
+            })
+            # Usamos 'circle' para las reales
+            fig_all.circle('x', 'y', size=10,
+                        fill_color=color_maps["real"][label],
+                        line_color=color_maps["real"][label],
+                        legend_label=f"Real: {label}",
+                        source=source)
+        # Ploteamos los puntos de synthetic, diferenciando cada source con su marcador
+        if unique_subsets["synthetic"]:
+            for source_name, labels in unique_subsets["synthetic"].items():
+                df_source = df_all["synthetic"][df_all["synthetic"]["source"] == source_name]
+                marker = marker_mapping.get(source_name, "square")
+                # Para cada label en ese source, usamos la función auxiliar
+                renderers = add_synthetic_dataset_to_fig(fig_all, df_source, labels,
+                                                        marker=marker,
+                                                        color_mapping=color_maps["synthetic"][source_name],
+                                                        group_label=source_name)
+        # Ploteamos los puntos de pretrained (agrupados por label)
+        if unique_subsets["pretrained"]:
+            for label in unique_subsets["pretrained"]:
+                subset = df_all["pretrained"][df_all["pretrained"]['label'] == label]
+                source = ColumnDataSource(data={
+                    'x': subset['x'],
+                    'y': subset['y'],
+                    'label': subset['label']
+                })
+                # Usamos 'triangle' para pretrained (por ejemplo)
+                fig_all.triangle('x', 'y', size=10,
+                                fill_color=color_maps["pretrained"][label],
+                                line_color=color_maps["pretrained"][label],
+                                legend_label=f"Pretrained: {label}",
+                                source=source)
+        # Calcular el centroide y el radio (usando solo las muestras reales)
+        center_x = df_all["real"]['x'].mean()
+        center_y = df_all["real"]['y'].mean()
+        distances = np.sqrt((df_all["real"]['x'] - center_x)**2 + (df_all["real"]['y'] - center_y)**2)
+        radius = distances.max()
+        # Dibujar el centroide y la circunferencia en el plot
+        fig_all.circle(x=center_x, y=center_y, size=15,
+                    fill_color="black", line_color="black", legend_label="Centroide")
+        fig_all.circle(x=center_x, y=center_y, radius=radius,
+                    fill_color=None, line_color="black", line_dash="dashed", legend_label="Circunferencia")
+        fig_all.xaxis.axis_label = "PC1"
+        fig_all.yaxis.axis_label = "PC2"
+        hover_all = fig_all.select_one(HoverTool)
+        hover_all.tooltips = [("Label", "@label"), ("PC1", "@x"), ("PC2", "@y")]
+        # Agregar checkbox para mostrar u ocultar la leyenda, igual que en el primer PCA
+        show_legend_second = st.checkbox("Show Legend", value=False, key=f"legend_second_{model_name}")
+        fig_all.legend.visible = show_legend_second
+        fig_all.legend.location = "top_right"
+        st.bokeh_chart(fig_all)
+        # Mostrar el valor del radio debajo del gráfico
+        st.write(f"El radio de la circunferencia (calculado a partir de las muestras reales) es: {radius:.4f}")
+        # --- Cálculo de distancias y scatter plot de Distance vs F1 para el nuevo PCA ---
+        # Se calcula la distancia de cada subset synthetic a cada subset real usando los datos proyectados (df_all)
+        # Se utiliza la función compute_cluster_distances_synthetic_individual ya definida
+        real_labels_new = sorted(df_all["real"]['label'].unique().tolist())
+        df_distances_new = compute_cluster_distances_synthetic_individual(
+            df_all["synthetic"],
+            df_all["real"],
+            real_labels_new,
+            metric="wasserstein",  # Puedes cambiar la métrica según lo requieras
+            bins=20
+        )
+        # Extraer las distancias globales (por cada source) del dataframe obtenido,
+        # buscando filas cuyo índice comience con "Global" (formato "Global (source)")
+        global_distances_new = {}
+        for idx in df_distances_new.index:
+            if idx.startswith("Global"):
+                source_name = idx.split("(")[1].rstrip(")")
+                global_distances_new[source_name] = df_distances_new.loc[idx].values
+        # Ahora, relacionar estas distancias con los valores de F1 (ya cargados en df_f1)
+        all_x_new = []
+        all_y_new = []
+        for source in df_f1.columns:
+            if source in global_distances_new:
+                x_vals = global_distances_new[source]
+                y_vals = df_f1[source].values
+                all_x_new.extend(x_vals)
+                all_y_new.extend(y_vals)
+        all_x_arr_new = np.array(all_x_new).reshape(-1, 1)
+        all_y_arr_new = np.array(all_y_new)
+        # Realizar la regresión lineal global sobre estos datos
+        model_global_new = LinearRegression().fit(all_x_arr_new, all_y_arr_new)
+        r2_new = model_global_new.score(all_x_arr_new, all_y_arr_new)
+        slope_new = model_global_new.coef_[0]
+        intercept_new = model_global_new.intercept_
+        # Crear el scatter plot
+        scatter_fig_new = figure(
+            width=600,
+            height=600,
+            tools="pan,wheel_zoom,reset,save,hover",
+            active_scroll="wheel_zoom",
+            title="Scatter Plot: Distance vs F1 (Nueva PCA)",
+            background_fill_color="white"
+        )
+        # Configurar únicamente grid horizontal
+        scatter_fig_new.xgrid.grid_line_color = None
+        scatter_fig_new.ygrid.grid_line_color = "gray"
+        # Mantenemos el mismo código de colores que en el otro scatter plot
+        source_colors = {
+            "es-digital-paragraph-degradation-seq": "blue",
+            "es-digital-line-degradation-seq": "green",
+            "es-digital-seq": "red",
+            "es-digital-zoom-degradation-seq": "orange",
+            "es-digital-rotation-degradation-seq": "purple",
+            "es-digital-rotation-zoom-degradation-seq": "brown",
+            "es-render-seq": "cyan"
+        }
+        # Dibujar cada conjunto: para cada source (por ejemplo, es-render-seq, etc.)
+        for source in df_f1.columns:
+            if source in global_distances_new:
+                x_vals = global_distances_new[source]
+                y_vals = df_f1[source].values
+                data = {"x": x_vals, "y": y_vals, "Fuente": [source]*len(x_vals)}
+                cds = ColumnDataSource(data=data)
+                scatter_fig_new.circle(
+                    'x', 'y', size=8, alpha=0.7, source=cds,
+                    fill_color=source_colors.get(source, "gray"),
+                    line_color=source_colors.get(source, "gray"),
+                    legend_label=source
+                )
+        scatter_fig_new.xaxis.axis_label = "Distance (Global, por Colegio) - Nueva PCA"
+        scatter_fig_new.yaxis.axis_label = "F1 Score"
+        scatter_fig_new.legend.location = "top_right"
+        hover_tool_new = scatter_fig_new.select_one(HoverTool)
+        hover_tool_new.tooltips = [("Distance", "@x"), ("F1", "@y"), ("Subset", "@Fuente")]
+        # Dibujar la línea de regresión global
+        x_line_new = np.linspace(all_x_arr_new.min(), all_x_arr_new.max(), 100)
+        y_line_new = model_global_new.predict(x_line_new.reshape(-1,1))
+        scatter_fig_new.line(x_line_new, y_line_new, line_width=2, line_color="black", legend_label="Global Regression")
+        st.bokeh_chart(scatter_fig_new)
+        st.write(f"Regresión global (Nueva PCA): R² = {r2_new:.4f}, Slope = {slope_new:.4f}, Intercept = {intercept_new:.4f}")
 def main():