Spaces:

latticetower
/

keyword-embeddings-space

Sleeping

App Files Files

latticetower commited on Feb 19

Commit

af49af1

1 Parent(s): b40aac1

cleanup

Browse files

Files changed (2) hide show

app.py +73 -81
mpl_data_plotter.py +3 -5

app.py CHANGED Viewed

@@ -11,29 +11,9 @@ from mpl_data_plotter import MatplotlibDataPlotter
 def convert_int64_to_int32(df):
     for col in df.columns:
         if df[col].dtype == 'int64':
-            print(col)
             df[col] = df[col].astype('int32')
     return df
-print(f"Loading domains data...")
-single_df = pd.read_csv(SINGLE_DOMAINS_FILE, compression='gzip')
-single_df.rename(columns={'bgc_class': 'biosyn_class'}, inplace=True)
-single_df['biosyn_class_index'] = single_df.biosyn_class.apply(lambda x: BIOSYN_CLASS_NAMES.index(x))
-single_df = convert_int64_to_int32(single_df)
-pair_df = pd.read_csv(PAIR_DOMAINS_FILE, compression='gzip')
-pair_df.rename(columns={'bgc_class': 'biosyn_class'}, inplace=True)
-pair_df['biosyn_class_index'] = pair_df.biosyn_class.apply(lambda x: BIOSYN_CLASS_NAMES.index(x))
-pair_df = convert_int64_to_int32(pair_df)
-num_domains_in_region_df = single_df.groupby('cds_region_id', as_index=False).agg({'as_domain_id': 'count'}).rename(
-    columns={'as_domain_id': 'num_domains'})
-unique_domain_lengths = num_domains_in_region_df.num_domains.unique()
-print(f"Initializing data plotter...")
-data_plotter = MatplotlibDataPlotter(single_df, pair_df, num_domains_in_region_df)
 def create_color_legend(class_to_color):
     # Create HTML for the color legend
@@ -86,66 +66,78 @@ def update_all_plots(frequency, split_name):
     return data_plotter.plot_single_domains(frequency, split_name), data_plotter.plot_pair_domains(frequency, split_name)
-print(f"Defining blocks...")
-# Create Gradio interface
-with gr.Blocks(title="BGC Keyword Plotter") as demo:
-    gr.Markdown("## BGC Keyword Plotter")
-    gr.Markdown("Select the model name and minimal number of domains in Antismash-db subset.")
-    color_legend = create_color_legend(BIOSYN_CLASS_HEX_COLORS)
-    with gr.Row():
-        frequency_slider = gr.Slider(
-            minimum=int(unique_domain_lengths.min()),
-            maximum=int(unique_domain_lengths.max()),
-            step=1,
-            value=int(unique_domain_lengths.min()),
-            label="Min number of domains"
         )
-        model_selector = gr.Radio(
-            choices=["stratified"] + BIOSYN_CLASS_NAMES,
-            value="stratified",
-            label="Model name"
         )
-    with gr.Row():
-        with gr.Column():
-            single_domains_plot = gr.Plot(
-                label="Single domains",
-                container=True,
-                elem_id="single_domains_plot"
-            )
-        # gr.HTML("""
-        # <style>
-        #     #single_domains_plot {
-        #         height: 100% !important;
-        #         width: 100% !important;
-        #     }
-        # </style>
-        # """)
-        with gr.Column():
-            pair_domains_plot = gr.Plot(label="Pair domains")
-        # with gr.Column():
-        #     combined_plot = gr.Plot(label="Combined Wave")
-    frequency_slider.release(
-        fn=update_all_plots,
-        inputs=[frequency_slider, model_selector],
-        outputs=[single_domains_plot, pair_domains_plot]#, cosine_plot]
-    )
-    demo.load(
-        fn=update_all_plots,
-        inputs=[frequency_slider, model_selector],
-        outputs=[single_domains_plot, pair_domains_plot]
-    )
-    model_selector.input(
-        fn=update_all_plots,
-        inputs=[frequency_slider, model_selector],
-        outputs=[single_domains_plot, pair_domains_plot]
-    )
-print(f"Launching!...")
-demo.launch()
-# demo.load(filter_map, [min_price, max_price, boroughs], map)

 def convert_int64_to_int32(df):
     for col in df.columns:
         if df[col].dtype == 'int64':
             df[col] = df[col].astype('int32')
     return df
 def create_color_legend(class_to_color):
     # Create HTML for the color legend
     return data_plotter.plot_single_domains(frequency, split_name), data_plotter.plot_pair_domains(frequency, split_name)
+if __name__ == "__main__":
+    print(f"Loading domains data...")
+    single_df = pd.read_csv(SINGLE_DOMAINS_FILE, compression='gzip')
+    single_df.rename(columns={'bgc_class': 'biosyn_class'}, inplace=True)
+    single_df['biosyn_class_index'] = single_df.biosyn_class.apply(lambda x: BIOSYN_CLASS_NAMES.index(x))
+    single_df = convert_int64_to_int32(single_df)
+    pair_df = pd.read_csv(PAIR_DOMAINS_FILE, compression='gzip')
+    pair_df.rename(columns={'bgc_class': 'biosyn_class'}, inplace=True)
+    pair_df['biosyn_class_index'] = pair_df.biosyn_class.apply(lambda x: BIOSYN_CLASS_NAMES.index(x))
+    pair_df = convert_int64_to_int32(pair_df)
+    num_domains_in_region_df = single_df.groupby('cds_region_id', as_index=False).agg({'as_domain_id': 'count'}).rename(
+        columns={'as_domain_id': 'num_domains'})
+    unique_domain_lengths = num_domains_in_region_df.num_domains.unique()
+    print(f"Initializing data plotter...")
+    data_plotter = MatplotlibDataPlotter(single_df, pair_df, num_domains_in_region_df)
+    print(f"Defining blocks...")
+    # Create Gradio interface
+    with gr.Blocks(title="BGC Keyword Plotter") as demo:
+        gr.Markdown("## BGC Keyword Plotter")
+        gr.Markdown("Select the model name and minimal number of domains in Antismash-db subset.")
+        color_legend = create_color_legend(BIOSYN_CLASS_HEX_COLORS)
+        with gr.Row():
+            frequency_slider = gr.Slider(
+                minimum=int(unique_domain_lengths.min()),
+                maximum=int(unique_domain_lengths.max()),
+                step=1,
+                value=int(unique_domain_lengths.min()),
+                label="Min number of domains"
+            )
+            model_selector = gr.Radio(
+                choices=["stratified"] + BIOSYN_CLASS_NAMES,
+                value="stratified",
+                label="Model name"
+            )
+        with gr.Row():
+            with gr.Column():
+                single_domains_plot = gr.Plot(
+                    label="Single domains",
+                    container=True,
+                    elem_id="single_domains_plot"
+                )
+            with gr.Column():
+                pair_domains_plot = gr.Plot(label="Pair domains")
+        frequency_slider.release(
+            fn=update_all_plots,
+            inputs=[frequency_slider, model_selector],
+            outputs=[single_domains_plot, pair_domains_plot]#, cosine_plot]
         )
+        demo.load(
+            fn=update_all_plots,
+            inputs=[frequency_slider, model_selector],
+            outputs=[single_domains_plot, pair_domains_plot]
         )
+        model_selector.input(
+            fn=update_all_plots,
+            inputs=[frequency_slider, model_selector],
+            outputs=[single_domains_plot, pair_domains_plot]
+        )
+    print(f"Launching!...")
+    demo.launch()
+    # demo.load(filter_map, [min_price, max_price, boroughs], map)

mpl_data_plotter.py CHANGED Viewed

@@ -17,7 +17,7 @@ class MatplotlibDataPlotter:
         self.single_domains_fig = plt.figure(figsize=(5, 10))
         self.pair_domains_fig = plt.figure(figsize=(5, 10))
-    def plot_single_domains(self, num_domains, split_name):
         selected_region_ids = self.num_domains_in_region_df.loc[
             self.num_domains_in_region_df.num_domains >= num_domains,
             'cds_region_id'].values
@@ -39,7 +39,6 @@ class MatplotlibDataPlotter:
         top_n=5
         bin_width=1
         hue_group_offset=0.5
-        # hue_order=BIOSYN_CLASS_NAMES
         width=0.9
         fig = self.single_domains_fig
@@ -62,7 +61,7 @@ class MatplotlibDataPlotter:
         fig.tight_layout()
         return fig
-    def plot_pair_domains(self, num_domains, split_name):
         selected_region_ids = self.num_domains_in_region_df.loc[
             self.num_domains_in_region_df.num_domains >= num_domains,
             'cds_region_id'].values
@@ -72,9 +71,8 @@ class MatplotlibDataPlotter:
         biosyn_counts_pairs = pair_df_subset[['cds_region_id', 'biosyn_class']].drop_duplicates().groupby("biosyn_class", as_index=False).count()
         hue2count_pairs = dict(biosyn_counts_pairs.values)
-        # split_name = 'stratified'
         column_name = f'cosine_similarity_{split_name}'
-        # pair_df_subset = pair_df.loc[pair_df.dom_location_len >= num_domains]
         selected_keyword_index = pair_df_subset.groupby('cds_region_id').agg(
             {column_name: 'idxmax'}
         ).values.flatten()

         self.single_domains_fig = plt.figure(figsize=(5, 10))
         self.pair_domains_fig = plt.figure(figsize=(5, 10))
+    def plot_single_domains(self, num_domains, split_name="stratified"):
         selected_region_ids = self.num_domains_in_region_df.loc[
             self.num_domains_in_region_df.num_domains >= num_domains,
             'cds_region_id'].values
         top_n=5
         bin_width=1
         hue_group_offset=0.5
         width=0.9
         fig = self.single_domains_fig
         fig.tight_layout()
         return fig
+    def plot_pair_domains(self, num_domains, split_name="stratified"):
         selected_region_ids = self.num_domains_in_region_df.loc[
             self.num_domains_in_region_df.num_domains >= num_domains,
             'cds_region_id'].values
         biosyn_counts_pairs = pair_df_subset[['cds_region_id', 'biosyn_class']].drop_duplicates().groupby("biosyn_class", as_index=False).count()
         hue2count_pairs = dict(biosyn_counts_pairs.values)
         column_name = f'cosine_similarity_{split_name}'
         selected_keyword_index = pair_df_subset.groupby('cds_region_id').agg(
             {column_name: 'idxmax'}
         ).values.flatten()