Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Feb 27

Commit

9cb16e9

verified ·

1 Parent(s): 9308c12

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -11

app.py CHANGED Viewed

@@ -224,6 +224,28 @@ def compute_gc_content(sequence):
 # 7. MAIN ANALYSIS STEP (Gradio Step 1)
 ###############################################################################
 def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
     if fasta_text.strip():
         text = fasta_text.strip()
@@ -232,13 +254,13 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
             with open(file_obj, 'r') as f:
                 text = f.read()
         except Exception as e:
-            return (f"Error reading file: {str(e)}", None, None, None, None, None)
     else:
-        return ("Please provide a FASTA sequence.", None, None, None, None, None)
     sequences = parse_fasta(text)
     if not sequences:
-        return ("No valid FASTA sequences found.", None, None, None, None, None)
     header, seq = sequences[0]
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -249,7 +271,7 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
         model.load_state_dict(state_dict)
         scaler = joblib.load('scaler.pkl')
     except Exception as e:
-        return (f"Error loading model/scaler: {str(e)}", None, None, None, None, None)
     freq_vector = sequence_to_kmer_vector(seq)
     scaled_vector = scaler.transform(freq_vector.reshape(1, -1))
@@ -284,11 +306,13 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
     heatmap_fig = plot_linear_heatmap(shap_means, title="Genome-wide SHAP")
     heatmap_img = fig_to_image(heatmap_fig)
-    # You might want to provide a CSV or other data for the 6th return item
-    # Here, we'll simply return None for the file download:
     state_dict_out = {"seq": seq, "shap_means": shap_means}
-    return (results_text, bar_img, heatmap_img, state_dict_out, header, None)
 ###############################################################################
 # 8. SUBREGION ANALYSIS (Gradio Step 2)
@@ -963,9 +987,22 @@ def prepare_csv_download(data, filename="analysis_results.csv"):
         return output.getvalue().encode(), filename
     else:
         raise ValueError("Unsupported data type for CSV download")
 ###############################################################################
-# 13. BUILD GRADIO INTERFACE
 ###############################################################################
 css = """
@@ -993,6 +1030,10 @@ with gr.Blocks(css=css) as iface:
             with gr.Column(scale=1):
                 file_input = gr.File(label="Upload FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
                 text_input = gr.Textbox(label="Or paste FASTA sequence", placeholder=">sequence_name\nACGTACGT...", lines=5)
                 top_k = gr.Slider(minimum=5, maximum=30, value=10, step=1, label="Number of top k-mers to display")
                 win_size = gr.Slider(minimum=100, maximum=5000, value=500, step=100, label="Window size for 'most pushing' subregions")
                 analyze_btn = gr.Button("Analyze Sequence", variant="primary")
@@ -1000,14 +1041,25 @@ with gr.Blocks(css=css) as iface:
                 results_box = gr.Textbox(label="Classification Results", lines=12, interactive=False)
                 kmer_img = gr.Image(label="Top k-mer SHAP")
                 genome_img = gr.Image(label="Genome-wide SHAP Heatmap (Blue=neg, White=0, Red=pos)")
-                download_results = gr.File(label="Download Results", visible=False, elem_classes="download-button")
         seq_state = gr.State()
         header_state = gr.State()
         analyze_btn.click(
             analyze_sequence,
             inputs=[file_input, top_k, text_input, win_size],
-            outputs=[results_box, kmer_img, genome_img, seq_state, header_state, download_results]
         )
     with gr.Tab("2) Subregion Exploration"):
@@ -1114,8 +1166,9 @@ with gr.Blocks(css=css) as iface:
       - Statistical summary of differences
     - **Data Export**:
       - Download results as CSV files
       - Save analysis outputs for further processing
     """)
 if __name__ == "__main__":
-    iface.launch()

 # 7. MAIN ANALYSIS STEP (Gradio Step 1)
 ###############################################################################
+def create_kmer_shap_csv(kmers, shap_values):
+    """Create a CSV file with k-mer SHAP values and return the filepath"""
+    # Create DataFrame with k-mers and SHAP values
+    kmer_df = pd.DataFrame({
+        'kmer': kmers,
+        'shap_value': shap_values,
+        'abs_shap': np.abs(shap_values)
+    })
+    # Sort by absolute SHAP value (most influential first)
+    kmer_df = kmer_df.sort_values('abs_shap', ascending=False)
+    # Drop the abs_shap column used for sorting
+    kmer_df = kmer_df[['kmer', 'shap_value']]
+    # Save to temporary file
+    temp_dir = tempfile.gettempdir()
+    temp_path = os.path.join(temp_dir, f"kmer_shap_values_{os.urandom(4).hex()}.csv")
+    kmer_df.to_csv(temp_path, index=False)
+    return temp_path
 def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
     if fasta_text.strip():
         text = fasta_text.strip()
             with open(file_obj, 'r') as f:
                 text = f.read()
         except Exception as e:
+            return (f"Error reading file: {str(e)}", None, None, None, None, None, None)
     else:
+        return ("Please provide a FASTA sequence.", None, None, None, None, None, None)
     sequences = parse_fasta(text)
     if not sequences:
+        return ("No valid FASTA sequences found.", None, None, None, None, None, None)
     header, seq = sequences[0]
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         model.load_state_dict(state_dict)
         scaler = joblib.load('scaler.pkl')
     except Exception as e:
+        return (f"Error loading model/scaler: {str(e)}", None, None, None, None, None, None)
     freq_vector = sequence_to_kmer_vector(seq)
     scaled_vector = scaler.transform(freq_vector.reshape(1, -1))
     heatmap_fig = plot_linear_heatmap(shap_means, title="Genome-wide SHAP")
     heatmap_img = fig_to_image(heatmap_fig)
+    # Create CSV with k-mer SHAP values and return the file path
+    kmer_shap_csv = create_kmer_shap_csv(kmers, shap_values)
+    # State dictionary for subregion analysis
     state_dict_out = {"seq": seq, "shap_means": shap_means}
+    return (results_text, bar_img, heatmap_img, state_dict_out, header, None, kmer_shap_csv)
 ###############################################################################
 # 8. SUBREGION ANALYSIS (Gradio Step 2)
         return output.getvalue().encode(), filename
     else:
         raise ValueError("Unsupported data type for CSV download")
+###############################################################################
+# 13. EXAMPLE FASTA LOADER
+###############################################################################
+def load_example_fasta():
+    """Load the example.fasta file contents"""
+    try:
+        with open('example.fasta', 'r') as f:
+            example_text = f.read()
+        return example_text
+    except Exception as e:
+        return f">example_sequence\nACGTACGT...\n\n(Note: Could not load example.fasta: {str(e)})"
 ###############################################################################
+# 14. BUILD GRADIO INTERFACE
 ###############################################################################
 css = """
             with gr.Column(scale=1):
                 file_input = gr.File(label="Upload FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
                 text_input = gr.Textbox(label="Or paste FASTA sequence", placeholder=">sequence_name\nACGTACGT...", lines=5)
+                with gr.Row():
+                    example_btn = gr.Button("Load Example FASTA", variant="secondary")
                 top_k = gr.Slider(minimum=5, maximum=30, value=10, step=1, label="Number of top k-mers to display")
                 win_size = gr.Slider(minimum=100, maximum=5000, value=500, step=100, label="Window size for 'most pushing' subregions")
                 analyze_btn = gr.Button("Analyze Sequence", variant="primary")
                 results_box = gr.Textbox(label="Classification Results", lines=12, interactive=False)
                 kmer_img = gr.Image(label="Top k-mer SHAP")
                 genome_img = gr.Image(label="Genome-wide SHAP Heatmap (Blue=neg, White=0, Red=pos)")
+                with gr.Row():
+                    download_kmer_shap = gr.File(label="Download k-mer SHAP Values (CSV)", visible=True)
+                    download_results = gr.File(label="Download Results", visible=False, elem_classes="download-button")
         seq_state = gr.State()
         header_state = gr.State()
+        # Event handlers
+        example_btn.click(
+            load_example_fasta,
+            inputs=[],
+            outputs=[text_input]
+        )
         analyze_btn.click(
             analyze_sequence,
             inputs=[file_input, top_k, text_input, win_size],
+            outputs=[results_box, kmer_img, genome_img, seq_state, header_state, download_results, download_kmer_shap]
         )
     with gr.Tab("2) Subregion Exploration"):
       - Statistical summary of differences
     - **Data Export**:
       - Download results as CSV files
+      - Download k-mer SHAP values
       - Save analysis outputs for further processing
     """)
 if __name__ == "__main__":
+    iface.launch()