Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Jan 14

Commit

d01c414

verified ·

1 Parent(s): 1b5b7bf

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -35

app.py CHANGED Viewed

@@ -17,8 +17,6 @@ import pandas as pd
 import tempfile
 import os
 from typing import List, Dict, Tuple, Optional, Any
-import io
-from io import BytesIO
 import seaborn as sns
 ###############################################################################
@@ -55,7 +53,8 @@ def parse_fasta(text):
     current_sequence = []
     for line in text.strip().split('\n'):
         line = line.strip()
-        if not line: continue
         if line.startswith('>'):
             if current_header:
                 sequences.append((current_header, ''.join(current_sequence)))
@@ -128,7 +127,8 @@ def compute_positionwise_scores(sequence, shap_values, k=4):
 def find_extreme_subregion(shap_means, window_size=500, mode="max"):
     n = len(shap_means)
-    if n == 0: return (0, 0, 0.0)
     if window_size >= n:
         return (0, n, float(np.mean(shap_means)))
     csum = np.zeros(n + 1, dtype=np.float32)
@@ -140,9 +140,11 @@ def find_extreme_subregion(shap_means, window_size=500, mode="max"):
         wsum = csum[start + window_size] - csum[start]
         wavg = wsum / window_size
         if mode == "max" and wavg > best_avg:
-            best_avg = wavg; best_start = start
         elif mode == "min" and wavg < best_avg:
-            best_avg = wavg; best_start = start
     return (best_start, best_start + window_size, float(best_avg))
 ###############################################################################
@@ -201,9 +203,9 @@ def create_importance_bar_plot(shap_values, kmers, top_k=10):
     plt.tight_layout()
     return fig
-def plot_shap_histogram(shap_array, title="SHAP Distribution in Region"):
     fig, ax = plt.subplots(figsize=(6, 4))
-    ax.hist(shap_array, bins=30, color='gray', edgecolor='black')
     ax.axvline(0, color='red', linestyle='--', label='0.0')
     ax.set_xlabel("SHAP Value")
     ax.set_ylabel("Count")
@@ -213,7 +215,8 @@ def plot_shap_histogram(shap_array, title="SHAP Distribution in Region"):
     return fig
 def compute_gc_content(sequence):
-    if not sequence: return 0
     gc_count = sequence.count('G') + sequence.count('C')
     return (gc_count / len(sequence)) * 100.0
@@ -229,23 +232,24 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
             with open(file_obj, 'r') as f:
                 text = f.read()
         except Exception as e:
-            return (f"Error reading file: {str(e)}", None, None, None, None)
     else:
-        return ("Please provide a FASTA sequence.", None, None, None, None)
     sequences = parse_fasta(text)
     if not sequences:
-        return ("No valid FASTA sequences found.", None, None, None, None)
     header, seq = sequences[0]
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     try:
-        state_dict = torch.load('model.pt', map_location=device, weights_only=True)
         model = VirusClassifier(256).to(device)
         model.load_state_dict(state_dict)
         scaler = joblib.load('scaler.pkl')
     except Exception as e:
-        return (f"Error loading model/scaler: {str(e)}", None, None, None, None)
     freq_vector = sequence_to_kmer_vector(seq)
     scaled_vector = scaler.transform(freq_vector.reshape(1, -1))
@@ -280,9 +284,11 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
     heatmap_fig = plot_linear_heatmap(shap_means, title="Genome-wide SHAP")
     heatmap_img = fig_to_image(heatmap_fig)
     state_dict_out = {"seq": seq, "shap_means": shap_means}
-    return (results_text, bar_img, heatmap_img, state_dict_out, header)
 ###############################################################################
 # 8. SUBREGION ANALYSIS (Gradio Step 2)
@@ -290,7 +296,7 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
 def analyze_subregion(state, header, region_start, region_end):
     if not state or "seq" not in state or "shap_means" not in state:
-        return ("No sequence data found. Please run Step 1 first.", None, None)
     seq = state["seq"]
     shap_means = state["shap_means"]
     region_start = int(region_start)
@@ -298,7 +304,7 @@ def analyze_subregion(state, header, region_start, region_end):
     region_start = max(0, min(region_start, len(seq)))
     region_end = max(0, min(region_end, len(seq)))
     if region_end <= region_start:
-        return ("Invalid region range. End must be > Start.", None, None)
     region_seq = seq[region_start:region_end]
     region_shap = shap_means[region_start:region_end]
     gc_percent = compute_gc_content(region_seq)
@@ -324,7 +330,9 @@ def analyze_subregion(state, header, region_start, region_end):
     heatmap_img = fig_to_image(heatmap_fig)
     hist_fig = plot_shap_histogram(region_shap, title="SHAP Distribution in Subregion")
     hist_img = fig_to_image(hist_fig)
-    return (region_info, heatmap_img, hist_img)
 ###############################################################################
 # 9. COMPARISON ANALYSIS FUNCTIONS
@@ -476,12 +484,12 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
         # Analyze first sequence
         res1 = analyze_sequence(file1, top_kmers=10, fasta_text=fasta1, window_size=500)
         if isinstance(res1[0], str) and "Error" in res1[0]:
-            return (f"Error in sequence 1: {res1[0]}", None, None)
         # Analyze second sequence
         res2 = analyze_sequence(file2, top_kmers=10, fasta_text=fasta2, window_size=500)
         if isinstance(res2[0], str) and "Error" in res2[0]:
-            return (f"Error in sequence 2: {res2[0]}", None, None)
         # Extract SHAP values and sequence info
         shap1 = res1[3]["shap_means"]
@@ -561,11 +569,12 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
         )
         hist_img = fig_to_image(hist_fig)
-        return comparison_text, heatmap_img, hist_img
     except Exception as e:
         error_msg = f"Error during sequence comparison: {str(e)}"
-        return error_msg, None, None
 ###############################################################################
 # 11. GENE FEATURE ANALYSIS
@@ -753,13 +762,11 @@ def create_simple_genome_diagram(gene_results: List[Dict[str, Any]], genome_leng
         # Prepare gene name label
         label = str(gene.get('gene_name','?'))
-        # If getsize() or textsize() is missing, use getmask(...).size as fallback
-        # But if your Pillow version supports font.getsize, you can do:
-        # label_width, label_height = font.getsize(label)
         label_mask = font.getmask(label)
         label_width, label_height = label_mask.size
-        # Alternate label positions above/below line
         if idx % 2 == 0:
             text_y = line_y - track_height - 15
         else:
@@ -821,12 +828,10 @@ def create_simple_genome_diagram(gene_results: List[Dict[str, Any]], genome_leng
     return img
 def analyze_gene_features(sequence_file: str,
-                        features_file: str,
-                        fasta_text: str = "",
-                        features_text: str = "") -> Tuple[str, Optional[str], Optional[Image.Image]]:
     """Analyze SHAP values for each gene feature"""
     # First analyze whole sequence
     sequence_results = analyze_sequence(sequence_file, top_kmers=10, fasta_text=fasta_text)
@@ -980,7 +985,7 @@ with gr.Blocks(css=css) as iface:
     **Step 3**: Analyze gene features and their contributions.
     **Step 4**: Compare sequences and analyze differences.
-    **Color Scale**: Negative SHAP = Blue, Zero = White, Positive = Red.
     """)
     with gr.Tab("1) Full-Sequence Analysis"):
@@ -998,6 +1003,7 @@ with gr.Blocks(css=css) as iface:
                 download_results = gr.File(label="Download Results", visible=False, elem_classes="download-button")
         seq_state = gr.State()
         header_state = gr.State()
         analyze_btn.click(
             analyze_sequence,
             inputs=[file_input, top_k, text_input, win_size],
@@ -1019,6 +1025,7 @@ with gr.Blocks(css=css) as iface:
             subregion_img = gr.Image(label="Subregion SHAP Heatmap (B-W-R)")
             subregion_hist_img = gr.Image(label="SHAP Distribution (Histogram)")
         download_subregion = gr.File(label="Download Subregion Analysis", visible=False, elem_classes="download-button")
         region_btn.click(
             analyze_subregion,
             inputs=[seq_state, header_state, region_start, region_end],
@@ -1065,8 +1072,8 @@ with gr.Blocks(css=css) as iface:
         The sequences will be normalized to the same length for comparison.
         **Color Scale**:
-        - Red: Sequence 2 is more human-like in this region
-        - Blue: Sequence 1 is more human-like in this region
         - White: No substantial difference
         """)
         with gr.Row():
@@ -1082,6 +1089,7 @@ with gr.Blocks(css=css) as iface:
             diff_heatmap = gr.Image(label="SHAP Difference Heatmap")
             diff_hist = gr.Image(label="Distribution of SHAP Differences")
         download_comparison = gr.File(label="Download Comparison Results", visible=False, elem_classes="download-button")
         compare_btn.click(
             analyze_sequence_comparison,
             inputs=[file_input1, file_input2, text_input1, text_input2],
@@ -1110,4 +1118,4 @@ with gr.Blocks(css=css) as iface:
     """)
 if __name__ == "__main__":
-    iface.launch()

 import tempfile
 import os
 from typing import List, Dict, Tuple, Optional, Any
 import seaborn as sns
 ###############################################################################
     current_sequence = []
     for line in text.strip().split('\n'):
         line = line.strip()
+        if not line:
+            continue
         if line.startswith('>'):
             if current_header:
                 sequences.append((current_header, ''.join(current_sequence)))
 def find_extreme_subregion(shap_means, window_size=500, mode="max"):
     n = len(shap_means)
+    if n == 0:
+        return (0, 0, 0.0)
     if window_size >= n:
         return (0, n, float(np.mean(shap_means)))
     csum = np.zeros(n + 1, dtype=np.float32)
         wsum = csum[start + window_size] - csum[start]
         wavg = wsum / window_size
         if mode == "max" and wavg > best_avg:
+            best_avg = wavg
+            best_start = start
         elif mode == "min" and wavg < best_avg:
+            best_avg = wavg
+            best_start = start
     return (best_start, best_start + window_size, float(best_avg))
 ###############################################################################
     plt.tight_layout()
     return fig
+def plot_shap_histogram(shap_array, title="SHAP Distribution in Region", num_bins=30):
     fig, ax = plt.subplots(figsize=(6, 4))
+    ax.hist(shap_array, bins=num_bins, color='gray', edgecolor='black')
     ax.axvline(0, color='red', linestyle='--', label='0.0')
     ax.set_xlabel("SHAP Value")
     ax.set_ylabel("Count")
     return fig
 def compute_gc_content(sequence):
+    if not sequence:
+        return 0
     gc_count = sequence.count('G') + sequence.count('C')
     return (gc_count / len(sequence)) * 100.0
             with open(file_obj, 'r') as f:
                 text = f.read()
         except Exception as e:
+            return (f"Error reading file: {str(e)}", None, None, None, None, None)
     else:
+        return ("Please provide a FASTA sequence.", None, None, None, None, None)
     sequences = parse_fasta(text)
     if not sequences:
+        return ("No valid FASTA sequences found.", None, None, None, None, None)
     header, seq = sequences[0]
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     try:
+        # IMPORTANT: adjust how you load your model as needed
+        state_dict = torch.load('model.pt', map_location=device)
         model = VirusClassifier(256).to(device)
         model.load_state_dict(state_dict)
         scaler = joblib.load('scaler.pkl')
     except Exception as e:
+        return (f"Error loading model/scaler: {str(e)}", None, None, None, None, None)
     freq_vector = sequence_to_kmer_vector(seq)
     scaled_vector = scaler.transform(freq_vector.reshape(1, -1))
     heatmap_fig = plot_linear_heatmap(shap_means, title="Genome-wide SHAP")
     heatmap_img = fig_to_image(heatmap_fig)
+    # You might want to provide a CSV or other data for the 6th return item
+    # Here, we'll simply return None for the file download:
     state_dict_out = {"seq": seq, "shap_means": shap_means}
+    return (results_text, bar_img, heatmap_img, state_dict_out, header, None)
 ###############################################################################
 # 8. SUBREGION ANALYSIS (Gradio Step 2)
 def analyze_subregion(state, header, region_start, region_end):
     if not state or "seq" not in state or "shap_means" not in state:
+        return ("No sequence data found. Please run Step 1 first.", None, None, None)
     seq = state["seq"]
     shap_means = state["shap_means"]
     region_start = int(region_start)
     region_start = max(0, min(region_start, len(seq)))
     region_end = max(0, min(region_end, len(seq)))
     if region_end <= region_start:
+        return ("Invalid region range. End must be > Start.", None, None, None)
     region_seq = seq[region_start:region_end]
     region_shap = shap_means[region_start:region_end]
     gc_percent = compute_gc_content(region_seq)
     heatmap_img = fig_to_image(heatmap_fig)
     hist_fig = plot_shap_histogram(region_shap, title="SHAP Distribution in Subregion")
     hist_img = fig_to_image(hist_fig)
+    # For demonstration, returning None for the file download as well
+    return (region_info, heatmap_img, hist_img, None)
 ###############################################################################
 # 9. COMPARISON ANALYSIS FUNCTIONS
         # Analyze first sequence
         res1 = analyze_sequence(file1, top_kmers=10, fasta_text=fasta1, window_size=500)
         if isinstance(res1[0], str) and "Error" in res1[0]:
+            return (f"Error in sequence 1: {res1[0]}", None, None, None)
         # Analyze second sequence
         res2 = analyze_sequence(file2, top_kmers=10, fasta_text=fasta2, window_size=500)
         if isinstance(res2[0], str) and "Error" in res2[0]:
+            return (f"Error in sequence 2: {res2[0]}", None, None, None)
         # Extract SHAP values and sequence info
         shap1 = res1[3]["shap_means"]
         )
         hist_img = fig_to_image(hist_fig)
+        # Return 4 outputs (text, image, image, and a file or None for the last)
+        return (comparison_text, heatmap_img, hist_img, None)
     except Exception as e:
         error_msg = f"Error during sequence comparison: {str(e)}"
+        return (error_msg, None, None, None)
 ###############################################################################
 # 11. GENE FEATURE ANALYSIS
         # Prepare gene name label
         label = str(gene.get('gene_name','?'))
+        # Fallback for label size
         label_mask = font.getmask(label)
         label_width, label_height = label_mask.size
+        # Alternate label positions
         if idx % 2 == 0:
             text_y = line_y - track_height - 15
         else:
     return img
 def analyze_gene_features(sequence_file: str,
+                          features_file: str,
+                          fasta_text: str = "",
+                          features_text: str = "") -> Tuple[str, Optional[str], Optional[Image.Image]]:
     """Analyze SHAP values for each gene feature"""
     # First analyze whole sequence
     sequence_results = analyze_sequence(sequence_file, top_kmers=10, fasta_text=fasta_text)
     **Step 3**: Analyze gene features and their contributions.
     **Step 4**: Compare sequences and analyze differences.
+    **Color Scale**: Negative SHAP = Blue, Zero = White, Positive SHAP = Red.
     """)
     with gr.Tab("1) Full-Sequence Analysis"):
                 download_results = gr.File(label="Download Results", visible=False, elem_classes="download-button")
         seq_state = gr.State()
         header_state = gr.State()
         analyze_btn.click(
             analyze_sequence,
             inputs=[file_input, top_k, text_input, win_size],
             subregion_img = gr.Image(label="Subregion SHAP Heatmap (B-W-R)")
             subregion_hist_img = gr.Image(label="SHAP Distribution (Histogram)")
         download_subregion = gr.File(label="Download Subregion Analysis", visible=False, elem_classes="download-button")
         region_btn.click(
             analyze_subregion,
             inputs=[seq_state, header_state, region_start, region_end],
         The sequences will be normalized to the same length for comparison.
         **Color Scale**:
+        - Red: Sequence 2 more human-like
+        - Blue: Sequence 1 more human-like
         - White: No substantial difference
         """)
         with gr.Row():
             diff_heatmap = gr.Image(label="SHAP Difference Heatmap")
             diff_hist = gr.Image(label="Distribution of SHAP Differences")
         download_comparison = gr.File(label="Download Comparison Results", visible=False, elem_classes="download-button")
         compare_btn.click(
             analyze_sequence_comparison,
             inputs=[file_input1, file_input2, text_input1, text_input2],
     """)
 if __name__ == "__main__":
+    iface.launch()