Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Jan 14

Commit

9308c12

verified ·

1 Parent(s): 18efb8a

Update app.py

Browse files

Files changed (1) hide show

app.py +285 -331

app.py CHANGED Viewed

@@ -67,9 +67,6 @@ def parse_fasta(text):
     return sequences
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
-    """
-    Convert a sequence into a frequency vector of all possible 4-mer combinations.
-    """
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     vec = np.zeros(len(kmers), dtype=np.float32)
@@ -87,15 +84,11 @@ def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
 ###############################################################################
 def calculate_shap_values(model, x_tensor):
-    """
-    A simple ablation-based SHAP approximation. Zero out each position
-    and measure the impact on the 'human' probability.
-    """
     model.eval()
     with torch.no_grad():
         baseline_output = model(x_tensor)
         baseline_probs = torch.softmax(baseline_output, dim=1)
-        baseline_prob = baseline_probs[0, 1].item()  # Probability for 'human'
         shap_values = []
         x_zeroed = x_tensor.clone()
         for i in range(x_tensor.shape[1]):
@@ -113,9 +106,6 @@ def calculate_shap_values(model, x_tensor):
 ###############################################################################
 def compute_positionwise_scores(sequence, shap_values, k=4):
-    """
-    Distribute each k-mer's SHAP contribution across its k underlying positions.
-    """
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     seq_len = len(sequence)
@@ -136,9 +126,6 @@ def compute_positionwise_scores(sequence, shap_values, k=4):
 ###############################################################################
 def find_extreme_subregion(shap_means, window_size=500, mode="max"):
-    """
-    Use a sliding window to find the subregion with the highest (or lowest) average SHAP.
-    """
     n = len(shap_means)
     if n == 0:
         return (0, 0, 0.0)
@@ -165,9 +152,6 @@ def find_extreme_subregion(shap_means, window_size=500, mode="max"):
 ###############################################################################
 def fig_to_image(fig):
-    """
-    Render a Matplotlib figure to a PIL Image.
-    """
     buf = io.BytesIO()
     fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
     buf.seek(0)
@@ -176,16 +160,10 @@ def fig_to_image(fig):
     return img
 def get_zero_centered_cmap():
-    """
-    Create a symmetrical (blue-white-red) colormap around zero.
-    """
     colors = [(0.0, 'blue'), (0.5, 'white'), (1.0, 'red')]
     return mcolors.LinearSegmentedColormap.from_list("blue_white_red", colors)
 def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, end=None):
-    """
-    Plot an inline heatmap for the chosen region (or entire genome if start/end not provided).
-    """
     if start is not None and end is not None:
         local_shap = shap_means[start:end]
         subtitle = f" (positions {start}-{end})"
@@ -211,9 +189,6 @@ def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, e
     return fig
 def create_importance_bar_plot(shap_values, kmers, top_k=10):
-    """
-    Show bar chart of top k-mers by absolute SHAP value.
-    """
     plt.rcParams.update({'font.size': 10})
     fig = plt.figure(figsize=(10, 5))
     indices = np.argsort(np.abs(shap_values))[-top_k:]
@@ -229,9 +204,6 @@ def create_importance_bar_plot(shap_values, kmers, top_k=10):
     return fig
 def plot_shap_histogram(shap_array, title="SHAP Distribution in Region", num_bins=30):
-    """
-    Plot a histogram of SHAP values in some region.
-    """
     fig, ax = plt.subplots(figsize=(6, 4))
     ax.hist(shap_array, bins=num_bins, color='gray', edgecolor='black')
     ax.axvline(0, color='red', linestyle='--', label='0.0')
@@ -243,11 +215,8 @@ def plot_shap_histogram(shap_array, title="SHAP Distribution in Region", num_bin
     return fig
 def compute_gc_content(sequence):
-    """
-    Compute GC content (%) for a given sequence.
-    """
     if not sequence:
-        return 0.0
     gc_count = sequence.count('G') + sequence.count('C')
     return (gc_count / len(sequence)) * 100.0
@@ -256,11 +225,6 @@ def compute_gc_content(sequence):
 ###############################################################################
 def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
-    """
-    Perform the main classification, SHAP analysis, and extreme subregion detection
-    for a single sequence.
-    """
-    # 1) Read input
     if fasta_text.strip():
         text = fasta_text.strip()
     elif file_obj is not None:
@@ -272,15 +236,14 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
     else:
         return ("Please provide a FASTA sequence.", None, None, None, None, None)
-    # 2) Parse FASTA
     sequences = parse_fasta(text)
     if not sequences:
         return ("No valid FASTA sequences found.", None, None, None, None, None)
     header, seq = sequences[0]
-    # 3) Load model, scaler, and run inference
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     try:
         state_dict = torch.load('model.pt', map_location=device)
         model = VirusClassifier(256).to(device)
         model.load_state_dict(state_dict)
@@ -297,12 +260,10 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
     classification = "Human" if prob_human > 0.5 else "Non-human"
     confidence = max(prob_human, prob_nonhuman)
-    # 4) Per-base SHAP & subregion detection
     shap_means = compute_positionwise_scores(seq, shap_values, k=4)
     max_start, max_end, max_avg = find_extreme_subregion(shap_means, window_size, mode="max")
     min_start, min_end, min_avg = find_extreme_subregion(shap_means, window_size, mode="min")
-    # 5) Prepare result text
     results_text = (
         f"Sequence: {header}\n"
         f"Length: {len(seq):,} bases\n"
@@ -316,7 +277,6 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
         f"Start: {min_start}, End: {min_end}, Avg SHAP: {min_avg:.4f}"
     )
-    # 6) Create bar & heatmap figures
     kmers = [''.join(p) for p in product("ACGT", repeat=4)]
     bar_fig = create_importance_bar_plot(shap_values, kmers, top_kmers)
     bar_img = fig_to_image(bar_fig)
@@ -324,10 +284,10 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
     heatmap_fig = plot_linear_heatmap(shap_means, title="Genome-wide SHAP")
     heatmap_img = fig_to_image(heatmap_fig)
-    # 7) Build the "state" dictionary so we can do subregion analysis
     state_dict_out = {"seq": seq, "shap_means": shap_means}
-    # Return 6 items to match your Gradio output
     return (results_text, bar_img, heatmap_img, state_dict_out, header, None)
 ###############################################################################
@@ -335,9 +295,6 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
 ###############################################################################
 def analyze_subregion(state, header, region_start, region_end):
-    """
-    Examine a subregion’s SHAP distribution, GC content, etc.
-    """
     if not state or "seq" not in state or "shap_means" not in state:
         return ("No sequence data found. Please run Step 1 first.", None, None, None)
     seq = state["seq"]
@@ -348,22 +305,18 @@ def analyze_subregion(state, header, region_start, region_end):
     region_end = max(0, min(region_end, len(seq)))
     if region_end <= region_start:
         return ("Invalid region range. End must be > Start.", None, None, None)
     region_seq = seq[region_start:region_end]
     region_shap = shap_means[region_start:region_end]
     gc_percent = compute_gc_content(region_seq)
     avg_shap = float(np.mean(region_shap))
     positive_fraction = np.mean(region_shap > 0)
     negative_fraction = np.mean(region_shap < 0)
     if avg_shap > 0.05:
         region_classification = "Likely pushing toward human"
     elif avg_shap < -0.05:
         region_classification = "Likely pushing toward non-human"
     else:
         region_classification = "Near neutral (no strong push)"
     region_info = (
         f"Analyzing subregion of {header} from {region_start} to {region_end}\n"
         f"Region length: {len(region_seq)} bases\n"
@@ -373,29 +326,30 @@ def analyze_subregion(state, header, region_start, region_end):
         f"Fraction with SHAP < 0 (toward non-human): {negative_fraction:.2f}\n"
         f"Subregion interpretation: {region_classification}\n"
     )
     heatmap_fig = plot_linear_heatmap(shap_means, title="Subregion SHAP", start=region_start, end=region_end)
     heatmap_img = fig_to_image(heatmap_fig)
     hist_fig = plot_shap_histogram(region_shap, title="SHAP Distribution in Subregion")
     hist_img = fig_to_image(hist_fig)
-    # Return 4 items to match your Gradio output
     return (region_info, heatmap_img, hist_img, None)
 ###############################################################################
-# 9. COMPARISON ANALYSIS FUNCTIONS (Step 4)
 ###############################################################################
 def compute_shap_difference(shap1_norm, shap2_norm):
-    """
-    Compute the SHAP difference (Seq2 - Seq1).
-    """
     return shap2_norm - shap1_norm
 def plot_comparative_heatmap(shap_diff, title="SHAP Difference Heatmap"):
     """
-    Plot a 1D heatmap of differences using relative positions 0-100%.
     """
     heatmap_data = shap_diff.reshape(1, -1)
     extent = max(abs(np.min(shap_diff)), abs(np.max(shap_diff)))
@@ -424,7 +378,7 @@ def plot_comparative_heatmap(shap_diff, title="SHAP Difference Heatmap"):
 def plot_shap_histogram(shap_array, title="SHAP Distribution", num_bins=30):
     """
-    Plot a histogram of SHAP values with optional # of bins.
     """
     fig, ax = plt.subplots(figsize=(6, 4))
     ax.hist(shap_array, bins=num_bins, color='gray', edgecolor='black', alpha=0.7)
@@ -438,16 +392,18 @@ def plot_shap_histogram(shap_array, title="SHAP Distribution", num_bins=30):
 def calculate_adaptive_parameters(len1, len2):
     """
-    Choose smoothing & interpolation parameters automatically based on length difference.
     """
     length_diff = abs(len1 - len2)
     max_length = max(len1, len2)
     min_length = min(len1, len2)
     length_ratio = min_length / max_length
-    # Base number of points
     base_points = min(2000, max(500, max_length // 100))
     if length_diff < 500:
         resolution_factor = 2.0
         num_points = min(3000, base_points * 2)
@@ -465,22 +421,29 @@ def calculate_adaptive_parameters(len1, len2):
         num_points = max(500, base_points // 2)
         smooth_window = max(100, length_diff // 500)
     smooth_window = int(smooth_window * (1 + (1 - length_ratio)))
     return int(num_points), int(smooth_window), resolution_factor
 def sliding_window_smooth(values, window_size=50):
     """
-    A custom smoothing approach, including exponential decay at edges.
     """
     if window_size < 3:
         return values
     window = np.ones(window_size)
     decay = np.exp(-np.linspace(0, 3, window_size // 2))
     window[:window_size // 2] = decay
     window[-(window_size // 2):] = decay[::-1]
     window = window / window.sum()
     smoothed = np.convolve(values, window, mode='valid')
     pad_size = len(values) - len(smoothed)
     pad_left = pad_size // 2
     pad_right = pad_size - pad_left
@@ -494,13 +457,16 @@ def sliding_window_smooth(values, window_size=50):
 def normalize_shap_lengths(shap1, shap2):
     """
-    Smooth, interpolate, and return arrays of the same length for direct comparison.
     """
     num_points, smooth_window, _ = calculate_adaptive_parameters(len(shap1), len(shap2))
     shap1_smooth = sliding_window_smooth(shap1, smooth_window)
     shap2_smooth = sliding_window_smooth(shap2, smooth_window)
     x1 = np.linspace(0, 1, len(shap1_smooth))
     x2 = np.linspace(0, 1, len(shap2_smooth))
     x_norm = np.linspace(0, 1, num_points)
@@ -512,8 +478,7 @@ def normalize_shap_lengths(shap1, shap2):
 def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
     """
-    Compare two sequences using the previously defined analysis pipeline
-    and produce difference visualizations & stats.
     """
     try:
         # Analyze first sequence
@@ -526,23 +491,26 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
         if isinstance(res2[0], str) and "Error" in res2[0]:
             return (f"Error in sequence 2: {res2[0]}", None, None, None)
         shap1 = res1[3]["shap_means"]
         shap2 = res2[3]["shap_means"]
         len1, len2 = len(shap1), len(shap2)
         length_diff = abs(len1 - len2)
         length_ratio = min(len1, len2) / max(len1, len2)
-        # Normalize both to the same length
         shap1_norm, shap2_norm, smooth_window = normalize_shap_lengths(shap1, shap2)
         shap_diff = compute_shap_difference(shap1_norm, shap2_norm)
-        # Compute stats
         base_threshold = 0.05
         adaptive_threshold = base_threshold * (1 + (1 - length_ratio))
         if length_diff > 50000:
             adaptive_threshold *= 1.5
         avg_diff = np.mean(shap_diff)
         std_diff = np.std(shap_diff)
         max_diff = np.max(shap_diff)
@@ -550,7 +518,7 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
         substantial_diffs = np.abs(shap_diff) > adaptive_threshold
         frac_different = np.mean(substantial_diffs)
-        # Extract classification from text
         try:
             classification1 = res1[0].split('Classification: ')[1].split('\n')[0].strip()
             classification2 = res2[0].split('Classification: ')[1].split('\n')[0].strip()
@@ -558,6 +526,7 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
             classification1 = "Unknown"
             classification2 = "Unknown"
         comparison_text = (
             "Sequence Comparison Results:\n"
             f"Sequence 1: {res1[4]}\n"
@@ -584,12 +553,14 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
             "- White regions: Similar between sequences"
         )
         heatmap_fig = plot_comparative_heatmap(
             shap_diff,
             title=f"SHAP Difference Heatmap (window: {smooth_window})"
         )
         heatmap_img = fig_to_image(heatmap_fig)
         num_bins = max(20, min(50, int(np.sqrt(len(shap_diff)))))
         hist_fig = plot_shap_histogram(
             shap_diff,
@@ -598,62 +569,31 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
         )
         hist_img = fig_to_image(hist_fig)
         return (comparison_text, heatmap_img, hist_img, None)
     except Exception as e:
         error_msg = f"Error during sequence comparison: {str(e)}"
         return (error_msg, None, None, None)
-###############################################################################
-# 10. ADDITIONAL / ADVANCED VISUALIZATIONS & STATISTICS
-###############################################################################
-def n50_length(sequence):
-    """
-    Calculate the N50 for a single continuous sequence (for demonstration).
-    For a single sequence, N50 is typically the length if it's just one piece,
-    but let's do a simplistic example.
-    """
-    # If you had contigs, you'd do a sorted list, cumulative sums, etc.
-    # We'll do a trivial approach here:
-    return len(sequence)  # Because we have only one contiguous region
-def sequence_complexity(sequence):
-    """
-    Compute a simple measure of 'sequence complexity'.
-    Here, we define complexity as the Shannon entropy over the nucleotides.
-    """
-    from math import log2
-    length = len(sequence)
-    if length == 0:
-        return 0.0
-    freq = {}
-    for base in sequence:
-        freq[base] = freq.get(base, 0) + 1
-    complexity = 0.0
-    for base, count in freq.items():
-        p = count / length
-        complexity -= p * log2(p)
-    return complexity
-def advanced_gene_statistics(gene_shap: np.ndarray, gene_seq: str) -> Dict[str, float]:
-    """
-    Additional stats: N50, complexity, etc.
-    """
-    stats = {}
-    stats['n50'] = len(gene_seq)  # trivial for a single gene region
-    stats['entropy'] = sequence_complexity(gene_seq)
-    stats['avg_shap'] = float(np.mean(gene_shap))
-    stats['max_shap'] = float(np.max(gene_shap)) if len(gene_shap) else 0.0
-    stats['min_shap'] = float(np.min(gene_shap)) if len(gene_shap) else 0.0
-    return stats
 ###############################################################################
 # 11. GENE FEATURE ANALYSIS
 ###############################################################################
 def parse_gene_features(text: str) -> List[Dict[str, Any]]:
-    """Parse gene features from text file in a FASTA-like format."""
     genes = []
     current_header = None
     current_sequence = []
@@ -662,6 +602,7 @@ def parse_gene_features(text: str) -> List[Dict[str, Any]]:
         line = line.strip()
         if not line:
             continue
         if line.startswith('>'):
             if current_header:
                 genes.append({
@@ -673,29 +614,36 @@ def parse_gene_features(text: str) -> List[Dict[str, Any]]:
             current_sequence = []
         else:
             current_sequence.append(line.upper())
     if current_header:
         genes.append({
             'header': current_header,
             'sequence': ''.join(current_sequence),
             'metadata': parse_gene_metadata(current_header)
         })
     return genes
 def parse_gene_metadata(header: str) -> Dict[str, str]:
-    """Extract metadata from gene header line."""
     metadata = {}
     parts = header.split()
     for part in parts:
         if '[' in part and ']' in part:
             key_value = part[1:-1].split('=', 1)
             if len(key_value) == 2:
                 metadata[key_value[0]] = key_value[1]
     return metadata
 def parse_location(location_str: str) -> Tuple[Optional[int], Optional[int]]:
-    """Parse gene location string, handling forward and complement strands."""
     try:
         clean_loc = location_str.replace('complement(', '').replace(')', '')
         if '..' in clean_loc:
             start, end = map(int, clean_loc.split('..'))
             return start, end
@@ -706,41 +654,48 @@ def parse_location(location_str: str) -> Tuple[Optional[int], Optional[int]]:
         return None, None
 def compute_gene_statistics(gene_shap: np.ndarray) -> Dict[str, float]:
-    """Basic statistical measures for gene SHAP values."""
     return {
-        'avg_shap': float(np.mean(gene_shap)) if len(gene_shap) else 0.0,
-        'median_shap': float(np.median(gene_shap)) if len(gene_shap) else 0.0,
-        'std_shap': float(np.std(gene_shap)) if len(gene_shap) else 0.0,
-        'max_shap': float(np.max(gene_shap)) if len(gene_shap) else 0.0,
-        'min_shap': float(np.min(gene_shap)) if len(gene_shap) else 0.0,
-        'pos_fraction': float(np.mean(gene_shap > 0)) if len(gene_shap) else 0.0
     }
 def create_simple_genome_diagram(gene_results: List[Dict[str, Any]], genome_length: int) -> Image.Image:
     """
-    A quick PIL-based diagram to show genes along the genome.
-    Color intensity = magnitude of SHAP. Red/Blue = sign of SHAP.
     """
     if not gene_results or genome_length <= 0:
         img = Image.new('RGB', (800, 100), color='white')
         draw = ImageDraw.Draw(img)
         draw.text((10, 40), "Error: Invalid input data", fill='black')
         return img
     for gene in gene_results:
         gene['start'] = max(0, int(gene['start']))
         gene['end'] = min(genome_length, int(gene['end']))
         if gene['start'] >= gene['end']:
-            print(f"Warning: Invalid coordinates for gene {gene.get('gene_name','?')}")
     width = 1500
     height = 600
     margin = 50
     track_height = 40
     img = Image.new('RGB', (width, height), 'white')
     draw = ImageDraw.Draw(img)
     try:
         font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
         title_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 16)
@@ -748,16 +703,24 @@ def create_simple_genome_diagram(gene_results: List[Dict[str, Any]], genome_leng
         font = ImageFont.load_default()
         title_font = ImageFont.load_default()
-    draw.text((margin, margin // 2), "Genome SHAP Analysis (Simple)", fill='black', font=title_font or font)
     line_y = height // 2
     draw.line([(int(margin), int(line_y)), (int(width - margin), int(line_y))], fill='black', width=2)
     scale = float(width - 2 * margin) / float(genome_length)
-    # Scale markers
     num_ticks = 10
-    step = max(1, genome_length // num_ticks)
     for i in range(0, genome_length + 1, step):
         x_coord = margin + i * scale
         draw.line([
@@ -766,33 +729,50 @@ def create_simple_genome_diagram(gene_results: List[Dict[str, Any]], genome_leng
         ], fill='black', width=1)
         draw.text((int(x_coord - 20), int(line_y + 10)), f"{i:,}", fill='black', font=font)
     sorted_genes = sorted(gene_results, key=lambda x: abs(x['avg_shap']))
     for idx, gene in enumerate(sorted_genes):
         start_x = margin + int(gene['start'] * scale)
         end_x   = margin + int(gene['end'] * scale)
         avg_shap = gene['avg_shap']
         intensity = int(abs(avg_shap) * 500)
-        intensity = max(50, min(255, intensity))
         if avg_shap > 0:
-            color = (255, 255 - intensity, 255 - intensity)  # Redish
         else:
-            color = (255 - intensity, 255 - intensity, 255)  # Blueish
         draw.rectangle([
             (int(start_x), int(line_y - track_height // 2)),
             (int(end_x),   int(line_y + track_height // 2))
         ], fill=color, outline='black')
         label = str(gene.get('gene_name','?'))
         label_mask = font.getmask(label)
         label_width, label_height = label_mask.size
         if idx % 2 == 0:
             text_y = line_y - track_height - 15
         else:
             text_y = line_y + track_height + 5
         gene_width = end_x - start_x
         if gene_width > label_width:
             text_x = start_x + (gene_width - label_width) // 2
@@ -804,113 +784,64 @@ def create_simple_genome_diagram(gene_results: List[Dict[str, Any]], genome_leng
             rotated_img = txt_img.rotate(90, expand=True)
             img.paste(rotated_img, (int(start_x), int(text_y)), rotated_img)
-    return img
-def create_advanced_genome_diagram(gene_results: List[Dict[str, Any]],
-                                   genome_length: int,
-                                   shap_means: np.ndarray,
-                                   diagram_title: str = "Advanced Genome Diagram") -> Image.Image:
-    """
-    An advanced genome diagram using Biopython's GenomeDiagram.
-    We'll create tracks for genes and a 'SHAP line plot' track.
-    """
-    if not gene_results or genome_length <= 0 or len(shap_means) == 0:
-        # Fallback if data is invalid
-        img = Image.new('RGB', (800, 100), color='white')
-        d = ImageDraw.Draw(img)
-        d.text((10, 40), "Error: Not enough data for advanced diagram", fill='black')
-        return img
-    diagram = GenomeDiagram.Diagram(diagram_title)
-    gene_track = diagram.new_track(1, name="Genes", greytrack=False, height=0.5)
-    gene_set = gene_track.new_set()
-    # Add each gene as a feature
-    for gene in gene_results:
-        start = max(0, int(gene['start']))
-        end = min(genome_length, int(gene['end']))
-        avg_shap = gene['avg_shap']
-        # Color scale: negative = blue, positive = red
-        intensity = abs(avg_shap) * 500
-        intensity = max(50, min(255, intensity))
-        if avg_shap >= 0:
-            color_hex = colors.Color(1.0, 1.0 - intensity/255.0, 1.0 - intensity/255.0)
-        else:
-            color_hex = colors.Color(1.0 - intensity/255.0, 1.0 - intensity/255.0, 1.0)
-        feature = SeqFeature(FeatureLocation(start, end), strand=1)
-        gene_set.add_feature(
-            feature,
-            color=color_hex,
-            label=True,
-            name=str(gene.get('gene_name','?')),
-            label_size=8,
-            label_color=colors.black
-        )
-    # Add a track for the SHAP line
-    shap_track = diagram.new_track(2, name="SHAP Score", greytrack=False, height=0.3)
-    shap_set = shap_track.new_set("graph")
-    # We'll plot the entire shap_means array.
-    # X coords = [0..genome_length], Y coords = shap_means
-    # We'll keep negative values below baseline, positive above.
-    # Normalizing for visualization
-    max_abs = max(abs(shap_means.min()), abs(shap_means.max()))
-    if max_abs == 0:
-        scaled_shap = [0]*len(shap_means)
-    else:
-        scaled_shap = (shap_means / max_abs * 50).tolist()  # scale to +/- 50
-    shap_set.add_graph(
-        data=scaled_shap,
-        name="shap_line",
-        style="line",
-        color=colors.darkgreen,
-        altcolor=colors.red,
-        linewidth=1
-    )
-    # Draw to a temporary file
-    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmpf:
-        diagram.draw(format="linear", pagesize='A3', fragments=1, start=0, end=genome_length)
-        diagram.write(tmpf.name, "PDF")
-        # Convert PDF to a PIL image (requires poppler or similar).
-        # If you do not have poppler, you can skip PDF -> image or use Cairo.
-        try:
-            import pdf2image
-            pages = pdf2image.convert_from_path(tmpf.name, dpi=100)
-            img = pages[0] if pages else Image.new('RGB', (800, 100), color='white')
-        except ImportError:
-            img = Image.new('RGB', (800, 100), color='white')
-            d = ImageDraw.Draw(img)
-            d.text((10, 40), "pdf2image not installed, can't show advanced diagram as image.", fill='black')
-    # Cleanup
-    os.remove(tmpf.name)
     return img
 def analyze_gene_features(sequence_file: str,
                           features_file: str,
                           fasta_text: str = "",
-                          features_text: str = "",
-                          diagram_mode: str = "advanced"
-                          ) -> Tuple[str, Optional[str], Optional[Image.Image]]:
-    """
-    Analyze each gene in the features file, compute gene-level SHAP stats,
-    produce tabular output, and create an optional genome diagram.
-    """
-    # 1) Analyze the entire sequence with the top-level function
     sequence_results = analyze_sequence(sequence_file, top_kmers=10, fasta_text=fasta_text)
     if isinstance(sequence_results[0], str) and "Error" in sequence_results[0]:
         return f"Error in sequence analysis: {sequence_results[0]}", None, None
-    seq = sequence_results[3]["seq"]
     shap_means = sequence_results[3]["shap_means"]
-    genome_length = len(seq)
-    # 2) Read gene features
     try:
         if features_text.strip():
             genes = parse_gene_features(features_text)
@@ -919,100 +850,98 @@ def analyze_gene_features(sequence_file: str,
                 genes = parse_gene_features(f.read())
     except Exception as e:
         return f"Error reading features file: {str(e)}", None, None
     gene_results = []
     for gene in genes:
-        location = gene['metadata'].get('location', '')
-        if not location:
-            continue
-        start, end = parse_location(location)
-        if start is None or end is None or start >= end or end > genome_length:
             continue
-        gene_shap = shap_means[start:end]
-        basic_stats = compute_gene_statistics(gene_shap)
-        # Additional stats
-        gene_seq = seq[start:end]
-        adv_stats = advanced_gene_statistics(gene_shap, gene_seq)
-        # Merge basic + advanced stats
-        all_stats = {**basic_stats, **adv_stats}
-        classification = 'Human' if basic_stats['avg_shap'] > 0 else 'Non-human'
-        locus_tag = gene['metadata'].get('locus_tag', '')
-        gene_name = gene['metadata'].get('gene', 'Unknown')
-        gene_dict = {
-            'gene_name': gene_name,
-            'location': location,
-            'start': start,
-            'end': end,
-            'locus_tag': locus_tag,
-            'avg_shap': all_stats['avg_shap'],
-            'median_shap': basic_stats['median_shap'],
-            'std_shap': basic_stats['std_shap'],
-            'max_shap': basic_stats['max_shap'],
-            'min_shap': basic_stats['min_shap'],
-            'pos_fraction': basic_stats['pos_fraction'],
-            'n50': all_stats['n50'],
-            'entropy': all_stats['entropy'],
-            'classification': classification,
-            'confidence': abs(all_stats['avg_shap'])
-        }
-        gene_results.append(gene_dict)
     if not gene_results:
         return "No valid genes could be processed", None, None
-    # 3) Summaries
     sorted_genes = sorted(gene_results, key=lambda x: abs(x['avg_shap']), reverse=True)
     results_text = "Gene Analysis Results:\n\n"
     results_text += f"Total genes analyzed: {len(gene_results)}\n"
-    num_human = sum(1 for g in gene_results if g['classification'] == 'Human')
-    results_text += f"Human-like genes: {num_human}\n"
-    results_text += f"Non-human-like genes: {len(gene_results) - num_human}\n\n"
-    results_text += "Top 10 most distinctive genes (by avg SHAP magnitude):\n"
     for gene in sorted_genes[:10]:
         results_text += (
             f"Gene: {gene['gene_name']}\n"
             f"Location: {gene['location']}\n"
             f"Classification: {gene['classification']} "
             f"(confidence: {gene['confidence']:.4f})\n"
-            f"Average SHAP: {gene['avg_shap']:.4f}\n"
-            f"N50: {gene['n50']}, Entropy: {gene['entropy']:.3f}\n\n"
         )
-    # 4) Make CSV
-    csv_content = "gene_name,location,start,end,locus_tag,avg_shap,median_shap,std_shap,"
-    csv_content += "max_shap,min_shap,pos_fraction,n50,entropy,classification,confidence\n"
-    for g in gene_results:
         csv_content += (
-            f"{g['gene_name']},{g['location']},{g['start']},{g['end']},{g['locus_tag']},"
-            f"{g['avg_shap']:.4f},{g['median_shap']:.4f},{g['std_shap']:.4f},"
-            f"{g['max_shap']:.4f},{g['min_shap']:.4f},{g['pos_fraction']:.4f},"
-            f"{g['n50']},{g['entropy']:.4f},{g['classification']},{g['confidence']:.4f}\n"
         )
     try:
         temp_dir = tempfile.gettempdir()
         temp_path = os.path.join(temp_dir, f"gene_analysis_{os.urandom(4).hex()}.csv")
         with open(temp_path, 'w') as f:
             f.write(csv_content)
     except Exception as e:
         print(f"Error saving CSV: {str(e)}")
         temp_path = None
-    # 5) Create diagram
     try:
-        if diagram_mode == "advanced":
-            diagram_img = create_advanced_genome_diagram(gene_results, genome_length, shap_means)
-        else:
-            diagram_img = create_simple_genome_diagram(gene_results, genome_length)
     except Exception as e:
         print(f"Error creating visualization: {str(e)}")
         diagram_img = Image.new('RGB', (800, 100), color='white')
         draw = ImageDraw.Draw(diagram_img)
         draw.text((10, 40), f"Error creating visualization: {str(e)}", fill='black')
     return results_text, temp_path, diagram_img
 ###############################################################################
@@ -1020,14 +949,13 @@ def analyze_gene_features(sequence_file: str,
 ###############################################################################
 def prepare_csv_download(data, filename="analysis_results.csv"):
-    """
-    Convert data to CSV for Gradio download button.
-    """
     if isinstance(data, str):
         return data.encode(), filename
     elif isinstance(data, (list, dict)):
         import csv
         from io import StringIO
         output = StringIO()
         writer = csv.DictWriter(output, fieldnames=data[0].keys())
         writer.writeheader()
@@ -1051,22 +979,22 @@ css = """
 with gr.Blocks(css=css) as iface:
     gr.Markdown("""
-    # Virus Host Classifier + Extended Genome Visualization
-    **Step 1**: Predict overall viral sequence origin (human vs non-human) and identify extreme subregions.
-    **Step 2**: Explore subregions (local SHAP, GC content, histogram).
-    **Step 3**: Analyze gene features (per-gene SHAP, advanced stats, improved diagrams).
-    **Step 4**: Compare sequences for SHAP differences.
-    **Color Scale**: Negative SHAP = Blue, 0 = White, Positive = Red.
     """)
     with gr.Tab("1) Full-Sequence Analysis"):
         with gr.Row():
             with gr.Column(scale=1):
                 file_input = gr.File(label="Upload FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
-                text_input = gr.Textbox(label="Or paste FASTA", placeholder=">name\nACGT...", lines=5)
                 top_k = gr.Slider(minimum=5, maximum=30, value=10, step=1, label="Number of top k-mers to display")
-                win_size = gr.Slider(minimum=100, maximum=5000, value=500, step=100, label="Subregion Window Size")
                 analyze_btn = gr.Button("Analyze Sequence", variant="primary")
             with gr.Column(scale=2):
                 results_box = gr.Textbox(label="Classification Results", lines=12, interactive=False)
@@ -1085,7 +1013,8 @@ with gr.Blocks(css=css) as iface:
     with gr.Tab("2) Subregion Exploration"):
         gr.Markdown("""
         **Subregion Analysis**
-        View SHAP signals, GC content, etc. for a specific region.
         """)
         with gr.Row():
             region_start = gr.Number(label="Region Start", value=0)
@@ -1095,7 +1024,7 @@ with gr.Blocks(css=css) as iface:
         with gr.Row():
             subregion_img = gr.Image(label="Subregion SHAP Heatmap (B-W-R)")
             subregion_hist_img = gr.Image(label="SHAP Distribution (Histogram)")
-        download_subregion = gr.File(label="Download Subregion", visible=False, elem_classes="download-button")
         region_btn.click(
             analyze_subregion,
@@ -1106,48 +1035,60 @@ with gr.Blocks(css=css) as iface:
     with gr.Tab("3) Gene Features Analysis"):
         gr.Markdown("""
         **Analyze Gene Features**
-        - Upload a FASTA file and a gene features file.
-        - See per-gene SHAP, classification, N50, entropy, etc.
-        - Choose a diagram mode (simple or advanced).
         """)
         with gr.Row():
             with gr.Column(scale=1):
-                gene_fasta_file = gr.File(label="FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
-                gene_fasta_text = gr.Textbox(label="Or paste FASTA sequence", lines=5)
             with gr.Column(scale=1):
-                features_file = gr.File(label="Gene features file", file_types=[".txt"], type="filepath")
-                features_text = gr.Textbox(label="Or paste gene features", lines=5)
-                diagram_mode = gr.Radio(choices=["simple", "advanced"], value="advanced", label="Diagram Mode")
         analyze_genes_btn = gr.Button("Analyze Gene Features", variant="primary")
         gene_results = gr.Textbox(label="Gene Analysis Results", lines=12, interactive=False)
-        gene_diagram = gr.Image(label="Genome Diagram")
         download_gene_results = gr.File(label="Download Gene Analysis (CSV)", visible=True)
         analyze_genes_btn.click(
             analyze_gene_features,
-            inputs=[gene_fasta_file, features_file, gene_fasta_text, features_text, diagram_mode],
             outputs=[gene_results, download_gene_results, gene_diagram]
         )
     with gr.Tab("4) Comparative Analysis"):
         gr.Markdown("""
         **Compare Two Sequences**
-        - Upload or paste two FASTA sequences.
-        - We'll compare SHAP patterns (normalized for different lengths).
         """)
         with gr.Row():
             with gr.Column(scale=1):
-                file_input1 = gr.File(label="1st FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
-                text_input1 = gr.Textbox(label="Or paste 1st FASTA", lines=5)
             with gr.Column(scale=1):
-                file_input2 = gr.File(label="2nd FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
-                text_input2 = gr.Textbox(label="Or paste 2nd FASTA", lines=5)
         compare_btn = gr.Button("Compare Sequences", variant="primary")
         comparison_text = gr.Textbox(label="Comparison Results", lines=12, interactive=False)
         with gr.Row():
             diff_heatmap = gr.Image(label="SHAP Difference Heatmap")
             diff_hist = gr.Image(label="Distribution of SHAP Differences")
-        download_comparison = gr.File(label="Download Comparison", visible=False, elem_classes="download-button")
         compare_btn.click(
             analyze_sequence_comparison,
@@ -1156,12 +1097,25 @@ with gr.Blocks(css=css) as iface:
         )
     gr.Markdown("""
-    ### Notes & Features
-    - **Advanced Genome Diagram** uses Biopython’s `GenomeDiagram` (requires `pdf2image` if you want it as an image).
-    - **Additional Stats**: N50, Shannon entropy, etc.
-    - **Auto-scaling** for comparative analysis with adaptive smoothing.
-    - **Data Export**: Download CSV of analysis results.
     """)
 if __name__ == "__main__":
     iface.launch()

     return sequences
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     vec = np.zeros(len(kmers), dtype=np.float32)
 ###############################################################################
 def calculate_shap_values(model, x_tensor):
     model.eval()
     with torch.no_grad():
         baseline_output = model(x_tensor)
         baseline_probs = torch.softmax(baseline_output, dim=1)
+        baseline_prob = baseline_probs[0, 1].item()  # Prob of 'human'
         shap_values = []
         x_zeroed = x_tensor.clone()
         for i in range(x_tensor.shape[1]):
 ###############################################################################
 def compute_positionwise_scores(sequence, shap_values, k=4):
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     seq_len = len(sequence)
 ###############################################################################
 def find_extreme_subregion(shap_means, window_size=500, mode="max"):
     n = len(shap_means)
     if n == 0:
         return (0, 0, 0.0)
 ###############################################################################
 def fig_to_image(fig):
     buf = io.BytesIO()
     fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
     buf.seek(0)
     return img
 def get_zero_centered_cmap():
     colors = [(0.0, 'blue'), (0.5, 'white'), (1.0, 'red')]
     return mcolors.LinearSegmentedColormap.from_list("blue_white_red", colors)
 def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, end=None):
     if start is not None and end is not None:
         local_shap = shap_means[start:end]
         subtitle = f" (positions {start}-{end})"
     return fig
 def create_importance_bar_plot(shap_values, kmers, top_k=10):
     plt.rcParams.update({'font.size': 10})
     fig = plt.figure(figsize=(10, 5))
     indices = np.argsort(np.abs(shap_values))[-top_k:]
     return fig
 def plot_shap_histogram(shap_array, title="SHAP Distribution in Region", num_bins=30):
     fig, ax = plt.subplots(figsize=(6, 4))
     ax.hist(shap_array, bins=num_bins, color='gray', edgecolor='black')
     ax.axvline(0, color='red', linestyle='--', label='0.0')
     return fig
 def compute_gc_content(sequence):
     if not sequence:
+        return 0
     gc_count = sequence.count('G') + sequence.count('C')
     return (gc_count / len(sequence)) * 100.0
 ###############################################################################
 def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
     if fasta_text.strip():
         text = fasta_text.strip()
     elif file_obj is not None:
     else:
         return ("Please provide a FASTA sequence.", None, None, None, None, None)
     sequences = parse_fasta(text)
     if not sequences:
         return ("No valid FASTA sequences found.", None, None, None, None, None)
     header, seq = sequences[0]
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     try:
+        # IMPORTANT: adjust how you load your model as needed
         state_dict = torch.load('model.pt', map_location=device)
         model = VirusClassifier(256).to(device)
         model.load_state_dict(state_dict)
     classification = "Human" if prob_human > 0.5 else "Non-human"
     confidence = max(prob_human, prob_nonhuman)
     shap_means = compute_positionwise_scores(seq, shap_values, k=4)
     max_start, max_end, max_avg = find_extreme_subregion(shap_means, window_size, mode="max")
     min_start, min_end, min_avg = find_extreme_subregion(shap_means, window_size, mode="min")
     results_text = (
         f"Sequence: {header}\n"
         f"Length: {len(seq):,} bases\n"
         f"Start: {min_start}, End: {min_end}, Avg SHAP: {min_avg:.4f}"
     )
     kmers = [''.join(p) for p in product("ACGT", repeat=4)]
     bar_fig = create_importance_bar_plot(shap_values, kmers, top_kmers)
     bar_img = fig_to_image(bar_fig)
     heatmap_fig = plot_linear_heatmap(shap_means, title="Genome-wide SHAP")
     heatmap_img = fig_to_image(heatmap_fig)
+    # You might want to provide a CSV or other data for the 6th return item
+    # Here, we'll simply return None for the file download:
     state_dict_out = {"seq": seq, "shap_means": shap_means}
     return (results_text, bar_img, heatmap_img, state_dict_out, header, None)
 ###############################################################################
 ###############################################################################
 def analyze_subregion(state, header, region_start, region_end):
     if not state or "seq" not in state or "shap_means" not in state:
         return ("No sequence data found. Please run Step 1 first.", None, None, None)
     seq = state["seq"]
     region_end = max(0, min(region_end, len(seq)))
     if region_end <= region_start:
         return ("Invalid region range. End must be > Start.", None, None, None)
     region_seq = seq[region_start:region_end]
     region_shap = shap_means[region_start:region_end]
     gc_percent = compute_gc_content(region_seq)
     avg_shap = float(np.mean(region_shap))
     positive_fraction = np.mean(region_shap > 0)
     negative_fraction = np.mean(region_shap < 0)
     if avg_shap > 0.05:
         region_classification = "Likely pushing toward human"
     elif avg_shap < -0.05:
         region_classification = "Likely pushing toward non-human"
     else:
         region_classification = "Near neutral (no strong push)"
     region_info = (
         f"Analyzing subregion of {header} from {region_start} to {region_end}\n"
         f"Region length: {len(region_seq)} bases\n"
         f"Fraction with SHAP < 0 (toward non-human): {negative_fraction:.2f}\n"
         f"Subregion interpretation: {region_classification}\n"
     )
     heatmap_fig = plot_linear_heatmap(shap_means, title="Subregion SHAP", start=region_start, end=region_end)
     heatmap_img = fig_to_image(heatmap_fig)
     hist_fig = plot_shap_histogram(region_shap, title="SHAP Distribution in Subregion")
     hist_img = fig_to_image(hist_fig)
+    # For demonstration, returning None for the file download as well
     return (region_info, heatmap_img, hist_img, None)
 ###############################################################################
+# 9. COMPARISON ANALYSIS FUNCTIONS
 ###############################################################################
+def get_zero_centered_cmap():
+    """Create a zero-centered blue-white-red colormap"""
+    colors = [(0.0, 'blue'), (0.5, 'white'), (1.0, 'red')]
+    return mcolors.LinearSegmentedColormap.from_list("blue_white_red", colors)
 def compute_shap_difference(shap1_norm, shap2_norm):
+    """Compute the SHAP difference between normalized sequences"""
     return shap2_norm - shap1_norm
 def plot_comparative_heatmap(shap_diff, title="SHAP Difference Heatmap"):
     """
+    Plot heatmap using relative positions (0-100%)
     """
     heatmap_data = shap_diff.reshape(1, -1)
     extent = max(abs(np.min(shap_diff)), abs(np.max(shap_diff)))
 def plot_shap_histogram(shap_array, title="SHAP Distribution", num_bins=30):
     """
+    Plot histogram of SHAP values with configurable number of bins
     """
     fig, ax = plt.subplots(figsize=(6, 4))
     ax.hist(shap_array, bins=num_bins, color='gray', edgecolor='black', alpha=0.7)
 def calculate_adaptive_parameters(len1, len2):
     """
+    Calculate adaptive parameters based on sequence lengths and their difference.
+    Returns: (num_points, smooth_window, resolution_factor)
     """
     length_diff = abs(len1 - len2)
     max_length = max(len1, len2)
     min_length = min(len1, len2)
     length_ratio = min_length / max_length
+    # Base number of points scales with sequence length
     base_points = min(2000, max(500, max_length // 100))
+    # Adjust parameters based on sequence properties
     if length_diff < 500:
         resolution_factor = 2.0
         num_points = min(3000, base_points * 2)
         num_points = max(500, base_points // 2)
         smooth_window = max(100, length_diff // 500)
+    # Adjust window size based on length ratio
     smooth_window = int(smooth_window * (1 + (1 - length_ratio)))
     return int(num_points), int(smooth_window), resolution_factor
 def sliding_window_smooth(values, window_size=50):
     """
+    Apply sliding window smoothing with edge handling
     """
     if window_size < 3:
         return values
+    # Create window with exponential decay at edges
     window = np.ones(window_size)
     decay = np.exp(-np.linspace(0, 3, window_size // 2))
     window[:window_size // 2] = decay
     window[-(window_size // 2):] = decay[::-1]
     window = window / window.sum()
+    # Apply convolution
     smoothed = np.convolve(values, window, mode='valid')
+    # Handle edges
     pad_size = len(values) - len(smoothed)
     pad_left = pad_size // 2
     pad_right = pad_size - pad_left
 def normalize_shap_lengths(shap1, shap2):
     """
+    Normalize and smooth SHAP values with dynamic adaptation
     """
+    # Calculate adaptive parameters
     num_points, smooth_window, _ = calculate_adaptive_parameters(len(shap1), len(shap2))
+    # Apply initial smoothing
     shap1_smooth = sliding_window_smooth(shap1, smooth_window)
     shap2_smooth = sliding_window_smooth(shap2, smooth_window)
+    # Create relative positions and interpolate
     x1 = np.linspace(0, 1, len(shap1_smooth))
     x2 = np.linspace(0, 1, len(shap2_smooth))
     x_norm = np.linspace(0, 1, num_points)
 def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
     """
+    Compare two sequences with adaptive parameters and visualization
     """
     try:
         # Analyze first sequence
         if isinstance(res2[0], str) and "Error" in res2[0]:
             return (f"Error in sequence 2: {res2[0]}", None, None, None)
+        # Extract SHAP values and sequence info
         shap1 = res1[3]["shap_means"]
         shap2 = res2[3]["shap_means"]
+        # Calculate sequence properties
         len1, len2 = len(shap1), len(shap2)
         length_diff = abs(len1 - len2)
         length_ratio = min(len1, len2) / max(len1, len2)
+        # Normalize and compare sequences
         shap1_norm, shap2_norm, smooth_window = normalize_shap_lengths(shap1, shap2)
         shap_diff = compute_shap_difference(shap1_norm, shap2_norm)
+        # Calculate adaptive threshold and statistics
         base_threshold = 0.05
         adaptive_threshold = base_threshold * (1 + (1 - length_ratio))
         if length_diff > 50000:
             adaptive_threshold *= 1.5
+        # Calculate comparison statistics
         avg_diff = np.mean(shap_diff)
         std_diff = np.std(shap_diff)
         max_diff = np.max(shap_diff)
         substantial_diffs = np.abs(shap_diff) > adaptive_threshold
         frac_different = np.mean(substantial_diffs)
+        # Extract classifications
         try:
             classification1 = res1[0].split('Classification: ')[1].split('\n')[0].strip()
             classification2 = res2[0].split('Classification: ')[1].split('\n')[0].strip()
             classification1 = "Unknown"
             classification2 = "Unknown"
+        # Format output text
         comparison_text = (
             "Sequence Comparison Results:\n"
             f"Sequence 1: {res1[4]}\n"
             "- White regions: Similar between sequences"
         )
+        # Generate visualizations
         heatmap_fig = plot_comparative_heatmap(
             shap_diff,
             title=f"SHAP Difference Heatmap (window: {smooth_window})"
         )
         heatmap_img = fig_to_image(heatmap_fig)
+        # Create histogram with adaptive bins
         num_bins = max(20, min(50, int(np.sqrt(len(shap_diff)))))
         hist_fig = plot_shap_histogram(
             shap_diff,
         )
         hist_img = fig_to_image(hist_fig)
+        # Return 4 outputs (text, image, image, and a file or None for the last)
         return (comparison_text, heatmap_img, hist_img, None)
     except Exception as e:
         error_msg = f"Error during sequence comparison: {str(e)}"
         return (error_msg, None, None, None)
 ###############################################################################
 # 11. GENE FEATURE ANALYSIS
 ###############################################################################
+import io
+from io import BytesIO
+from PIL import Image, ImageDraw, ImageFont
+import numpy as np
+import pandas as pd
+import tempfile
+import os
+from typing import List, Dict, Tuple, Optional, Any
+import matplotlib.pyplot as plt
+from matplotlib.colors import LinearSegmentedColormap
+import seaborn as sns
 def parse_gene_features(text: str) -> List[Dict[str, Any]]:
+    """Parse gene features from text file in FASTA-like format"""
     genes = []
     current_header = None
     current_sequence = []
         line = line.strip()
         if not line:
             continue
         if line.startswith('>'):
             if current_header:
                 genes.append({
             current_sequence = []
         else:
             current_sequence.append(line.upper())
     if current_header:
         genes.append({
             'header': current_header,
             'sequence': ''.join(current_sequence),
             'metadata': parse_gene_metadata(current_header)
         })
     return genes
 def parse_gene_metadata(header: str) -> Dict[str, str]:
+    """Extract metadata from gene header"""
     metadata = {}
     parts = header.split()
     for part in parts:
         if '[' in part and ']' in part:
             key_value = part[1:-1].split('=', 1)
             if len(key_value) == 2:
                 metadata[key_value[0]] = key_value[1]
     return metadata
 def parse_location(location_str: str) -> Tuple[Optional[int], Optional[int]]:
+    """Parse gene location string, handling both forward and complement strands"""
     try:
+        # Remove 'complement(' and ')' if present
         clean_loc = location_str.replace('complement(', '').replace(')', '')
+        # Split on '..' and convert to integers
         if '..' in clean_loc:
             start, end = map(int, clean_loc.split('..'))
             return start, end
         return None, None
 def compute_gene_statistics(gene_shap: np.ndarray) -> Dict[str, float]:
+    """Compute statistical measures for gene SHAP values"""
     return {
+        'avg_shap': float(np.mean(gene_shap)),
+        'median_shap': float(np.median(gene_shap)),
+        'std_shap': float(np.std(gene_shap)),
+        'max_shap': float(np.max(gene_shap)),
+        'min_shap': float(np.min(gene_shap)),
+        'pos_fraction': float(np.mean(gene_shap > 0))
     }
 def create_simple_genome_diagram(gene_results: List[Dict[str, Any]], genome_length: int) -> Image.Image:
     """
+    Create a simple genome diagram using PIL, forcing a minimum color intensity
+    so that small SHAP values don't appear white.
     """
+    from PIL import Image, ImageDraw, ImageFont
+    # Validate inputs
     if not gene_results or genome_length <= 0:
         img = Image.new('RGB', (800, 100), color='white')
         draw = ImageDraw.Draw(img)
         draw.text((10, 40), "Error: Invalid input data", fill='black')
         return img
+    # Ensure all gene coordinates are valid integers
     for gene in gene_results:
         gene['start'] = max(0, int(gene['start']))
         gene['end'] = min(genome_length, int(gene['end']))
         if gene['start'] >= gene['end']:
+            print(f"Warning: Invalid coordinates for gene {gene.get('gene_name','?')}: {gene['start']}-{gene['end']}")
+    # Image dimensions
     width = 1500
     height = 600
     margin = 50
     track_height = 40
+    # Create image with white background
     img = Image.new('RGB', (width, height), 'white')
     draw = ImageDraw.Draw(img)
+    # Try to load font, fall back to default if unavailable
     try:
         font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
         title_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 16)
         font = ImageFont.load_default()
         title_font = ImageFont.load_default()
+    # Draw title
+    draw.text((margin, margin // 2), "Genome SHAP Analysis", fill='black', font=title_font or font)
+    # Draw genome line
     line_y = height // 2
     draw.line([(int(margin), int(line_y)), (int(width - margin), int(line_y))], fill='black', width=2)
+    # Calculate scale factor
     scale = float(width - 2 * margin) / float(genome_length)
+    # Determine a reasonable step for scale markers
     num_ticks = 10
+    if genome_length < num_ticks:
+        step = 1
+    else:
+        step = genome_length // num_ticks
+    # Draw scale markers
     for i in range(0, genome_length + 1, step):
         x_coord = margin + i * scale
         draw.line([
         ], fill='black', width=1)
         draw.text((int(x_coord - 20), int(line_y + 10)), f"{i:,}", fill='black', font=font)
+    # Sort genes by absolute SHAP value for drawing
     sorted_genes = sorted(gene_results, key=lambda x: abs(x['avg_shap']))
+    # Draw genes
     for idx, gene in enumerate(sorted_genes):
+        # Calculate position and ensure integers
         start_x = margin + int(gene['start'] * scale)
         end_x   = margin + int(gene['end'] * scale)
+        # Calculate color based on SHAP value
         avg_shap = gene['avg_shap']
+        # Convert shap -> color intensity (0 to 255)
+        # Then clamp to a minimum intensity so it never ends up plain white
         intensity = int(abs(avg_shap) * 500)
+        intensity = max(50, min(255, intensity))  # clamp between 50 and 255
         if avg_shap > 0:
+            # Red-ish for positive
+            color = (255, 255 - intensity, 255 - intensity)
         else:
+            # Blue-ish for negative or zero
+            color = (255 - intensity, 255 - intensity, 255)
+        # Draw gene rectangle
         draw.rectangle([
             (int(start_x), int(line_y - track_height // 2)),
             (int(end_x),   int(line_y + track_height // 2))
         ], fill=color, outline='black')
+        # Prepare gene name label
         label = str(gene.get('gene_name','?'))
+        # Fallback for label size
         label_mask = font.getmask(label)
         label_width, label_height = label_mask.size
+        # Alternate label positions
         if idx % 2 == 0:
             text_y = line_y - track_height - 15
         else:
             text_y = line_y + track_height + 5
+        # Decide whether to rotate text based on space
         gene_width = end_x - start_x
         if gene_width > label_width:
             text_x = start_x + (gene_width - label_width) // 2
             rotated_img = txt_img.rotate(90, expand=True)
             img.paste(rotated_img, (int(start_x), int(text_y)), rotated_img)
+    # Draw legend
+    legend_x = margin
+    legend_y = height - margin
+    draw.text((int(legend_x), int(legend_y - 60)), "SHAP Values:", fill='black', font=font)
+    # Draw legend boxes
+    box_width = 20
+    box_height = 20
+    spacing = 15
+    # Strong human-like
+    draw.rectangle([
+        (int(legend_x), int(legend_y - 45)),
+        (int(legend_x + box_width), int(legend_y - 45 + box_height))
+    ], fill=(255, 0, 0), outline='black')
+    draw.text((int(legend_x + box_width + spacing), int(legend_y - 45)),
+              "Strong human-like signal", fill='black', font=font)
+    # Weak human-like
+    draw.rectangle([
+        (int(legend_x), int(legend_y - 20)),
+        (int(legend_x + box_width), int(legend_y - 20 + box_height))
+    ], fill=(255, 200, 200), outline='black')
+    draw.text((int(legend_x + box_width + spacing), int(legend_y - 20)),
+              "Weak human-like signal", fill='black', font=font)
+    # Weak non-human-like
+    draw.rectangle([
+        (int(legend_x + 250), int(legend_y - 45)),
+        (int(legend_x + 250 + box_width), int(legend_y - 45 + box_height))
+    ], fill=(200, 200, 255), outline='black')
+    draw.text((int(legend_x + 250 + box_width + spacing), int(legend_y - 45)),
+              "Weak non-human-like signal", fill='black', font=font)
+    # Strong non-human-like
+    draw.rectangle([
+        (int(legend_x + 250), int(legend_y - 20)),
+        (int(legend_x + 250 + box_width), int(legend_y - 20 + box_height))
+    ], fill=(0, 0, 255), outline='black')
+    draw.text((int(legend_x + 250 + box_width + spacing), int(legend_y - 20)),
+              "Strong non-human-like signal", fill='black', font=font)
     return img
 def analyze_gene_features(sequence_file: str,
                           features_file: str,
                           fasta_text: str = "",
+                          features_text: str = "") -> Tuple[str, Optional[str], Optional[Image.Image]]:
+    """Analyze SHAP values for each gene feature"""
+    # First analyze whole sequence
     sequence_results = analyze_sequence(sequence_file, top_kmers=10, fasta_text=fasta_text)
     if isinstance(sequence_results[0], str) and "Error" in sequence_results[0]:
         return f"Error in sequence analysis: {sequence_results[0]}", None, None
+    # Get SHAP values
     shap_means = sequence_results[3]["shap_means"]
+    # Parse gene features
     try:
         if features_text.strip():
             genes = parse_gene_features(features_text)
                 genes = parse_gene_features(f.read())
     except Exception as e:
         return f"Error reading features file: {str(e)}", None, None
+    # Analyze each gene
     gene_results = []
     for gene in genes:
+        try:
+            location = gene['metadata'].get('location', '')
+            if not location:
+                continue
+            start, end = parse_location(location)
+            if start is None or end is None:
+                continue
+            # Get SHAP values for this region
+            gene_shap = shap_means[start:end]
+            stats = compute_gene_statistics(gene_shap)
+            gene_results.append({
+                'gene_name': gene['metadata'].get('gene', 'Unknown'),
+                'location': location,
+                'start': start,
+                'end': end,
+                'locus_tag': gene['metadata'].get('locus_tag', ''),
+                'avg_shap': stats['avg_shap'],
+                'median_shap': stats['median_shap'],
+                'std_shap': stats['std_shap'],
+                'max_shap': stats['max_shap'],
+                'min_shap': stats['min_shap'],
+                'pos_fraction': stats['pos_fraction'],
+                'classification': 'Human' if stats['avg_shap'] > 0 else 'Non-human',
+                'confidence': abs(stats['avg_shap'])
+            })
+        except Exception as e:
+            print(f"Error processing gene {gene['metadata'].get('gene', 'Unknown')}: {str(e)}")
             continue
     if not gene_results:
         return "No valid genes could be processed", None, None
+    # Sort genes by absolute SHAP value
     sorted_genes = sorted(gene_results, key=lambda x: abs(x['avg_shap']), reverse=True)
+    # Create results text
     results_text = "Gene Analysis Results:\n\n"
     results_text += f"Total genes analyzed: {len(gene_results)}\n"
+    results_text += f"Human-like genes: {sum(1 for g in gene_results if g['classification'] == 'Human')}\n"
+    results_text += f"Non-human-like genes: {sum(1 for g in gene_results if g['classification'] == 'Non-human')}\n\n"
+    results_text += "Top 10 most distinctive genes:\n"
     for gene in sorted_genes[:10]:
         results_text += (
             f"Gene: {gene['gene_name']}\n"
             f"Location: {gene['location']}\n"
             f"Classification: {gene['classification']} "
             f"(confidence: {gene['confidence']:.4f})\n"
+            f"Average SHAP: {gene['avg_shap']:.4f}\n\n"
         )
+    # Create CSV content
+    csv_content = "gene_name,location,avg_shap,median_shap,std_shap,max_shap,min_shap,"
+    csv_content += "pos_fraction,classification,confidence,locus_tag\n"
+    for gene in gene_results:
         csv_content += (
+            f"{gene['gene_name']},{gene['location']},{gene['avg_shap']:.4f},"
+            f"{gene['median_shap']:.4f},{gene['std_shap']:.4f},{gene['max_shap']:.4f},"
+            f"{gene['min_shap']:.4f},{gene['pos_fraction']:.4f},{gene['classification']},"
+            f"{gene['confidence']:.4f},{gene['locus_tag']}\n"
         )
+    # Save CSV to temp file
     try:
         temp_dir = tempfile.gettempdir()
         temp_path = os.path.join(temp_dir, f"gene_analysis_{os.urandom(4).hex()}.csv")
         with open(temp_path, 'w') as f:
             f.write(csv_content)
     except Exception as e:
         print(f"Error saving CSV: {str(e)}")
         temp_path = None
+    # Create visualization
     try:
+        diagram_img = create_simple_genome_diagram(gene_results, len(shap_means))
     except Exception as e:
         print(f"Error creating visualization: {str(e)}")
+        # Create error image
         diagram_img = Image.new('RGB', (800, 100), color='white')
         draw = ImageDraw.Draw(diagram_img)
         draw.text((10, 40), f"Error creating visualization: {str(e)}", fill='black')
     return results_text, temp_path, diagram_img
 ###############################################################################
 ###############################################################################
 def prepare_csv_download(data, filename="analysis_results.csv"):
+    """Prepare CSV data for download"""
     if isinstance(data, str):
         return data.encode(), filename
     elif isinstance(data, (list, dict)):
         import csv
         from io import StringIO
         output = StringIO()
         writer = csv.DictWriter(output, fieldnames=data[0].keys())
         writer.writeheader()
 with gr.Blocks(css=css) as iface:
     gr.Markdown("""
+    # Virus Host Classifier
+    **Step 1**: Predict overall viral sequence origin (human vs non-human) and identify extreme regions.
+    **Step 2**: Explore subregions to see local SHAP signals, distribution, GC content, etc.
+    **Step 3**: Analyze gene features and their contributions.
+    **Step 4**: Compare sequences and analyze differences.
+    **Color Scale**: Negative SHAP = Blue, Zero = White, Positive SHAP = Red.
     """)
     with gr.Tab("1) Full-Sequence Analysis"):
         with gr.Row():
             with gr.Column(scale=1):
                 file_input = gr.File(label="Upload FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
+                text_input = gr.Textbox(label="Or paste FASTA sequence", placeholder=">sequence_name\nACGTACGT...", lines=5)
                 top_k = gr.Slider(minimum=5, maximum=30, value=10, step=1, label="Number of top k-mers to display")
+                win_size = gr.Slider(minimum=100, maximum=5000, value=500, step=100, label="Window size for 'most pushing' subregions")
                 analyze_btn = gr.Button("Analyze Sequence", variant="primary")
             with gr.Column(scale=2):
                 results_box = gr.Textbox(label="Classification Results", lines=12, interactive=False)
     with gr.Tab("2) Subregion Exploration"):
         gr.Markdown("""
         **Subregion Analysis**
+        Select start/end positions to view local SHAP signals, distribution, GC content, etc.
+        The heatmap uses the same Blue-White-Red scale.
         """)
         with gr.Row():
             region_start = gr.Number(label="Region Start", value=0)
         with gr.Row():
             subregion_img = gr.Image(label="Subregion SHAP Heatmap (B-W-R)")
             subregion_hist_img = gr.Image(label="SHAP Distribution (Histogram)")
+        download_subregion = gr.File(label="Download Subregion Analysis", visible=False, elem_classes="download-button")
         region_btn.click(
             analyze_subregion,
     with gr.Tab("3) Gene Features Analysis"):
         gr.Markdown("""
         **Analyze Gene Features**
+        Upload a FASTA file and corresponding gene features file to analyze SHAP values per gene.
+        Gene features should be in the format:
+        ```
+        >gene_name [gene=X] [locus_tag=Y] [location=start..end] or [location=complement(start..end)]
+        SEQUENCE
+        ```
+        The genome viewer will show genes color-coded by their contribution:
+        - Red: Genes pushing toward human origin
+        - Blue: Genes pushing toward non-human origin
+        - Color intensity indicates strength of signal
         """)
         with gr.Row():
             with gr.Column(scale=1):
+                gene_fasta_file = gr.File(label="Upload FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
+                gene_fasta_text = gr.Textbox(label="Or paste FASTA sequence", placeholder=">sequence_name\nACGTACGT...", lines=5)
             with gr.Column(scale=1):
+                features_file = gr.File(label="Upload gene features file", file_types=[".txt"], type="filepath")
+                features_text = gr.Textbox(label="Or paste gene features", placeholder=">gene_1 [gene=U12]...\nACGT...", lines=5)
         analyze_genes_btn = gr.Button("Analyze Gene Features", variant="primary")
         gene_results = gr.Textbox(label="Gene Analysis Results", lines=12, interactive=False)
+        gene_diagram = gr.Image(label="Genome Diagram with Gene Features")
         download_gene_results = gr.File(label="Download Gene Analysis (CSV)", visible=True)
         analyze_genes_btn.click(
             analyze_gene_features,
+            inputs=[gene_fasta_file, features_file, gene_fasta_text, features_text],
             outputs=[gene_results, download_gene_results, gene_diagram]
         )
     with gr.Tab("4) Comparative Analysis"):
         gr.Markdown("""
         **Compare Two Sequences**
+        Upload or paste two FASTA sequences to compare their SHAP patterns.
+        The sequences will be normalized to the same length for comparison.
+        **Color Scale**:
+        - Red: Sequence 2 more human-like
+        - Blue: Sequence 1 more human-like
+        - White: No substantial difference
         """)
         with gr.Row():
             with gr.Column(scale=1):
+                file_input1 = gr.File(label="Upload first FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
+                text_input1 = gr.Textbox(label="Or paste first FASTA sequence", placeholder=">sequence1\nACGTACGT...", lines=5)
             with gr.Column(scale=1):
+                file_input2 = gr.File(label="Upload second FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
+                text_input2 = gr.Textbox(label="Or paste second FASTA sequence", placeholder=">sequence2\nACGTACGT...", lines=5)
         compare_btn = gr.Button("Compare Sequences", variant="primary")
         comparison_text = gr.Textbox(label="Comparison Results", lines=12, interactive=False)
         with gr.Row():
             diff_heatmap = gr.Image(label="SHAP Difference Heatmap")
             diff_hist = gr.Image(label="Distribution of SHAP Differences")
+        download_comparison = gr.File(label="Download Comparison Results", visible=False, elem_classes="download-button")
         compare_btn.click(
             analyze_sequence_comparison,
         )
     gr.Markdown("""
+    ### Interface Features
+    - **Overall Classification** (human vs non-human) using k-mer frequencies
+    - **SHAP Analysis** shows which k-mers push classification toward or away from human
+    - **White-Centered SHAP Gradient**:
+      - Negative (blue), 0 (white), Positive (red)
+      - Symmetrical color range around 0
+    - **Identify Subregions** with strongest push for human or non-human
+    - **Gene Feature Analysis**:
+      - Analyze individual genes' contributions
+      - Interactive genome viewer
+      - Gene-level statistics and classification
+    - **Sequence Comparison**:
+      - Compare two sequences to identify regions of difference
+      - Normalized comparison to handle different lengths
+      - Statistical summary of differences
+    - **Data Export**:
+      - Download results as CSV files
+      - Save analysis outputs for further processing
     """)
 if __name__ == "__main__":
     iface.launch()