Spaces:

hiyata
/

HostClassifier

Running

File size: 47,143 Bytes

5263bd3
 
f5ea8d6
5263bd3
f1d4be6
5263bd3
4a7c026
bcf9134
 
9a5c352
 
dbad921
50ef7f7
 
 
dbad921
 
 
 
7b2a54f
a6886ca
962ae70
de0719b
962ae70
 
5263bd3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
de0719b
5263bd3
 
b5edb58
962ae70
de0719b
962ae70
 
de0719b
870813f
 
 
f1d4be6
870813f
d01c414
 
870813f
 
 
 
 
 
de0719b
870813f
 
 
 
a6886ca
18efb8a
 
 
a6886ca
 
 
 
 
de0719b
a6886ca
 
 
77621ec
de0719b
6be7ede
962ae70
de0719b
962ae70
 
de0719b
18efb8a
 
 
 
f1d4be6
 
ef80028
7e92f7c
18efb8a
ef80028
7e92f7c
ef80028
de0719b
962ae70
7e92f7c
 
455bf4d
77621ec
 
ef80028
a6886ca
de0719b
 
 
 
 
18efb8a
 
 
962ae70
 
 
 
 
 
 
 
de0719b
77621ec
 
962ae70
 
 
 
de0719b
 
 
 
 
18efb8a
 
 
de0719b
d01c414
 
de0719b
77621ec
de0719b
 
 
 
 
 
 
 
77621ec
d01c414
 
77621ec
d01c414
 
de0719b
552aec4
 
de0719b
d76e76a
 
de0719b
18efb8a
 
 
de0719b
 
 
 
 
 
 
 
18efb8a
 
 
77621ec
 
de0719b
 
18efb8a
 
 
de0719b
 
 
 
 
 
 
 
 
 
 
 
77621ec
 
 
 
 
 
de0719b
2e254a9
 
77621ec
962ae70
 
de0719b
18efb8a
 
 
de0719b
 
 
 
 
 
 
 
 
 
 
 
6d0235b
 
d01c414
18efb8a
 
 
de0719b
d01c414
de0719b
 
 
 
 
 
f1d4be6
 
de0719b
18efb8a
 
 
d01c414
18efb8a
de0719b
 
 
962ae70
f5ea8d6
962ae70
 
f5ea8d6
18efb8a
 
 
 
 
f5ea8d6
 
 
 
 
 
 
d01c414
f5ea8d6
d01c414
f5ea8d6
18efb8a
f5ea8d6
 
d01c414
f5ea8d6
 
18efb8a
f5ea8d6
82425ee
d01c414
f5ea8d6
 
 
82425ee
d01c414
f5ea8d6
 
 
 
 
 
 
 
 
 
18efb8a
f5ea8d6
77621ec
 
f5ea8d6
18efb8a
f5ea8d6
 
 
 
 
 
 
 
 
 
 
 
 
18efb8a
f5ea8d6
 
 
 
 
 
 
18efb8a
77621ec
f5ea8d6
18efb8a
d01c414
455bf4d
 
f5ea8d6
455bf4d
56468ea
de0719b
18efb8a
 
 
de0719b
d01c414
de0719b
 
 
 
 
 
 
d01c414
18efb8a
de0719b
 
18efb8a
de0719b
 
 
 
18efb8a
de0719b
 
 
 
 
 
18efb8a
de0719b
 
 
 
 
 
 
 
 
18efb8a
77621ec
de0719b
18efb8a
de0719b
 
18efb8a
 
d01c414
de0719b
77621ec
18efb8a
77621ec
 
e502db5
18efb8a
 
 
e502db5
 
1869cbd
37ce441
18efb8a
37ce441
e502db5
1869cbd
8ef755b
1869cbd
8ef755b
1869cbd
1b8562c
1869cbd
e502db5
1869cbd
 
e502db5
1869cbd
1b8562c
1869cbd
 
 
8ef755b
 
1869cbd
 
 
e502db5
 
88b80ae
 
 
18efb8a
88b80ae
 
 
 
 
 
 
 
 
 
 
6c4adfb
87c2305
18efb8a
87c2305
6c4adfb
 
88b80ae
 
87c2305
18efb8a
6c4adfb
87c2305
6c4adfb
88b80ae
6c4adfb
88b80ae
6c4adfb
 
 
 
 
 
 
 
 
 
 
 
87c2305
6c4adfb
 
77621ec
6c4adfb
87c2305
18efb8a
87c2305
6c4adfb
 
88b80ae
6c4adfb
 
 
 
87c2305
6c4adfb
 
 
 
87c2305
6c4adfb
 
88b80ae
 
6c4adfb
 
 
88b80ae
6c4adfb
18efb8a
6c4adfb
 
 
 
 
 
 
 
 
 
 
 
 
 
77621ec
 
87c2305
18efb8a
 
87c2305
5a41c75
88b80ae
 
 
d01c414
88b80ae
 
 
 
d01c414
88b80ae
 
 
 
 
 
 
18efb8a
 
88b80ae
 
 
18efb8a
88b80ae
 
 
 
 
 
 
 
 
 
 
 
18efb8a
88b80ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d01c414
88b80ae
 
 
d01c414
2fd86ff
 
18efb8a
2fd86ff
 
18efb8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c54683
dbad921
18efb8a
2fd86ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbad921
18efb8a
2fd86ff
 
 
 
 
 
 
 
 
dbad921
18efb8a
dbad921
 
 
 
 
 
 
 
 
 
 
 
18efb8a
dbad921
18efb8a
 
 
 
 
 
dbad921
 
1b5b7bf
 
18efb8a
 
1b5b7bf
7d672a0
1b5b7bf
 
7d672a0
 
18efb8a
ae32958
7d672a0
1b5b7bf
7d672a0
18efb8a
1b5b7bf
 
 
7d672a0
 
1b5b7bf
 
 
 
ae32958
7d672a0
 
 
 
 
1b5b7bf
18efb8a
1b5b7bf
7d672a0
1b5b7bf
 
 
 
18efb8a
7d672a0
18efb8a
7d672a0
 
1b5b7bf
 
 
 
 
 
7d672a0
 
 
 
 
1b5b7bf
18efb8a
7d672a0
 
18efb8a
7d672a0
18efb8a
1b5b7bf
7d672a0
1b5b7bf
 
 
 
 
7d672a0
 
1b5b7bf
7d672a0
 
 
 
1b5b7bf
7d672a0
 
 
1b5b7bf
7d672a0
 
 
 
 
 
1b5b7bf
18efb8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b5b7bf
18efb8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d672a0
 
0c54683
d01c414
 
18efb8a
 
 
 
 
 
 
 
0c54683
 
 
 
18efb8a
0c54683
18efb8a
 
 
0c54683
 
 
 
 
 
 
 
18efb8a
0c54683
 
18efb8a
 
0c54683
18efb8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c54683
 
18efb8a
 
0c54683
 
 
18efb8a
 
 
0c54683
18efb8a
0c54683
 
 
 
 
 
18efb8a
 
0c54683
18efb8a
 
 
 
 
0c54683
18efb8a
 
 
 
0c54683
 
 
 
 
 
 
 
 
18efb8a
 
0c54683
18efb8a
 
 
 
0c54683
 
 
 
 
18efb8a
0c54683
ae32958
2fd86ff
 
 
 
 
18efb8a
 
 
2fd86ff
 
 
 
 
 
 
 
 
 
 
 
88b80ae
de0719b
2fd86ff
de0719b
 
 
 
 
 
2fd86ff
 
 
de0719b
 
 
 
18efb8a
 
 
 
 
 
 
de0719b
 
 
 
 
77621ec
18efb8a
77621ec
18efb8a
de0719b
 
77621ec
de0719b
 
2fd86ff
de0719b
 
d01c414
de0719b
 
f5ea8d6
2fd86ff
de0719b
77621ec
de0719b
56468ea
de0719b
18efb8a
56468ea
de0719b
 
 
 
77621ec
de0719b
 
 
18efb8a
d01c414
de0719b
 
 
2fd86ff
 
 
 
 
 
18efb8a
 
 
2fd86ff
 
 
18efb8a
 
2fd86ff
18efb8a
 
 
2fd86ff
 
18efb8a
dbad921
2fd86ff
 
 
18efb8a
2fd86ff
de0719b
82425ee
2fd86ff
77621ec
 
18efb8a
 
77621ec
 
 
18efb8a
 
77621ec
18efb8a
 
77621ec
 
 
 
 
18efb8a
d01c414
77621ec
 
 
2fd86ff
77621ec
 
de0719b
18efb8a
 
 
 
 
de0719b
18efb8a
723da6d
d01c414

import gradio as gr
import torch
import joblib
import numpy as np
from itertools import product
import torch.nn as nn
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors 
from matplotlib.colors import LinearSegmentedColormap
import io 
from io import BytesIO # Import io then BytesIO 
from PIL import Image, ImageDraw, ImageFont
from Bio.Graphics import GenomeDiagram
from Bio.SeqFeature import SeqFeature, FeatureLocation
from reportlab.lib import colors
import pandas as pd
import tempfile
import os
from typing import List, Dict, Tuple, Optional, Any
import seaborn as sns

###############################################################################
# 1. MODEL DEFINITION
###############################################################################

class VirusClassifier(nn.Module):
    def __init__(self, input_shape: int):
        super(VirusClassifier, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_shape, 64),
            nn.GELU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.GELU(),
            nn.BatchNorm1d(32),
            nn.Dropout(0.3),
            nn.Linear(32, 32),
            nn.GELU(),
            nn.Linear(32, 2)
        )

    def forward(self, x):
        return self.network(x)

###############################################################################
# 2. FASTA PARSING & K-MER FEATURE ENGINEERING
###############################################################################

def parse_fasta(text):
    sequences = []
    current_header = None
    current_sequence = []
    for line in text.strip().split('\n'):
        line = line.strip()
        if not line:
            continue
        if line.startswith('>'):
            if current_header:
                sequences.append((current_header, ''.join(current_sequence)))
            current_header = line[1:]
            current_sequence = []
        else:
            current_sequence.append(line.upper())
    if current_header:
        sequences.append((current_header, ''.join(current_sequence)))
    return sequences

def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
    """
    Convert a sequence into a frequency vector of all possible 4-mer combinations.
    """
    kmers = [''.join(p) for p in product("ACGT", repeat=k)]
    kmer_dict = {km: i for i, km in enumerate(kmers)}
    vec = np.zeros(len(kmers), dtype=np.float32)
    for i in range(len(sequence) - k + 1):
        kmer = sequence[i:i+k]
        if kmer in kmer_dict:
            vec[kmer_dict[kmer]] += 1
    total_kmers = len(sequence) - k + 1
    if total_kmers > 0:
        vec /= total_kmers
    return vec

###############################################################################
# 3. SHAP-VALUE (ABLATION) CALCULATION
###############################################################################

def calculate_shap_values(model, x_tensor):
    """
    A simple ablation-based SHAP approximation. Zero out each position
    and measure the impact on the 'human' probability.
    """
    model.eval()
    with torch.no_grad():
        baseline_output = model(x_tensor)
        baseline_probs = torch.softmax(baseline_output, dim=1)
        baseline_prob = baseline_probs[0, 1].item()  # Probability for 'human'
        shap_values = []
        x_zeroed = x_tensor.clone()
        for i in range(x_tensor.shape[1]):
            original_val = x_zeroed[0, i].item()
            x_zeroed[0, i] = 0.0
            output = model(x_zeroed)
            probs = torch.softmax(output, dim=1)
            prob = probs[0, 1].item()
            shap_values.append(baseline_prob - prob)
            x_zeroed[0, i] = original_val
    return np.array(shap_values), baseline_prob

###############################################################################
# 4. PER-BASE SHAP AGGREGATION
###############################################################################

def compute_positionwise_scores(sequence, shap_values, k=4):
    """
    Distribute each k-mer's SHAP contribution across its k underlying positions.
    """
    kmers = [''.join(p) for p in product("ACGT", repeat=k)]
    kmer_dict = {km: i for i, km in enumerate(kmers)}
    seq_len = len(sequence)
    shap_sums = np.zeros(seq_len, dtype=np.float32)
    coverage = np.zeros(seq_len, dtype=np.float32)
    for i in range(seq_len - k + 1):
        kmer = sequence[i:i+k]
        if kmer in kmer_dict:
            val = shap_values[kmer_dict[kmer]]
            shap_sums[i:i+k] += val
            coverage[i:i+k] += 1
    with np.errstate(divide='ignore', invalid='ignore'):
        shap_means = np.where(coverage > 0, shap_sums / coverage, 0.0)
    return shap_means

###############################################################################
# 5. FIND EXTREME SHAP REGIONS
###############################################################################

def find_extreme_subregion(shap_means, window_size=500, mode="max"):
    """
    Use a sliding window to find the subregion with the highest (or lowest) average SHAP.
    """
    n = len(shap_means)
    if n == 0:
        return (0, 0, 0.0)
    if window_size >= n:
        return (0, n, float(np.mean(shap_means)))
    csum = np.zeros(n + 1, dtype=np.float32)
    csum[1:] = np.cumsum(shap_means)
    best_start = 0
    best_sum = csum[window_size] - csum[0]
    best_avg = best_sum / window_size
    for start in range(1, n - window_size + 1):
        wsum = csum[start + window_size] - csum[start]
        wavg = wsum / window_size
        if mode == "max" and wavg > best_avg:
            best_avg = wavg
            best_start = start
        elif mode == "min" and wavg < best_avg:
            best_avg = wavg
            best_start = start
    return (best_start, best_start + window_size, float(best_avg))

###############################################################################
# 6. PLOTTING / UTILITIES
###############################################################################

def fig_to_image(fig):
    """
    Render a Matplotlib figure to a PIL Image.
    """
    buf = io.BytesIO()
    fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
    buf.seek(0)
    img = Image.open(buf)
    plt.close(fig)
    return img

def get_zero_centered_cmap():
    """
    Create a symmetrical (blue-white-red) colormap around zero.
    """
    colors = [(0.0, 'blue'), (0.5, 'white'), (1.0, 'red')]
    return mcolors.LinearSegmentedColormap.from_list("blue_white_red", colors)

def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, end=None):
    """
    Plot an inline heatmap for the chosen region (or entire genome if start/end not provided).
    """
    if start is not None and end is not None:
        local_shap = shap_means[start:end]
        subtitle = f" (positions {start}-{end})"
    else:
        local_shap = shap_means
        subtitle = ""
    if len(local_shap) == 0:
        local_shap = np.array([0.0])
    heatmap_data = local_shap.reshape(1, -1)
    min_val = np.min(local_shap)
    max_val = np.max(local_shap)
    extent = max(abs(min_val), abs(max_val))
    cmap = get_zero_centered_cmap()
    fig, ax = plt.subplots(figsize=(12, 1.8))
    cax = ax.imshow(heatmap_data, aspect='auto', cmap=cmap, vmin=-extent, vmax=extent)
    cbar = plt.colorbar(cax, orientation='horizontal', pad=0.25, aspect=40, shrink=0.8)
    cbar.ax.tick_params(labelsize=8)
    cbar.set_label('SHAP Contribution', fontsize=9, labelpad=5)
    ax.set_yticks([])
    ax.set_xlabel('Position in Sequence', fontsize=10)
    ax.set_title(f"{title}{subtitle}", pad=10)
    plt.subplots_adjust(bottom=0.25, left=0.05, right=0.95)
    return fig

def create_importance_bar_plot(shap_values, kmers, top_k=10):
    """
    Show bar chart of top k-mers by absolute SHAP value.
    """
    plt.rcParams.update({'font.size': 10})
    fig = plt.figure(figsize=(10, 5))
    indices = np.argsort(np.abs(shap_values))[-top_k:]
    values = shap_values[indices]
    features = [kmers[i] for i in indices]
    colors = ['#99ccff' if v < 0 else '#ff9999' for v in values]
    plt.barh(range(len(values)), values, color=colors)
    plt.yticks(range(len(values)), features)
    plt.xlabel('SHAP Value (impact on model output)')
    plt.title(f'Top {top_k} Most Influential k-mers')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    return fig

def plot_shap_histogram(shap_array, title="SHAP Distribution in Region", num_bins=30):
    """
    Plot a histogram of SHAP values in some region.
    """
    fig, ax = plt.subplots(figsize=(6, 4))
    ax.hist(shap_array, bins=num_bins, color='gray', edgecolor='black')
    ax.axvline(0, color='red', linestyle='--', label='0.0')
    ax.set_xlabel("SHAP Value")
    ax.set_ylabel("Count")
    ax.set_title(title)
    ax.legend()
    plt.tight_layout()
    return fig

def compute_gc_content(sequence):
    """
    Compute GC content (%) for a given sequence.
    """
    if not sequence:
        return 0.0
    gc_count = sequence.count('G') + sequence.count('C')
    return (gc_count / len(sequence)) * 100.0

###############################################################################
# 7. MAIN ANALYSIS STEP (Gradio Step 1)
###############################################################################

def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
    """
    Perform the main classification, SHAP analysis, and extreme subregion detection
    for a single sequence.
    """
    # 1) Read input
    if fasta_text.strip():
        text = fasta_text.strip()
    elif file_obj is not None:
        try:
            with open(file_obj, 'r') as f:
                text = f.read()
        except Exception as e:
            return (f"Error reading file: {str(e)}", None, None, None, None, None)
    else:
        return ("Please provide a FASTA sequence.", None, None, None, None, None)

    # 2) Parse FASTA
    sequences = parse_fasta(text)
    if not sequences:
        return ("No valid FASTA sequences found.", None, None, None, None, None)
    header, seq = sequences[0]

    # 3) Load model, scaler, and run inference
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    try:
        state_dict = torch.load('model.pt', map_location=device)
        model = VirusClassifier(256).to(device)
        model.load_state_dict(state_dict)
        scaler = joblib.load('scaler.pkl')
    except Exception as e:
        return (f"Error loading model/scaler: {str(e)}", None, None, None, None, None)

    freq_vector = sequence_to_kmer_vector(seq)
    scaled_vector = scaler.transform(freq_vector.reshape(1, -1))
    x_tensor = torch.FloatTensor(scaled_vector).to(device)

    shap_values, prob_human = calculate_shap_values(model, x_tensor)
    prob_nonhuman = 1.0 - prob_human
    classification = "Human" if prob_human > 0.5 else "Non-human"
    confidence = max(prob_human, prob_nonhuman)

    # 4) Per-base SHAP & subregion detection
    shap_means = compute_positionwise_scores(seq, shap_values, k=4)
    max_start, max_end, max_avg = find_extreme_subregion(shap_means, window_size, mode="max")
    min_start, min_end, min_avg = find_extreme_subregion(shap_means, window_size, mode="min")

    # 5) Prepare result text
    results_text = (
        f"Sequence: {header}\n"
        f"Length: {len(seq):,} bases\n"
        f"Classification: {classification}\n"
        f"Confidence: {confidence:.3f}\n"
        f"(Human Probability: {prob_human:.3f}, Non-human Probability: {prob_nonhuman:.3f})\n\n"
        f"---\n"
        f"**Most Human-Pushing {window_size}-bp Subregion**:\n"
        f"Start: {max_start}, End: {max_end}, Avg SHAP: {max_avg:.4f}\n\n"
        f"**Most Non-Human–Pushing {window_size}-bp Subregion**:\n"
        f"Start: {min_start}, End: {min_end}, Avg SHAP: {min_avg:.4f}"
    )

    # 6) Create bar & heatmap figures
    kmers = [''.join(p) for p in product("ACGT", repeat=4)]
    bar_fig = create_importance_bar_plot(shap_values, kmers, top_kmers)
    bar_img = fig_to_image(bar_fig)

    heatmap_fig = plot_linear_heatmap(shap_means, title="Genome-wide SHAP")
    heatmap_img = fig_to_image(heatmap_fig)

    # 7) Build the "state" dictionary so we can do subregion analysis
    state_dict_out = {"seq": seq, "shap_means": shap_means}

    # Return 6 items to match your Gradio output
    return (results_text, bar_img, heatmap_img, state_dict_out, header, None)

###############################################################################
# 8. SUBREGION ANALYSIS (Gradio Step 2)
###############################################################################

def analyze_subregion(state, header, region_start, region_end):
    """
    Examine a subregion’s SHAP distribution, GC content, etc.
    """
    if not state or "seq" not in state or "shap_means" not in state:
        return ("No sequence data found. Please run Step 1 first.", None, None, None)
    seq = state["seq"]
    shap_means = state["shap_means"]
    region_start = int(region_start)
    region_end = int(region_end)
    region_start = max(0, min(region_start, len(seq)))
    region_end = max(0, min(region_end, len(seq)))
    if region_end <= region_start:
        return ("Invalid region range. End must be > Start.", None, None, None)

    region_seq = seq[region_start:region_end]
    region_shap = shap_means[region_start:region_end]

    gc_percent = compute_gc_content(region_seq)
    avg_shap = float(np.mean(region_shap))
    positive_fraction = np.mean(region_shap > 0)
    negative_fraction = np.mean(region_shap < 0)

    if avg_shap > 0.05:
        region_classification = "Likely pushing toward human"
    elif avg_shap < -0.05:
        region_classification = "Likely pushing toward non-human"
    else:
        region_classification = "Near neutral (no strong push)"

    region_info = (
        f"Analyzing subregion of {header} from {region_start} to {region_end}\n"
        f"Region length: {len(region_seq)} bases\n"
        f"GC content: {gc_percent:.2f}%\n"
        f"Average SHAP in region: {avg_shap:.4f}\n"
        f"Fraction with SHAP > 0 (toward human): {positive_fraction:.2f}\n"
        f"Fraction with SHAP < 0 (toward non-human): {negative_fraction:.2f}\n"
        f"Subregion interpretation: {region_classification}\n"
    )

    heatmap_fig = plot_linear_heatmap(shap_means, title="Subregion SHAP", start=region_start, end=region_end)
    heatmap_img = fig_to_image(heatmap_fig)

    hist_fig = plot_shap_histogram(region_shap, title="SHAP Distribution in Subregion")
    hist_img = fig_to_image(hist_fig)

    # Return 4 items to match your Gradio output
    return (region_info, heatmap_img, hist_img, None)

###############################################################################
# 9. COMPARISON ANALYSIS FUNCTIONS (Step 4)
###############################################################################

def compute_shap_difference(shap1_norm, shap2_norm):
    """
    Compute the SHAP difference (Seq2 - Seq1).
    """
    return shap2_norm - shap1_norm

def plot_comparative_heatmap(shap_diff, title="SHAP Difference Heatmap"):
    """
    Plot a 1D heatmap of differences using relative positions 0-100%.
    """
    heatmap_data = shap_diff.reshape(1, -1)
    extent = max(abs(np.min(shap_diff)), abs(np.max(shap_diff)))
    
    fig, ax = plt.subplots(figsize=(12, 1.8))
    cmap = get_zero_centered_cmap()
    cax = ax.imshow(heatmap_data, aspect='auto', cmap=cmap, vmin=-extent, vmax=extent)
    
    # Create percentage-based x-axis ticks
    num_ticks = 5
    tick_positions = np.linspace(0, shap_diff.shape[0]-1, num_ticks)
    tick_labels = [f"{int(x*100)}%" for x in np.linspace(0, 1, num_ticks)]
    ax.set_xticks(tick_positions)
    ax.set_xticklabels(tick_labels)
    
    cbar = plt.colorbar(cax, orientation='horizontal', pad=0.25, aspect=40, shrink=0.8)
    cbar.ax.tick_params(labelsize=8)
    cbar.set_label('SHAP Difference (Seq2 - Seq1)', fontsize=9, labelpad=5)
    
    ax.set_yticks([])
    ax.set_xlabel('Relative Position in Sequence', fontsize=10)
    ax.set_title(title, pad=10)
    plt.subplots_adjust(bottom=0.25, left=0.05, right=0.95)
    
    return fig

def plot_shap_histogram(shap_array, title="SHAP Distribution", num_bins=30):
    """
    Plot a histogram of SHAP values with optional # of bins.
    """
    fig, ax = plt.subplots(figsize=(6, 4))
    ax.hist(shap_array, bins=num_bins, color='gray', edgecolor='black', alpha=0.7)
    ax.axvline(0, color='red', linestyle='--', label='0.0')
    ax.set_xlabel("SHAP Value")
    ax.set_ylabel("Count")
    ax.set_title(title)
    ax.legend()
    plt.tight_layout()
    return fig

def calculate_adaptive_parameters(len1, len2):
    """
    Choose smoothing & interpolation parameters automatically based on length difference.
    """
    length_diff = abs(len1 - len2)
    max_length = max(len1, len2)
    min_length = min(len1, len2)
    length_ratio = min_length / max_length
    
    # Base number of points
    base_points = min(2000, max(500, max_length // 100))
    
    if length_diff < 500:
        resolution_factor = 2.0
        num_points = min(3000, base_points * 2)
        smooth_window = max(10, length_diff // 50)
    elif length_diff < 5000:
        resolution_factor = 1.5
        num_points = min(2000, base_points * 1.5)
        smooth_window = max(20, length_diff // 100)
    elif length_diff < 50000:
        resolution_factor = 1.0
        num_points = base_points
        smooth_window = max(50, length_diff // 200)
    else:
        resolution_factor = 0.75
        num_points = max(500, base_points // 2)
        smooth_window = max(100, length_diff // 500)
    
    smooth_window = int(smooth_window * (1 + (1 - length_ratio)))
    return int(num_points), int(smooth_window), resolution_factor

def sliding_window_smooth(values, window_size=50):
    """
    A custom smoothing approach, including exponential decay at edges.
    """
    if window_size < 3:
        return values
    window = np.ones(window_size)
    decay = np.exp(-np.linspace(0, 3, window_size // 2))
    window[:window_size // 2] = decay
    window[-(window_size // 2):] = decay[::-1]
    window = window / window.sum()
    
    smoothed = np.convolve(values, window, mode='valid')
    pad_size = len(values) - len(smoothed)
    pad_left = pad_size // 2
    pad_right = pad_size - pad_left
    
    result = np.zeros_like(values)
    result[pad_left:-pad_right] = smoothed
    result[:pad_left] = values[:pad_left]
    result[-pad_right:] = values[-pad_right:]
    
    return result

def normalize_shap_lengths(shap1, shap2):
    """
    Smooth, interpolate, and return arrays of the same length for direct comparison.
    """
    num_points, smooth_window, _ = calculate_adaptive_parameters(len(shap1), len(shap2))
    
    shap1_smooth = sliding_window_smooth(shap1, smooth_window)
    shap2_smooth = sliding_window_smooth(shap2, smooth_window)
    
    x1 = np.linspace(0, 1, len(shap1_smooth))
    x2 = np.linspace(0, 1, len(shap2_smooth))
    x_norm = np.linspace(0, 1, num_points)
    
    shap1_interp = np.interp(x_norm, x1, shap1_smooth)
    shap2_interp = np.interp(x_norm, x2, shap2_smooth)
    
    return shap1_interp, shap2_interp, smooth_window

def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
    """
    Compare two sequences using the previously defined analysis pipeline
    and produce difference visualizations & stats.
    """
    try:
        # Analyze first sequence
        res1 = analyze_sequence(file1, top_kmers=10, fasta_text=fasta1, window_size=500)
        if isinstance(res1[0], str) and "Error" in res1[0]:
            return (f"Error in sequence 1: {res1[0]}", None, None, None)
        
        # Analyze second sequence
        res2 = analyze_sequence(file2, top_kmers=10, fasta_text=fasta2, window_size=500)
        if isinstance(res2[0], str) and "Error" in res2[0]:
            return (f"Error in sequence 2: {res2[0]}", None, None, None)

        shap1 = res1[3]["shap_means"]
        shap2 = res2[3]["shap_means"]
        
        len1, len2 = len(shap1), len(shap2)
        length_diff = abs(len1 - len2)
        length_ratio = min(len1, len2) / max(len1, len2)

        # Normalize both to the same length
        shap1_norm, shap2_norm, smooth_window = normalize_shap_lengths(shap1, shap2)
        shap_diff = compute_shap_difference(shap1_norm, shap2_norm)
        
        # Compute stats
        base_threshold = 0.05
        adaptive_threshold = base_threshold * (1 + (1 - length_ratio))
        if length_diff > 50000:
            adaptive_threshold *= 1.5
        
        avg_diff = np.mean(shap_diff)
        std_diff = np.std(shap_diff)
        max_diff = np.max(shap_diff)
        min_diff = np.min(shap_diff)
        substantial_diffs = np.abs(shap_diff) > adaptive_threshold
        frac_different = np.mean(substantial_diffs)

        # Extract classification from text
        try:
            classification1 = res1[0].split('Classification: ')[1].split('\n')[0].strip()
            classification2 = res2[0].split('Classification: ')[1].split('\n')[0].strip()
        except:
            classification1 = "Unknown"
            classification2 = "Unknown"
        
        comparison_text = (
            "Sequence Comparison Results:\n"
            f"Sequence 1: {res1[4]}\n"
            f"Length: {len1:,} bases\n"
            f"Classification: {classification1}\n\n"
            f"Sequence 2: {res2[4]}\n"
            f"Length: {len2:,} bases\n"
            f"Classification: {classification2}\n\n"
            "Comparison Parameters:\n"
            f"Length Difference: {length_diff:,} bases\n"
            f"Length Ratio: {length_ratio:.3f}\n"
            f"Smoothing Window: {smooth_window} points\n"
            f"Adaptive Threshold: {adaptive_threshold:.3f}\n\n"
            "Statistics:\n"
            f"Average SHAP difference: {avg_diff:.4f}\n"
            f"Standard deviation: {std_diff:.4f}\n"
            f"Max difference: {max_diff:.4f} (Seq2 more human-like)\n"
            f"Min difference: {min_diff:.4f} (Seq1 more human-like)\n"
            f"Fraction with substantial differences: {frac_different:.2%}\n\n"
            "Note: All parameters automatically adjusted based on sequence properties\n\n"
            "Interpretation:\n"
            "- Red regions: Sequence 2 more human-like\n"
            "- Blue regions: Sequence 1 more human-like\n"
            "- White regions: Similar between sequences"
        )
        
        heatmap_fig = plot_comparative_heatmap(
            shap_diff,
            title=f"SHAP Difference Heatmap (window: {smooth_window})"
        )
        heatmap_img = fig_to_image(heatmap_fig)
        
        num_bins = max(20, min(50, int(np.sqrt(len(shap_diff)))))
        hist_fig = plot_shap_histogram(
            shap_diff,
            title="Distribution of SHAP Differences",
            num_bins=num_bins
        )
        hist_img = fig_to_image(hist_fig)
        
        return (comparison_text, heatmap_img, hist_img, None)
        
    except Exception as e:
        error_msg = f"Error during sequence comparison: {str(e)}"
        return (error_msg, None, None, None)

###############################################################################
# 10. ADDITIONAL / ADVANCED VISUALIZATIONS & STATISTICS
###############################################################################

def n50_length(sequence):
    """
    Calculate the N50 for a single continuous sequence (for demonstration).
    For a single sequence, N50 is typically the length if it's just one piece,
    but let's do a simplistic example.
    """
    # If you had contigs, you'd do a sorted list, cumulative sums, etc.
    # We'll do a trivial approach here:
    return len(sequence)  # Because we have only one contiguous region

def sequence_complexity(sequence):
    """
    Compute a simple measure of 'sequence complexity'.
    Here, we define complexity as the Shannon entropy over the nucleotides.
    """
    from math import log2
    length = len(sequence)
    if length == 0:
        return 0.0
    freq = {}
    for base in sequence:
        freq[base] = freq.get(base, 0) + 1
    complexity = 0.0
    for base, count in freq.items():
        p = count / length
        complexity -= p * log2(p)
    return complexity

def advanced_gene_statistics(gene_shap: np.ndarray, gene_seq: str) -> Dict[str, float]:
    """
    Additional stats: N50, complexity, etc.
    """
    stats = {}
    stats['n50'] = len(gene_seq)  # trivial for a single gene region
    stats['entropy'] = sequence_complexity(gene_seq)
    stats['avg_shap'] = float(np.mean(gene_shap))
    stats['max_shap'] = float(np.max(gene_shap)) if len(gene_shap) else 0.0
    stats['min_shap'] = float(np.min(gene_shap)) if len(gene_shap) else 0.0
    return stats

###############################################################################
# 11. GENE FEATURE ANALYSIS
###############################################################################

def parse_gene_features(text: str) -> List[Dict[str, Any]]:
    """Parse gene features from text file in a FASTA-like format."""
    genes = []
    current_header = None
    current_sequence = []
    
    for line in text.strip().split('\n'):
        line = line.strip()
        if not line:
            continue
        if line.startswith('>'):
            if current_header:
                genes.append({
                    'header': current_header,
                    'sequence': ''.join(current_sequence),
                    'metadata': parse_gene_metadata(current_header)
                })
            current_header = line[1:]
            current_sequence = []
        else:
            current_sequence.append(line.upper())
    if current_header:
        genes.append({
            'header': current_header,
            'sequence': ''.join(current_sequence),
            'metadata': parse_gene_metadata(current_header)
        })
    return genes

def parse_gene_metadata(header: str) -> Dict[str, str]:
    """Extract metadata from gene header line."""
    metadata = {}
    parts = header.split()
    for part in parts:
        if '[' in part and ']' in part:
            key_value = part[1:-1].split('=', 1)
            if len(key_value) == 2:
                metadata[key_value[0]] = key_value[1]
    return metadata

def parse_location(location_str: str) -> Tuple[Optional[int], Optional[int]]:
    """Parse gene location string, handling forward and complement strands."""
    try:
        clean_loc = location_str.replace('complement(', '').replace(')', '')
        if '..' in clean_loc:
            start, end = map(int, clean_loc.split('..'))
            return start, end
        else:
            return None, None
    except Exception as e:
        print(f"Error parsing location {location_str}: {str(e)}")
        return None, None

def compute_gene_statistics(gene_shap: np.ndarray) -> Dict[str, float]:
    """Basic statistical measures for gene SHAP values."""
    return {
        'avg_shap': float(np.mean(gene_shap)) if len(gene_shap) else 0.0,
        'median_shap': float(np.median(gene_shap)) if len(gene_shap) else 0.0,
        'std_shap': float(np.std(gene_shap)) if len(gene_shap) else 0.0,
        'max_shap': float(np.max(gene_shap)) if len(gene_shap) else 0.0,
        'min_shap': float(np.min(gene_shap)) if len(gene_shap) else 0.0,
        'pos_fraction': float(np.mean(gene_shap > 0)) if len(gene_shap) else 0.0
    }

def create_simple_genome_diagram(gene_results: List[Dict[str, Any]], genome_length: int) -> Image.Image:
    """
    A quick PIL-based diagram to show genes along the genome.
    Color intensity = magnitude of SHAP. Red/Blue = sign of SHAP.
    """
    if not gene_results or genome_length <= 0:
        img = Image.new('RGB', (800, 100), color='white')
        draw = ImageDraw.Draw(img)
        draw.text((10, 40), "Error: Invalid input data", fill='black')
        return img

    for gene in gene_results:
        gene['start'] = max(0, int(gene['start']))
        gene['end'] = min(genome_length, int(gene['end']))
        if gene['start'] >= gene['end']:
            print(f"Warning: Invalid coordinates for gene {gene.get('gene_name','?')}")
    
    width = 1500
    height = 600
    margin = 50
    track_height = 40
    
    img = Image.new('RGB', (width, height), 'white')
    draw = ImageDraw.Draw(img)
    
    try:
        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
        title_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 16)
    except:
        font = ImageFont.load_default()
        title_font = ImageFont.load_default()
    
    draw.text((margin, margin // 2), "Genome SHAP Analysis (Simple)", fill='black', font=title_font or font)
    
    line_y = height // 2
    draw.line([(int(margin), int(line_y)), (int(width - margin), int(line_y))], fill='black', width=2)
    
    scale = float(width - 2 * margin) / float(genome_length)
    
    # Scale markers
    num_ticks = 10
    step = max(1, genome_length // num_ticks)
    for i in range(0, genome_length + 1, step):
        x_coord = margin + i * scale
        draw.line([
            (int(x_coord), int(line_y - 5)), 
            (int(x_coord), int(line_y + 5))
        ], fill='black', width=1)
        draw.text((int(x_coord - 20), int(line_y + 10)), f"{i:,}", fill='black', font=font)
    
    sorted_genes = sorted(gene_results, key=lambda x: abs(x['avg_shap']))
    for idx, gene in enumerate(sorted_genes):
        start_x = margin + int(gene['start'] * scale)
        end_x   = margin + int(gene['end'] * scale)
        avg_shap = gene['avg_shap']
        intensity = int(abs(avg_shap) * 500)
        intensity = max(50, min(255, intensity))

        if avg_shap > 0:
            color = (255, 255 - intensity, 255 - intensity)  # Redish
        else:
            color = (255 - intensity, 255 - intensity, 255)  # Blueish
        
        draw.rectangle([
            (int(start_x), int(line_y - track_height // 2)),
            (int(end_x),   int(line_y + track_height // 2))
        ], fill=color, outline='black')
        
        label = str(gene.get('gene_name','?'))
        label_mask = font.getmask(label)
        label_width, label_height = label_mask.size
        
        if idx % 2 == 0:
            text_y = line_y - track_height - 15
        else:
            text_y = line_y + track_height + 5
        
        gene_width = end_x - start_x
        if gene_width > label_width:
            text_x = start_x + (gene_width - label_width) // 2
            draw.text((int(text_x), int(text_y)), label, fill='black', font=font)
        elif gene_width > 20:
            txt_img = Image.new('RGBA', (label_width, label_height), (255, 255, 255, 0))
            txt_draw = ImageDraw.Draw(txt_img)
            txt_draw.text((0, 0), label, font=font, fill='black')
            rotated_img = txt_img.rotate(90, expand=True)
            img.paste(rotated_img, (int(start_x), int(text_y)), rotated_img)
    
    return img

def create_advanced_genome_diagram(gene_results: List[Dict[str, Any]], 
                                   genome_length: int, 
                                   shap_means: np.ndarray,
                                   diagram_title: str = "Advanced Genome Diagram") -> Image.Image:
    """
    An advanced genome diagram using Biopython's GenomeDiagram.
    We'll create tracks for genes and a 'SHAP line plot' track.
    """
    if not gene_results or genome_length <= 0 or len(shap_means) == 0:
        # Fallback if data is invalid
        img = Image.new('RGB', (800, 100), color='white')
        d = ImageDraw.Draw(img)
        d.text((10, 40), "Error: Not enough data for advanced diagram", fill='black')
        return img

    diagram = GenomeDiagram.Diagram(diagram_title)
    gene_track = diagram.new_track(1, name="Genes", greytrack=False, height=0.5)
    gene_set = gene_track.new_set()

    # Add each gene as a feature
    for gene in gene_results:
        start = max(0, int(gene['start']))
        end = min(genome_length, int(gene['end']))
        avg_shap = gene['avg_shap']
        # Color scale: negative = blue, positive = red
        intensity = abs(avg_shap) * 500
        intensity = max(50, min(255, intensity))
        if avg_shap >= 0:
            color_hex = colors.Color(1.0, 1.0 - intensity/255.0, 1.0 - intensity/255.0)
        else:
            color_hex = colors.Color(1.0 - intensity/255.0, 1.0 - intensity/255.0, 1.0)

        feature = SeqFeature(FeatureLocation(start, end), strand=1)
        gene_set.add_feature(
            feature,
            color=color_hex,
            label=True,
            name=str(gene.get('gene_name','?')),
            label_size=8,
            label_color=colors.black
        )

    # Add a track for the SHAP line
    shap_track = diagram.new_track(2, name="SHAP Score", greytrack=False, height=0.3)
    shap_set = shap_track.new_set("graph")
    # We'll plot the entire shap_means array. 
    # X coords = [0..genome_length], Y coords = shap_means
    # We'll keep negative values below baseline, positive above.

    # Normalizing for visualization
    max_abs = max(abs(shap_means.min()), abs(shap_means.max()))
    if max_abs == 0:
        scaled_shap = [0]*len(shap_means)
    else:
        scaled_shap = (shap_means / max_abs * 50).tolist()  # scale to +/- 50
    
    shap_set.add_graph(
        data=scaled_shap,
        name="shap_line",
        style="line",
        color=colors.darkgreen,
        altcolor=colors.red,
        linewidth=1
    )

    # Draw to a temporary file
    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmpf:
        diagram.draw(format="linear", pagesize='A3', fragments=1, start=0, end=genome_length)
        diagram.write(tmpf.name, "PDF")

        # Convert PDF to a PIL image (requires poppler or similar).
        # If you do not have poppler, you can skip PDF -> image or use Cairo.
        try:
            import pdf2image
            pages = pdf2image.convert_from_path(tmpf.name, dpi=100)
            img = pages[0] if pages else Image.new('RGB', (800, 100), color='white')
        except ImportError:
            img = Image.new('RGB', (800, 100), color='white')
            d = ImageDraw.Draw(img)
            d.text((10, 40), "pdf2image not installed, can't show advanced diagram as image.", fill='black')

    # Cleanup
    os.remove(tmpf.name)
    return img

def analyze_gene_features(sequence_file: str, 
                          features_file: str, 
                          fasta_text: str = "", 
                          features_text: str = "",
                          diagram_mode: str = "advanced"
                          ) -> Tuple[str, Optional[str], Optional[Image.Image]]:
    """
    Analyze each gene in the features file, compute gene-level SHAP stats, 
    produce tabular output, and create an optional genome diagram.
    """
    # 1) Analyze the entire sequence with the top-level function
    sequence_results = analyze_sequence(sequence_file, top_kmers=10, fasta_text=fasta_text)
    if isinstance(sequence_results[0], str) and "Error" in sequence_results[0]:
        return f"Error in sequence analysis: {sequence_results[0]}", None, None
        
    seq = sequence_results[3]["seq"]
    shap_means = sequence_results[3]["shap_means"]
    genome_length = len(seq)

    # 2) Read gene features
    try:
        if features_text.strip():
            genes = parse_gene_features(features_text)
        else:
            with open(features_file, 'r') as f:
                genes = parse_gene_features(f.read())
    except Exception as e:
        return f"Error reading features file: {str(e)}", None, None
    
    gene_results = []
    for gene in genes:
        location = gene['metadata'].get('location', '')
        if not location:
            continue
        start, end = parse_location(location)
        if start is None or end is None or start >= end or end > genome_length:
            continue
        gene_shap = shap_means[start:end]
        basic_stats = compute_gene_statistics(gene_shap)
        # Additional stats
        gene_seq = seq[start:end]
        adv_stats = advanced_gene_statistics(gene_shap, gene_seq)

        # Merge basic + advanced stats
        all_stats = {**basic_stats, **adv_stats}
        
        classification = 'Human' if basic_stats['avg_shap'] > 0 else 'Non-human'
        locus_tag = gene['metadata'].get('locus_tag', '')
        gene_name = gene['metadata'].get('gene', 'Unknown')
        
        gene_dict = {
            'gene_name': gene_name,
            'location': location,
            'start': start,
            'end': end,
            'locus_tag': locus_tag,
            'avg_shap': all_stats['avg_shap'],
            'median_shap': basic_stats['median_shap'],
            'std_shap': basic_stats['std_shap'],
            'max_shap': basic_stats['max_shap'],
            'min_shap': basic_stats['min_shap'],
            'pos_fraction': basic_stats['pos_fraction'],
            'n50': all_stats['n50'],
            'entropy': all_stats['entropy'],
            'classification': classification,
            'confidence': abs(all_stats['avg_shap'])
        }
        gene_results.append(gene_dict)

    if not gene_results:
        return "No valid genes could be processed", None, None

    # 3) Summaries
    sorted_genes = sorted(gene_results, key=lambda x: abs(x['avg_shap']), reverse=True)
    results_text = "Gene Analysis Results:\n\n"
    results_text += f"Total genes analyzed: {len(gene_results)}\n"
    num_human = sum(1 for g in gene_results if g['classification'] == 'Human')
    results_text += f"Human-like genes: {num_human}\n"
    results_text += f"Non-human-like genes: {len(gene_results) - num_human}\n\n"
    
    results_text += "Top 10 most distinctive genes (by avg SHAP magnitude):\n"
    for gene in sorted_genes[:10]:
        results_text += (
            f"Gene: {gene['gene_name']}\n"
            f"Location: {gene['location']}\n"
            f"Classification: {gene['classification']} "
            f"(confidence: {gene['confidence']:.4f})\n"
            f"Average SHAP: {gene['avg_shap']:.4f}\n"
            f"N50: {gene['n50']}, Entropy: {gene['entropy']:.3f}\n\n"
        )

    # 4) Make CSV
    csv_content = "gene_name,location,start,end,locus_tag,avg_shap,median_shap,std_shap,"
    csv_content += "max_shap,min_shap,pos_fraction,n50,entropy,classification,confidence\n"
    for g in gene_results:
        csv_content += (
            f"{g['gene_name']},{g['location']},{g['start']},{g['end']},{g['locus_tag']},"
            f"{g['avg_shap']:.4f},{g['median_shap']:.4f},{g['std_shap']:.4f},"
            f"{g['max_shap']:.4f},{g['min_shap']:.4f},{g['pos_fraction']:.4f},"
            f"{g['n50']},{g['entropy']:.4f},{g['classification']},{g['confidence']:.4f}\n"
        )
    try:
        temp_dir = tempfile.gettempdir()
        temp_path = os.path.join(temp_dir, f"gene_analysis_{os.urandom(4).hex()}.csv")
        with open(temp_path, 'w') as f:
            f.write(csv_content)
    except Exception as e:
        print(f"Error saving CSV: {str(e)}")
        temp_path = None

    # 5) Create diagram
    try:
        if diagram_mode == "advanced":
            diagram_img = create_advanced_genome_diagram(gene_results, genome_length, shap_means)
        else:
            diagram_img = create_simple_genome_diagram(gene_results, genome_length)
    except Exception as e:
        print(f"Error creating visualization: {str(e)}")
        diagram_img = Image.new('RGB', (800, 100), color='white')
        draw = ImageDraw.Draw(diagram_img)
        draw.text((10, 40), f"Error creating visualization: {str(e)}", fill='black')

    return results_text, temp_path, diagram_img

###############################################################################
# 12. DOWNLOAD FUNCTIONS
###############################################################################

def prepare_csv_download(data, filename="analysis_results.csv"):
    """
    Convert data to CSV for Gradio download button. 
    """
    if isinstance(data, str):
        return data.encode(), filename
    elif isinstance(data, (list, dict)):
        import csv
        from io import StringIO
        output = StringIO()
        writer = csv.DictWriter(output, fieldnames=data[0].keys())
        writer.writeheader()
        writer.writerows(data)
        return output.getvalue().encode(), filename
    else:
        raise ValueError("Unsupported data type for CSV download")
        
###############################################################################
# 13. BUILD GRADIO INTERFACE
###############################################################################

css = """
.gradio-container {
    font-family: 'IBM Plex Sans', sans-serif;
}
.download-button {
    margin-top: 10px;
}
"""

with gr.Blocks(css=css) as iface:
    gr.Markdown("""
    # Virus Host Classifier + Extended Genome Visualization
    **Step 1**: Predict overall viral sequence origin (human vs non-human) and identify extreme subregions.  
    **Step 2**: Explore subregions (local SHAP, GC content, histogram).  
    **Step 3**: Analyze gene features (per-gene SHAP, advanced stats, improved diagrams).  
    **Step 4**: Compare sequences for SHAP differences.

    **Color Scale**: Negative SHAP = Blue, 0 = White, Positive = Red.
    """)
    
    with gr.Tab("1) Full-Sequence Analysis"):
        with gr.Row():
            with gr.Column(scale=1):
                file_input = gr.File(label="Upload FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
                text_input = gr.Textbox(label="Or paste FASTA", placeholder=">name\nACGT...", lines=5)
                top_k = gr.Slider(minimum=5, maximum=30, value=10, step=1, label="Number of top k-mers to display")
                win_size = gr.Slider(minimum=100, maximum=5000, value=500, step=100, label="Subregion Window Size")
                analyze_btn = gr.Button("Analyze Sequence", variant="primary")
            with gr.Column(scale=2):
                results_box = gr.Textbox(label="Classification Results", lines=12, interactive=False)
                kmer_img = gr.Image(label="Top k-mer SHAP")
                genome_img = gr.Image(label="Genome-wide SHAP Heatmap (Blue=neg, White=0, Red=pos)")
                download_results = gr.File(label="Download Results", visible=False, elem_classes="download-button")
        seq_state = gr.State()
        header_state = gr.State()

        analyze_btn.click(
            analyze_sequence,
            inputs=[file_input, top_k, text_input, win_size],
            outputs=[results_box, kmer_img, genome_img, seq_state, header_state, download_results]
        )

    with gr.Tab("2) Subregion Exploration"):
        gr.Markdown("""
        **Subregion Analysis**  
        View SHAP signals, GC content, etc. for a specific region.
        """)
        with gr.Row():
            region_start = gr.Number(label="Region Start", value=0)
            region_end = gr.Number(label="Region End", value=500)
            region_btn = gr.Button("Analyze Subregion")
        subregion_info = gr.Textbox(label="Subregion Analysis", lines=7, interactive=False)
        with gr.Row():
            subregion_img = gr.Image(label="Subregion SHAP Heatmap (B-W-R)")
            subregion_hist_img = gr.Image(label="SHAP Distribution (Histogram)")
        download_subregion = gr.File(label="Download Subregion", visible=False, elem_classes="download-button")
        
        region_btn.click(
            analyze_subregion,
            inputs=[seq_state, header_state, region_start, region_end],
            outputs=[subregion_info, subregion_img, subregion_hist_img, download_subregion]
        )

    with gr.Tab("3) Gene Features Analysis"):
        gr.Markdown("""
        **Analyze Gene Features**  
        - Upload a FASTA file and a gene features file.  
        - See per-gene SHAP, classification, N50, entropy, etc.  
        - Choose a diagram mode (simple or advanced).
        """)
        with gr.Row():
            with gr.Column(scale=1):
                gene_fasta_file = gr.File(label="FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
                gene_fasta_text = gr.Textbox(label="Or paste FASTA sequence", lines=5)
            with gr.Column(scale=1):
                features_file = gr.File(label="Gene features file", file_types=[".txt"], type="filepath")
                features_text = gr.Textbox(label="Or paste gene features", lines=5)
                diagram_mode = gr.Radio(choices=["simple", "advanced"], value="advanced", label="Diagram Mode")
        analyze_genes_btn = gr.Button("Analyze Gene Features", variant="primary")
        gene_results = gr.Textbox(label="Gene Analysis Results", lines=12, interactive=False)
        gene_diagram = gr.Image(label="Genome Diagram")
        download_gene_results = gr.File(label="Download Gene Analysis (CSV)", visible=True)
        
        analyze_genes_btn.click(
            analyze_gene_features,
            inputs=[gene_fasta_file, features_file, gene_fasta_text, features_text, diagram_mode],
            outputs=[gene_results, download_gene_results, gene_diagram]
        )
    
    with gr.Tab("4) Comparative Analysis"):
        gr.Markdown("""
        **Compare Two Sequences**  
        - Upload or paste two FASTA sequences.  
        - We'll compare SHAP patterns (normalized for different lengths).  
        """)
        with gr.Row():
            with gr.Column(scale=1):
                file_input1 = gr.File(label="1st FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
                text_input1 = gr.Textbox(label="Or paste 1st FASTA", lines=5)
            with gr.Column(scale=1):
                file_input2 = gr.File(label="2nd FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
                text_input2 = gr.Textbox(label="Or paste 2nd FASTA", lines=5)
        compare_btn = gr.Button("Compare Sequences", variant="primary")
        comparison_text = gr.Textbox(label="Comparison Results", lines=12, interactive=False)
        with gr.Row():
            diff_heatmap = gr.Image(label="SHAP Difference Heatmap")
            diff_hist = gr.Image(label="Distribution of SHAP Differences")
        download_comparison = gr.File(label="Download Comparison", visible=False, elem_classes="download-button")

        compare_btn.click(
            analyze_sequence_comparison,
            inputs=[file_input1, file_input2, text_input1, text_input2],
            outputs=[comparison_text, diff_heatmap, diff_hist, download_comparison]
        )
    
    gr.Markdown("""
    ### Notes & Features
    - **Advanced Genome Diagram** uses Biopython’s `GenomeDiagram` (requires `pdf2image` if you want it as an image).
    - **Additional Stats**: N50, Shannon entropy, etc.
    - **Auto-scaling** for comparative analysis with adaptive smoothing.
    - **Data Export**: Download CSV of analysis results.
    """)
    
if __name__ == "__main__":
    iface.launch()