VenusFactory / src /web /download_tab.py
2dogey's picture
Upload folder using huggingface_hub
8918ac7 verified
import gradio as gr
import os
import subprocess
import numpy as np
import plotly.graph_objects as go
from Bio.PDB import PDBParser
import io
import base64
from typing import Dict, Any, Tuple, List, Optional
def create_download_tab(constant: Dict[str, Any]) -> Dict[str, Any]:
"""
Create the download tab with various options for downloading protein data.
Args:
constant: Dictionary containing constant values for the application
Returns:
Dictionary containing any state information
"""
def run_download_script(script_name: str, **kwargs) -> str:
"""
Run a download script with the specified arguments.
Args:
script_name: Name of the script to run
**kwargs: Arguments to pass to the script
Returns:
Output of the script as a string
"""
cmd = ["python", f"src/crawler/{script_name}"]
for k, v in kwargs.items():
if v is None: # Skip None values
continue
if isinstance(v, bool): # Handle boolean flags
if v:
cmd.append(f"--{k}")
elif v == "--merge": # Handle special merge flag
cmd.append(v)
else: # Handle regular arguments
cmd.extend([f"--{k}", str(v)])
try:
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
return f"Download completed successfully\n{result.stdout}"
else:
return f"Error during download:\n{result.stderr}"
except Exception as e:
return f"Failed to run download script: {str(e)}"
# Function to visualize protein structure using Plotly
def visualize_protein_structure(pdb_file: str) -> Tuple[str, go.Figure]:
"""
Visualize a protein structure from a PDB file using Plotly for interactive 3D visualization.
Args:
pdb_file: Path to the PDB file
Returns:
Tuple containing status message and Plotly figure
"""
try:
if not os.path.exists(pdb_file):
return f"File not found: {pdb_file}", None
# Parse the PDB file
parser = PDBParser(QUIET=True)
structure = parser.get_structure("protein", pdb_file)
# Extract atom coordinates and information for all atoms
all_atoms_x, all_atoms_y, all_atoms_z = [], [], []
all_atoms_text = [] # For hover information
all_atoms_color = []
# Color mapping for different atom types
color_map = {
'C': '#333333', # Dark gray for carbon
'N': '#3050F8', # Blue for nitrogen
'O': '#FF2010', # Red for oxygen
'S': '#FFFF30', # Yellow for sulfur
'P': '#FF8000', # Orange for phosphorus
'H': '#E0E0E0', # Light gray for hydrogen
'CA': '#00FF00' # Green for alpha carbon
}
# Extract backbone (CA atoms) for the ribbon representation
ca_x, ca_y, ca_z = [], [], []
ca_text = []
# Track chains for coloring
chains = {}
chain_colors = [
'#1F77B4', '#FF7F0E', '#2CA02C', '#D62728', '#9467BD',
'#8C564B', '#E377C2', '#7F7F7F', '#BCBD22', '#17BECF'
]
# Create a Plotly figure
fig = go.Figure()
# Track the number of backbone traces for visibility control
backbone_trace_count = 0
# Extract coordinates and properties
for model in structure:
for chain in model:
chain_id = chain.get_id()
if chain_id not in chains:
chains[chain_id] = len(chains) % len(chain_colors)
chain_color = chain_colors[chains[chain_id]]
# Collect CA atoms for this chain
chain_ca_x, chain_ca_y, chain_ca_z = [], [], []
chain_ca_text = []
for residue in chain:
res_id = residue.get_id()
res_name = residue.get_resname()
res_num = res_id[1]
# Extract CA atoms for backbone trace
if 'CA' in residue:
ca = residue['CA'].get_coord()
chain_ca_x.append(ca[0])
chain_ca_y.append(ca[1])
chain_ca_z.append(ca[2])
chain_ca_text.append(f"Chain {chain_id}, {res_name} {res_num}")
# Also add to global CA lists
ca_x.append(ca[0])
ca_y.append(ca[1])
ca_z.append(ca[2])
ca_text.append(f"Chain {chain_id}, {res_name} {res_num}")
# Extract all atoms
for atom in residue:
coord = atom.get_coord()
all_atoms_x.append(coord[0])
all_atoms_y.append(coord[1])
all_atoms_z.append(coord[2])
atom_name = atom.get_name()
atom_element = atom.element
all_atoms_text.append(f"Chain {chain_id}, {res_name} {res_num}, {atom_name}")
# Determine atom color
if atom_name == 'CA':
all_atoms_color.append(color_map.get('CA', '#808080'))
else:
all_atoms_color.append(color_map.get(atom_element, '#808080'))
# Add this chain's CA atoms as a separate trace for better visualization
if chain_ca_x:
fig.add_trace(go.Scatter3d(
x=chain_ca_x,
y=chain_ca_y,
z=chain_ca_z,
mode='lines',
name=f'Chain {chain_id}',
line=dict(color=chain_color, width=8), # Increased line width
text=chain_ca_text,
hoverinfo='text',
showlegend=True
))
backbone_trace_count += 1
# Add backbone trace (CA atoms as markers)
fig.add_trace(go.Scatter3d(
x=ca_x,
y=ca_y,
z=ca_z,
mode='markers',
name='Backbone',
marker=dict(
size=7, # Increased marker size
color='#00FF00',
opacity=0.8,
symbol='circle'
),
text=ca_text,
hoverinfo='text',
showlegend=True
))
backbone_trace_count += 1
# Add all atoms as small markers
fig.add_trace(go.Scatter3d(
x=all_atoms_x,
y=all_atoms_y,
z=all_atoms_z,
mode='markers',
name='All Atoms',
marker=dict(
size=2.5,
color=all_atoms_color,
opacity=0.6
),
text=all_atoms_text,
hoverinfo='text',
showlegend=True,
visible='legendonly' # Hide by default, can be toggled in legend
))
# Set layout properties
pdb_id = os.path.basename(pdb_file).split('.')[0]
fig.update_layout(
title=dict(
text=f"Structure: {pdb_id}",
font=dict(size=20, family="Arial, sans-serif")
),
scene=dict(
xaxis=dict(title='X (Å)', showbackground=False, showgrid=True, gridcolor='lightgray'),
yaxis=dict(title='Y (Å)', showbackground=False, showgrid=True, gridcolor='lightgray'),
zaxis=dict(title='Z (Å)', showbackground=False, showgrid=True, gridcolor='lightgray'),
aspectmode='data',
camera=dict(
eye=dict(x=1.5, y=1.5, z=1.5)
)
),
margin=dict(l=0, r=0, b=0, t=40),
legend=dict(
yanchor="top",
y=0.99,
xanchor="left",
x=0.01,
bgcolor="rgba(255, 255, 255, 0.8)",
bordercolor="lightgray",
borderwidth=1
),
template="plotly_white",
height=600, # Increase height for better visualization
width=800 # Set width for better aspect ratio
)
# Create visibility arrays for the buttons
# For "Backbone Only": all backbone traces visible, all atoms hidden
backbone_only_visibility = [True] * backbone_trace_count + [False]
# For "All Atoms": all traces visible
all_atoms_visibility = [True] * (backbone_trace_count + 1)
# Add buttons for different views
fig.update_layout(
updatemenus=[
dict(
type="buttons",
direction="right",
buttons=[
dict(
args=[{"visible": backbone_only_visibility}],
label="Backbone Only",
method="update"
),
dict(
args=[{"visible": all_atoms_visibility}],
label="All Atoms",
method="update"
)
],
pad={"r": 10, "t": 10},
showactive=True,
x=0.1,
xanchor="left",
y=1.1,
yanchor="top",
bgcolor="rgba(255, 255, 255, 0.8)",
bordercolor="lightgray",
borderwidth=1
),
]
)
return f"Successfully visualized structure from {pdb_file}", fig
except Exception as e:
import traceback
error_details = traceback.format_exc()
print(f"Error visualizing structure: {str(e)}\n{error_details}")
return f"Error visualizing structure: {str(e)}", None
# Create the main download tab
with gr.Tab("Download"):
with gr.Row():
with gr.Column():
gr.Markdown("### Download Protein Data (See help for more details)")
# InterPro Metadata tab
with gr.Tab("InterPro Metadata"):
with gr.Row():
interpro_method = gr.Radio(
choices=["Single ID", "From JSON"],
label="Download Method",
value="Single ID"
)
with gr.Column():
interpro_id = gr.Textbox(label="InterPro ID", value="IPR000001")
interpro_json = gr.Textbox(label="InterPro JSON Path", value="download/interpro_domain/interpro_json.customization", visible=False)
interpro_out = gr.Textbox(label="Output Directory", value="download/interpro_domain")
interpro_error = gr.Checkbox(label="Save error file", value=True)
interpro_btn = gr.Button("Download InterPro Data")
interpro_output = gr.Textbox(label="Output", interactive=False)
def update_interpro_visibility(method):
"""Update visibility of InterPro input fields based on selected method"""
return {
interpro_id: gr.update(visible=(method == "Single ID")),
interpro_json: gr.update(visible=(method == "From JSON"))
}
interpro_method.change(
fn=update_interpro_visibility,
inputs=[interpro_method],
outputs=[interpro_id, interpro_json]
)
# RCSB Metadata tab
with gr.Tab("RCSB Metadata"):
with gr.Row():
rcsb_method = gr.Radio(
choices=["Single ID", "From File"],
label="Download Method",
value="Single ID"
)
with gr.Column():
rcsb_id = gr.Textbox(label="PDB ID", value="1a0j")
rcsb_file = gr.Textbox(label="PDB List File", value="download/rcsb.txt", visible=False)
rcsb_out = gr.Textbox(label="Output Directory", value="download/rcsb_metadata")
rcsb_error = gr.Checkbox(label="Save error file", value=True)
rcsb_btn = gr.Button("Download RCSB Metadata")
rcsb_output = gr.Textbox(label="Output", interactive=False)
def update_rcsb_visibility(method):
"""Update visibility of RCSB input fields based on selected method"""
return {
rcsb_id: gr.update(visible=(method == "Single ID")),
rcsb_file: gr.update(visible=(method == "From File"))
}
rcsb_method.change(
fn=update_rcsb_visibility,
inputs=[rcsb_method],
outputs=[rcsb_id, rcsb_file]
)
# UniProt Sequences tab
with gr.Tab("UniProt Sequences"):
with gr.Row():
uniprot_method = gr.Radio(
choices=["Single ID", "From File"],
label="Download Method",
value="Single ID"
)
with gr.Column():
uniprot_id = gr.Textbox(label="UniProt ID", value="P00734")
uniprot_file = gr.Textbox(label="UniProt ID List File", value="download/uniprot.txt", visible=False)
uniprot_out = gr.Textbox(label="Output Directory", value="download/uniprot_sequences")
uniprot_merge = gr.Checkbox(label="Merge into single FASTA", value=False)
uniprot_error = gr.Checkbox(label="Save error file", value=True)
uniprot_btn = gr.Button("Download UniProt Sequences")
uniprot_output = gr.Textbox(label="Output", interactive=False)
def update_uniprot_visibility(method):
"""Update visibility of UniProt input fields based on selected method"""
return {
uniprot_id: gr.update(visible=(method == "Single ID")),
uniprot_file: gr.update(visible=(method == "From File"))
}
uniprot_method.change(
fn=update_uniprot_visibility,
inputs=[uniprot_method],
outputs=[uniprot_id, uniprot_file]
)
# RCSB Structures tab
with gr.Tab("RCSB Structures"):
with gr.Row():
# Left column for inputs
with gr.Column(scale=3):
with gr.Group(): # Group for better visual separation
struct_method = gr.Radio(
choices=["Single ID", "From File"],
label="Download Method",
value="Single ID"
)
# Input parameters section with consistent spacing
with gr.Row():
struct_id = gr.Textbox(label="PDB ID", value="1a0j")
with gr.Row():
struct_file = gr.Textbox(label="PDB List File", value="download/rcsb.txt", visible=False)
with gr.Row():
struct_out = gr.Textbox(label="Output Directory", value="download/rcsb_structures")
with gr.Row():
struct_type = gr.Dropdown(
choices=["cif", "pdb", "pdb1", "xml", "sf", "mr", "mrstr"],
value="pdb",
label="Structure Type"
)
with gr.Row():
with gr.Column(scale=1):
struct_unzip = gr.Checkbox(label="Unzip downloaded files", value=True)
with gr.Column(scale=1):
struct_error = gr.Checkbox(label="Save error file", value=True)
with gr.Row():
struct_btn = gr.Button("Download RCSB Structures", size="lg")
# Output section
struct_output = gr.Textbox(label="Download Output", interactive=False, lines=4)
struct_viz_status = gr.Textbox(label="Visualization Status", interactive=False)
# Right column for visualization
with gr.Column(scale=5):
# Visualization section with full height
struct_viz = gr.Plot(label="Structure Visualization", elem_id="struct_viz_plot")
def update_struct_visibility(method):
"""Update visibility of RCSB structure input fields based on selected method"""
return {
struct_id: gr.update(visible=(method == "Single ID")),
struct_file: gr.update(visible=(method == "From File"))
}
struct_method.change(
fn=update_struct_visibility,
inputs=[struct_method],
outputs=[struct_id, struct_file]
)
# AlphaFold2 Structures tab
with gr.Tab("AlphaFold2 Structures"):
with gr.Row():
# Left column for inputs
with gr.Column(scale=3):
with gr.Group(): # Group for better visual separation
af_method = gr.Radio(
choices=["Single ID", "From File"],
label="Download Method",
value="Single ID"
)
# Input parameters section with consistent spacing
with gr.Row():
af_id = gr.Textbox(label="UniProt ID", value="P00734")
with gr.Row():
af_file = gr.Textbox(label="UniProt ID List File", value="download/uniprot.txt", visible=False)
with gr.Row():
af_out = gr.Textbox(label="Output Directory", value="download/alphafold2_structures")
with gr.Row():
af_index_level = gr.Number(label="Index Level", value=0, precision=0)
with gr.Row():
af_error = gr.Checkbox(label="Save error file", value=True)
with gr.Row():
af_btn = gr.Button("Download AlphaFold Structures", size="lg")
# Output section
af_output = gr.Textbox(label="Download Output", interactive=False, lines=4)
af_viz_status = gr.Textbox(label="Visualization Status", interactive=False)
# Right column for visualization
with gr.Column(scale=5):
# Visualization section with full height
af_viz = gr.Plot(label="Structure Visualization", elem_id="af_viz_plot")
def update_af_visibility(method):
"""Update visibility of AlphaFold input fields based on selected method"""
return {
af_id: gr.update(visible=(method == "Single ID")),
af_file: gr.update(visible=(method == "From File"))
}
af_method.change(
fn=update_af_visibility,
inputs=[af_method],
outputs=[af_id, af_file]
)
# Handler functions for download buttons
def handle_interpro_download(method, id_val, json_val, out_dir, error):
"""Handle InterPro data download"""
if method == "Single ID":
return run_download_script(
"metadata/download_interpro.py",
interpro_id=id_val,
out_dir=out_dir,
error_file=f"{out_dir}/failed.txt" if error else None
)
else:
return run_download_script(
"metadata/download_interpro.py",
interpro_json=json_val,
out_dir=out_dir,
error_file=f"{out_dir}/failed.txt" if error else None
)
interpro_btn.click(
fn=handle_interpro_download,
inputs=[interpro_method, interpro_id, interpro_json, interpro_out, interpro_error],
outputs=interpro_output
)
def handle_rcsb_download(method, id_val, file_val, out_dir, error):
"""Handle RCSB metadata download"""
if method == "Single ID":
return run_download_script(
"metadata/download_rcsb.py",
pdb_id=id_val,
out_dir=out_dir,
error_file=f"{out_dir}/failed.txt" if error else None
)
else:
return run_download_script(
"metadata/download_rcsb.py",
pdb_id_file=file_val,
out_dir=out_dir,
error_file=f"{out_dir}/failed.txt" if error else None
)
rcsb_btn.click(
fn=handle_rcsb_download,
inputs=[rcsb_method, rcsb_id, rcsb_file, rcsb_out, rcsb_error],
outputs=rcsb_output
)
def handle_uniprot_download(method, id_val, file_val, out_dir, merge, error):
"""Handle UniProt sequence download"""
if method == "Single ID":
return run_download_script(
"sequence/download_uniprot_seq.py",
uniprot_id=id_val,
out_dir=out_dir,
merge="--merge" if merge else None,
error_file=f"{out_dir}/failed.txt" if error else None
)
else:
return run_download_script(
"sequence/download_uniprot_seq.py",
file=file_val,
out_dir=out_dir,
merge="--merge" if merge else None,
error_file=f"{out_dir}/failed.txt" if error else None
)
uniprot_btn.click(
fn=handle_uniprot_download,
inputs=[uniprot_method, uniprot_id, uniprot_file, uniprot_out, uniprot_merge, uniprot_error],
outputs=uniprot_output
)
def handle_struct_download(method, id_val, file_val, out_dir, type_val, unzip, error):
"""
Handle RCSB structure download and visualization
Args:
method: Download method (Single ID or From File)
id_val: PDB ID for single download
file_val: File path for batch download
out_dir: Output directory
type_val: Structure file type
unzip: Whether to unzip downloaded files
error: Whether to save error file
Returns:
Tuple containing download output, visualization status, and Plotly figure
"""
# Download the structure
if method == "Single ID":
download_output = run_download_script(
"structure/download_rcsb.py",
pdb_id=id_val,
out_dir=out_dir,
type=type_val,
unzip="--unzip" if unzip else None,
error_file=f"{out_dir}/failed.txt" if error else None
)
# Visualize the downloaded structure
if "Download completed successfully" in download_output:
pdb_file = f"{out_dir}/{id_val.lower()}.{type_val}"
if type_val == "pdb" and os.path.exists(pdb_file):
viz_status, viz_fig = visualize_protein_structure(pdb_file)
return download_output, viz_status, viz_fig
else:
return download_output, f"Cannot visualize {type_val} format or file not found", None
else:
return download_output, "Download failed, cannot visualize", None
else:
download_output = run_download_script(
"structure/download_rcsb.py",
pdb_id_file=file_val,
out_dir=out_dir,
type=type_val,
unzip="--unzip" if unzip else None,
error_file=f"{out_dir}/failed.txt" if error else None
)
return download_output, "Batch download completed, select a single ID to visualize", None
struct_btn.click(
fn=handle_struct_download,
inputs=[struct_method, struct_id, struct_file, struct_out, struct_type, struct_unzip, struct_error],
outputs=[struct_output, struct_viz_status, struct_viz]
)
def handle_af_download(method, id_val, file_val, out_dir, index_level, error):
"""
Handle AlphaFold structure download and visualization
Args:
method: Download method (Single ID or From File)
id_val: UniProt ID for single download
file_val: File path for batch download
out_dir: Output directory
index_level: Index level for directory structure
error: Whether to save error file
Returns:
Tuple containing download output, visualization status, and Plotly figure
"""
# Download the structure
if method == "Single ID":
download_output = run_download_script(
"structure/download_alphafold.py",
uniprot_id=id_val,
out_dir=out_dir,
index_level=index_level,
error_file=f"{out_dir}/failed.txt" if error else None
)
# Visualize the downloaded structure
if "Download completed successfully" in download_output:
# Try different possible file paths
possible_paths = [
f"{out_dir}/AF-{id_val}-F1-model_v4.pdb",
f"{out_dir}/{id_val}.pdb"
]
for pdb_file in possible_paths:
if os.path.exists(pdb_file):
viz_status, viz_fig = visualize_protein_structure(pdb_file)
return download_output, viz_status, viz_fig
return download_output, f"PDB file not found in expected locations", None
else:
return download_output, "Download failed, cannot visualize", None
else:
download_output = run_download_script(
"structure/download_alphafold.py",
uniprot_id_file=file_val,
out_dir=out_dir,
index_level=index_level,
error_file=f"{out_dir}/failed.txt" if error else None
)
return download_output, "Batch download completed, select a single ID to visualize", None
af_btn.click(
fn=handle_af_download,
inputs=[af_method, af_id, af_file, af_out, af_index_level, af_error],
outputs=[af_output, af_viz_status, af_viz]
)
return {}