import gradio as gr import os import subprocess import numpy as np import plotly.graph_objects as go from Bio.PDB import PDBParser import io import base64 from typing import Dict, Any, Tuple, List, Optional def create_download_tab(constant: Dict[str, Any]) -> Dict[str, Any]: """ Create the download tab with various options for downloading protein data. Args: constant: Dictionary containing constant values for the application Returns: Dictionary containing any state information """ def run_download_script(script_name: str, **kwargs) -> str: """ Run a download script with the specified arguments. Args: script_name: Name of the script to run **kwargs: Arguments to pass to the script Returns: Output of the script as a string """ cmd = ["python", f"src/crawler/{script_name}"] for k, v in kwargs.items(): if v is None: # Skip None values continue if isinstance(v, bool): # Handle boolean flags if v: cmd.append(f"--{k}") elif v == "--merge": # Handle special merge flag cmd.append(v) else: # Handle regular arguments cmd.extend([f"--{k}", str(v)]) try: result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode == 0: return f"Download completed successfully\n{result.stdout}" else: return f"Error during download:\n{result.stderr}" except Exception as e: return f"Failed to run download script: {str(e)}" # Function to visualize protein structure using Plotly def visualize_protein_structure(pdb_file: str) -> Tuple[str, go.Figure]: """ Visualize a protein structure from a PDB file using Plotly for interactive 3D visualization. Args: pdb_file: Path to the PDB file Returns: Tuple containing status message and Plotly figure """ try: if not os.path.exists(pdb_file): return f"File not found: {pdb_file}", None # Parse the PDB file parser = PDBParser(QUIET=True) structure = parser.get_structure("protein", pdb_file) # Extract atom coordinates and information for all atoms all_atoms_x, all_atoms_y, all_atoms_z = [], [], [] all_atoms_text = [] # For hover information all_atoms_color = [] # Color mapping for different atom types color_map = { 'C': '#333333', # Dark gray for carbon 'N': '#3050F8', # Blue for nitrogen 'O': '#FF2010', # Red for oxygen 'S': '#FFFF30', # Yellow for sulfur 'P': '#FF8000', # Orange for phosphorus 'H': '#E0E0E0', # Light gray for hydrogen 'CA': '#00FF00' # Green for alpha carbon } # Extract backbone (CA atoms) for the ribbon representation ca_x, ca_y, ca_z = [], [], [] ca_text = [] # Track chains for coloring chains = {} chain_colors = [ '#1F77B4', '#FF7F0E', '#2CA02C', '#D62728', '#9467BD', '#8C564B', '#E377C2', '#7F7F7F', '#BCBD22', '#17BECF' ] # Create a Plotly figure fig = go.Figure() # Track the number of backbone traces for visibility control backbone_trace_count = 0 # Extract coordinates and properties for model in structure: for chain in model: chain_id = chain.get_id() if chain_id not in chains: chains[chain_id] = len(chains) % len(chain_colors) chain_color = chain_colors[chains[chain_id]] # Collect CA atoms for this chain chain_ca_x, chain_ca_y, chain_ca_z = [], [], [] chain_ca_text = [] for residue in chain: res_id = residue.get_id() res_name = residue.get_resname() res_num = res_id[1] # Extract CA atoms for backbone trace if 'CA' in residue: ca = residue['CA'].get_coord() chain_ca_x.append(ca[0]) chain_ca_y.append(ca[1]) chain_ca_z.append(ca[2]) chain_ca_text.append(f"Chain {chain_id}, {res_name} {res_num}") # Also add to global CA lists ca_x.append(ca[0]) ca_y.append(ca[1]) ca_z.append(ca[2]) ca_text.append(f"Chain {chain_id}, {res_name} {res_num}") # Extract all atoms for atom in residue: coord = atom.get_coord() all_atoms_x.append(coord[0]) all_atoms_y.append(coord[1]) all_atoms_z.append(coord[2]) atom_name = atom.get_name() atom_element = atom.element all_atoms_text.append(f"Chain {chain_id}, {res_name} {res_num}, {atom_name}") # Determine atom color if atom_name == 'CA': all_atoms_color.append(color_map.get('CA', '#808080')) else: all_atoms_color.append(color_map.get(atom_element, '#808080')) # Add this chain's CA atoms as a separate trace for better visualization if chain_ca_x: fig.add_trace(go.Scatter3d( x=chain_ca_x, y=chain_ca_y, z=chain_ca_z, mode='lines', name=f'Chain {chain_id}', line=dict(color=chain_color, width=8), # Increased line width text=chain_ca_text, hoverinfo='text', showlegend=True )) backbone_trace_count += 1 # Add backbone trace (CA atoms as markers) fig.add_trace(go.Scatter3d( x=ca_x, y=ca_y, z=ca_z, mode='markers', name='Backbone', marker=dict( size=7, # Increased marker size color='#00FF00', opacity=0.8, symbol='circle' ), text=ca_text, hoverinfo='text', showlegend=True )) backbone_trace_count += 1 # Add all atoms as small markers fig.add_trace(go.Scatter3d( x=all_atoms_x, y=all_atoms_y, z=all_atoms_z, mode='markers', name='All Atoms', marker=dict( size=2.5, color=all_atoms_color, opacity=0.6 ), text=all_atoms_text, hoverinfo='text', showlegend=True, visible='legendonly' # Hide by default, can be toggled in legend )) # Set layout properties pdb_id = os.path.basename(pdb_file).split('.')[0] fig.update_layout( title=dict( text=f"Structure: {pdb_id}", font=dict(size=20, family="Arial, sans-serif") ), scene=dict( xaxis=dict(title='X (Å)', showbackground=False, showgrid=True, gridcolor='lightgray'), yaxis=dict(title='Y (Å)', showbackground=False, showgrid=True, gridcolor='lightgray'), zaxis=dict(title='Z (Å)', showbackground=False, showgrid=True, gridcolor='lightgray'), aspectmode='data', camera=dict( eye=dict(x=1.5, y=1.5, z=1.5) ) ), margin=dict(l=0, r=0, b=0, t=40), legend=dict( yanchor="top", y=0.99, xanchor="left", x=0.01, bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="lightgray", borderwidth=1 ), template="plotly_white", height=600, # Increase height for better visualization width=800 # Set width for better aspect ratio ) # Create visibility arrays for the buttons # For "Backbone Only": all backbone traces visible, all atoms hidden backbone_only_visibility = [True] * backbone_trace_count + [False] # For "All Atoms": all traces visible all_atoms_visibility = [True] * (backbone_trace_count + 1) # Add buttons for different views fig.update_layout( updatemenus=[ dict( type="buttons", direction="right", buttons=[ dict( args=[{"visible": backbone_only_visibility}], label="Backbone Only", method="update" ), dict( args=[{"visible": all_atoms_visibility}], label="All Atoms", method="update" ) ], pad={"r": 10, "t": 10}, showactive=True, x=0.1, xanchor="left", y=1.1, yanchor="top", bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="lightgray", borderwidth=1 ), ] ) return f"Successfully visualized structure from {pdb_file}", fig except Exception as e: import traceback error_details = traceback.format_exc() print(f"Error visualizing structure: {str(e)}\n{error_details}") return f"Error visualizing structure: {str(e)}", None # Create the main download tab with gr.Tab("Download"): with gr.Row(): with gr.Column(): gr.Markdown("### Download Protein Data (See help for more details)") # InterPro Metadata tab with gr.Tab("InterPro Metadata"): with gr.Row(): interpro_method = gr.Radio( choices=["Single ID", "From JSON"], label="Download Method", value="Single ID" ) with gr.Column(): interpro_id = gr.Textbox(label="InterPro ID", value="IPR000001") interpro_json = gr.Textbox(label="InterPro JSON Path", value="download/interpro_domain/interpro_json.customization", visible=False) interpro_out = gr.Textbox(label="Output Directory", value="download/interpro_domain") interpro_error = gr.Checkbox(label="Save error file", value=True) interpro_btn = gr.Button("Download InterPro Data") interpro_output = gr.Textbox(label="Output", interactive=False) def update_interpro_visibility(method): """Update visibility of InterPro input fields based on selected method""" return { interpro_id: gr.update(visible=(method == "Single ID")), interpro_json: gr.update(visible=(method == "From JSON")) } interpro_method.change( fn=update_interpro_visibility, inputs=[interpro_method], outputs=[interpro_id, interpro_json] ) # RCSB Metadata tab with gr.Tab("RCSB Metadata"): with gr.Row(): rcsb_method = gr.Radio( choices=["Single ID", "From File"], label="Download Method", value="Single ID" ) with gr.Column(): rcsb_id = gr.Textbox(label="PDB ID", value="1a0j") rcsb_file = gr.Textbox(label="PDB List File", value="download/rcsb.txt", visible=False) rcsb_out = gr.Textbox(label="Output Directory", value="download/rcsb_metadata") rcsb_error = gr.Checkbox(label="Save error file", value=True) rcsb_btn = gr.Button("Download RCSB Metadata") rcsb_output = gr.Textbox(label="Output", interactive=False) def update_rcsb_visibility(method): """Update visibility of RCSB input fields based on selected method""" return { rcsb_id: gr.update(visible=(method == "Single ID")), rcsb_file: gr.update(visible=(method == "From File")) } rcsb_method.change( fn=update_rcsb_visibility, inputs=[rcsb_method], outputs=[rcsb_id, rcsb_file] ) # UniProt Sequences tab with gr.Tab("UniProt Sequences"): with gr.Row(): uniprot_method = gr.Radio( choices=["Single ID", "From File"], label="Download Method", value="Single ID" ) with gr.Column(): uniprot_id = gr.Textbox(label="UniProt ID", value="P00734") uniprot_file = gr.Textbox(label="UniProt ID List File", value="download/uniprot.txt", visible=False) uniprot_out = gr.Textbox(label="Output Directory", value="download/uniprot_sequences") uniprot_merge = gr.Checkbox(label="Merge into single FASTA", value=False) uniprot_error = gr.Checkbox(label="Save error file", value=True) uniprot_btn = gr.Button("Download UniProt Sequences") uniprot_output = gr.Textbox(label="Output", interactive=False) def update_uniprot_visibility(method): """Update visibility of UniProt input fields based on selected method""" return { uniprot_id: gr.update(visible=(method == "Single ID")), uniprot_file: gr.update(visible=(method == "From File")) } uniprot_method.change( fn=update_uniprot_visibility, inputs=[uniprot_method], outputs=[uniprot_id, uniprot_file] ) # RCSB Structures tab with gr.Tab("RCSB Structures"): with gr.Row(): # Left column for inputs with gr.Column(scale=3): with gr.Group(): # Group for better visual separation struct_method = gr.Radio( choices=["Single ID", "From File"], label="Download Method", value="Single ID" ) # Input parameters section with consistent spacing with gr.Row(): struct_id = gr.Textbox(label="PDB ID", value="1a0j") with gr.Row(): struct_file = gr.Textbox(label="PDB List File", value="download/rcsb.txt", visible=False) with gr.Row(): struct_out = gr.Textbox(label="Output Directory", value="download/rcsb_structures") with gr.Row(): struct_type = gr.Dropdown( choices=["cif", "pdb", "pdb1", "xml", "sf", "mr", "mrstr"], value="pdb", label="Structure Type" ) with gr.Row(): with gr.Column(scale=1): struct_unzip = gr.Checkbox(label="Unzip downloaded files", value=True) with gr.Column(scale=1): struct_error = gr.Checkbox(label="Save error file", value=True) with gr.Row(): struct_btn = gr.Button("Download RCSB Structures", size="lg") # Output section struct_output = gr.Textbox(label="Download Output", interactive=False, lines=4) struct_viz_status = gr.Textbox(label="Visualization Status", interactive=False) # Right column for visualization with gr.Column(scale=5): # Visualization section with full height struct_viz = gr.Plot(label="Structure Visualization", elem_id="struct_viz_plot") def update_struct_visibility(method): """Update visibility of RCSB structure input fields based on selected method""" return { struct_id: gr.update(visible=(method == "Single ID")), struct_file: gr.update(visible=(method == "From File")) } struct_method.change( fn=update_struct_visibility, inputs=[struct_method], outputs=[struct_id, struct_file] ) # AlphaFold2 Structures tab with gr.Tab("AlphaFold2 Structures"): with gr.Row(): # Left column for inputs with gr.Column(scale=3): with gr.Group(): # Group for better visual separation af_method = gr.Radio( choices=["Single ID", "From File"], label="Download Method", value="Single ID" ) # Input parameters section with consistent spacing with gr.Row(): af_id = gr.Textbox(label="UniProt ID", value="P00734") with gr.Row(): af_file = gr.Textbox(label="UniProt ID List File", value="download/uniprot.txt", visible=False) with gr.Row(): af_out = gr.Textbox(label="Output Directory", value="download/alphafold2_structures") with gr.Row(): af_index_level = gr.Number(label="Index Level", value=0, precision=0) with gr.Row(): af_error = gr.Checkbox(label="Save error file", value=True) with gr.Row(): af_btn = gr.Button("Download AlphaFold Structures", size="lg") # Output section af_output = gr.Textbox(label="Download Output", interactive=False, lines=4) af_viz_status = gr.Textbox(label="Visualization Status", interactive=False) # Right column for visualization with gr.Column(scale=5): # Visualization section with full height af_viz = gr.Plot(label="Structure Visualization", elem_id="af_viz_plot") def update_af_visibility(method): """Update visibility of AlphaFold input fields based on selected method""" return { af_id: gr.update(visible=(method == "Single ID")), af_file: gr.update(visible=(method == "From File")) } af_method.change( fn=update_af_visibility, inputs=[af_method], outputs=[af_id, af_file] ) # Handler functions for download buttons def handle_interpro_download(method, id_val, json_val, out_dir, error): """Handle InterPro data download""" if method == "Single ID": return run_download_script( "metadata/download_interpro.py", interpro_id=id_val, out_dir=out_dir, error_file=f"{out_dir}/failed.txt" if error else None ) else: return run_download_script( "metadata/download_interpro.py", interpro_json=json_val, out_dir=out_dir, error_file=f"{out_dir}/failed.txt" if error else None ) interpro_btn.click( fn=handle_interpro_download, inputs=[interpro_method, interpro_id, interpro_json, interpro_out, interpro_error], outputs=interpro_output ) def handle_rcsb_download(method, id_val, file_val, out_dir, error): """Handle RCSB metadata download""" if method == "Single ID": return run_download_script( "metadata/download_rcsb.py", pdb_id=id_val, out_dir=out_dir, error_file=f"{out_dir}/failed.txt" if error else None ) else: return run_download_script( "metadata/download_rcsb.py", pdb_id_file=file_val, out_dir=out_dir, error_file=f"{out_dir}/failed.txt" if error else None ) rcsb_btn.click( fn=handle_rcsb_download, inputs=[rcsb_method, rcsb_id, rcsb_file, rcsb_out, rcsb_error], outputs=rcsb_output ) def handle_uniprot_download(method, id_val, file_val, out_dir, merge, error): """Handle UniProt sequence download""" if method == "Single ID": return run_download_script( "sequence/download_uniprot_seq.py", uniprot_id=id_val, out_dir=out_dir, merge="--merge" if merge else None, error_file=f"{out_dir}/failed.txt" if error else None ) else: return run_download_script( "sequence/download_uniprot_seq.py", file=file_val, out_dir=out_dir, merge="--merge" if merge else None, error_file=f"{out_dir}/failed.txt" if error else None ) uniprot_btn.click( fn=handle_uniprot_download, inputs=[uniprot_method, uniprot_id, uniprot_file, uniprot_out, uniprot_merge, uniprot_error], outputs=uniprot_output ) def handle_struct_download(method, id_val, file_val, out_dir, type_val, unzip, error): """ Handle RCSB structure download and visualization Args: method: Download method (Single ID or From File) id_val: PDB ID for single download file_val: File path for batch download out_dir: Output directory type_val: Structure file type unzip: Whether to unzip downloaded files error: Whether to save error file Returns: Tuple containing download output, visualization status, and Plotly figure """ # Download the structure if method == "Single ID": download_output = run_download_script( "structure/download_rcsb.py", pdb_id=id_val, out_dir=out_dir, type=type_val, unzip="--unzip" if unzip else None, error_file=f"{out_dir}/failed.txt" if error else None ) # Visualize the downloaded structure if "Download completed successfully" in download_output: pdb_file = f"{out_dir}/{id_val.lower()}.{type_val}" if type_val == "pdb" and os.path.exists(pdb_file): viz_status, viz_fig = visualize_protein_structure(pdb_file) return download_output, viz_status, viz_fig else: return download_output, f"Cannot visualize {type_val} format or file not found", None else: return download_output, "Download failed, cannot visualize", None else: download_output = run_download_script( "structure/download_rcsb.py", pdb_id_file=file_val, out_dir=out_dir, type=type_val, unzip="--unzip" if unzip else None, error_file=f"{out_dir}/failed.txt" if error else None ) return download_output, "Batch download completed, select a single ID to visualize", None struct_btn.click( fn=handle_struct_download, inputs=[struct_method, struct_id, struct_file, struct_out, struct_type, struct_unzip, struct_error], outputs=[struct_output, struct_viz_status, struct_viz] ) def handle_af_download(method, id_val, file_val, out_dir, index_level, error): """ Handle AlphaFold structure download and visualization Args: method: Download method (Single ID or From File) id_val: UniProt ID for single download file_val: File path for batch download out_dir: Output directory index_level: Index level for directory structure error: Whether to save error file Returns: Tuple containing download output, visualization status, and Plotly figure """ # Download the structure if method == "Single ID": download_output = run_download_script( "structure/download_alphafold.py", uniprot_id=id_val, out_dir=out_dir, index_level=index_level, error_file=f"{out_dir}/failed.txt" if error else None ) # Visualize the downloaded structure if "Download completed successfully" in download_output: # Try different possible file paths possible_paths = [ f"{out_dir}/AF-{id_val}-F1-model_v4.pdb", f"{out_dir}/{id_val}.pdb" ] for pdb_file in possible_paths: if os.path.exists(pdb_file): viz_status, viz_fig = visualize_protein_structure(pdb_file) return download_output, viz_status, viz_fig return download_output, f"PDB file not found in expected locations", None else: return download_output, "Download failed, cannot visualize", None else: download_output = run_download_script( "structure/download_alphafold.py", uniprot_id_file=file_val, out_dir=out_dir, index_level=index_level, error_file=f"{out_dir}/failed.txt" if error else None ) return download_output, "Batch download completed, select a single ID to visualize", None af_btn.click( fn=handle_af_download, inputs=[af_method, af_id, af_file, af_out, af_index_level, af_error], outputs=[af_output, af_viz_status, af_viz] ) return {}