Spaces:

2dogey
/

VenusFactory

Runtime error

App Files Files Community

VenusFactory / src /web /download_tab.py

2dogey

Upload folder using huggingface_hub

8918ac7 verified about 1 month ago

raw

history blame contribute delete

29.7 kB

	import gradio as gr
	import os
	import subprocess
	import numpy as np
	import plotly.graph_objects as go
	from Bio.PDB import PDBParser
	import io
	import base64
	from typing import Dict, Any, Tuple, List, Optional

	def create_download_tab(constant: Dict[str, Any]) -> Dict[str, Any]:
	"""
	Create the download tab with various options for downloading protein data.

	Args:
	constant: Dictionary containing constant values for the application

	Returns:
	Dictionary containing any state information
	"""
	def run_download_script(script_name: str, **kwargs) -> str:
	"""
	Run a download script with the specified arguments.

	Args:
	script_name: Name of the script to run
	**kwargs: Arguments to pass to the script

	Returns:
	Output of the script as a string
	"""
	cmd = ["python", f"src/crawler/{script_name}"]
	for k, v in kwargs.items():
	if v is None: # Skip None values
	continue
	if isinstance(v, bool): # Handle boolean flags
	if v:
	cmd.append(f"--{k}")
	elif v == "--merge": # Handle special merge flag
	cmd.append(v)
	else: # Handle regular arguments
	cmd.extend([f"--{k}", str(v)])

	try:
	result = subprocess.run(cmd, capture_output=True, text=True)
	if result.returncode == 0:
	return f"Download completed successfully\n{result.stdout}"
	else:
	return f"Error during download:\n{result.stderr}"
	except Exception as e:
	return f"Failed to run download script: {str(e)}"

	# Function to visualize protein structure using Plotly
	def visualize_protein_structure(pdb_file: str) -> Tuple[str, go.Figure]:
	"""
	Visualize a protein structure from a PDB file using Plotly for interactive 3D visualization.

	Args:
	pdb_file: Path to the PDB file

	Returns:
	Tuple containing status message and Plotly figure
	"""
	try:
	if not os.path.exists(pdb_file):
	return f"File not found: {pdb_file}", None

	# Parse the PDB file
	parser = PDBParser(QUIET=True)
	structure = parser.get_structure("protein", pdb_file)

	# Extract atom coordinates and information for all atoms
	all_atoms_x, all_atoms_y, all_atoms_z = [], [], []
	all_atoms_text = [] # For hover information
	all_atoms_color = []

	# Color mapping for different atom types
	color_map = {
	'C': '#333333', # Dark gray for carbon
	'N': '#3050F8', # Blue for nitrogen
	'O': '#FF2010', # Red for oxygen
	'S': '#FFFF30', # Yellow for sulfur
	'P': '#FF8000', # Orange for phosphorus
	'H': '#E0E0E0', # Light gray for hydrogen
	'CA': '#00FF00' # Green for alpha carbon
	}

	# Extract backbone (CA atoms) for the ribbon representation
	ca_x, ca_y, ca_z = [], [], []
	ca_text = []

	# Track chains for coloring
	chains = {}
	chain_colors = [
	'#1F77B4', '#FF7F0E', '#2CA02C', '#D62728', '#9467BD',
	'#8C564B', '#E377C2', '#7F7F7F', '#BCBD22', '#17BECF'
	]

	# Create a Plotly figure
	fig = go.Figure()

	# Track the number of backbone traces for visibility control
	backbone_trace_count = 0

	# Extract coordinates and properties
	for model in structure:
	for chain in model:
	chain_id = chain.get_id()
	if chain_id not in chains:
	chains[chain_id] = len(chains) % len(chain_colors)

	chain_color = chain_colors[chains[chain_id]]

	# Collect CA atoms for this chain
	chain_ca_x, chain_ca_y, chain_ca_z = [], [], []
	chain_ca_text = []

	for residue in chain:
	res_id = residue.get_id()
	res_name = residue.get_resname()
	res_num = res_id[1]

	# Extract CA atoms for backbone trace
	if 'CA' in residue:
	ca = residue['CA'].get_coord()
	chain_ca_x.append(ca[0])
	chain_ca_y.append(ca[1])
	chain_ca_z.append(ca[2])
	chain_ca_text.append(f"Chain {chain_id}, {res_name} {res_num}")

	# Also add to global CA lists
	ca_x.append(ca[0])
	ca_y.append(ca[1])
	ca_z.append(ca[2])
	ca_text.append(f"Chain {chain_id}, {res_name} {res_num}")

	# Extract all atoms
	for atom in residue:
	coord = atom.get_coord()
	all_atoms_x.append(coord[0])
	all_atoms_y.append(coord[1])
	all_atoms_z.append(coord[2])

	atom_name = atom.get_name()
	atom_element = atom.element

	all_atoms_text.append(f"Chain {chain_id}, {res_name} {res_num}, {atom_name}")

	# Determine atom color
	if atom_name == 'CA':
	all_atoms_color.append(color_map.get('CA', '#808080'))
	else:
	all_atoms_color.append(color_map.get(atom_element, '#808080'))

	# Add this chain's CA atoms as a separate trace for better visualization
	if chain_ca_x:
	fig.add_trace(go.Scatter3d(
	x=chain_ca_x,
	y=chain_ca_y,
	z=chain_ca_z,
	mode='lines',
	name=f'Chain {chain_id}',
	line=dict(color=chain_color, width=8), # Increased line width
	text=chain_ca_text,
	hoverinfo='text',
	showlegend=True
	))
	backbone_trace_count += 1

	# Add backbone trace (CA atoms as markers)
	fig.add_trace(go.Scatter3d(
	x=ca_x,
	y=ca_y,
	z=ca_z,
	mode='markers',
	name='Backbone',
	marker=dict(
	size=7, # Increased marker size
	color='#00FF00',
	opacity=0.8,
	symbol='circle'
	),
	text=ca_text,
	hoverinfo='text',
	showlegend=True
	))
	backbone_trace_count += 1

	# Add all atoms as small markers
	fig.add_trace(go.Scatter3d(
	x=all_atoms_x,
	y=all_atoms_y,
	z=all_atoms_z,
	mode='markers',
	name='All Atoms',
	marker=dict(
	size=2.5,
	color=all_atoms_color,
	opacity=0.6
	),
	text=all_atoms_text,
	hoverinfo='text',
	showlegend=True,
	visible='legendonly' # Hide by default, can be toggled in legend
	))

	# Set layout properties
	pdb_id = os.path.basename(pdb_file).split('.')[0]
	fig.update_layout(
	title=dict(
	text=f"Structure: {pdb_id}",
	font=dict(size=20, family="Arial, sans-serif")
	),
	scene=dict(
	xaxis=dict(title='X (Å)', showbackground=False, showgrid=True, gridcolor='lightgray'),
	yaxis=dict(title='Y (Å)', showbackground=False, showgrid=True, gridcolor='lightgray'),
	zaxis=dict(title='Z (Å)', showbackground=False, showgrid=True, gridcolor='lightgray'),
	aspectmode='data',
	camera=dict(
	eye=dict(x=1.5, y=1.5, z=1.5)
	)
	),
	margin=dict(l=0, r=0, b=0, t=40),
	legend=dict(
	yanchor="top",
	y=0.99,
	xanchor="left",
	x=0.01,
	bgcolor="rgba(255, 255, 255, 0.8)",
	bordercolor="lightgray",
	borderwidth=1
	),
	template="plotly_white",
	height=600, # Increase height for better visualization
	width=800 # Set width for better aspect ratio
	)

	# Create visibility arrays for the buttons
	# For "Backbone Only": all backbone traces visible, all atoms hidden
	backbone_only_visibility = [True] * backbone_trace_count + [False]
	# For "All Atoms": all traces visible
	all_atoms_visibility = [True] * (backbone_trace_count + 1)

	# Add buttons for different views
	fig.update_layout(
	updatemenus=[
	dict(
	type="buttons",
	direction="right",
	buttons=[
	dict(
	args=[{"visible": backbone_only_visibility}],
	label="Backbone Only",
	method="update"
	),
	dict(
	args=[{"visible": all_atoms_visibility}],
	label="All Atoms",
	method="update"
	)
	],
	pad={"r": 10, "t": 10},
	showactive=True,
	x=0.1,
	xanchor="left",
	y=1.1,
	yanchor="top",
	bgcolor="rgba(255, 255, 255, 0.8)",
	bordercolor="lightgray",
	borderwidth=1
	),
	]
	)

	return f"Successfully visualized structure from {pdb_file}", fig

	except Exception as e:
	import traceback
	error_details = traceback.format_exc()
	print(f"Error visualizing structure: {str(e)}\n{error_details}")
	return f"Error visualizing structure: {str(e)}", None

	# Create the main download tab
	with gr.Tab("Download"):
	with gr.Row():
	with gr.Column():
	gr.Markdown("### Download Protein Data (See help for more details)")

	# InterPro Metadata tab
	with gr.Tab("InterPro Metadata"):
	with gr.Row():
	interpro_method = gr.Radio(
	choices=["Single ID", "From JSON"],
	label="Download Method",
	value="Single ID"
	)

	with gr.Column():
	interpro_id = gr.Textbox(label="InterPro ID", value="IPR000001")
	interpro_json = gr.Textbox(label="InterPro JSON Path", value="download/interpro_domain/interpro_json.customization", visible=False)
	interpro_out = gr.Textbox(label="Output Directory", value="download/interpro_domain")
	interpro_error = gr.Checkbox(label="Save error file", value=True)
	interpro_btn = gr.Button("Download InterPro Data")
	interpro_output = gr.Textbox(label="Output", interactive=False)

	def update_interpro_visibility(method):
	"""Update visibility of InterPro input fields based on selected method"""
	return {
	interpro_id: gr.update(visible=(method == "Single ID")),
	interpro_json: gr.update(visible=(method == "From JSON"))
	}

	interpro_method.change(
	fn=update_interpro_visibility,
	inputs=[interpro_method],
	outputs=[interpro_id, interpro_json]
	)

	# RCSB Metadata tab
	with gr.Tab("RCSB Metadata"):
	with gr.Row():
	rcsb_method = gr.Radio(
	choices=["Single ID", "From File"],
	label="Download Method",
	value="Single ID"
	)

	with gr.Column():
	rcsb_id = gr.Textbox(label="PDB ID", value="1a0j")
	rcsb_file = gr.Textbox(label="PDB List File", value="download/rcsb.txt", visible=False)
	rcsb_out = gr.Textbox(label="Output Directory", value="download/rcsb_metadata")
	rcsb_error = gr.Checkbox(label="Save error file", value=True)
	rcsb_btn = gr.Button("Download RCSB Metadata")
	rcsb_output = gr.Textbox(label="Output", interactive=False)

	def update_rcsb_visibility(method):
	"""Update visibility of RCSB input fields based on selected method"""
	return {
	rcsb_id: gr.update(visible=(method == "Single ID")),
	rcsb_file: gr.update(visible=(method == "From File"))
	}

	rcsb_method.change(
	fn=update_rcsb_visibility,
	inputs=[rcsb_method],
	outputs=[rcsb_id, rcsb_file]
	)

	# UniProt Sequences tab
	with gr.Tab("UniProt Sequences"):
	with gr.Row():
	uniprot_method = gr.Radio(
	choices=["Single ID", "From File"],
	label="Download Method",
	value="Single ID"
	)

	with gr.Column():
	uniprot_id = gr.Textbox(label="UniProt ID", value="P00734")
	uniprot_file = gr.Textbox(label="UniProt ID List File", value="download/uniprot.txt", visible=False)
	uniprot_out = gr.Textbox(label="Output Directory", value="download/uniprot_sequences")
	uniprot_merge = gr.Checkbox(label="Merge into single FASTA", value=False)
	uniprot_error = gr.Checkbox(label="Save error file", value=True)
	uniprot_btn = gr.Button("Download UniProt Sequences")
	uniprot_output = gr.Textbox(label="Output", interactive=False)

	def update_uniprot_visibility(method):
	"""Update visibility of UniProt input fields based on selected method"""
	return {
	uniprot_id: gr.update(visible=(method == "Single ID")),
	uniprot_file: gr.update(visible=(method == "From File"))
	}

	uniprot_method.change(
	fn=update_uniprot_visibility,
	inputs=[uniprot_method],
	outputs=[uniprot_id, uniprot_file]
	)

	# RCSB Structures tab
	with gr.Tab("RCSB Structures"):
	with gr.Row():
	# Left column for inputs
	with gr.Column(scale=3):
	with gr.Group(): # Group for better visual separation
	struct_method = gr.Radio(
	choices=["Single ID", "From File"],
	label="Download Method",
	value="Single ID"
	)

	# Input parameters section with consistent spacing
	with gr.Row():
	struct_id = gr.Textbox(label="PDB ID", value="1a0j")

	with gr.Row():
	struct_file = gr.Textbox(label="PDB List File", value="download/rcsb.txt", visible=False)

	with gr.Row():
	struct_out = gr.Textbox(label="Output Directory", value="download/rcsb_structures")

	with gr.Row():
	struct_type = gr.Dropdown(
	choices=["cif", "pdb", "pdb1", "xml", "sf", "mr", "mrstr"],
	value="pdb",
	label="Structure Type"
	)

	with gr.Row():
	with gr.Column(scale=1):
	struct_unzip = gr.Checkbox(label="Unzip downloaded files", value=True)
	with gr.Column(scale=1):
	struct_error = gr.Checkbox(label="Save error file", value=True)

	with gr.Row():
	struct_btn = gr.Button("Download RCSB Structures", size="lg")

	# Output section
	struct_output = gr.Textbox(label="Download Output", interactive=False, lines=4)
	struct_viz_status = gr.Textbox(label="Visualization Status", interactive=False)

	# Right column for visualization
	with gr.Column(scale=5):
	# Visualization section with full height
	struct_viz = gr.Plot(label="Structure Visualization", elem_id="struct_viz_plot")

	def update_struct_visibility(method):
	"""Update visibility of RCSB structure input fields based on selected method"""
	return {
	struct_id: gr.update(visible=(method == "Single ID")),
	struct_file: gr.update(visible=(method == "From File"))
	}

	struct_method.change(
	fn=update_struct_visibility,
	inputs=[struct_method],
	outputs=[struct_id, struct_file]
	)

	# AlphaFold2 Structures tab
	with gr.Tab("AlphaFold2 Structures"):
	with gr.Row():
	# Left column for inputs
	with gr.Column(scale=3):
	with gr.Group(): # Group for better visual separation
	af_method = gr.Radio(
	choices=["Single ID", "From File"],
	label="Download Method",
	value="Single ID"
	)

	# Input parameters section with consistent spacing
	with gr.Row():
	af_id = gr.Textbox(label="UniProt ID", value="P00734")

	with gr.Row():
	af_file = gr.Textbox(label="UniProt ID List File", value="download/uniprot.txt", visible=False)

	with gr.Row():
	af_out = gr.Textbox(label="Output Directory", value="download/alphafold2_structures")

	with gr.Row():
	af_index_level = gr.Number(label="Index Level", value=0, precision=0)

	with gr.Row():
	af_error = gr.Checkbox(label="Save error file", value=True)

	with gr.Row():
	af_btn = gr.Button("Download AlphaFold Structures", size="lg")

	# Output section
	af_output = gr.Textbox(label="Download Output", interactive=False, lines=4)
	af_viz_status = gr.Textbox(label="Visualization Status", interactive=False)

	# Right column for visualization
	with gr.Column(scale=5):
	# Visualization section with full height
	af_viz = gr.Plot(label="Structure Visualization", elem_id="af_viz_plot")

	def update_af_visibility(method):
	"""Update visibility of AlphaFold input fields based on selected method"""
	return {
	af_id: gr.update(visible=(method == "Single ID")),
	af_file: gr.update(visible=(method == "From File"))
	}

	af_method.change(
	fn=update_af_visibility,
	inputs=[af_method],
	outputs=[af_id, af_file]
	)

	# Handler functions for download buttons
	def handle_interpro_download(method, id_val, json_val, out_dir, error):
	"""Handle InterPro data download"""
	if method == "Single ID":
	return run_download_script(
	"metadata/download_interpro.py",
	interpro_id=id_val,
	out_dir=out_dir,
	error_file=f"{out_dir}/failed.txt" if error else None
	)
	else:
	return run_download_script(
	"metadata/download_interpro.py",
	interpro_json=json_val,
	out_dir=out_dir,
	error_file=f"{out_dir}/failed.txt" if error else None
	)

	interpro_btn.click(
	fn=handle_interpro_download,
	inputs=[interpro_method, interpro_id, interpro_json, interpro_out, interpro_error],
	outputs=interpro_output
	)

	def handle_rcsb_download(method, id_val, file_val, out_dir, error):
	"""Handle RCSB metadata download"""
	if method == "Single ID":
	return run_download_script(
	"metadata/download_rcsb.py",
	pdb_id=id_val,
	out_dir=out_dir,
	error_file=f"{out_dir}/failed.txt" if error else None
	)
	else:
	return run_download_script(
	"metadata/download_rcsb.py",
	pdb_id_file=file_val,
	out_dir=out_dir,
	error_file=f"{out_dir}/failed.txt" if error else None
	)

	rcsb_btn.click(
	fn=handle_rcsb_download,
	inputs=[rcsb_method, rcsb_id, rcsb_file, rcsb_out, rcsb_error],
	outputs=rcsb_output
	)

	def handle_uniprot_download(method, id_val, file_val, out_dir, merge, error):
	"""Handle UniProt sequence download"""
	if method == "Single ID":
	return run_download_script(
	"sequence/download_uniprot_seq.py",
	uniprot_id=id_val,
	out_dir=out_dir,
	merge="--merge" if merge else None,
	error_file=f"{out_dir}/failed.txt" if error else None
	)
	else:
	return run_download_script(
	"sequence/download_uniprot_seq.py",
	file=file_val,
	out_dir=out_dir,
	merge="--merge" if merge else None,
	error_file=f"{out_dir}/failed.txt" if error else None
	)

	uniprot_btn.click(
	fn=handle_uniprot_download,
	inputs=[uniprot_method, uniprot_id, uniprot_file, uniprot_out, uniprot_merge, uniprot_error],
	outputs=uniprot_output
	)

	def handle_struct_download(method, id_val, file_val, out_dir, type_val, unzip, error):
	"""
	Handle RCSB structure download and visualization

	Args:
	method: Download method (Single ID or From File)
	id_val: PDB ID for single download
	file_val: File path for batch download
	out_dir: Output directory
	type_val: Structure file type
	unzip: Whether to unzip downloaded files
	error: Whether to save error file

	Returns:
	Tuple containing download output, visualization status, and Plotly figure
	"""
	# Download the structure
	if method == "Single ID":
	download_output = run_download_script(
	"structure/download_rcsb.py",
	pdb_id=id_val,
	out_dir=out_dir,
	type=type_val,
	unzip="--unzip" if unzip else None,
	error_file=f"{out_dir}/failed.txt" if error else None
	)

	# Visualize the downloaded structure
	if "Download completed successfully" in download_output:
	pdb_file = f"{out_dir}/{id_val.lower()}.{type_val}"
	if type_val == "pdb" and os.path.exists(pdb_file):
	viz_status, viz_fig = visualize_protein_structure(pdb_file)
	return download_output, viz_status, viz_fig
	else:
	return download_output, f"Cannot visualize {type_val} format or file not found", None
	else:
	return download_output, "Download failed, cannot visualize", None
	else:
	download_output = run_download_script(
	"structure/download_rcsb.py",
	pdb_id_file=file_val,
	out_dir=out_dir,
	type=type_val,
	unzip="--unzip" if unzip else None,
	error_file=f"{out_dir}/failed.txt" if error else None
	)
	return download_output, "Batch download completed, select a single ID to visualize", None

	struct_btn.click(
	fn=handle_struct_download,
	inputs=[struct_method, struct_id, struct_file, struct_out, struct_type, struct_unzip, struct_error],
	outputs=[struct_output, struct_viz_status, struct_viz]
	)

	def handle_af_download(method, id_val, file_val, out_dir, index_level, error):
	"""
	Handle AlphaFold structure download and visualization

	Args:
	method: Download method (Single ID or From File)
	id_val: UniProt ID for single download
	file_val: File path for batch download
	out_dir: Output directory
	index_level: Index level for directory structure
	error: Whether to save error file

	Returns:
	Tuple containing download output, visualization status, and Plotly figure
	"""
	# Download the structure
	if method == "Single ID":
	download_output = run_download_script(
	"structure/download_alphafold.py",
	uniprot_id=id_val,
	out_dir=out_dir,
	index_level=index_level,
	error_file=f"{out_dir}/failed.txt" if error else None
	)

	# Visualize the downloaded structure
	if "Download completed successfully" in download_output:
	# Try different possible file paths
	possible_paths = [
	f"{out_dir}/AF-{id_val}-F1-model_v4.pdb",
	f"{out_dir}/{id_val}.pdb"
	]

	for pdb_file in possible_paths:
	if os.path.exists(pdb_file):
	viz_status, viz_fig = visualize_protein_structure(pdb_file)
	return download_output, viz_status, viz_fig

	return download_output, f"PDB file not found in expected locations", None
	else:
	return download_output, "Download failed, cannot visualize", None
	else:
	download_output = run_download_script(
	"structure/download_alphafold.py",
	uniprot_id_file=file_val,
	out_dir=out_dir,
	index_level=index_level,
	error_file=f"{out_dir}/failed.txt" if error else None
	)
	return download_output, "Batch download completed, select a single ID to visualize", None

	af_btn.click(
	fn=handle_af_download,
	inputs=[af_method, af_id, af_file, af_out, af_index_level, af_error],
	outputs=[af_output, af_viz_status, af_viz]
	)

	return {}