Spaces:
Running
Running
import os | |
import time | |
import json | |
import numpy as np | |
import re | |
import sys | |
from Bio.PDB import PDBParser, MMCIFParser | |
sys.path.append(".") | |
# Get structural seqs from pdb file | |
def get_struc_seq(foldseek, | |
path, | |
chains: list = None, | |
process_id: int = 0, | |
plddt_mask: bool = "auto", | |
plddt_threshold: float = 70., | |
foldseek_verbose: bool = False) -> dict: | |
""" | |
Args: | |
foldseek: Binary executable file of foldseek | |
path: Path to pdb file | |
chains: Chains to be extracted from pdb file. If None, all chains will be extracted. | |
process_id: Process ID for temporary files. This is used for parallel processing. | |
plddt_mask: If True, mask regions with plddt < plddt_threshold. plddt scores are from the pdb file. | |
plddt_threshold: Threshold for plddt. If plddt is lower than this value, the structure will be masked. | |
foldseek_verbose: If True, foldseek will print verbose messages. | |
Returns: | |
seq_dict: A dict of structural seqs. The keys are chain IDs. The values are tuples of | |
(seq, struc_seq, combined_seq). | |
""" | |
assert os.path.exists(foldseek), f"Foldseek not found: {foldseek}" | |
assert os.path.exists(path), f"PDB file not found: {path}" | |
tmp_save_path = f"get_struc_seq_{process_id}_{time.time()}.tsv" | |
if foldseek_verbose: | |
cmd = f"{foldseek} structureto3didescriptor --threads 1 --chain-name-mode 1 {path} {tmp_save_path}" | |
else: | |
cmd = f"{foldseek} structureto3didescriptor -v 0 --threads 1 --chain-name-mode 1 {path} {tmp_save_path}" | |
os.system(cmd) | |
# Check whether the structure is predicted by AlphaFold2 | |
if plddt_mask == "auto": | |
with open(path, "r") as r: | |
plddt_mask = True if "alphafold" in r.read().lower() else False | |
seq_dict = {} | |
name = os.path.basename(path) | |
with open(tmp_save_path, "r") as r: | |
for i, line in enumerate(r): | |
desc, seq, struc_seq = line.split("\t")[:3] | |
# Mask low plddt | |
if plddt_mask: | |
try: | |
plddts = extract_plddt(path) | |
assert len(plddts) == len(struc_seq), f"Length mismatch: {len(plddts)} != {len(struc_seq)}" | |
# Mask regions with plddt < threshold | |
indices = np.where(plddts < plddt_threshold)[0] | |
np_seq = np.array(list(struc_seq)) | |
np_seq[indices] = "#" | |
struc_seq = "".join(np_seq) | |
except Exception as e: | |
print(f"Error: {e}") | |
print(f"Failed to mask plddt for {name}") | |
name_chain = desc.split(" ")[0] | |
chain = name_chain.replace(name, "").split("_")[-1] | |
if chains is None or chain in chains: | |
if chain not in seq_dict: | |
combined_seq = "".join([a + b.lower() for a, b in zip(seq, struc_seq)]) | |
seq_dict[chain] = (seq, struc_seq, combined_seq) | |
os.remove(tmp_save_path) | |
os.remove(tmp_save_path + ".dbtype") | |
return seq_dict | |
def extract_plddt(pdb_path: str) -> np.ndarray: | |
""" | |
Extract plddt scores from pdb file. | |
Args: | |
pdb_path: Path to pdb file. | |
Returns: | |
plddts: plddt scores. | |
""" | |
# Initialize parser | |
if pdb_path.endswith(".cif"): | |
parser = MMCIFParser() | |
elif pdb_path.endswith(".pdb"): | |
parser = PDBParser() | |
else: | |
raise ValueError("Invalid file format for plddt extraction. Must be '.cif' or '.pdb'.") | |
structure = parser.get_structure('protein', pdb_path) | |
model = structure[0] | |
chain = model["A"] | |
# Extract plddt scores | |
plddts = [] | |
for residue in chain: | |
residue_plddts = [] | |
for atom in residue: | |
plddt = atom.get_bfactor() | |
residue_plddts.append(plddt) | |
plddts.append(np.mean(residue_plddts)) | |
plddts = np.array(plddts) | |
return plddts | |
def transform_pdb_dir(foldseek: str, pdb_dir: str, seq_type: str, save_path: str): | |
""" | |
Transform a directory of pdb files into a fasta file. | |
Args: | |
foldseek: Binary executable file of foldseek. | |
pdb_dir: Directory of pdb files. | |
seq_type: Type of sequence to be extracted. Must be "aa" or "foldseek" | |
save_path: Path to save the fasta file. | |
""" | |
assert os.path.exists(foldseek), f"Foldseek not found: {foldseek}" | |
assert seq_type in ["aa", "foldseek"], f"seq_type must be 'aa' or 'foldseek'!" | |
tmp_save_path = f"get_struc_seq_{time.time()}.tsv" | |
cmd = f"{foldseek} structureto3didescriptor --chain-name-mode 1 {pdb_dir} {tmp_save_path}" | |
os.system(cmd) | |
with open(tmp_save_path, "r") as r, open(save_path, "w") as w: | |
for line in r: | |
protein_id, aa_seq, foldseek_seq = line.strip().split("\t")[:3] | |
if seq_type == "aa": | |
w.write(f">{protein_id}\n{aa_seq}\n") | |
else: | |
w.write(f">{protein_id}\n{foldseek_seq.lower()}\n") | |
os.remove(tmp_save_path) | |
os.remove(tmp_save_path + ".dbtype") | |
if __name__ == '__main__': | |
foldseek = "/sujin/bin/foldseek" | |
# test_path = "/sujin/Datasets/PDB/all/6xtd.cif" | |
test_path = "/sujin/Datasets/FLIP/meltome/af2_structures/A0A061ACX4.pdb" | |
plddt_path = "/sujin/Datasets/FLIP/meltome/af2_plddts/A0A061ACX4.json" | |
res = get_struc_seq(foldseek, test_path, plddt_path=plddt_path, plddt_threshold=70.) | |
print(res["A"][1].lower()) | |