Spaces:
Sleeping
Sleeping
File size: 5,716 Bytes
5676c75 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
import os
import time
import json
import numpy as np
import re
import sys
from Bio.PDB import PDBParser, MMCIFParser
sys.path.append(".")
# Get structural seqs from pdb file
def get_struc_seq(foldseek,
path,
chains: list = None,
process_id: int = 0,
plddt_mask: bool = "auto",
plddt_threshold: float = 70.,
foldseek_verbose: bool = False) -> dict:
"""
Args:
foldseek: Binary executable file of foldseek
path: Path to pdb file
chains: Chains to be extracted from pdb file. If None, all chains will be extracted.
process_id: Process ID for temporary files. This is used for parallel processing.
plddt_mask: If True, mask regions with plddt < plddt_threshold. plddt scores are from the pdb file.
plddt_threshold: Threshold for plddt. If plddt is lower than this value, the structure will be masked.
foldseek_verbose: If True, foldseek will print verbose messages.
Returns:
seq_dict: A dict of structural seqs. The keys are chain IDs. The values are tuples of
(seq, struc_seq, combined_seq).
"""
assert os.path.exists(foldseek), f"Foldseek not found: {foldseek}"
assert os.path.exists(path), f"PDB file not found: {path}"
tmp_save_path = f"get_struc_seq_{process_id}_{time.time()}.tsv"
if foldseek_verbose:
cmd = f"{foldseek} structureto3didescriptor --threads 1 --chain-name-mode 1 {path} {tmp_save_path}"
else:
cmd = f"{foldseek} structureto3didescriptor -v 0 --threads 1 --chain-name-mode 1 {path} {tmp_save_path}"
os.system(cmd)
# Check whether the structure is predicted by AlphaFold2
if plddt_mask == "auto":
with open(path, "r") as r:
plddt_mask = True if "alphafold" in r.read().lower() else False
seq_dict = {}
name = os.path.basename(path)
with open(tmp_save_path, "r") as r:
for i, line in enumerate(r):
desc, seq, struc_seq = line.split("\t")[:3]
# Mask low plddt
if plddt_mask:
try:
plddts = extract_plddt(path)
assert len(plddts) == len(struc_seq), f"Length mismatch: {len(plddts)} != {len(struc_seq)}"
# Mask regions with plddt < threshold
indices = np.where(plddts < plddt_threshold)[0]
np_seq = np.array(list(struc_seq))
np_seq[indices] = "#"
struc_seq = "".join(np_seq)
except Exception as e:
print(f"Error: {e}")
print(f"Failed to mask plddt for {name}")
name_chain = desc.split(" ")[0]
chain = name_chain.replace(name, "").split("_")[-1]
if chains is None or chain in chains:
if chain not in seq_dict:
combined_seq = "".join([a + b.lower() for a, b in zip(seq, struc_seq)])
seq_dict[chain] = (seq, struc_seq, combined_seq)
os.remove(tmp_save_path)
os.remove(tmp_save_path + ".dbtype")
return seq_dict
def extract_plddt(pdb_path: str) -> np.ndarray:
"""
Extract plddt scores from pdb file.
Args:
pdb_path: Path to pdb file.
Returns:
plddts: plddt scores.
"""
# Initialize parser
if pdb_path.endswith(".cif"):
parser = MMCIFParser()
elif pdb_path.endswith(".pdb"):
parser = PDBParser()
else:
raise ValueError("Invalid file format for plddt extraction. Must be '.cif' or '.pdb'.")
structure = parser.get_structure('protein', pdb_path)
model = structure[0]
chain = model["A"]
# Extract plddt scores
plddts = []
for residue in chain:
residue_plddts = []
for atom in residue:
plddt = atom.get_bfactor()
residue_plddts.append(plddt)
plddts.append(np.mean(residue_plddts))
plddts = np.array(plddts)
return plddts
def transform_pdb_dir(foldseek: str, pdb_dir: str, seq_type: str, save_path: str):
"""
Transform a directory of pdb files into a fasta file.
Args:
foldseek: Binary executable file of foldseek.
pdb_dir: Directory of pdb files.
seq_type: Type of sequence to be extracted. Must be "aa" or "foldseek"
save_path: Path to save the fasta file.
"""
assert os.path.exists(foldseek), f"Foldseek not found: {foldseek}"
assert seq_type in ["aa", "foldseek"], f"seq_type must be 'aa' or 'foldseek'!"
tmp_save_path = f"get_struc_seq_{time.time()}.tsv"
cmd = f"{foldseek} structureto3didescriptor --chain-name-mode 1 {pdb_dir} {tmp_save_path}"
os.system(cmd)
with open(tmp_save_path, "r") as r, open(save_path, "w") as w:
for line in r:
protein_id, aa_seq, foldseek_seq = line.strip().split("\t")[:3]
if seq_type == "aa":
w.write(f">{protein_id}\n{aa_seq}\n")
else:
w.write(f">{protein_id}\n{foldseek_seq.lower()}\n")
os.remove(tmp_save_path)
os.remove(tmp_save_path + ".dbtype")
if __name__ == '__main__':
foldseek = "/sujin/bin/foldseek"
# test_path = "/sujin/Datasets/PDB/all/6xtd.cif"
test_path = "/sujin/Datasets/FLIP/meltome/af2_structures/A0A061ACX4.pdb"
plddt_path = "/sujin/Datasets/FLIP/meltome/af2_plddts/A0A061ACX4.json"
res = get_struc_seq(foldseek, test_path, plddt_path=plddt_path, plddt_threshold=70.)
print(res["A"][1].lower())
|