|
import os |
|
from argparse import FileType, ArgumentParser |
|
|
|
import numpy as np |
|
from Bio.PDB import PDBParser |
|
from Bio.Seq import Seq |
|
from Bio.SeqRecord import SeqRecord |
|
from tqdm import tqdm |
|
|
|
parser = ArgumentParser() |
|
parser.add_argument('--data_dir', type=str, default='data/PDBBind_processed', help='') |
|
parser.add_argument('--chain_cutoff', type=int, default=10, help='') |
|
parser.add_argument('--out_file', type=str, default="data/pdbbind_sequences.fasta") |
|
args = parser.parse_args() |
|
|
|
cutoff = args.chain_cutoff |
|
data_dir = args.data_dir |
|
names = os.listdir(data_dir) |
|
|
|
from Bio import SeqIO |
|
biopython_parser = PDBParser() |
|
|
|
three_to_one = {'ALA': 'A', |
|
'ARG': 'R', |
|
'ASN': 'N', |
|
'ASP': 'D', |
|
'CYS': 'C', |
|
'GLN': 'Q', |
|
'GLU': 'E', |
|
'GLY': 'G', |
|
'HIS': 'H', |
|
'ILE': 'I', |
|
'LEU': 'L', |
|
'LYS': 'K', |
|
'MET': 'M', |
|
'MSE': 'M', |
|
'PHE': 'F', |
|
'PRO': 'P', |
|
'PYL': 'O', |
|
'SER': 'S', |
|
'SEC': 'U', |
|
'THR': 'T', |
|
'TRP': 'W', |
|
'TYR': 'Y', |
|
'VAL': 'V', |
|
'ASX': 'B', |
|
'GLX': 'Z', |
|
'XAA': 'X', |
|
'XLE': 'J'} |
|
|
|
sequences = [] |
|
ids = [] |
|
for name in tqdm(names): |
|
if name == '.DS_Store': continue |
|
if os.path.exists(os.path.join(data_dir, name, f'{name}_protein_processed.pdb')): |
|
rec_path = os.path.join(data_dir, name, f'{name}_protein_processed.pdb') |
|
else: |
|
rec_path = os.path.join(data_dir, name, f'{name}_protein.pdb') |
|
if cutoff > 10: |
|
rec_path = os.path.join(data_dir, name, f'{name}_protein_obabel_reduce.pdb') |
|
if not os.path.exists(rec_path): |
|
rec_path = os.path.join(data_dir, name, f'{name}_protein.pdb') |
|
structure = biopython_parser.get_structure('random_id', rec_path) |
|
structure = structure[0] |
|
for i, chain in enumerate(structure): |
|
seq = '' |
|
for res_idx, residue in enumerate(chain): |
|
if residue.get_resname() == 'HOH': |
|
continue |
|
residue_coords = [] |
|
c_alpha, n, c = None, None, None |
|
for atom in residue: |
|
if atom.name == 'CA': |
|
c_alpha = list(atom.get_vector()) |
|
if atom.name == 'N': |
|
n = list(atom.get_vector()) |
|
if atom.name == 'C': |
|
c = list(atom.get_vector()) |
|
if c_alpha != None and n != None and c != None: |
|
try: |
|
seq += three_to_one[residue.get_resname()] |
|
except Exception as e: |
|
seq += '-' |
|
print("encountered unknown AA: ", residue.get_resname(), ' in the complex ', name, '. Replacing it with a dash - .') |
|
sequences.append(seq) |
|
ids.append(f'{name}_chain_{i}') |
|
records = [] |
|
for (index, seq) in zip(ids,sequences): |
|
record = SeqRecord(Seq(seq), str(index)) |
|
record.description = '' |
|
records.append(record) |
|
SeqIO.write(records, args.out_file, "fasta") |
|
|
|
|
|
|