|
import os |
|
from argparse import FileType, ArgumentParser |
|
|
|
import numpy as np |
|
import pandas as pd |
|
from Bio.PDB import PDBParser |
|
from Bio.Seq import Seq |
|
from Bio.SeqRecord import SeqRecord |
|
from tqdm import tqdm |
|
from Bio import SeqIO |
|
|
|
|
|
|
|
def esm_embedding_prep(out_file, protein_path): |
|
biopython_parser = PDBParser() |
|
|
|
three_to_one = { |
|
"ALA": "A", |
|
"ARG": "R", |
|
"ASN": "N", |
|
"ASP": "D", |
|
"CYS": "C", |
|
"GLN": "Q", |
|
"GLU": "E", |
|
"GLY": "G", |
|
"HIS": "H", |
|
"ILE": "I", |
|
"LEU": "L", |
|
"LYS": "K", |
|
"MET": "M", |
|
"MSE": "M", |
|
"PHE": "F", |
|
"PRO": "P", |
|
"PYL": "O", |
|
"SER": "S", |
|
"SEC": "U", |
|
"THR": "T", |
|
"TRP": "W", |
|
"TYR": "Y", |
|
"VAL": "V", |
|
"ASX": "B", |
|
"GLX": "Z", |
|
"XAA": "X", |
|
"XLE": "J", |
|
} |
|
|
|
file_paths = [protein_path] |
|
sequences = [] |
|
ids = [] |
|
for file_path in tqdm(file_paths): |
|
structure = biopython_parser.get_structure("random_id", file_path) |
|
structure = structure[0] |
|
for i, chain in enumerate(structure): |
|
seq = "" |
|
for res_idx, residue in enumerate(chain): |
|
if residue.get_resname() == "HOH": |
|
continue |
|
residue_coords = [] |
|
c_alpha, n, c = None, None, None |
|
for atom in residue: |
|
if atom.name == "CA": |
|
c_alpha = list(atom.get_vector()) |
|
if atom.name == "N": |
|
n = list(atom.get_vector()) |
|
if atom.name == "C": |
|
c = list(atom.get_vector()) |
|
if ( |
|
c_alpha != None and n != None and c != None |
|
): |
|
try: |
|
seq += three_to_one[residue.get_resname()] |
|
except Exception as e: |
|
seq += "-" |
|
print( |
|
"encountered unknown AA: ", |
|
residue.get_resname(), |
|
" in the complex ", |
|
file_path, |
|
". Replacing it with a dash - .", |
|
) |
|
sequences.append(seq) |
|
ids.append(f"{os.path.basename(file_path)}_chain_{i}") |
|
records = [] |
|
for (index, seq) in zip(ids, sequences): |
|
record = SeqRecord(Seq(seq), str(index)) |
|
record.description = "" |
|
records.append(record) |
|
SeqIO.write(records, out_file, "fasta") |
|
|