Spaces:
Runtime error
Runtime error
File size: 4,287 Bytes
8918ac7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import os
import json
import argparse
import torch
import pandas as pd
from tqdm import tqdm
from esm.models.vqvae import StructureTokenEncoder
from get_esm3_structure_seq import get_esm3_structure_seq
from get_foldseek_structure_seq import get_foldseek_structure_seq
from get_secondary_structure_seq import get_secondary_structure_seq
from get_prosst_str_token import get_prosst_token
# ignore the warning
import warnings
warnings.filterwarnings("ignore")
def ESM3_structure_encoder_v0(device: torch.device | str = "cpu"):
model = (
StructureTokenEncoder(
d_model=1024, n_heads=1, v_heads=128, n_layers=2, d_out=128, n_codes=4096
)
.to(device)
.eval()
)
state_dict = torch.load(
"./src/data/weight/esm3_structure_encoder_v0.pth", map_location=device
)
model.load_state_dict(state_dict)
return model
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--pdb_dir", type=str, default='dataset/sesadapter/DeepET/esmfold_pdb')
parser.add_argument("--pdb_file", type=str, default=None)
parser.add_argument("--out_dir", type=str, default='dataset/sesadapter/DeepET')
parser.add_argument("--merge_into", type=str, default='csv', choices=['json', 'csv'])
parser.add_argument("--save_intermediate", action='store_true')
args = parser.parse_args()
device = "cuda:0"
esm3_encoder = ESM3_structure_encoder_v0(device)
if args.pdb_dir is not None:
dir_name = os.path.basename(args.pdb_dir)
pdb_files = os.listdir(args.pdb_dir)
ss_results, esm3_results = [], []
for pdb_file in tqdm(pdb_files):
ss_result, error = get_secondary_structure_seq(os.path.join(args.pdb_dir, pdb_file))
if error is not None:
print(error)
continue
ss_results.append(ss_result)
esm3_result = get_esm3_structure_seq(os.path.join(args.pdb_dir, pdb_file), esm3_encoder, device)
esm3_results.append(esm3_result)
# clear cuda cache
torch.cuda.empty_cache()
with open(os.path.join(args.out_dir, f"{dir_name}_ss.json"), "w") as f:
f.write("\n".join([json.dumps(r) for r in ss_results]))
with open(os.path.join(args.out_dir, f"{dir_name}_esm3.json"), "w") as f:
f.write("\n".join([json.dumps(r) for r in esm3_results]))
fs_results = get_foldseek_structure_seq(args.pdb_dir)
with open(os.path.join(args.out_dir, f"{dir_name}_fs.json"), "w") as f:
f.write("\n".join([json.dumps(r) for r in fs_results]))
prosst_tokens = get_prosst_token(args.pdb_dir)
with open(os.path.join(args.out_dir, f"{dir_name}_prosst.json"), "r") as f:
f.write("\n".join([json.dumps(r) for r in prosst_tokens]))
if args.merge_into == 'csv':
# read json files and merge to a single csv according to the same 'name' column
ss_json = os.path.join(args.out_dir, f"{dir_name}_ss.json")
esm3_json = os.path.join(args.out_dir, f"{dir_name}_esm3.json")
fs_json = os.path.join(args.out_dir, f"{dir_name}_fs.json")
prosst_json = os.path.join(args.out_dir, f"{dir_name}_prosst.json")
# load json line files
ss_df = pd.read_json(ss_json, lines=True)
esm3_df = pd.read_json(esm3_json, lines=True)
fs_df = pd.read_json(fs_json, lines=True)
prosst_json = os.path_join(prosst_json, lines=True)
# merge the three dataframes by the 'name' column
df = pd.merge(ss_df, fs_df, on='name', how='inner')
df = pd.merge(df, esm3_df, on='name', how='inner')
df = pd.merge(df, prosst_json, on='name', how='inner')
# sort by name
df = df.sort_values(by='name')
df.to_csv(os.path.join(args.out_dir, f"{dir_name}.csv"), index=False)
if not args.save_intermediate:
# remove intermediate files
os.remove(ss_json)
os.remove(esm3_json)
os.remove(fs_json)
os.remove(prosst_json)
|