File size: 4,287 Bytes
8918ac7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import os
import json
import argparse
import torch
import pandas as pd
from tqdm import tqdm
from esm.models.vqvae import StructureTokenEncoder
from get_esm3_structure_seq import get_esm3_structure_seq
from get_foldseek_structure_seq import get_foldseek_structure_seq
from get_secondary_structure_seq import get_secondary_structure_seq
from get_prosst_str_token import get_prosst_token

# ignore the warning
import warnings
warnings.filterwarnings("ignore")

def ESM3_structure_encoder_v0(device: torch.device | str = "cpu"):
    model = (
        StructureTokenEncoder(
            d_model=1024, n_heads=1, v_heads=128, n_layers=2, d_out=128, n_codes=4096
        )
        .to(device)
        .eval()
    )
    state_dict = torch.load(
        "./src/data/weight/esm3_structure_encoder_v0.pth", map_location=device
    )
    model.load_state_dict(state_dict)
    return model

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--pdb_dir", type=str, default='dataset/sesadapter/DeepET/esmfold_pdb')
    parser.add_argument("--pdb_file", type=str, default=None)
    parser.add_argument("--out_dir", type=str, default='dataset/sesadapter/DeepET')
    parser.add_argument("--merge_into", type=str, default='csv', choices=['json', 'csv'])
    parser.add_argument("--save_intermediate", action='store_true')
    args = parser.parse_args()

    device = "cuda:0"
    esm3_encoder = ESM3_structure_encoder_v0(device)
    
    if args.pdb_dir is not None:
        dir_name = os.path.basename(args.pdb_dir)
        pdb_files = os.listdir(args.pdb_dir)
        ss_results, esm3_results = [], []
        for pdb_file in tqdm(pdb_files):
            ss_result, error = get_secondary_structure_seq(os.path.join(args.pdb_dir, pdb_file))
            if error is not None:
                print(error)
                continue
            ss_results.append(ss_result)
            esm3_result = get_esm3_structure_seq(os.path.join(args.pdb_dir, pdb_file), esm3_encoder, device)
            esm3_results.append(esm3_result)
            # clear cuda cache
            torch.cuda.empty_cache()
        with open(os.path.join(args.out_dir, f"{dir_name}_ss.json"), "w") as f:
            f.write("\n".join([json.dumps(r) for r in ss_results]))
        with open(os.path.join(args.out_dir, f"{dir_name}_esm3.json"), "w") as f:
            f.write("\n".join([json.dumps(r) for r in esm3_results]))
        
        fs_results = get_foldseek_structure_seq(args.pdb_dir)
        with open(os.path.join(args.out_dir, f"{dir_name}_fs.json"), "w") as f:
            f.write("\n".join([json.dumps(r) for r in fs_results]))
        prosst_tokens = get_prosst_token(args.pdb_dir)
        with open(os.path.join(args.out_dir, f"{dir_name}_prosst.json"), "r") as f:
            f.write("\n".join([json.dumps(r) for r in prosst_tokens]))

        if args.merge_into == 'csv':
            # read json files and merge to a single csv according to the same 'name' column
            ss_json = os.path.join(args.out_dir, f"{dir_name}_ss.json")
            esm3_json = os.path.join(args.out_dir, f"{dir_name}_esm3.json")
            fs_json = os.path.join(args.out_dir, f"{dir_name}_fs.json")
            prosst_json = os.path.join(args.out_dir, f"{dir_name}_prosst.json")
            # load json line files
            ss_df = pd.read_json(ss_json, lines=True)
            esm3_df = pd.read_json(esm3_json, lines=True)
            fs_df = pd.read_json(fs_json, lines=True)
            prosst_json = os.path_join(prosst_json, lines=True)
            # merge the three dataframes by the 'name' column
            df = pd.merge(ss_df, fs_df, on='name', how='inner')
            df = pd.merge(df, esm3_df, on='name', how='inner')
            df = pd.merge(df, prosst_json, on='name', how='inner')
            # sort by name
            df = df.sort_values(by='name')
            df.to_csv(os.path.join(args.out_dir, f"{dir_name}.csv"), index=False)
            
            if not args.save_intermediate:
                # remove intermediate files
                os.remove(ss_json)
                os.remove(esm3_json)
                os.remove(fs_json)
                os.remove(prosst_json)