dsk129 commited on
Commit
6822689
·
verified ·
1 Parent(s): ca5f6e7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -14
app.py CHANGED
@@ -5,35 +5,61 @@ import gradio as gr
5
  import matplotlib.pyplot as plt
6
  from transformers import AutoTokenizer, EsmModel
7
  from sklearn.metrics.pairwise import cosine_similarity
8
- from Bio.PDB import PDBParser, PDBIO
9
- from Bio.PDB.StructureBuilder import StructureBuilder
10
  import tempfile
11
  import os
12
 
13
- #----------------------------------------------------Analysis--------------------------------------------------------------------------------------------------------
14
  # Load ESM-1b model and tokenizer
15
  model = EsmModel.from_pretrained("facebook/esm1b_t33_650M_UR50S", output_hidden_states=True)
16
  tokenizer = AutoTokenizer.from_pretrained("facebook/esm1b_t33_650M_UR50S")
17
 
18
- def compute_residue_scores(seq):
 
19
  inputs = tokenizer(seq, return_tensors="pt")
20
  with torch.no_grad():
21
  outputs = model(**inputs)
22
- embedding = outputs.last_hidden_state[0] # shape (L+2, d)
23
 
24
  L = len(seq)
25
- embedding = embedding[1:L+1] # shape (L, d)
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  sim_matrix = cosine_similarity(embedding.detach().cpu().numpy())
28
- residue_scores = np.sum(sim_matrix, axis=1)
29
- norm_scores = 100 * (residue_scores - np.min(residue_scores)) / (np.max(residue_scores) - np.min(residue_scores))
30
 
 
 
 
 
 
 
 
 
31
  return norm_scores
32
 
 
33
  def inject_bfactors_into_pdb(pdb_file, scores):
34
  parser = PDBParser(QUIET=True)
35
  structure = parser.get_structure("prot", pdb_file.name)
36
-
37
  i = 0
38
  for model in structure:
39
  for chain in model:
@@ -43,16 +69,15 @@ def inject_bfactors_into_pdb(pdb_file, scores):
43
  for atom in residue:
44
  atom.bfactor = float(scores[i])
45
  i += 1
46
-
47
  out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".pdb").name
48
  io = PDBIO()
49
  io.set_structure(structure)
50
  io.save(out_path)
51
-
52
  return out_path
53
 
 
54
  def process(seq, pdb_file):
55
- scores = compute_residue_scores(seq)
56
  pdb_with_scores = inject_bfactors_into_pdb(pdb_file, scores)
57
  return pdb_with_scores
58
 
@@ -63,8 +88,8 @@ demo = gr.Interface(
63
  gr.Textbox(label="Input Protein Sequence (1-letter code)"),
64
  gr.File(label="Upload PDB File", file_types=[".pdb"])
65
  ],
66
- outputs=gr.File(label="Modified PDB with Scores in B-factor Column"),
67
- title="ESM-1b Residue Scoring: B-factor Injection for Structural Visualization"
68
  )
69
 
70
  demo.launch()
 
5
  import matplotlib.pyplot as plt
6
  from transformers import AutoTokenizer, EsmModel
7
  from sklearn.metrics.pairwise import cosine_similarity
8
+ from Bio.PDB import PDBParser, PDBIO, DSSP
9
+ from Bio.PDB.Polypeptide import PPBuilder
10
  import tempfile
11
  import os
12
 
13
+ #-------------------------------------------------Analysis---------------------------------------------------------------------------------------------------------------
14
  # Load ESM-1b model and tokenizer
15
  model = EsmModel.from_pretrained("facebook/esm1b_t33_650M_UR50S", output_hidden_states=True)
16
  tokenizer = AutoTokenizer.from_pretrained("facebook/esm1b_t33_650M_UR50S")
17
 
18
+ # Compute per-residue cosine similarity scores (ASA-aware)
19
+ def compute_asa_filtered_scores(seq, pdb_path):
20
  inputs = tokenizer(seq, return_tensors="pt")
21
  with torch.no_grad():
22
  outputs = model(**inputs)
23
+ embedding = outputs.last_hidden_state[0]
24
 
25
  L = len(seq)
26
+ embedding = embedding[1:L+1] # Remove CLS/EOS
27
 
28
+ # Parse structure and compute DSSP
29
+ parser = PDBParser(QUIET=True)
30
+ structure = parser.get_structure("prot", pdb_path)
31
+ model_struct = next(structure.get_models())
32
+ dssp = DSSP(model_struct, pdb_path)
33
+
34
+ # Extract rASA and match to sequence indices
35
+ rASA = []
36
+ for key in list(dssp.keys())[:L]:
37
+ asa = dssp[key][3] # absolute ASA
38
+ max_acc = dssp.residue_max_acc[dssp[key][1]]
39
+ rASA.append(asa / max_acc if max_acc > 0 else 0.0)
40
+ rASA = np.array(rASA)
41
+
42
+ # Bin into buried (<= 0.25) and exposed (> 0.25)
43
+ buried_idx = np.where(rASA <= 0.25)[0]
44
+ exposed_idx = np.where(rASA > 0.25)[0]
45
+
46
+ # Compute cosine similarity matrix
47
  sim_matrix = cosine_similarity(embedding.detach().cpu().numpy())
 
 
48
 
49
+ # Sum similarities only within ASA bins
50
+ filtered_scores = np.zeros(L)
51
+ for i in range(L):
52
+ group = buried_idx if i in buried_idx else exposed_idx
53
+ filtered_scores[i] = np.sum(sim_matrix[i, group])
54
+
55
+ # Normalize
56
+ norm_scores = 100 * (filtered_scores - np.min(filtered_scores)) / (np.max(filtered_scores) - np.min(filtered_scores))
57
  return norm_scores
58
 
59
+ # Inject scores into B-factor column
60
  def inject_bfactors_into_pdb(pdb_file, scores):
61
  parser = PDBParser(QUIET=True)
62
  structure = parser.get_structure("prot", pdb_file.name)
 
63
  i = 0
64
  for model in structure:
65
  for chain in model:
 
69
  for atom in residue:
70
  atom.bfactor = float(scores[i])
71
  i += 1
 
72
  out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".pdb").name
73
  io = PDBIO()
74
  io.set_structure(structure)
75
  io.save(out_path)
 
76
  return out_path
77
 
78
+ # Combined Gradio interface
79
  def process(seq, pdb_file):
80
+ scores = compute_asa_filtered_scores(seq, pdb_file.name)
81
  pdb_with_scores = inject_bfactors_into_pdb(pdb_file, scores)
82
  return pdb_with_scores
83
 
 
88
  gr.Textbox(label="Input Protein Sequence (1-letter code)"),
89
  gr.File(label="Upload PDB File", file_types=[".pdb"])
90
  ],
91
+ outputs=gr.File(label="Modified PDB with ASA-filtered Embedding Scores in B-factor Column"),
92
+ title="ESM-1b ASA-Aware Residue Scoring for Structural Visualization"
93
  )
94
 
95
  demo.launch()