dsk129 commited on
Commit
486ec1d
·
verified ·
1 Parent(s): 9013567

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -44
app.py CHANGED
@@ -3,9 +3,8 @@ import numpy as np
3
  import gradio as gr
4
  import matplotlib.pyplot as plt
5
  from transformers import AutoTokenizer, EsmModel
6
- from sklearn.metrics.pairwise import cosine_similarity
7
  from Bio.PDB import PDBParser, PDBIO
8
- import freesasa
9
  import tempfile
10
  import os
11
 
@@ -13,48 +12,25 @@ import os
13
  model = EsmModel.from_pretrained("facebook/esm1b_t33_650M_UR50S", output_hidden_states=True)
14
  tokenizer = AutoTokenizer.from_pretrained("facebook/esm1b_t33_650M_UR50S")
15
 
16
- # Compute per-residue cosine similarity scores (ASA-aware)
17
- def compute_asa_filtered_scores(seq, pdb_path):
18
  inputs = tokenizer(seq, return_tensors="pt")
19
  with torch.no_grad():
20
  outputs = model(**inputs)
21
- embedding = outputs.last_hidden_state[0]
22
 
23
  L = len(seq)
24
- embedding = embedding[1:L+1] # Remove CLS/EOS
25
 
26
- # Compute ASA using freesasa
27
- structure = freesasa.Structure(pdb_path)
28
- result = freesasa.calc(structure)
29
- rASA = []
30
- for i in range(L):
31
- try:
32
- res_id = structure.residueNumber(i)
33
- chain = structure.chainLabel(i)
34
- area = result.residueAreas()[chain][res_id]['total']
35
- # Estimate max ASA for normalization (simplified)
36
- max_acc = 200.0 # Conservative estimate for normalization
37
- rASA.append(area / max_acc)
38
- except:
39
- rASA.append(0.0)
40
- rASA = np.array(rASA)
41
 
42
- # Bin into buried (<= 0.25) and exposed (> 0.25)
43
- buried_idx = np.where(rASA <= 0.25)[0]
44
- exposed_idx = np.where(rASA > 0.25)[0]
45
-
46
- # Compute cosine similarity matrix
47
- sim_matrix = cosine_similarity(embedding.detach().cpu().numpy())
48
-
49
- # Sum similarities only within ASA bins
50
- filtered_scores = np.zeros(L)
51
- for i in range(L):
52
- group = buried_idx if i in buried_idx else exposed_idx
53
- filtered_scores[i] = np.sum(sim_matrix[i, group])
54
-
55
- # Normalize
56
- norm_scores = 100 * (filtered_scores - np.min(filtered_scores)) / (np.max(filtered_scores) - np.min(filtered_scores))
57
- return norm_scores
58
 
59
  # Inject scores into B-factor column
60
  def inject_bfactors_into_pdb(pdb_file, scores):
@@ -75,21 +51,23 @@ def inject_bfactors_into_pdb(pdb_file, scores):
75
  io.save(out_path)
76
  return out_path
77
 
78
- # Combined Gradio interface
79
- def process(seq, pdb_file):
80
- scores = compute_asa_filtered_scores(seq, pdb_file.name)
81
  pdb_with_scores = inject_bfactors_into_pdb(pdb_file, scores)
82
  return pdb_with_scores
83
 
84
- # Gradio Interface
85
  demo = gr.Interface(
86
  fn=process,
87
  inputs=[
88
  gr.Textbox(label="Input Protein Sequence (1-letter code)"),
89
- gr.File(label="Upload PDB File", file_types=[".pdb"])
 
90
  ],
91
- outputs=gr.File(label="Modified PDB with ASA-filtered Embedding Scores in B-factor Column"),
92
- title="ESM-1b ASA-Aware Residue Scoring for Structural Visualization"
93
  )
94
 
95
  demo.launch()
 
 
3
  import gradio as gr
4
  import matplotlib.pyplot as plt
5
  from transformers import AutoTokenizer, EsmModel
6
+ from sklearn.decomposition import PCA
7
  from Bio.PDB import PDBParser, PDBIO
 
8
  import tempfile
9
  import os
10
 
 
12
  model = EsmModel.from_pretrained("facebook/esm1b_t33_650M_UR50S", output_hidden_states=True)
13
  tokenizer = AutoTokenizer.from_pretrained("facebook/esm1b_t33_650M_UR50S")
14
 
15
+ # Compute scaled PCA values for a selected component
16
+ def compute_scaled_pca_scores(seq, component=0):
17
  inputs = tokenizer(seq, return_tensors="pt")
18
  with torch.no_grad():
19
  outputs = model(**inputs)
20
+ embedding = outputs.last_hidden_state[0] # shape (L+2, d)
21
 
22
  L = len(seq)
23
+ embedding = embedding[1:L+1] # remove CLS and EOS
24
 
25
+ # Run PCA
26
+ pca = PCA(n_components=component + 1)
27
+ pca_result = pca.fit_transform(embedding.detach().cpu().numpy())
28
+ selected_component = pca_result[:, component]
 
 
 
 
 
 
 
 
 
 
 
29
 
30
+ # Scale between 0 and 100 for B-factor compatibility
31
+ scaled = (selected_component - selected_component.min()) / (selected_component.max() - selected_component.min())
32
+ scaled *= 100
33
+ return scaled
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  # Inject scores into B-factor column
36
  def inject_bfactors_into_pdb(pdb_file, scores):
 
51
  io.save(out_path)
52
  return out_path
53
 
54
+ # Gradio interface logic
55
+ def process(seq, pdb_file, component):
56
+ scores = compute_scaled_pca_scores(seq, component)
57
  pdb_with_scores = inject_bfactors_into_pdb(pdb_file, scores)
58
  return pdb_with_scores
59
 
60
+ # Gradio UI
61
  demo = gr.Interface(
62
  fn=process,
63
  inputs=[
64
  gr.Textbox(label="Input Protein Sequence (1-letter code)"),
65
+ gr.File(label="Upload PDB File", file_types=[".pdb"]),
66
+ gr.Number(label="PCA Component (0 = first PC)", value=0, precision=0)
67
  ],
68
+ outputs=gr.File(label="Modified PDB with PCA Component in B-factor Column"),
69
+ title="ESM-1b PCA Component Projection for Structural Mapping"
70
  )
71
 
72
  demo.launch()
73
+