dsk129 commited on
Commit
14b630e
·
verified ·
1 Parent(s): 4c8a467

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -119
app.py CHANGED
@@ -1,140 +1,70 @@
1
- #-------------------------------------------------------libraries------------------------------------------------------------------------------------
2
  import torch
3
  import numpy as np
4
  import gradio as gr
5
  import matplotlib.pyplot as plt
6
  from transformers import AutoTokenizer, EsmModel
7
- from sklearn.decomposition import PCA
8
  from sklearn.metrics.pairwise import cosine_similarity
 
 
 
 
9
 
10
- #----------------------------------------------------Analysis------------------------------------------------------------------------------------
11
- #--load model and tokenizer
12
  model = EsmModel.from_pretrained("facebook/esm1b_t33_650M_UR50S", output_hidden_states=True)
13
  tokenizer = AutoTokenizer.from_pretrained("facebook/esm1b_t33_650M_UR50S")
14
 
15
- #--confirm proper installation
16
- import torch, transformers
17
- print("Torch version:", torch.__version__)
18
- print("Transformers version:", transformers.__version__)
19
-
20
- #import torch
21
- print("Torch NumPy test:", torch.ones(1).numpy())
22
-
23
- '''
24
- #--principal component plot
25
- def extract_and_plot(seq, layer=-1):
26
- #--preprocess sequence
27
  inputs = tokenizer(seq, return_tensors="pt")
28
-
29
- #--forward pass
30
  with torch.no_grad():
31
  outputs = model(**inputs)
32
- hidden_states = outputs.hidden_states #--> tuple: (layer0, ..., layer_final)
33
-
34
- #--select hidden state from specified layer
35
- if layer == 1:
36
- embedding = hidden_states[-1][0] #--> (seq_len, hidden_dim)
37
- else:
38
- embedding = hidden_states[layer][0]
39
-
40
- #--PCA
41
- pca = PCA(n_components=2)
42
- coords = pca.fit_transform(embedding.numpy())
43
 
44
- #--plot
45
- plt.figure(figsize=(6, 4))
46
- plt.scatter(coords[:, 0], coords[:, 1])
47
- plt.title(f"PCA of esm1b embeddings (layer {layer})")
48
- plt.xlabel("PCA1")
49
- plt.ylabel("PCA2")
50
- plt.tight_layout()
51
-
52
- return plt
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  demo = gr.Interface(
55
- fn=extract_and_plot,
56
  inputs=[
57
- gr.Textbox(label="Protein Sequence"),
58
- gr.Slider(minimum=0, maximum=33, step=1, value=33, label="Layer (-1 = final)")
59
  ],
60
- outputs=gr.Plot()
 
61
  )
62
 
63
  demo.launch()
64
- '''
65
-
66
- import torch
67
- import gradio as gr
68
- import matplotlib.pyplot as plt
69
- import numpy as np
70
- from sklearn.metrics.pairwise import cosine_similarity
71
- from transformers import AutoTokenizer, EsmModel
72
-
73
- # Load model
74
- model = EsmModel.from_pretrained("facebook/esm1b_t33_650M_UR50S", output_hidden_states=True)
75
- tokenizer = AutoTokenizer.from_pretrained("facebook/esm1b_t33_650M_UR50S")
76
-
77
- # Define hydrophobicity classification
78
- nonpolar = set("AFLIVMYW")
79
- polar = set("QERSDHKNT")
80
-
81
- def classify_residues(seq):
82
- return ["nonpolar" if aa in nonpolar else "polar" if aa in polar else "other" for aa in seq]
83
-
84
- def compute_cosine_heatmap(seq):
85
- # Tokenize
86
- inputs = tokenizer(seq, return_tensors="pt")
87
- with torch.no_grad():
88
- outputs = model(**inputs)
89
- embedding = outputs.last_hidden_state[0] # shape (L, 1280)
90
-
91
- # Remove [CLS] and [EOS] if present
92
- L = len(seq)
93
- embedding = embedding[1:L+1]
94
-
95
- # Cosine similarity matrix
96
- sim_matrix = cosine_similarity(embedding.detach().cpu().numpy())
97
-
98
- # Residue classification
99
- residue_classes = classify_residues(seq)
100
- class_colors = {
101
- "nonpolar": "magenta",
102
- "polar": "indigo",
103
- "other": "steelblue"
104
- }
105
- row_colors = [class_colors[c] for c in residue_classes]
106
-
107
- # Plot heatmap
108
- fig, ax = plt.subplots(figsize=(8, 6))
109
- im = ax.imshow(sim_matrix, cmap="viridis")
110
- fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
111
- ax.set_title("Residue–Residue Cosine Similarity")
112
- ax.set_xlabel("Residue Index")
113
- ax.set_ylabel("Residue Index")
114
-
115
- # Add colored ticks for class annotation
116
- for spine in ax.spines.values():
117
- spine.set_visible(False)
118
- ax.set_xticks(range(L))
119
- ax.set_yticks(range(L))
120
- ax.tick_params(length=0)
121
-
122
- # Color-code labels
123
- ax.set_xticklabels(residue_classes, rotation=90, fontsize=6)
124
- ax.set_yticklabels(residue_classes, fontsize=6)
125
- for label, color in zip(ax.get_xticklabels(), row_colors):
126
- label.set_color(color)
127
- for label, color in zip(ax.get_yticklabels(), row_colors):
128
- label.set_color(color)
129
-
130
- fig.tight_layout()
131
- return fig
132
-
133
- # Gradio UI
134
- demo = gr.Interface(
135
- fn=compute_cosine_heatmap,
136
- inputs=gr.Textbox(label="Input Protein Sequence (1-letter code)"),
137
- outputs=gr.Plot()
138
- )
139
-
140
- demo.launch()
 
1
+ #-------------------------------------------------libraries---------------------------------------------------------------------------------------------------------------
2
  import torch
3
  import numpy as np
4
  import gradio as gr
5
  import matplotlib.pyplot as plt
6
  from transformers import AutoTokenizer, EsmModel
 
7
  from sklearn.metrics.pairwise import cosine_similarity
8
+ from Bio.PDB import PDBParser, PDBIO
9
+ from Bio.PDB.StructureBuilder import StructureBuilder
10
+ import tempfile
11
+ import os
12
 
13
+ #----------------------------------------------------Analysis--------------------------------------------------------------------------------------------------------
14
+ # Load ESM-1b model and tokenizer
15
  model = EsmModel.from_pretrained("facebook/esm1b_t33_650M_UR50S", output_hidden_states=True)
16
  tokenizer = AutoTokenizer.from_pretrained("facebook/esm1b_t33_650M_UR50S")
17
 
18
+ def compute_residue_scores(seq):
 
 
 
 
 
 
 
 
 
 
 
19
  inputs = tokenizer(seq, return_tensors="pt")
 
 
20
  with torch.no_grad():
21
  outputs = model(**inputs)
22
+ embedding = outputs.last_hidden_state[0] # shape (L+2, d)
 
 
 
 
 
 
 
 
 
 
23
 
24
+ L = len(seq)
25
+ embedding = embedding[1:L+1] # shape (L, d)
 
 
 
 
 
 
 
26
 
27
+ sim_matrix = cosine_similarity(embedding.detach().cpu().numpy())
28
+ residue_scores = np.sum(sim_matrix, axis=1)
29
+ norm_scores = 100 * (residue_scores - np.min(residue_scores)) / (np.max(residue_scores) - np.min(residue_scores))
30
+
31
+ return norm_scores
32
+
33
+ def inject_bfactors_into_pdb(pdb_file, scores):
34
+ parser = PDBParser(QUIET=True)
35
+ structure = parser.get_structure("prot", pdb_file.name)
36
+
37
+ i = 0
38
+ for model in structure:
39
+ for chain in model:
40
+ for residue in chain:
41
+ if i >= len(scores):
42
+ break
43
+ for atom in residue:
44
+ atom.bfactor = float(scores[i])
45
+ i += 1
46
+
47
+ out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".pdb").name
48
+ io = PDBIO()
49
+ io.set_structure(structure)
50
+ io.save(out_path)
51
+
52
+ return out_path
53
+
54
+ def process(seq, pdb_file):
55
+ scores = compute_residue_scores(seq)
56
+ pdb_with_scores = inject_bfactors_into_pdb(pdb_file, scores)
57
+ return pdb_with_scores
58
+
59
+ # Gradio Interface
60
  demo = gr.Interface(
61
+ fn=process,
62
  inputs=[
63
+ gr.Textbox(label="Input Protein Sequence (1-letter code)"),
64
+ gr.File(label="Upload PDB File", file_types=[".pdb"])
65
  ],
66
+ outputs=gr.File(label="Modified PDB with Scores in B-factor Column"),
67
+ title="ESM-1b Residue Scoring: B-factor Injection for Structural Visualization"
68
  )
69
 
70
  demo.launch()