dsk129 commited on
Commit
aa3c5fe
·
verified ·
1 Parent(s): 486ec1d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -33
app.py CHANGED
@@ -12,50 +12,61 @@ import os
12
  model = EsmModel.from_pretrained("facebook/esm1b_t33_650M_UR50S", output_hidden_states=True)
13
  tokenizer = AutoTokenizer.from_pretrained("facebook/esm1b_t33_650M_UR50S")
14
 
15
- # Compute scaled PCA values for a selected component
16
- def compute_scaled_pca_scores(seq, component=0):
17
  inputs = tokenizer(seq, return_tensors="pt")
18
  with torch.no_grad():
19
  outputs = model(**inputs)
20
- embedding = outputs.last_hidden_state[0] # shape (L+2, d)
21
 
22
  L = len(seq)
23
  embedding = embedding[1:L+1] # remove CLS and EOS
24
 
25
- # Run PCA
26
- pca = PCA(n_components=component + 1)
27
  pca_result = pca.fit_transform(embedding.detach().cpu().numpy())
28
- selected_component = pca_result[:, component]
29
 
30
- # Scale between 0 and 100 for B-factor compatibility
31
- scaled = (selected_component - selected_component.min()) / (selected_component.max() - selected_component.min())
32
- scaled *= 100
33
- return scaled
 
34
 
35
- # Inject scores into B-factor column
36
- def inject_bfactors_into_pdb(pdb_file, scores):
 
 
37
  parser = PDBParser(QUIET=True)
38
  structure = parser.get_structure("prot", pdb_file.name)
39
- i = 0
40
- for model in structure:
41
- for chain in model:
42
- for residue in chain:
43
- if i >= len(scores):
44
- break
45
- for atom in residue:
46
- atom.bfactor = float(scores[i])
47
- i += 1
48
- out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".pdb").name
49
- io = PDBIO()
50
- io.set_structure(structure)
51
- io.save(out_path)
52
- return out_path
 
 
 
 
 
53
 
54
  # Gradio interface logic
55
- def process(seq, pdb_file, component):
56
- scores = compute_scaled_pca_scores(seq, component)
57
- pdb_with_scores = inject_bfactors_into_pdb(pdb_file, scores)
58
- return pdb_with_scores
 
 
 
 
 
59
 
60
  # Gradio UI
61
  demo = gr.Interface(
@@ -63,11 +74,12 @@ demo = gr.Interface(
63
  inputs=[
64
  gr.Textbox(label="Input Protein Sequence (1-letter code)"),
65
  gr.File(label="Upload PDB File", file_types=[".pdb"]),
66
- gr.Number(label="PCA Component (0 = first PC)", value=0, precision=0)
67
  ],
68
- outputs=gr.File(label="Modified PDB with PCA Component in B-factor Column"),
69
- title="ESM-1b PCA Component Projection for Structural Mapping"
70
  )
71
 
72
  demo.launch()
73
 
 
 
12
  model = EsmModel.from_pretrained("facebook/esm1b_t33_650M_UR50S", output_hidden_states=True)
13
  tokenizer = AutoTokenizer.from_pretrained("facebook/esm1b_t33_650M_UR50S")
14
 
15
+ # Compute PCA and return scaled values for selected components
16
+ def compute_scaled_pca_scores(seq, components):
17
  inputs = tokenizer(seq, return_tensors="pt")
18
  with torch.no_grad():
19
  outputs = model(**inputs)
20
+ embedding = outputs.last_hidden_state[0]
21
 
22
  L = len(seq)
23
  embedding = embedding[1:L+1] # remove CLS and EOS
24
 
25
+ pca = PCA(n_components=max(components) + 1)
 
26
  pca_result = pca.fit_transform(embedding.detach().cpu().numpy())
 
27
 
28
+ scaled_components = []
29
+ for c in components:
30
+ selected = pca_result[:, c]
31
+ scaled = (selected - selected.min()) / (selected.max() - selected.min()) * 100
32
+ scaled_components.append(scaled)
33
 
34
+ return scaled_components
35
+
36
+ # Inject scores into B-factor column and save each PDB separately
37
+ def inject_bfactors_and_save(pdb_file, scores_list, component_indices):
38
  parser = PDBParser(QUIET=True)
39
  structure = parser.get_structure("prot", pdb_file.name)
40
+ output_paths = []
41
+
42
+ for scores, idx in zip(scores_list, component_indices):
43
+ i = 0
44
+ for model in structure:
45
+ for chain in model:
46
+ for residue in chain:
47
+ if i >= len(scores):
48
+ break
49
+ for atom in residue:
50
+ atom.bfactor = float(scores[i])
51
+ i += 1
52
+ out_path = tempfile.NamedTemporaryFile(delete=False, suffix=f"_PC{idx}.pdb").name
53
+ io = PDBIO()
54
+ io.set_structure(structure)
55
+ io.save(out_path)
56
+ output_paths.append(out_path)
57
+
58
+ return output_paths
59
 
60
  # Gradio interface logic
61
+ def process(seq, pdb_file, component_string):
62
+ try:
63
+ components = [int(c.strip()) for c in component_string.split(",") if c.strip().isdigit()]
64
+ except:
65
+ return "Error: Please input a comma-separated list of integers.", []
66
+
67
+ scores_list = compute_scaled_pca_scores(seq, components)
68
+ pdb_paths = inject_bfactors_and_save(pdb_file, scores_list, components)
69
+ return pdb_paths
70
 
71
  # Gradio UI
72
  demo = gr.Interface(
 
74
  inputs=[
75
  gr.Textbox(label="Input Protein Sequence (1-letter code)"),
76
  gr.File(label="Upload PDB File", file_types=[".pdb"]),
77
+ gr.Textbox(label="Comma-separated PCA Components (e.g. 0,1,2)")
78
  ],
79
+ outputs=gr.File(label="Download PDBs with PCA Projections", file_types=[".pdb"], file_count="multiple"),
80
+ title="ESM-1b PCA Component Projection: Multi-PC Structural Mapping"
81
  )
82
 
83
  demo.launch()
84
 
85
+