Spaces:
Sleeping
Sleeping
Commit
·
eb181e2
1
Parent(s):
ea67cd9
Delete code/add_3Dalignment_alphafold.py
Browse files
code/add_3Dalignment_alphafold.py
DELETED
@@ -1,273 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
This code file produces alignments between the structure and the sequence for a given protein.
|
3 |
-
|
4 |
-
"""
|
5 |
-
|
6 |
-
import math
|
7 |
-
import glob
|
8 |
-
import numpy as np
|
9 |
-
from Bio import Align
|
10 |
-
import gzip
|
11 |
-
from pathlib import Path
|
12 |
-
from Bio.Align import substitution_matrices
|
13 |
-
from Bio.PDB.Polypeptide import *
|
14 |
-
aligner = Align.PairwiseAligner()
|
15 |
-
import requests
|
16 |
-
from Bio.PDB import PDBParser, PPBuilder
|
17 |
-
from io import StringIO
|
18 |
-
|
19 |
-
|
20 |
-
def convert_non_standard_amino_acids(sequence):
|
21 |
-
"""
|
22 |
-
Convert non-standard or ambiguous amino acid codes to their closest relatives.
|
23 |
-
"""
|
24 |
-
|
25 |
-
# Define a dictionary to map non-standard codes to standard amino acids
|
26 |
-
conversion_dict = {
|
27 |
-
'B': 'D', # Aspartic Acid (D) is often used for B (Asx)
|
28 |
-
'Z': 'E', # Glutamic Acid (E) is often used for Z (Glx)
|
29 |
-
'X': 'A', # Alanine (A) is a common placeholder for unknown/ambiguous
|
30 |
-
'U': 'C', # Cysteine (C) is often used for Selenocysteine (U)
|
31 |
-
'J': 'L', # Leucine (L) is often used for J (Leu/Ile)
|
32 |
-
'O': 'K', # Lysine (K) is often used for O (Pyrrolysine)
|
33 |
-
# '*' or 'Stop' represents a stop codon; you may replace with '' to remove
|
34 |
-
'*': '',
|
35 |
-
}
|
36 |
-
|
37 |
-
# Replace non-standard codes with their closest relatives
|
38 |
-
converted_sequence = ''.join([conversion_dict.get(aa, aa) for aa in sequence])
|
39 |
-
|
40 |
-
return converted_sequence
|
41 |
-
|
42 |
-
def distance(x1, y1, z1, x2, y2, z2):
|
43 |
-
d = math.sqrt(math.pow(x2 - x1, 2) +
|
44 |
-
math.pow(y2 - y1, 2) +
|
45 |
-
math.pow(z2 - z1, 2) * 1.0)
|
46 |
-
return d
|
47 |
-
|
48 |
-
|
49 |
-
def find_distance(coordMut, coordAnnot):
|
50 |
-
if coordMut != np.NaN:
|
51 |
-
try:
|
52 |
-
dist = distance(float(coordMut[0]), float(coordMut[1]), float(coordMut[2]), float(coordAnnot[0]),
|
53 |
-
float(coordAnnot[1]), float(coordAnnot[2]))
|
54 |
-
return "%.2f" % dist
|
55 |
-
except:
|
56 |
-
ValueError
|
57 |
-
dist = 'nan'
|
58 |
-
return dist
|
59 |
-
else:
|
60 |
-
return np.NaN
|
61 |
-
|
62 |
-
|
63 |
-
def threeToOne(variant):
|
64 |
-
if variant == "ALA":
|
65 |
-
variant = "A"
|
66 |
-
elif variant == "ARG":
|
67 |
-
variant = "R"
|
68 |
-
elif variant == "VAL":
|
69 |
-
variant = "V"
|
70 |
-
elif variant == "GLU":
|
71 |
-
variant = "E"
|
72 |
-
elif variant == "PRO":
|
73 |
-
variant = "P"
|
74 |
-
elif variant == "LEU":
|
75 |
-
variant = "L"
|
76 |
-
elif variant == "GLY":
|
77 |
-
variant = "G"
|
78 |
-
elif variant == "ASN":
|
79 |
-
variant = "N"
|
80 |
-
elif variant == "SER":
|
81 |
-
variant = "S"
|
82 |
-
elif variant == "GLN":
|
83 |
-
variant = "Q"
|
84 |
-
elif variant == "THR":
|
85 |
-
variant = "T"
|
86 |
-
elif variant == "MET":
|
87 |
-
variant = "M"
|
88 |
-
elif variant == "LYS":
|
89 |
-
variant = "K"
|
90 |
-
elif variant == "ASP":
|
91 |
-
variant = "D"
|
92 |
-
elif variant == "ILE":
|
93 |
-
variant = "I"
|
94 |
-
elif variant == "PHE":
|
95 |
-
variant = "F"
|
96 |
-
elif variant == "TRP":
|
97 |
-
variant = "W"
|
98 |
-
elif variant == "TYR":
|
99 |
-
variant = "Y"
|
100 |
-
elif variant == "HIS":
|
101 |
-
variant = "H"
|
102 |
-
elif variant == "CYS":
|
103 |
-
variant = "C"
|
104 |
-
elif variant == 'UNK':
|
105 |
-
variant = 'X'
|
106 |
-
elif variant == 'ASX':
|
107 |
-
variant = 'O'
|
108 |
-
return (variant)
|
109 |
-
|
110 |
-
|
111 |
-
def get_coords(annot, alignments, coords, resnums_for_sasa, mode):
|
112 |
-
if mode == 1:
|
113 |
-
for alignment in alignments[0]:
|
114 |
-
alignment = (str(alignment).strip().split('\n'))
|
115 |
-
startGap = 0
|
116 |
-
if alignment[0].startswith('.'):
|
117 |
-
for k in alignment[0]:
|
118 |
-
if k == '.' or k == '-':
|
119 |
-
startGap += 1
|
120 |
-
else:
|
121 |
-
break
|
122 |
-
countGap = startGap
|
123 |
-
countResidue = 0
|
124 |
-
for j in alignment[0][startGap:]:
|
125 |
-
if j == '.' or j == '-':
|
126 |
-
countGap += 1
|
127 |
-
else:
|
128 |
-
countResidue += 1
|
129 |
-
if countResidue == float(annot):
|
130 |
-
break
|
131 |
-
countGap_pdb = 0
|
132 |
-
countResidue_pdb = 0
|
133 |
-
for m in alignment[2][0:countResidue + countGap - 1]:
|
134 |
-
if m == '.' or m == '-':
|
135 |
-
countGap_pdb += 1
|
136 |
-
posAtom = countResidue + countGap - countGap_pdb
|
137 |
-
|
138 |
-
realpdbStart = 0
|
139 |
-
for j in alignment[2]:
|
140 |
-
if j == '.' or j == '-':
|
141 |
-
realpdbStart += 1
|
142 |
-
else:
|
143 |
-
break
|
144 |
-
|
145 |
-
if (alignment[2][countResidue + countGap - 1] != '-') and (float(annot) >= float(realpdbStart) + 1):
|
146 |
-
try:
|
147 |
-
coordinates = alignments[1]
|
148 |
-
residue_numbers = alignments[2]
|
149 |
-
coordWeWant = coordinates[posAtom - 1]
|
150 |
-
residue_number_we_want = residue_numbers[posAtom - 1]
|
151 |
-
|
152 |
-
except:
|
153 |
-
IndexError
|
154 |
-
coordWeWant = 'nan'
|
155 |
-
else:
|
156 |
-
coordWeWant = 'nan'
|
157 |
-
return coordWeWant, posAtom, residue_number_we_want
|
158 |
-
if mode == 2:
|
159 |
-
if annot != 'nan':
|
160 |
-
if int(annot) <= 1400:
|
161 |
-
alignment = (str(alignments).strip().split('\n'))
|
162 |
-
startGap = 0
|
163 |
-
if alignment[0].startswith('.'):
|
164 |
-
for k in alignment[0]:
|
165 |
-
if k == '.' or k == '-':
|
166 |
-
startGap += 1
|
167 |
-
else:
|
168 |
-
break
|
169 |
-
countGap = startGap
|
170 |
-
countResidue = 0
|
171 |
-
for j in alignment[0][startGap:]:
|
172 |
-
if j == '.' or j == '-':
|
173 |
-
countGap += 1
|
174 |
-
else:
|
175 |
-
countResidue += 1
|
176 |
-
if countResidue == float(annot):
|
177 |
-
break
|
178 |
-
countGap_pdb = 0
|
179 |
-
countResidue_pdb = 0
|
180 |
-
for m in alignment[2][0:countResidue + countGap - 1]:
|
181 |
-
if m == '.' or m == '-':
|
182 |
-
countGap_pdb += 1
|
183 |
-
posAtom = countResidue + countGap - countGap_pdb
|
184 |
-
realpdbStart = 0
|
185 |
-
for j in alignment[2]:
|
186 |
-
if j == '.' or j == '-':
|
187 |
-
realpdbStart += 1
|
188 |
-
else:
|
189 |
-
break
|
190 |
-
if len(alignment[2]) > (countResidue + countGap - 1):
|
191 |
-
if (alignment[2][countResidue + countGap - 1] != '-') and (float(annot) >= float(realpdbStart) + 1):
|
192 |
-
try:
|
193 |
-
coordinates = coords
|
194 |
-
residue_numbers = resnums_for_sasa
|
195 |
-
coordWeWant = coordinates[posAtom - 1]
|
196 |
-
residue_number_we_want = residue_numbers[posAtom - 1]
|
197 |
-
except:
|
198 |
-
IndexError
|
199 |
-
coordWeWant = 'nan'
|
200 |
-
residue_number_we_want = 'nan'
|
201 |
-
else:
|
202 |
-
coordWeWant = 'nan'
|
203 |
-
residue_number_we_want = 'nan'
|
204 |
-
return coordWeWant, posAtom, residue_number_we_want
|
205 |
-
else:
|
206 |
-
coordWeWant = 'nan'
|
207 |
-
residue_number_we_want = 'nan'
|
208 |
-
return coordWeWant, posAtom, residue_number_we_want
|
209 |
-
else:
|
210 |
-
return np.NaN, np.NaN, np.NaN
|
211 |
-
else:
|
212 |
-
return np.NaN, np.NaN, np.NaN
|
213 |
-
|
214 |
-
|
215 |
-
def get_alignments_3D(identifier, model_num, pdb_path, pdbSequence, source, chain, pdbID, mode, path_3D_alignment,file_format = 'gzip'):
|
216 |
-
pdbSequence = convert_non_standard_amino_acids(pdbSequence)
|
217 |
-
if mode == 1:
|
218 |
-
if source == 'PDB':
|
219 |
-
# Step 1: Fetch the PDB file
|
220 |
-
pdb_url = f"https://files.rcsb.org/download/{pdbID}.pdb"
|
221 |
-
response = requests.get(pdb_url)
|
222 |
-
response.raise_for_status() # Check for a successful response
|
223 |
-
# Step 2: Parse the PDB file from memory
|
224 |
-
atoms = [i for i in response.text.split('\n') if i.startswith('ATOM')]
|
225 |
-
|
226 |
-
atoms = [i.split() for i in atoms]
|
227 |
-
atoms = [i for i in atoms if (i[2] == 'CA' and i[4] == chain)]
|
228 |
-
atoms = [[x[i][-3:] if i == 3 else x[i] for i in range(len(x))] for x in atoms]
|
229 |
-
|
230 |
-
atomSequence = ''.join([threeToOne(i[3]) for i in atoms])
|
231 |
-
coords = [[i[6] ,i[7] ,i[8]] for i in atoms]
|
232 |
-
resnums_for_sasa = [i[5] for i in atoms]
|
233 |
-
|
234 |
-
elif source == 'SWISSMODEL':
|
235 |
-
atomSequence = ''
|
236 |
-
coords = []
|
237 |
-
resnums_for_sasa = []
|
238 |
-
with open(pdb_path, encoding="utf8") as f:
|
239 |
-
for line in f.readlines():
|
240 |
-
if line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21].upper() == chain.upper():
|
241 |
-
atomSequence += threeToOne(line[17:20].strip())
|
242 |
-
coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
|
243 |
-
resnums_for_sasa.append(line[22:26].strip())
|
244 |
-
elif line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21] == ' ':
|
245 |
-
atomSequence += threeToOne(line[17:20].strip())
|
246 |
-
coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
|
247 |
-
resnums_for_sasa.append(line[22:26].strip())
|
248 |
-
|
249 |
-
|
250 |
-
elif source == 'MODBASE':
|
251 |
-
atomSequence = ''
|
252 |
-
coords = []
|
253 |
-
resnums_for_sasa = []
|
254 |
-
with open(pdb_path, encoding="utf8") as f:
|
255 |
-
for line in f.readlines():
|
256 |
-
if line[0:7].strip() == 'ATOM' and line[13:15].strip() == 'CA':
|
257 |
-
atomSequence += threeToOne(line[17:20].strip())
|
258 |
-
coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
|
259 |
-
resnums_for_sasa.append(line[22:26].strip())
|
260 |
-
|
261 |
-
aligner.mode = 'local'
|
262 |
-
aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
|
263 |
-
aligner.open_gap_score = -11
|
264 |
-
aligner.extend_gap_score = -1
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
atomSequence = convert_non_standard_amino_acids(atomSequence)
|
269 |
-
|
270 |
-
alignments = aligner.align(pdbSequence, atomSequence)
|
271 |
-
alignments = (list(alignments))
|
272 |
-
|
273 |
-
return alignments, coords, resnums_for_sasa
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|