fatmacankara commited on
Commit
eb181e2
·
1 Parent(s): ea67cd9

Delete code/add_3Dalignment_alphafold.py

Browse files
Files changed (1) hide show
  1. code/add_3Dalignment_alphafold.py +0 -273
code/add_3Dalignment_alphafold.py DELETED
@@ -1,273 +0,0 @@
1
- """
2
- This code file produces alignments between the structure and the sequence for a given protein.
3
-
4
- """
5
-
6
- import math
7
- import glob
8
- import numpy as np
9
- from Bio import Align
10
- import gzip
11
- from pathlib import Path
12
- from Bio.Align import substitution_matrices
13
- from Bio.PDB.Polypeptide import *
14
- aligner = Align.PairwiseAligner()
15
- import requests
16
- from Bio.PDB import PDBParser, PPBuilder
17
- from io import StringIO
18
-
19
-
20
- def convert_non_standard_amino_acids(sequence):
21
- """
22
- Convert non-standard or ambiguous amino acid codes to their closest relatives.
23
- """
24
-
25
- # Define a dictionary to map non-standard codes to standard amino acids
26
- conversion_dict = {
27
- 'B': 'D', # Aspartic Acid (D) is often used for B (Asx)
28
- 'Z': 'E', # Glutamic Acid (E) is often used for Z (Glx)
29
- 'X': 'A', # Alanine (A) is a common placeholder for unknown/ambiguous
30
- 'U': 'C', # Cysteine (C) is often used for Selenocysteine (U)
31
- 'J': 'L', # Leucine (L) is often used for J (Leu/Ile)
32
- 'O': 'K', # Lysine (K) is often used for O (Pyrrolysine)
33
- # '*' or 'Stop' represents a stop codon; you may replace with '' to remove
34
- '*': '',
35
- }
36
-
37
- # Replace non-standard codes with their closest relatives
38
- converted_sequence = ''.join([conversion_dict.get(aa, aa) for aa in sequence])
39
-
40
- return converted_sequence
41
-
42
- def distance(x1, y1, z1, x2, y2, z2):
43
- d = math.sqrt(math.pow(x2 - x1, 2) +
44
- math.pow(y2 - y1, 2) +
45
- math.pow(z2 - z1, 2) * 1.0)
46
- return d
47
-
48
-
49
- def find_distance(coordMut, coordAnnot):
50
- if coordMut != np.NaN:
51
- try:
52
- dist = distance(float(coordMut[0]), float(coordMut[1]), float(coordMut[2]), float(coordAnnot[0]),
53
- float(coordAnnot[1]), float(coordAnnot[2]))
54
- return "%.2f" % dist
55
- except:
56
- ValueError
57
- dist = 'nan'
58
- return dist
59
- else:
60
- return np.NaN
61
-
62
-
63
- def threeToOne(variant):
64
- if variant == "ALA":
65
- variant = "A"
66
- elif variant == "ARG":
67
- variant = "R"
68
- elif variant == "VAL":
69
- variant = "V"
70
- elif variant == "GLU":
71
- variant = "E"
72
- elif variant == "PRO":
73
- variant = "P"
74
- elif variant == "LEU":
75
- variant = "L"
76
- elif variant == "GLY":
77
- variant = "G"
78
- elif variant == "ASN":
79
- variant = "N"
80
- elif variant == "SER":
81
- variant = "S"
82
- elif variant == "GLN":
83
- variant = "Q"
84
- elif variant == "THR":
85
- variant = "T"
86
- elif variant == "MET":
87
- variant = "M"
88
- elif variant == "LYS":
89
- variant = "K"
90
- elif variant == "ASP":
91
- variant = "D"
92
- elif variant == "ILE":
93
- variant = "I"
94
- elif variant == "PHE":
95
- variant = "F"
96
- elif variant == "TRP":
97
- variant = "W"
98
- elif variant == "TYR":
99
- variant = "Y"
100
- elif variant == "HIS":
101
- variant = "H"
102
- elif variant == "CYS":
103
- variant = "C"
104
- elif variant == 'UNK':
105
- variant = 'X'
106
- elif variant == 'ASX':
107
- variant = 'O'
108
- return (variant)
109
-
110
-
111
- def get_coords(annot, alignments, coords, resnums_for_sasa, mode):
112
- if mode == 1:
113
- for alignment in alignments[0]:
114
- alignment = (str(alignment).strip().split('\n'))
115
- startGap = 0
116
- if alignment[0].startswith('.'):
117
- for k in alignment[0]:
118
- if k == '.' or k == '-':
119
- startGap += 1
120
- else:
121
- break
122
- countGap = startGap
123
- countResidue = 0
124
- for j in alignment[0][startGap:]:
125
- if j == '.' or j == '-':
126
- countGap += 1
127
- else:
128
- countResidue += 1
129
- if countResidue == float(annot):
130
- break
131
- countGap_pdb = 0
132
- countResidue_pdb = 0
133
- for m in alignment[2][0:countResidue + countGap - 1]:
134
- if m == '.' or m == '-':
135
- countGap_pdb += 1
136
- posAtom = countResidue + countGap - countGap_pdb
137
-
138
- realpdbStart = 0
139
- for j in alignment[2]:
140
- if j == '.' or j == '-':
141
- realpdbStart += 1
142
- else:
143
- break
144
-
145
- if (alignment[2][countResidue + countGap - 1] != '-') and (float(annot) >= float(realpdbStart) + 1):
146
- try:
147
- coordinates = alignments[1]
148
- residue_numbers = alignments[2]
149
- coordWeWant = coordinates[posAtom - 1]
150
- residue_number_we_want = residue_numbers[posAtom - 1]
151
-
152
- except:
153
- IndexError
154
- coordWeWant = 'nan'
155
- else:
156
- coordWeWant = 'nan'
157
- return coordWeWant, posAtom, residue_number_we_want
158
- if mode == 2:
159
- if annot != 'nan':
160
- if int(annot) <= 1400:
161
- alignment = (str(alignments).strip().split('\n'))
162
- startGap = 0
163
- if alignment[0].startswith('.'):
164
- for k in alignment[0]:
165
- if k == '.' or k == '-':
166
- startGap += 1
167
- else:
168
- break
169
- countGap = startGap
170
- countResidue = 0
171
- for j in alignment[0][startGap:]:
172
- if j == '.' or j == '-':
173
- countGap += 1
174
- else:
175
- countResidue += 1
176
- if countResidue == float(annot):
177
- break
178
- countGap_pdb = 0
179
- countResidue_pdb = 0
180
- for m in alignment[2][0:countResidue + countGap - 1]:
181
- if m == '.' or m == '-':
182
- countGap_pdb += 1
183
- posAtom = countResidue + countGap - countGap_pdb
184
- realpdbStart = 0
185
- for j in alignment[2]:
186
- if j == '.' or j == '-':
187
- realpdbStart += 1
188
- else:
189
- break
190
- if len(alignment[2]) > (countResidue + countGap - 1):
191
- if (alignment[2][countResidue + countGap - 1] != '-') and (float(annot) >= float(realpdbStart) + 1):
192
- try:
193
- coordinates = coords
194
- residue_numbers = resnums_for_sasa
195
- coordWeWant = coordinates[posAtom - 1]
196
- residue_number_we_want = residue_numbers[posAtom - 1]
197
- except:
198
- IndexError
199
- coordWeWant = 'nan'
200
- residue_number_we_want = 'nan'
201
- else:
202
- coordWeWant = 'nan'
203
- residue_number_we_want = 'nan'
204
- return coordWeWant, posAtom, residue_number_we_want
205
- else:
206
- coordWeWant = 'nan'
207
- residue_number_we_want = 'nan'
208
- return coordWeWant, posAtom, residue_number_we_want
209
- else:
210
- return np.NaN, np.NaN, np.NaN
211
- else:
212
- return np.NaN, np.NaN, np.NaN
213
-
214
-
215
- def get_alignments_3D(identifier, model_num, pdb_path, pdbSequence, source, chain, pdbID, mode, path_3D_alignment,file_format = 'gzip'):
216
- pdbSequence = convert_non_standard_amino_acids(pdbSequence)
217
- if mode == 1:
218
- if source == 'PDB':
219
- # Step 1: Fetch the PDB file
220
- pdb_url = f"https://files.rcsb.org/download/{pdbID}.pdb"
221
- response = requests.get(pdb_url)
222
- response.raise_for_status() # Check for a successful response
223
- # Step 2: Parse the PDB file from memory
224
- atoms = [i for i in response.text.split('\n') if i.startswith('ATOM')]
225
-
226
- atoms = [i.split() for i in atoms]
227
- atoms = [i for i in atoms if (i[2] == 'CA' and i[4] == chain)]
228
- atoms = [[x[i][-3:] if i == 3 else x[i] for i in range(len(x))] for x in atoms]
229
-
230
- atomSequence = ''.join([threeToOne(i[3]) for i in atoms])
231
- coords = [[i[6] ,i[7] ,i[8]] for i in atoms]
232
- resnums_for_sasa = [i[5] for i in atoms]
233
-
234
- elif source == 'SWISSMODEL':
235
- atomSequence = ''
236
- coords = []
237
- resnums_for_sasa = []
238
- with open(pdb_path, encoding="utf8") as f:
239
- for line in f.readlines():
240
- if line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21].upper() == chain.upper():
241
- atomSequence += threeToOne(line[17:20].strip())
242
- coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
243
- resnums_for_sasa.append(line[22:26].strip())
244
- elif line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21] == ' ':
245
- atomSequence += threeToOne(line[17:20].strip())
246
- coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
247
- resnums_for_sasa.append(line[22:26].strip())
248
-
249
-
250
- elif source == 'MODBASE':
251
- atomSequence = ''
252
- coords = []
253
- resnums_for_sasa = []
254
- with open(pdb_path, encoding="utf8") as f:
255
- for line in f.readlines():
256
- if line[0:7].strip() == 'ATOM' and line[13:15].strip() == 'CA':
257
- atomSequence += threeToOne(line[17:20].strip())
258
- coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
259
- resnums_for_sasa.append(line[22:26].strip())
260
-
261
- aligner.mode = 'local'
262
- aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
263
- aligner.open_gap_score = -11
264
- aligner.extend_gap_score = -1
265
-
266
-
267
-
268
- atomSequence = convert_non_standard_amino_acids(atomSequence)
269
-
270
- alignments = aligner.align(pdbSequence, atomSequence)
271
- alignments = (list(alignments))
272
-
273
- return alignments, coords, resnums_for_sasa