fatmacankara commited on
Commit
97b1b63
·
1 Parent(s): 993c0d3

Delete code/add_alignment.py

Browse files
Files changed (1) hide show
  1. code/add_alignment.py +0 -350
code/add_alignment.py DELETED
@@ -1,350 +0,0 @@
1
- from Bio import Align
2
- from Bio.Align import substitution_matrices
3
- from pathlib import Path
4
-
5
- aligner = Align.PairwiseAligner()
6
- from Bio.pairwise2 import format_alignment
7
-
8
- def convert_non_standard_amino_acids(sequence):
9
- """
10
- Convert non-standard or ambiguous amino acid codes to their closest relatives.
11
- """
12
-
13
- # Define a dictionary to map non-standard codes to standard amino acids
14
- conversion_dict = {
15
- 'B': 'D', # Aspartic Acid (D) is often used for B (Asx)
16
- 'Z': 'E', # Glutamic Acid (E) is often used for Z (Glx)
17
- 'X': 'A', # Alanine (A) is a common placeholder for unknown/ambiguous
18
- 'U': 'C', # Cysteine (C) is often used for Selenocysteine (U)
19
- 'J': 'L', # Leucine (L) is often used for J (Leu/Ile)
20
- 'O': 'K', # Lysine (K) is often used for O (Pyrrolysine)
21
- # '*' or 'Stop' represents a stop codon; you may replace with '' to remove
22
- '*': '',
23
- }
24
-
25
- # Replace non-standard codes with their closest relatives
26
- converted_sequence = ''.join([conversion_dict.get(aa, aa) for aa in sequence])
27
-
28
- return converted_sequence
29
- def do_alignment(identifier, uniprotSequence, pdbSequence, alignment_path):
30
- print(f'Aligning Datapoint: {identifier}')
31
- if len(pdbSequence) >= 1:
32
- uniprotSequence = convert_non_standard_amino_acids(uniprotSequence)
33
- pdbSequence = convert_non_standard_amino_acids(pdbSequence)
34
- aligner.mode = 'local'
35
- aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
36
- aligner.open_gap_score = -11
37
- aligner.extend_gap_score = -1
38
- alignments = aligner.align(uniprotSequence, pdbSequence)
39
- alignments = (list(alignments))
40
- alignment_list = []
41
- for alignment in alignments:
42
- alignment = (str(alignment).strip().split('\n'))
43
- alignment = [''.join(['.' if m == ' ' else m for m in x]) for x in alignment]
44
- alignment_list.append(alignment)
45
- return alignment_list
46
-
47
-
48
- def mutation_position_on_pdb(alignment_list, pos):
49
- which_alignment_to_go = 0
50
- for alignment in alignment_list:
51
- which_alignment_to_go += 1
52
- alignment_uniprot = alignment[0]
53
- alignment_pdb = alignment[2]
54
- startGap = 0
55
- if alignment_uniprot.startswith('.') or alignment_uniprot.startswith('-'):
56
- for k in alignment_uniprot:
57
- if k == '.' or k == '-':
58
- startGap += 1
59
- else:
60
- break
61
- countGap = startGap
62
- countResidue = 0
63
- canonicalRes = ' '
64
- pdbRes = ' '
65
- for j in alignment_uniprot[startGap:]:
66
- if j == '.' or j == '-':
67
- countGap += 1
68
- else:
69
- countResidue += 1
70
- if int(countResidue) == int(pos):
71
- canonicalRes = alignment_uniprot[countResidue + countGap - 1]
72
- try:
73
- pdbRes = alignment_pdb[countResidue + countGap - 1]
74
- except:
75
- IndexError
76
- pdbRes = 'nan'
77
- break
78
- if (alignment[1][countResidue + countGap - 1] == '|') or (alignment[1][countResidue + countGap - 1] == 'X'):
79
- if canonicalRes == pdbRes:
80
- pdb_alignStatus = 'aligned'
81
- elif canonicalRes != pdbRes:
82
- pdb_alignStatus = 'aligned*'
83
- countGap_pdb = 0
84
- countResidue_pdb = 0
85
- pdbRes = ' '
86
- for j in alignment_pdb[0:countResidue + countGap - 1]:
87
- if j == '.' or j == '-':
88
- countGap_pdb += 1
89
- if alignment_pdb[countResidue + countGap - 1] == '.' or alignment_pdb[
90
- countResidue + countGap - 1] == '-':
91
- mutationPositionOnPDB = 'nan'
92
- posPDB = 'nan'
93
- else:
94
- posPDB = countResidue + countGap - countGap_pdb
95
- mutationPositionOnPDB = str(posPDB)
96
- break
97
- elif (canonicalRes == pdbRes) and ((alignment[1][countResidue + countGap - 1] == '.') or (
98
- alignment[1][countResidue + countGap - 1] == '-')):
99
- pdb_alignStatus = 'not_aligned'
100
- mutationPositionOnPDB = 'nan'
101
- elif (canonicalRes != pdbRes) and ((alignment[1][countResidue + countGap - 1] == '.') or (
102
- alignment[1][countResidue + countGap - 1] == '-')):
103
- pdb_alignStatus = 'not_aligned'
104
- mutationPositionOnPDB = 'nan'
105
- elif alignment_pdb[countResidue + countGap - 1] == '.' or alignment_pdb[
106
- countResidue + countGap - 1] == '-':
107
- mutationPositionOnPDB = 'nan'
108
- posPDB = 'nan'
109
- else:
110
- pdb_alignStatus = 'not_aligned'
111
- mutationPositionOnPDB = 'nan'
112
-
113
- return (pdb_alignStatus, mutationPositionOnPDB, startGap, alignment_list[which_alignment_to_go - 1])
114
-
115
-
116
- def find_position_on_pdb_for_range_annotations(posAnnotation, startGap, alignment_to_use):
117
- annotation_on_pdb_start = 'nan'
118
- annotation_on_pdb_end = 'nan'
119
- pos1 = int(posAnnotation.split('-')[0])
120
- count_gap = startGap
121
- count_residue = 0
122
- for j in alignment_to_use[0][startGap:]:
123
- if j == '.' or j == '-':
124
- count_gap += 1
125
- else:
126
- count_residue += 1
127
- if int(count_residue) == int(pos1): # count gaps until the first position
128
- break
129
- annotation_on_up_start = int(pos1) + int(count_gap)
130
-
131
- pos2 = int(posAnnotation.split('-')[1])
132
- count_gap = startGap
133
- count_residue = 0
134
- for j in alignment_to_use[0][startGap:]:
135
- if j == '.' or j == '-':
136
- count_gap += 1
137
- else:
138
- count_residue += 1
139
- if int(count_residue) == int(pos2): # count gaps until the first position
140
- break
141
-
142
- annotation_on_up_end = int(pos2) + int(count_gap)
143
- try:
144
- pdb_residue_start = alignment_to_use[2][annotation_on_up_start - 1].strip()
145
- if (pdb_residue_start == '.') or (pdb_residue_start == '-'):
146
- for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
147
- if (alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end][ran] != '.') and \
148
- (alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end][ran] != '-') and \
149
- ((alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == '|') or
150
- (alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == 'X')):
151
- annotation_on_up_start += ran
152
- break
153
- elif (pdb_residue_start != '.') and (pdb_residue_start != '-') and \
154
- ((alignment_to_use[1][annotation_on_up_start - 1] == '.') or (
155
- alignment_to_use[1][annotation_on_up_start - 1] == '-')):
156
- for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
157
- if ((alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == '|') or
158
- (alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == 'X')):
159
- annotation_on_up_start += ran
160
- break
161
- count_gap_pdb = 0
162
- if annotation_on_up_start != 'nan':
163
- for q in alignment_to_use[2][0:annotation_on_up_start - 1]:
164
- if q == '.' or q == '-':
165
- count_gap_pdb += 1
166
- if alignment_to_use[1][annotation_on_up_start] == '-' or alignment_to_use[1][annotation_on_up_start] == '.':
167
- annotation_on_pdb_start = 'nan'
168
- else:
169
- annotation_on_pdb_start = int(annotation_on_up_start) - count_gap_pdb
170
- else:
171
- annotation_on_pdb_start = 'nan'
172
- except:
173
- IndexError
174
- try:
175
- pdb_residue_end = alignment_to_use[2][annotation_on_up_end - 1].strip()
176
- if pdb_residue_end == '.' or pdb_residue_end == '-':
177
- for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
178
- if ((alignment_to_use[1][annotation_on_up_start - 1:annotation_on_up_end][ran] == '.') or
179
- (alignment_to_use[1][(annotation_on_up_start - 1):][ran] == '-')):
180
- annotation_on_up_start += (ran - 1)
181
- annotation_on_up_end = annotation_on_up_start
182
- break
183
- elif (pdb_residue_end != '.') and (pdb_residue_end != '-') and \
184
- ((alignment_to_use[1][annotation_on_up_end - 1] == '.') or (
185
- alignment_to_use[1][annotation_on_up_end - 1] == '-')):
186
- for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
187
- if ((alignment_to_use[1][annotation_on_up_start - 1:annotation_on_up_end][ran] == '.') or
188
- (alignment_to_use[1][(annotation_on_up_start - 1):][ran] == '-')):
189
- annotation_on_up_start += (ran - 1)
190
- annotation_on_up_end = annotation_on_up_start
191
- break
192
- count_gap_pdb = 0
193
- if annotation_on_up_end != 'nan':
194
- for q in alignment_to_use[2][0:annotation_on_up_end - 1]:
195
- if q == '.' or q == '-':
196
- count_gap_pdb += 1
197
- if alignment_to_use[1][annotation_on_up_end - 1] == '-' or alignment_to_use[1][
198
- annotation_on_up_end - 1] == '.' and annotation_on_pdb_start == 'nan':
199
- annotation_on_pdb_end = 'nan'
200
- elif alignment_to_use[1][annotation_on_up_end - 1] == '-' or alignment_to_use[1][
201
- annotation_on_up_end - 1] == '.' and annotation_on_pdb_start != 'nan':
202
- annotation_on_pdb_end = int(annotation_on_up_end) - count_gap_pdb
203
- else:
204
- annotation_on_pdb_end = int(annotation_on_up_end) - count_gap_pdb
205
- else:
206
- annotation_on_pdb_end = 'nan'
207
- except:
208
- IndexError # Say isoform 2 is matched with the length 100, but canonical is 150 aa long. If there is an annotation at 105. position, for the isoform it throws an index error.
209
-
210
- if annotation_on_pdb_start == 'nan' and annotation_on_pdb_end != 'nan':
211
- annotation_on_pdb_start = annotation_on_up_start - count_gap_pdb
212
- if annotation_on_pdb_start == annotation_on_pdb_end:
213
- annotation_on_pdb_start = 'nan'
214
- annotation_on_pdb_end = 'nan'
215
- return annotation_on_up_start, annotation_on_up_end, annotation_on_pdb_start, annotation_on_pdb_end
216
-
217
-
218
- def annotation_pos_on_pdb(annot_positions, startGap, alignment_to_use, identifier):
219
- newpos = []
220
- if annot_positions != 'nan':
221
- annot_positions = (str(annot_positions).replace("'", ''))
222
- annot_positions = (str(annot_positions).replace('[', ''))
223
- annot_positions = (str(annot_positions).replace("]", ''))
224
- positionList_perAnnotation = annot_positions.split(',')
225
- positionList_perAnnotation = [h.strip() for h in positionList_perAnnotation]
226
-
227
- position_start_on_pdb = 'nan'
228
- position_end_on_pdb = 'nan'
229
- try:
230
- positionList_perAnnotation = [i for i in positionList_perAnnotation if i != 'nan']
231
- except:
232
- TypeError
233
- for position in range(len(positionList_perAnnotation)):
234
- if ('-' not in str(positionList_perAnnotation[position])) and (str(positionList_perAnnotation[position]) != '?') and (str(positionList_perAnnotation[position]) != '') and (len(str(positionList_perAnnotation[position])) != 0):
235
- count_gap = startGap
236
- count_residue = 0
237
- for j in alignment_to_use[0][startGap:]:
238
- if j == '.' or j == '-':
239
- count_gap += 1
240
- else:
241
- count_residue += 1
242
- try:
243
- if int(count_residue) == int(positionList_perAnnotation[position]):
244
- break
245
- except:
246
- ValueError
247
-
248
- annotation_on_up = int(positionList_perAnnotation[position]) + int(count_gap)
249
- try:
250
- pdb_residue_start = alignment_to_use[2][annotation_on_up - 1].strip()
251
- except:
252
- IndexError
253
- pdb_residue_start = 'nan'
254
- if pdb_residue_start != 'nan':
255
- try:
256
- if (pdb_residue_start == '.') or (pdb_residue_start == '-'):
257
- for ran in range(len(alignment_to_use[2][(annotation_on_up - 1):annotation_on_up])):
258
- if (alignment_to_use[2][(annotation_on_up - 1):annotation_on_up][ran] != '.') and \
259
- (alignment_to_use[2][(annotation_on_up - 1):annotation_on_up][
260
- ran] != '-') and \
261
- ((alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][
262
- ran] == '|') or
263
- (alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][
264
- ran] == 'X')):
265
- annotation_on_up += ran
266
- break
267
- elif (pdb_residue_start != '.') and (pdb_residue_start != '-') and \
268
- ((alignment_to_use[1][annotation_on_up - 1] == '.') or (
269
- alignment_to_use[1][annotation_on_up - 1] == '-')):
270
- for ran in range(len(alignment_to_use[2][(annotation_on_up - 1):annotation_on_up])):
271
- if ((alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][ran] == '|') or
272
- (alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][ran] == 'X')):
273
- annotation_on_up += ran
274
- break
275
- count_gap_pdb = 0
276
- for q in alignment_to_use[2][0:annotation_on_up - 1]:
277
- if q == '.' or q == '-':
278
- count_gap_pdb += 1
279
- if alignment_to_use[1][annotation_on_up] == '-' or alignment_to_use[1][
280
- annotation_on_up] == '.':
281
- annotation_on_pdb = 'nan'
282
- else:
283
- annotation_on_pdb = int(annotation_on_up) - count_gap_pdb
284
-
285
- if count_gap_pdb == annotation_on_up:
286
- annotation_on_pdb = 'nan'
287
- try:
288
- if alignment_to_use[2][count_gap_pdb + annotation_on_pdb - 1] == '.' or alignment_to_use[2][
289
- count_gap_pdb + annotation_on_pdb - 1] == '-':
290
- annotation_on_pdb = 'nan'
291
- except:
292
- IndexError
293
- annotation_on_pdb = 'nan'
294
- except:
295
- IndexError
296
- annotation_on_pdb = 'nan'
297
-
298
- newpos.append(annotation_on_pdb)
299
-
300
- elif ('-' in str(positionList_perAnnotation[position])) and (
301
- str(positionList_perAnnotation[position]) != '?') and (
302
- str(positionList_perAnnotation[position]) != ' ') and (
303
- len(str(positionList_perAnnotation[position])) != 0):
304
- try:
305
- position_start_on_pdb = \
306
- find_position_on_pdb_for_range_annotations(positionList_perAnnotation[position],
307
- startGap, alignment_to_use)[2]
308
- position_end_on_pdb = \
309
- find_position_on_pdb_for_range_annotations(positionList_perAnnotation[position],
310
- startGap, alignment_to_use)[3]
311
- except:
312
- ValueError
313
- newpositions = str(position_start_on_pdb) + '-' + str(position_end_on_pdb)
314
- newpos.append(newpositions)
315
- else:
316
- pass
317
- try:
318
- newpos = [i for i in newpos if i != 'nan']
319
- except:
320
- TypeError
321
- return newpos
322
-
323
-
324
- def final_stage(df, annotation_list, alignment_path):
325
- for i in df.index:
326
- identifier = df.at[i, 'uniprotID'] + '_' + df.at[i, 'pdbID'] + '_' + df.at[i, 'chain'] + '_'
327
- alignment_list = do_alignment(identifier, df.at[i, 'uniprotSequence'], df.at[i, 'pdbSequence'], alignment_path)
328
- df.at[i, 'pdb_alignStatus'] = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[0]
329
- df.at[i, 'mutationPositionOnPDB'] = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[1]
330
- startGap = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[2]
331
- alignment_to_use = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[3]
332
- for annot in annotation_list:
333
- df.at[i, annot] = annotation_pos_on_pdb(df.at[i, annot], startGap, alignment_to_use, identifier)
334
- if str(df.at[i, 'domStart']) != 'nan' and str(df.at[i, 'domEnd']) != 'nan' and \
335
- ((str(df.at[i, 'domStart']) != '-1' and str(df.at[i, 'domEnd']) != '-1' and
336
- str(df.at[i, 'domStart']) != '-1.0' and str(df.at[i, 'domEnd']) != '-1.0')):
337
- domainLoc = str(df.at[i, 'domStart']).split('.')[0] + '-' + str(df.at[i, 'domEnd']).split('.')[0]
338
- domain_pos = find_position_on_pdb_for_range_annotations(domainLoc, startGap, alignment_to_use)
339
- df.at[i, 'domainStartonPDB'] = domain_pos[2]
340
- df.at[i, 'domainEndonPDB'] = domain_pos[3]
341
- elif str(df.at[i, 'domStart']) != '-1' or str(df.at[i, 'domEnd']) != '-1' or \
342
- str(df.at[i, 'domStart']) != '-1.0' or str(df.at[i, 'domEnd']) != '-1.0':
343
- df.at[i, 'domainStartonPDB'] = 'nan'
344
- df.at[i, 'domainEndonPDB'] = 'nan'
345
- return df
346
-
347
- def alignment(dataframe_to_align, annotation_list, alignment_path):
348
- domainList = ['domStart', 'domEnd']
349
- result = final_stage(dataframe_to_align, annotation_list, alignment_path)
350
- return result