fatmacankara commited on
Commit
1ca7fde
·
1 Parent(s): eb181e2

Delete code/add_3Dalignment.py

Browse files
Files changed (1) hide show
  1. code/add_3Dalignment.py +0 -284
code/add_3Dalignment.py DELETED
@@ -1,284 +0,0 @@
1
- """
2
- This code file produces alignments between the structure and the sequence for a given protein.
3
-
4
- """
5
-
6
- import math
7
- import glob
8
- import numpy as np
9
- from Bio import Align
10
- import gzip
11
- from pathlib import Path
12
- from Bio.Align import substitution_matrices
13
- aligner = Align.PairwiseAligner()
14
- def convert_non_standard_amino_acids(sequence):
15
- """
16
- Convert non-standard or ambiguous amino acid codes to their closest relatives.
17
- """
18
-
19
- # Define a dictionary to map non-standard codes to standard amino acids
20
- conversion_dict = {
21
- 'B': 'D', # Aspartic Acid (D) is often used for B (Asx)
22
- 'Z': 'E', # Glutamic Acid (E) is often used for Z (Glx)
23
- 'X': 'A', # Alanine (A) is a common placeholder for unknown/ambiguous
24
- 'U': 'C', # Cysteine (C) is often used for Selenocysteine (U)
25
- 'J': 'L', # Leucine (L) is often used for J (Leu/Ile)
26
- 'O': 'K', # Lysine (K) is often used for O (Pyrrolysine)
27
- # '*' or 'Stop' represents a stop codon; you may replace with '' to remove
28
- '*': '',
29
- }
30
-
31
- # Replace non-standard codes with their closest relatives
32
- converted_sequence = ''.join([conversion_dict.get(aa, aa) for aa in sequence])
33
-
34
- return converted_sequence
35
- def distance(x1, y1, z1, x2, y2, z2):
36
- d = math.sqrt(math.pow(x2 - x1, 2) +
37
- math.pow(y2 - y1, 2) +
38
- math.pow(z2 - z1, 2) * 1.0)
39
- return d
40
-
41
-
42
- def find_distance(coordMut, coordAnnot):
43
- if coordMut != np.NaN:
44
- try:
45
- dist = distance(float(coordMut[0]), float(coordMut[1]), float(coordMut[2]), float(coordAnnot[0]),
46
- float(coordAnnot[1]), float(coordAnnot[2]))
47
- return "%.2f" % dist
48
- except:
49
- ValueError
50
- dist = 'nan'
51
- return dist
52
- else:
53
- return np.NaN
54
-
55
-
56
- def threeToOne(variant):
57
- if variant == "ALA":
58
- variant = "A"
59
- elif variant == "ARG":
60
- variant = "R"
61
- elif variant == "VAL":
62
- variant = "V"
63
- elif variant == "GLU":
64
- variant = "E"
65
- elif variant == "PRO":
66
- variant = "P"
67
- elif variant == "LEU":
68
- variant = "L"
69
- elif variant == "GLY":
70
- variant = "G"
71
- elif variant == "ASN":
72
- variant = "N"
73
- elif variant == "SER":
74
- variant = "S"
75
- elif variant == "GLN":
76
- variant = "Q"
77
- elif variant == "THR":
78
- variant = "T"
79
- elif variant == "MET":
80
- variant = "M"
81
- elif variant == "LYS":
82
- variant = "K"
83
- elif variant == "ASP":
84
- variant = "D"
85
- elif variant == "ILE":
86
- variant = "I"
87
- elif variant == "PHE":
88
- variant = "F"
89
- elif variant == "TRP":
90
- variant = "W"
91
- elif variant == "TYR":
92
- variant = "Y"
93
- elif variant == "HIS":
94
- variant = "H"
95
- elif variant == "CYS":
96
- variant = "C"
97
- elif variant == 'UNK':
98
- variant = 'X'
99
- elif variant == 'ASX':
100
- variant = 'O'
101
- return (variant)
102
-
103
-
104
- def get_coords(annot, alignments, coords, resnums_for_sasa, mode):
105
- if mode == 1:
106
- for alignment in alignments[0]:
107
- alignment = (str(alignment).strip().split('\n'))
108
- startGap = 0
109
- if alignment[0].startswith('.'):
110
- for k in alignment[0]:
111
- if k == '.' or k == '-':
112
- startGap += 1
113
- else:
114
- break
115
- countGap = startGap
116
- countResidue = 0
117
- for j in alignment[0][startGap:]:
118
- if j == '.' or j == '-':
119
- countGap += 1
120
- else:
121
- countResidue += 1
122
- if countResidue == float(annot):
123
- break
124
- countGap_pdb = 0
125
- countResidue_pdb = 0
126
- for m in alignment[2][0:countResidue + countGap - 1]:
127
- if m == '.' or m == '-':
128
- countGap_pdb += 1
129
- posAtom = countResidue + countGap - countGap_pdb
130
-
131
- realpdbStart = 0
132
- for j in alignment[2]:
133
- if j == '.' or j == '-':
134
- realpdbStart += 1
135
- else:
136
- break
137
-
138
- if (alignment[2][countResidue + countGap - 1] != '-') and (float(annot) >= float(realpdbStart) + 1):
139
- try:
140
- coordinates = alignments[1]
141
- residue_numbers = alignments[2]
142
- coordWeWant = coordinates[posAtom - 1]
143
- residue_number_we_want = residue_numbers[posAtom - 1]
144
-
145
- except:
146
- IndexError
147
- coordWeWant = 'nan'
148
- else:
149
- coordWeWant = 'nan'
150
- return coordWeWant, posAtom, residue_number_we_want
151
- if mode == 2:
152
- if annot != 'nan':
153
- if int(annot) <= 1400:
154
- alignment = (str(alignments).strip().split('\n'))
155
- startGap = 0
156
- if alignment[0].startswith('.'):
157
- for k in alignment[0]:
158
- if k == '.' or k == '-':
159
- startGap += 1
160
- else:
161
- break
162
- countGap = startGap
163
- countResidue = 0
164
- for j in alignment[0][startGap:]:
165
- if j == '.' or j == '-':
166
- countGap += 1
167
- else:
168
- countResidue += 1
169
- if countResidue == float(annot):
170
- break
171
- countGap_pdb = 0
172
- countResidue_pdb = 0
173
- for m in alignment[2][0:countResidue + countGap - 1]:
174
- if m == '.' or m == '-':
175
- countGap_pdb += 1
176
- posAtom = countResidue + countGap - countGap_pdb
177
- realpdbStart = 0
178
- for j in alignment[2]:
179
- if j == '.' or j == '-':
180
- realpdbStart += 1
181
- else:
182
- break
183
- if len(alignment[2]) > (countResidue + countGap - 1):
184
- if (alignment[2][countResidue + countGap - 1] != '-') and (float(annot) >= float(realpdbStart) + 1):
185
- try:
186
- coordinates = coords
187
- residue_numbers = resnums_for_sasa
188
- coordWeWant = coordinates[posAtom - 1]
189
- residue_number_we_want = residue_numbers[posAtom - 1]
190
- except:
191
- IndexError
192
- coordWeWant = 'nan'
193
- residue_number_we_want = 'nan'
194
- else:
195
- coordWeWant = 'nan'
196
- residue_number_we_want = 'nan'
197
- return coordWeWant, posAtom, residue_number_we_want
198
- else:
199
- coordWeWant = 'nan'
200
- residue_number_we_want = 'nan'
201
- return coordWeWant, posAtom, residue_number_we_want
202
- else:
203
- return np.NaN, np.NaN, np.NaN
204
- else:
205
- return np.NaN, np.NaN, np.NaN
206
-
207
-
208
- def get_alignments_3D(identifier, model_num, pdb_path, pdbSequence, source, chain, pdbID, mode, path_3D_alignment,file_format = 'gzip'):
209
- pdbSequence = convert_non_standard_amino_acids(pdbSequence)
210
- if mode == 1:
211
- atomSequence = ''
212
- coords = []
213
- resnums_for_sasa = []
214
- with open(pdb_path, encoding="utf8") as f:
215
- for line in f.readlines():
216
- if source != 'MODBASE':
217
- if line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21].upper() == chain.upper():
218
- atomSequence += threeToOne(line[17:20].strip())
219
- coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
220
- resnums_for_sasa.append(line[22:26].strip())
221
- elif line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21] == ' ':
222
- atomSequence += threeToOne(line[17:20].strip())
223
- coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
224
- resnums_for_sasa.append(line[22:26].strip())
225
- else:
226
- if line[0:7].strip() == 'ATOM' and line[13:15].strip() == 'CA':
227
- atomSequence += threeToOne(line[17:20].strip())
228
- coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
229
- resnums_for_sasa.append(line[22:26].strip())
230
-
231
- f = open(Path(path_3D_alignment / f'{identifier}_{pdbID}_{str(chain)}_alignment.txt'),"w")
232
- atomSequence = convert_non_standard_amino_acids(atomSequence)
233
-
234
- aligner.mode = 'local'
235
- aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
236
- aligner.open_gap_score = -11
237
- aligner.extend_gap_score = -1
238
- alignments = aligner.align(pdbSequence, atomSequence)
239
- alignments = (list(alignments))
240
- for alignment in alignments:
241
- f.write(str(alignment))
242
- f.write('\n')
243
- f.write('\n')
244
- return alignments, coords, resnums_for_sasa
245
- elif mode==2:
246
- atomSequence = ''
247
- coords = []
248
- resnums_for_sasa = []
249
- if file_format == 'txt':
250
- with open(name, encoding="utf8") as f:
251
- for line in f.readlines():
252
- if line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA':
253
- atomSequence += threeToOne(line[17:20].strip())
254
- coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
255
- resnums_for_sasa.append(line[22:26].strip())
256
- elif line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21] == ' ':
257
- atomSequence += threeToOne(line[17:20].strip())
258
- coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
259
- resnums_for_sasa.append(line[22:26].strip())
260
- elif file_format == 'gzip':
261
- with gzip.open(pdb_path, mode='rb') as f:
262
- for line in f:
263
- line = line.decode()
264
- if line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA':
265
- atomSequence += threeToOne(line[17:20].strip())
266
- coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
267
- resnums_for_sasa.append(line[22:26].strip())
268
- elif line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21] == ' ':
269
- atomSequence += threeToOne(line[17:20].strip())
270
- coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
271
- resnums_for_sasa.append(line[22:26].strip())
272
- f = open(Path(path_3D_alignment / f'{identifier}_{str(model_num)}_3Dalignment.txt'),"w")
273
- aligner.mode = 'local'
274
- aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
275
- aligner.open_gap_score = -11
276
- aligner.extend_gap_score = -1
277
- atomSequence = convert_non_standard_amino_acids(atomSequence)
278
- alignments = aligner.align(pdbSequence, atomSequence)
279
- alignments = (list(alignments))
280
- for alignment in alignments:
281
- f.write(str(alignment))
282
- f.write('\n')
283
- f.write('\n')
284
- return alignments, coords, resnums_for_sasa