fatmacankara commited on
Commit
f761ce4
·
1 Parent(s): b31c4ca

Update code/pdb_featureVector.py

Browse files
Files changed (1) hide show
  1. code/pdb_featureVector.py +349 -349
code/pdb_featureVector.py CHANGED
@@ -82,377 +82,377 @@ def pdb(input_set, mode, impute):
82
  out_path = path_to_output_files / 'log.txt'
83
  #sys.stdout = open(out_path, 'w')
84
  data = clean_data(input_set)
85
- if len(data) != 0:
 
 
86
  data = add_uniprot_sequence(data)
87
  match = data[(data.wt_sequence_match == 'm')]
88
  org_len = len(match)
89
  iso = data[(data.wt_sequence_match == 'i')]
90
  noMatch = data[(data.wt_sequence_match != 'm') & (data.wt_sequence_match != 'i')]
91
- if len(data) == 0:
92
- st.write('Feature vectore generation terminated. Please enter a query.')
 
 
 
 
 
 
 
 
93
  else:
94
- if len(noMatch) == len(data) :
95
- st.write('Aminoacid at the position could not be mapped to canonical or isoform sequence. Please check the input amino acid.')
96
- elif len(noMatch) > 0:
97
- st.write(
98
- f'{len(noMatch)} of {len(data)} datapoints has not been mapped to any sequence. These datapoints are omitted.')
99
- if len(iso) > 0:
100
- st.write(f'{len(iso)} of {len(data)} datapoints has been mapped to isoform sequences. These datapoints are omitted.')
101
- if len(match) == 0:
102
- st.write('Feature generation terminated due to failed mapping of input amino acid to UniProt sequence.')
103
- else:
104
- st.write(f'{len(match)} of {len(data)} datapoints has been mapped to canonical sequences. Proceeding with these datapoins.')
105
- if (len(iso) != 0) | (len(noMatch) != 0):
106
- st.write('Omitted datapoints are:', noMatch.datapoint.to_list() + iso.datapoint.to_list())
107
- st.write('\n')
108
- st.write('Check log file for updates.')
109
-
110
- data = match[['uniprotID', 'wt', 'pos', 'mut', 'datapoint']]
111
- print('>> Feature vector generation started...\n')
112
- print('\n>> Creating directories...')
113
- print('\n>> Adding physicochemical properties...\n')
114
- data = add_physicochemical(data)
115
- print('\n>> Adding domains\n')
116
- data = add_domains(data, path_to_domains)
117
- print('\n>> Adding sequence annotations...\n')
118
- data = add_annotations(data)
119
- print('\n>> Retrieving PDB structure information...\n')
120
- pdb_info = addPDBinfo(data, path_to_output_files)
121
- if len(pdb_info) != 0:
122
- data = pd.merge(data, pdb_info, on='uniprotID', how='left')
123
- # Spare datapoint if there is no associated PDB.
124
- no_pdb = data[data.pdbID.isna()].drop_duplicates()
125
- pdb = data[~data.pdbID.isna()].drop_duplicates()
126
- # Spare datapoint if associated PDB does not cover mutated area.
127
- pdb.pos = pdb.pos.apply(lambda x:int(x))
128
- pdb.start = pdb.start.apply(lambda x: int(x))
129
- pdb.end = pdb.end.apply(lambda x: int(x))
130
- no_pdb_add = pdb[~((pdb.pos > pdb.start) & (pdb.pos < pdb.end))]
131
-
132
- pdb = pdb[(pdb.pos > pdb.start) & (pdb.pos < pdb.end)] # do not change order
133
-
134
- pdb.reset_index(drop=True, inplace=True)
135
- # Delete spared datapoint from no_pdb list if it has any other PDB that spans the mutated area.
136
- no_pdb_add = no_pdb_add[~no_pdb_add.datapoint.isin(pdb.datapoint.to_list())]
137
- # Final collection of datapoints without PDB associaton.
138
- no_pdb = pd.concat([no_pdb, no_pdb_add])
139
- no_pdb = no_pdb[SIMPLE_COLS]
140
- no_pdb = no_pdb.drop_duplicates()
141
-
142
- pdb = pdb.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
143
  pdb.reset_index(drop=True, inplace=True)
144
- pdb.fillna(np.NaN, inplace=True)
145
- # Get position mapping from added structures
146
- print('\n>> Adding structure residue positions...\n')
147
- if len(pdb) > 0: # there are mapped structures, and some of them span the mutated area.
148
- pdb.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True)
149
- pdb = pdbMapping(pdb, Path(path_to_output_files / 'pdb_structures'))
 
 
 
 
150
  pdb.reset_index(drop=True, inplace=True)
151
- pdb = pdb.fillna(np.NaN)
152
- no_pdb_add_ = pdb[pdb.AAonPDB.isna()]
153
- no_pdb_add = pdb[pdb.MATCHDICT.isna()]
154
- no_pdb = pd.concat([no_pdb_add_, no_pdb, no_pdb_add])
155
- no_pdb.reset_index(inplace=True, drop=True)
156
- pdb = pdb[~(pdb.MATCHDICT.isna())]
157
- pdb = pdb[~(pdb.AAonPDB.isna())]
158
- if len(pdb) > 0:
159
- print('\n>> Mapping to PDB residues...\n')
160
- pdb = changeUPtoPDB(pdb)
161
- pdb.reset_index(drop=True, inplace=True)
162
- print('\n>> Calculating 3D distances for PDB structures...\n')
163
- pdb = isZeroDistance(pdb)
164
- pdb = processFile(pdb, path_to_output_files)
165
- pdb = match3D(pdb)
166
- pdb = selectMaxAnnot(pdb)
167
- pdb = pdb.sort_values(by=['datapoint', 'resolution', 'annotTotal'], ascending=[True, True, True])
168
- pdb = pdb.drop_duplicates(['datapoint'])
169
- pdb.replace({'[]': np.NaN, 'hit':0.0}, inplace=True)
170
- print('\n>> PDB matching is completed...\n')
171
- else:
172
- # There was no residue match in the associated PDB. So we cannot use PDB data.
173
- pdb = pdb[SIMPLE_COLS]
174
- print('\n>>> No PDB structure could be matched.')
175
-
176
  else:
 
177
  pdb = pdb[SIMPLE_COLS]
178
  print('\n>>> No PDB structure could be matched.')
179
-
180
-
181
  else:
182
- pdb = pd.DataFrame(columns = SIMPLE_COLS)
183
  print('\n>>> No PDB structure could be matched.')
184
- no_pdb = data.copy()
185
- no_pdb = no_pdb[SIMPLE_COLS]
186
-
187
- print(
188
- 'PDB phase is finished...\nPDB structures are found for %d of %d.\n%d of %d failed to match with PDB structure.\n'
189
- % (len(pdb.drop_duplicates(['datapoint'])), len(data.drop_duplicates(['datapoint'])),
190
- len(no_pdb.drop_duplicates(['datapoint'])), len(data.drop_duplicates(['datapoint']))))
191
-
192
-
193
-
194
- print('\n>>> Proceeding to SwissModel search...')
195
- print('------------------------------------\n')
196
- swiss = no_pdb.copy()
 
 
 
 
 
 
 
 
 
 
 
197
  if len(swiss) > 0:
198
- print('\n>> Adding SwissModel residue positions...\n')
199
- swiss.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True)
200
- swiss = swiss.fillna(np.NaN)
201
- swiss, no_swiss_models= addSwissModels(swiss, path_to_input_files, path_to_output_files)
202
- print('\n>> Mapping to SwissModels...\n')
203
- if len(swiss) > 0:
204
- swiss.reset_index(drop=True, inplace=True)
205
- swiss = changeUPtoModels(swiss)
206
- swiss.reset_index(drop=True, inplace=True)
207
- print('\n>> Calculating 3D distances for SwissModels...\n')
208
- swiss = isZeroDistance(swiss)
209
- swiss = match3DModels(swiss)
210
- swiss = selectMaxAnnot(swiss)
211
- swiss = swiss.sort_values(by=['datapoint', 'qmean_norm', 'distance', 'hitTotal', 'annotTotal'], ascending=[True, False, True, False, True])
212
- swiss = swiss.drop_duplicates(['datapoint'])
213
- swiss.replace({'[]': np.NaN, 'hit': 0.0}, inplace=True)
214
- else:
215
- swiss = swiss[SIMPLE_COLS]
216
-
217
- if len(no_swiss_models) > 0:
218
- no_swiss_models = no_swiss_models[SIMPLE_COLS]
219
- no_swiss_models.reset_index(inplace=True, drop=True)
220
-
221
  else:
222
  swiss = swiss[SIMPLE_COLS]
223
- no_swiss_models = no_pdb.copy()
224
 
225
- if len(no_swiss_models) >0:
226
- modbase = no_swiss_models.copy()
227
- print('Proceeding to Modbase search...')
228
- print('------------------------------------\n')
229
-
230
- modbase = modbase[SIMPLE_COLS]
231
- modbase.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True)
232
- modbase = modbase.fillna(np.NaN)
233
- print('\n>> Adding Modbase residue positions...\n')
234
- modbase_simple = modbase[['uniprotID', 'wt', 'pos', 'mut','datapoint']]
235
- modbase_simple = modbase_simple.drop_duplicates(['uniprotID', 'wt', 'pos' ,'mut','datapoint'])
236
- modbaseOut, no_modbase_models_updated = addModbaseModels(modbase_simple, path_to_input_files, path_to_output_files)
237
 
238
- if len(modbaseOut) > 0:
239
- modbase = modbase.merge(modbaseOut, on = ['uniprotID', 'wt', 'pos', 'mut','datapoint'], how = 'left')
240
- no_modbase_models_updated['sasa'] = np.NaN
241
- modbase.reset_index(inplace=True, drop=True)
242
- no_modbase_add = modbase[pd.isna(modbase.coordinates)]
243
- modbase = modbase[~pd.isna(modbase.coordinates)]
244
- no_modbase_models_updated = pd.concat([no_modbase_models_updated, no_modbase_add])
245
- print('\n>> Mapping to Modbase models...\n')
246
- modbase = changeUPtoModels(modbase)
247
- print('\n>> Calculating 3D distances for Modbase models...\n')
248
- modbase = isZeroDistance(modbase)
249
- modbase = match3DModels(modbase)
250
- modbase = selectMaxAnnot(modbase)
251
- modbase = modbase.sort_values(by=['datapoint', 'quality_score', 'distance','hitTotal', 'annotTotal'], ascending=[True, False, True, False, True])
252
- modbase = modbase.drop_duplicates(['datapoint'])
253
- modbase.replace({'[]': np.NaN, 'hit': 0.0}, inplace=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  else:
256
- modbase = pd.DataFrame(columns = SIMPLE_COLS)
257
-
258
- else:
259
- no_modbase_models_updated = pd.DataFrame(columns = SIMPLE_COLS)
260
- modbase= pd.DataFrame(columns = SIMPLE_COLS)
261
-
262
- COLS = ['uniprotID', 'wt', 'pos', 'mut', 'datapoint', 'composition', 'polarity', 'volume', 'granthamScore', 'domain', 'domStart', 'domEnd', 'distance',
263
- 'region', 'crosslink', 'peptide', 'disulfide', 'signalPeptide', 'propeptide', 'naturalVariant', 'nucleotideBinding', 'modifiedResidue', 'site',
264
- 'caBinding', 'turn', 'transmembrane', 'repeat', 'glycosylation', 'intramembrane', 'metalBinding', 'bindingSite', 'dnaBinding', 'activeSite',
265
- 'coiledCoil', 'helix', 'mutagenesis', 'zincFinger', 'transitPeptide', 'intMet', 'strand', 'lipidation', 'motif', 'topologicalDomain',
266
- 'disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', 'nucleotideBindingBinary',
267
- 'lipidationBinary', 'siteBinary', 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
268
- 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', 'modifiedResidueBinary', 'zincFingerBinary',
269
- 'motifBinary', 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', 'glycosylationBinary', 'propeptideBinary', 'sasa']
270
-
271
- if len(no_modbase_models_updated) == 0:
272
- no_modbase_models_updated = pd.DataFrame(columns = SIMPLE_COLS)
273
- no_modbase_models_updated = no_modbase_models_updated[~no_modbase_models_updated.datapoint.isin(modbase.datapoint.to_list())]
274
- no_modbase_models_updated = no_modbase_models_updated[['uniprotID', 'wt', 'pos', 'mut', 'datapoint']]
275
- no_modbase_models_updated.pos = no_modbase_models_updated.pos.astype(int)
276
- no_modbase_models_updated = no_modbase_models_updated.drop_duplicates()
277
 
278
-
279
- if len(pdb)>0:
280
- pdb = pdb[COLS]
281
- pdb['Source'] = 'PDB'
282
- else:
283
- pdb = pd.DataFrame()
284
- if len(swiss) > 0:
285
- swiss = swiss[COLS]
286
- swiss['Source'] = 'SWISS-Model'
287
- else:
288
- swiss = pd.DataFrame()
289
- if len(modbase) > 0:
290
- modbase = modbase[COLS]
291
- modbase['Source'] = 'Modbase'
292
- else:
293
- modbase = pd.DataFrame()
294
-
295
-
296
- # st.write('======PDB==========')
297
- # st.write(pdb.to_string())
298
- # st.write('======SWISS==========')
299
- # st.write(swiss.to_string())
300
- # st.write('======MODBASE==========')
301
- # st.write(modbase.to_string())
302
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
 
304
-
305
- allData = pd.concat([pdb, swiss, modbase])
306
- allData.reset_index(inplace=True, drop=True)
307
- allData.replace({np.NaN: ''}, inplace=True)
308
- # st.write('======ALL DATA==========')
309
- # st.write(allData.to_string())
310
- if len(allData)>0:
311
- allData.distance.replace({-1000: ''}, inplace=True)
312
-
313
-
314
- # Get interface positions from ECLAIR. Download HQ human
315
- print()
316
- print('Assigning surface regions...')
317
- print('------------------------------------\n')
318
-
319
- print('Extracting interface residues...\n')
320
- data_interface = pd.read_csv(path_to_interfaces, sep='\t')
321
-
322
- positions = get_interface_positions(data_interface, 'P1', 'P2')
323
-
324
- interface_dataframe = pd.DataFrame()
325
-
326
- for key, val in positions.items():
327
- k = pd.Series((key, str(list(set(val)))))
328
- interface_dataframe = interface_dataframe.append(k, ignore_index=True)
329
- interface_dataframe.columns = ['uniprotID', 'positions']
330
- final_data = finalTouch(allData)
331
- final_data = final_data.merge(interface_dataframe, on='uniprotID', how='left')
332
- final_data.positions = final_data.positions.astype('str')
333
- for i in final_data.index:
334
- if (str(final_data.at[i, 'pos']) in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'surface':
335
- final_data.at[i, 'threeState_trsh4_HQ'] = 'interface'
336
- elif (str(final_data.at[i, 'pos']) not in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'surface':
337
- final_data.at[i, 'threeState_trsh4_HQ'] = 'surface'
338
- elif (str(final_data.at[i, 'pos']) not in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'core':
339
- final_data.at[i, 'threeState_trsh4_HQ'] = 'core'
340
- elif (str(final_data.at[i, 'pos']) in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'core':
341
- final_data.at[i, 'threeState_trsh4_HQ'] = 'conflict'
342
- elif final_data.at[i, 'trsh4'] == 'nan':
343
- final_data.at[i, 'threeState_trsh4_HQ'] = 'nan'
344
-
345
- final_data.drop(['positions'], axis=1, inplace=True)
346
-
347
- fisherResult = pd.read_csv(fisher_path, sep='\t')
348
- significant_domains = fisherResult.domain.to_list()
349
- for i in final_data.index:
350
- if final_data.at[i, 'domain'] in significant_domains:
351
- final_data.at[i, 'domain_fisher'] = final_data.at[i, 'domain']
352
- else:
353
- final_data.at[i, 'domain_fisher'] = 'NULL'
354
- print('Final adjustments are being done...\n')
355
- binaryCols = UNIPROT_ANNOTATION_COLS[-30:]
356
- final_data = final_data.astype(str)
357
- final_data.replace({'NaN': 'nan'}, inplace=True)
358
- for i in final_data.index:
359
- for j in binaryCols:
360
- final_data[j] = final_data[j].astype('str')
361
- if (final_data.at[i, j] == '0') or (final_data.at[i, j] == '0.0'):
362
- final_data.at[i, j] = '1'
363
- elif final_data.at[i, j] == 'nan':
364
- final_data.at[i, j] = '0'
365
- elif (final_data.at[i, j] == '1') or (final_data.at[i, j] == '1.0'):
366
- final_data.at[i, j] = '2'
367
-
368
- annotCols = UNIPROT_ANNOTATION_COLS[:30]
369
-
370
- for i in final_data.index:
371
- for annot in annotCols:
372
- binaryName = str(annot) + 'Binary'
373
- if final_data.at[i, binaryName] == '2':
374
- final_data.at[i, annot] = '0.0'
375
- final_data.rename(
376
- columns={'uniprotID': 'prot_uniprotAcc', 'wt': 'wt_residue', 'pos': 'position', 'mut': 'mut_residue',
377
- 'datapoint': 'meta_merged', 'datapoint_disease': 'meta-lab_merged', 'label': 'source_db',
378
- 'family': 'prot_family', 'domain': 'domains_all', 'domain_fisher': 'domains_sig',
379
- 'distance': 'domains_3Ddist', 'threeState_trsh4_HQ': 'location_3state',
380
- 'disulfideBinary': 'disulfide_bin', 'intMetBinary': 'intMet_bin',
381
- 'intramembraneBinary': 'intramembrane_bin',
382
- 'naturalVariantBinary': 'naturalVariant_bin', 'dnaBindingBinary': 'dnaBinding_bin',
383
- 'activeSiteBinary': 'activeSite_bin',
384
- 'nucleotideBindingBinary': 'nucleotideBinding_bin', 'lipidationBinary': 'lipidation_bin',
385
- 'siteBinary': 'site_bin',
386
- 'transmembraneBinary': 'transmembrane_bin', 'crosslinkBinary': 'crosslink_bin',
387
- 'mutagenesisBinary': 'mutagenesis_bin',
388
- 'strandBinary': 'strand_bin', 'helixBinary': 'helix_bin', 'turnBinary': 'turn_bin',
389
- 'metalBindingBinary': 'metalBinding_bin',
390
- 'repeatBinary': 'repeat_bin', 'topologicalDomainBinary': 'topologicalDomain_bin',
391
- 'caBindingBinary': 'caBinding_bin',
392
- 'bindingSiteBinary': 'bindingSite_bin', 'regionBinary': 'region_bin',
393
- 'signalPeptideBinary': 'signalPeptide_bin',
394
- 'modifiedResidueBinary': 'modifiedResidue_bin', 'zincFingerBinary': 'zincFinger_bin',
395
- 'motifBinary': 'motif_bin',
396
- 'coiledCoilBinary': 'coiledCoil_bin', 'peptideBinary': 'peptide_bin',
397
- 'transitPeptideBinary': 'transitPeptide_bin',
398
- 'glycosylationBinary': 'glycosylation_bin', 'propeptideBinary': 'propeptide_bin',
399
- 'disulfide': 'disulfide_dist', 'intMet': 'intMet_dist',
400
- 'intramembrane': 'intramembrane_dist', 'naturalVariant': 'naturalVariant_dist',
401
- 'dnaBinding': 'dnaBinding_dist', 'activeSite': 'activeSite_dist',
402
- 'nucleotideBinding': 'nucleotideBinding_dist', 'lipidation': 'lipidation_dist',
403
- 'site': 'site_dist',
404
- 'transmembrane': 'transmembrane_dist', 'crosslink': 'crosslink_dist',
405
- 'mutagenesis': 'mutagenesis_dist', 'strand': 'strand_dist', 'helix': 'helix_dist',
406
- 'turn': 'turn_dist',
407
- 'metalBinding': 'metalBinding_dist', 'repeat': 'repeat_dist',
408
- 'topologicalDomain': 'topologicalDomain_dist', 'caBinding': 'caBinding_dist',
409
- 'bindingSite': 'bindingSite_dist', 'region': 'region_dist',
410
- 'signalPeptide': 'signalPeptide_dist', 'modifiedResidue': 'modifiedResidue_dist',
411
- 'zincFinger': 'zincFinger_dist', 'motif': 'motif_dist', 'coiledCoil': 'coiledCoil_dist',
412
- 'peptide': 'peptide_dist', 'transitPeptide': 'transitPeptide_dist',
413
- 'glycosylation': 'glycosylation_dist', 'propeptide': 'propeptide_dist'}, inplace=True)
414
-
415
- final_data = final_data[
416
- ['prot_uniprotAcc', 'wt_residue', 'mut_residue', 'position','Source', 'meta_merged', 'composition', 'polarity',
417
- 'volume',
418
- 'granthamScore', 'domains_all',
419
- 'domains_sig', 'domains_3Ddist', 'sasa', 'location_3state', 'disulfide_bin', 'intMet_bin',
420
- 'intramembrane_bin', 'naturalVariant_bin', 'dnaBinding_bin',
421
- 'activeSite_bin', 'nucleotideBinding_bin', 'lipidation_bin', 'site_bin',
422
- 'transmembrane_bin', 'crosslink_bin', 'mutagenesis_bin', 'strand_bin',
423
- 'helix_bin', 'turn_bin', 'metalBinding_bin', 'repeat_bin',
424
- 'caBinding_bin', 'topologicalDomain_bin', 'bindingSite_bin',
425
- 'region_bin', 'signalPeptide_bin', 'modifiedResidue_bin',
426
- 'zincFinger_bin', 'motif_bin', 'coiledCoil_bin', 'peptide_bin',
427
- 'transitPeptide_bin', 'glycosylation_bin', 'propeptide_bin', 'disulfide_dist', 'intMet_dist',
428
- 'intramembrane_dist',
429
- 'naturalVariant_dist', 'dnaBinding_dist', 'activeSite_dist',
430
- 'nucleotideBinding_dist', 'lipidation_dist', 'site_dist',
431
- 'transmembrane_dist', 'crosslink_dist', 'mutagenesis_dist',
432
- 'strand_dist', 'helix_dist', 'turn_dist', 'metalBinding_dist',
433
- 'repeat_dist', 'caBinding_dist', 'topologicalDomain_dist',
434
- 'bindingSite_dist', 'region_dist', 'signalPeptide_dist',
435
- 'modifiedResidue_dist', 'zincFinger_dist', 'motif_dist',
436
- 'coiledCoil_dist', 'peptide_dist', 'transitPeptide_dist',
437
- 'glycosylation_dist', 'propeptide_dist']]
438
- # Imputation
439
- if (impute == 'True') or (impute == 'true') or (impute == True):
440
- filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, 15.99,
441
- 16.82,
442
- 20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36]
443
- col_index = 0
444
- for col_ in final_data.columns[-30:]:
445
- final_data[col_] = final_data[col_].fillna(filler[col_index])
446
- final_data[col_] = final_data[col_].replace({'nan': filler[col_index]})
447
- col_index += 1
448
- final_data['domains_3Ddist'] = final_data['domains_3Ddist'].fillna(24.5)
449
- final_data['sasa'] = final_data['sasa'].fillna(29.5)
450
- final_data['location_3state'] = final_data['location_3state'].fillna('unknown')
451
- elif (impute == 'False') or (impute == 'false'):
452
- pass
453
- final_data = final_data.replace({'nan': np.NaN})
454
- final_data.domains_all = final_data.domains_all.replace({-1: 'NULL'})
455
-
456
  # ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
457
  if len(final_data) == 0:
458
  print(
 
82
  out_path = path_to_output_files / 'log.txt'
83
  #sys.stdout = open(out_path, 'w')
84
  data = clean_data(input_set)
85
+ if len(data) == 0:
86
+ st.write('Feature vectore generation terminated. Please enter a query or check your input format.')
87
+ else:
88
  data = add_uniprot_sequence(data)
89
  match = data[(data.wt_sequence_match == 'm')]
90
  org_len = len(match)
91
  iso = data[(data.wt_sequence_match == 'i')]
92
  noMatch = data[(data.wt_sequence_match != 'm') & (data.wt_sequence_match != 'i')]
93
+
94
+ if len(noMatch) == len(data) :
95
+ st.write('Aminoacid at the position could not be mapped to canonical or isoform sequence. Please check the input amino acid.')
96
+ elif len(noMatch) > 0:
97
+ st.write(
98
+ f'{len(noMatch)} of {len(data)} datapoints has not been mapped to any sequence. These datapoints are omitted.')
99
+ if len(iso) > 0:
100
+ st.write(f'{len(iso)} of {len(data)} datapoints has been mapped to isoform sequences. These datapoints are omitted.')
101
+ if len(match) == 0:
102
+ st.write('Feature generation terminated due to failed mapping of input amino acid to UniProt sequence.')
103
  else:
104
+ st.write(f'{len(match)} of {len(data)} datapoints has been mapped to canonical sequences. Proceeding with these datapoins.')
105
+ if (len(iso) != 0) | (len(noMatch) != 0):
106
+ st.write('Omitted datapoints are:', noMatch.datapoint.to_list() + iso.datapoint.to_list())
107
+ st.write('\n')
108
+ st.write('Check log file for updates.')
109
+
110
+ data = match[['uniprotID', 'wt', 'pos', 'mut', 'datapoint']]
111
+ print('>> Feature vector generation started...\n')
112
+ print('\n>> Creating directories...')
113
+ print('\n>> Adding physicochemical properties...\n')
114
+ data = add_physicochemical(data)
115
+ print('\n>> Adding domains\n')
116
+ data = add_domains(data, path_to_domains)
117
+ print('\n>> Adding sequence annotations...\n')
118
+ data = add_annotations(data)
119
+ print('\n>> Retrieving PDB structure information...\n')
120
+ pdb_info = addPDBinfo(data, path_to_output_files)
121
+ if len(pdb_info) != 0:
122
+ data = pd.merge(data, pdb_info, on='uniprotID', how='left')
123
+ # Spare datapoint if there is no associated PDB.
124
+ no_pdb = data[data.pdbID.isna()].drop_duplicates()
125
+ pdb = data[~data.pdbID.isna()].drop_duplicates()
126
+ # Spare datapoint if associated PDB does not cover mutated area.
127
+ pdb.pos = pdb.pos.apply(lambda x:int(x))
128
+ pdb.start = pdb.start.apply(lambda x: int(x))
129
+ pdb.end = pdb.end.apply(lambda x: int(x))
130
+ no_pdb_add = pdb[~((pdb.pos > pdb.start) & (pdb.pos < pdb.end))]
131
+
132
+ pdb = pdb[(pdb.pos > pdb.start) & (pdb.pos < pdb.end)] # do not change order
133
+
134
+ pdb.reset_index(drop=True, inplace=True)
135
+ # Delete spared datapoint from no_pdb list if it has any other PDB that spans the mutated area.
136
+ no_pdb_add = no_pdb_add[~no_pdb_add.datapoint.isin(pdb.datapoint.to_list())]
137
+ # Final collection of datapoints without PDB associaton.
138
+ no_pdb = pd.concat([no_pdb, no_pdb_add])
139
+ no_pdb = no_pdb[SIMPLE_COLS]
140
+ no_pdb = no_pdb.drop_duplicates()
141
+
142
+ pdb = pdb.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
143
+ pdb.reset_index(drop=True, inplace=True)
144
+ pdb.fillna(np.NaN, inplace=True)
145
+ # Get position mapping from added structures
146
+ print('\n>> Adding structure residue positions...\n')
147
+ if len(pdb) > 0: # there are mapped structures, and some of them span the mutated area.
148
+ pdb.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True)
149
+ pdb = pdbMapping(pdb, Path(path_to_output_files / 'pdb_structures'))
 
 
 
150
  pdb.reset_index(drop=True, inplace=True)
151
+ pdb = pdb.fillna(np.NaN)
152
+ no_pdb_add_ = pdb[pdb.AAonPDB.isna()]
153
+ no_pdb_add = pdb[pdb.MATCHDICT.isna()]
154
+ no_pdb = pd.concat([no_pdb_add_, no_pdb, no_pdb_add])
155
+ no_pdb.reset_index(inplace=True, drop=True)
156
+ pdb = pdb[~(pdb.MATCHDICT.isna())]
157
+ pdb = pdb[~(pdb.AAonPDB.isna())]
158
+ if len(pdb) > 0:
159
+ print('\n>> Mapping to PDB residues...\n')
160
+ pdb = changeUPtoPDB(pdb)
161
  pdb.reset_index(drop=True, inplace=True)
162
+ print('\n>> Calculating 3D distances for PDB structures...\n')
163
+ pdb = isZeroDistance(pdb)
164
+ pdb = processFile(pdb, path_to_output_files)
165
+ pdb = match3D(pdb)
166
+ pdb = selectMaxAnnot(pdb)
167
+ pdb = pdb.sort_values(by=['datapoint', 'resolution', 'annotTotal'], ascending=[True, True, True])
168
+ pdb = pdb.drop_duplicates(['datapoint'])
169
+ pdb.replace({'[]': np.NaN, 'hit':0.0}, inplace=True)
170
+ print('\n>> PDB matching is completed...\n')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  else:
172
+ # There was no residue match in the associated PDB. So we cannot use PDB data.
173
  pdb = pdb[SIMPLE_COLS]
174
  print('\n>>> No PDB structure could be matched.')
175
+
 
176
  else:
177
+ pdb = pdb[SIMPLE_COLS]
178
  print('\n>>> No PDB structure could be matched.')
179
+
180
+
181
+ else:
182
+ pdb = pd.DataFrame(columns = SIMPLE_COLS)
183
+ print('\n>>> No PDB structure could be matched.')
184
+ no_pdb = data.copy()
185
+ no_pdb = no_pdb[SIMPLE_COLS]
186
+
187
+ print(
188
+ 'PDB phase is finished...\nPDB structures are found for %d of %d.\n%d of %d failed to match with PDB structure.\n'
189
+ % (len(pdb.drop_duplicates(['datapoint'])), len(data.drop_duplicates(['datapoint'])),
190
+ len(no_pdb.drop_duplicates(['datapoint'])), len(data.drop_duplicates(['datapoint']))))
191
+
192
+
193
+
194
+ print('\n>>> Proceeding to SwissModel search...')
195
+ print('------------------------------------\n')
196
+ swiss = no_pdb.copy()
197
+ if len(swiss) > 0:
198
+ print('\n>> Adding SwissModel residue positions...\n')
199
+ swiss.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True)
200
+ swiss = swiss.fillna(np.NaN)
201
+ swiss, no_swiss_models= addSwissModels(swiss, path_to_input_files, path_to_output_files)
202
+ print('\n>> Mapping to SwissModels...\n')
203
  if len(swiss) > 0:
204
+ swiss.reset_index(drop=True, inplace=True)
205
+ swiss = changeUPtoModels(swiss)
206
+ swiss.reset_index(drop=True, inplace=True)
207
+ print('\n>> Calculating 3D distances for SwissModels...\n')
208
+ swiss = isZeroDistance(swiss)
209
+ swiss = match3DModels(swiss)
210
+ swiss = selectMaxAnnot(swiss)
211
+ swiss = swiss.sort_values(by=['datapoint', 'qmean_norm', 'distance', 'hitTotal', 'annotTotal'], ascending=[True, False, True, False, True])
212
+ swiss = swiss.drop_duplicates(['datapoint'])
213
+ swiss.replace({'[]': np.NaN, 'hit': 0.0}, inplace=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  else:
215
  swiss = swiss[SIMPLE_COLS]
 
216
 
217
+ if len(no_swiss_models) > 0:
218
+ no_swiss_models = no_swiss_models[SIMPLE_COLS]
219
+ no_swiss_models.reset_index(inplace=True, drop=True)
 
 
 
 
 
 
 
 
 
220
 
221
+ else:
222
+ swiss = swiss[SIMPLE_COLS]
223
+ no_swiss_models = no_pdb.copy()
224
+
225
+ if len(no_swiss_models) >0:
226
+ modbase = no_swiss_models.copy()
227
+ print('Proceeding to Modbase search...')
228
+ print('------------------------------------\n')
229
+
230
+ modbase = modbase[SIMPLE_COLS]
231
+ modbase.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True)
232
+ modbase = modbase.fillna(np.NaN)
233
+ print('\n>> Adding Modbase residue positions...\n')
234
+ modbase_simple = modbase[['uniprotID', 'wt', 'pos', 'mut','datapoint']]
235
+ modbase_simple = modbase_simple.drop_duplicates(['uniprotID', 'wt', 'pos' ,'mut','datapoint'])
236
+ modbaseOut, no_modbase_models_updated = addModbaseModels(modbase_simple, path_to_input_files, path_to_output_files)
237
+
238
+ if len(modbaseOut) > 0:
239
+ modbase = modbase.merge(modbaseOut, on = ['uniprotID', 'wt', 'pos', 'mut','datapoint'], how = 'left')
240
+ no_modbase_models_updated['sasa'] = np.NaN
241
+ modbase.reset_index(inplace=True, drop=True)
242
+ no_modbase_add = modbase[pd.isna(modbase.coordinates)]
243
+ modbase = modbase[~pd.isna(modbase.coordinates)]
244
+ no_modbase_models_updated = pd.concat([no_modbase_models_updated, no_modbase_add])
245
+ print('\n>> Mapping to Modbase models...\n')
246
+ modbase = changeUPtoModels(modbase)
247
+ print('\n>> Calculating 3D distances for Modbase models...\n')
248
+ modbase = isZeroDistance(modbase)
249
+ modbase = match3DModels(modbase)
250
+ modbase = selectMaxAnnot(modbase)
251
+ modbase = modbase.sort_values(by=['datapoint', 'quality_score', 'distance','hitTotal', 'annotTotal'], ascending=[True, False, True, False, True])
252
+ modbase = modbase.drop_duplicates(['datapoint'])
253
+ modbase.replace({'[]': np.NaN, 'hit': 0.0}, inplace=True)
254
+
255
+ else:
256
+ modbase = pd.DataFrame(columns = SIMPLE_COLS)
257
 
258
+ else:
259
+ no_modbase_models_updated = pd.DataFrame(columns = SIMPLE_COLS)
260
+ modbase= pd.DataFrame(columns = SIMPLE_COLS)
261
+
262
+ COLS = ['uniprotID', 'wt', 'pos', 'mut', 'datapoint', 'composition', 'polarity', 'volume', 'granthamScore', 'domain', 'domStart', 'domEnd', 'distance',
263
+ 'region', 'crosslink', 'peptide', 'disulfide', 'signalPeptide', 'propeptide', 'naturalVariant', 'nucleotideBinding', 'modifiedResidue', 'site',
264
+ 'caBinding', 'turn', 'transmembrane', 'repeat', 'glycosylation', 'intramembrane', 'metalBinding', 'bindingSite', 'dnaBinding', 'activeSite',
265
+ 'coiledCoil', 'helix', 'mutagenesis', 'zincFinger', 'transitPeptide', 'intMet', 'strand', 'lipidation', 'motif', 'topologicalDomain',
266
+ 'disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', 'nucleotideBindingBinary',
267
+ 'lipidationBinary', 'siteBinary', 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
268
+ 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', 'modifiedResidueBinary', 'zincFingerBinary',
269
+ 'motifBinary', 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', 'glycosylationBinary', 'propeptideBinary', 'sasa']
270
+
271
+ if len(no_modbase_models_updated) == 0:
272
+ no_modbase_models_updated = pd.DataFrame(columns = SIMPLE_COLS)
273
+ no_modbase_models_updated = no_modbase_models_updated[~no_modbase_models_updated.datapoint.isin(modbase.datapoint.to_list())]
274
+ no_modbase_models_updated = no_modbase_models_updated[['uniprotID', 'wt', 'pos', 'mut', 'datapoint']]
275
+ no_modbase_models_updated.pos = no_modbase_models_updated.pos.astype(int)
276
+ no_modbase_models_updated = no_modbase_models_updated.drop_duplicates()
277
+
278
+
279
+ if len(pdb)>0:
280
+ pdb = pdb[COLS]
281
+ pdb['Source'] = 'PDB'
282
+ else:
283
+ pdb = pd.DataFrame()
284
+ if len(swiss) > 0:
285
+ swiss = swiss[COLS]
286
+ swiss['Source'] = 'SWISS-Model'
287
+ else:
288
+ swiss = pd.DataFrame()
289
+ if len(modbase) > 0:
290
+ modbase = modbase[COLS]
291
+ modbase['Source'] = 'Modbase'
292
+ else:
293
+ modbase = pd.DataFrame()
294
+
295
+
296
+ # st.write('======PDB==========')
297
+ # st.write(pdb.to_string())
298
+ # st.write('======SWISS==========')
299
+ # st.write(swiss.to_string())
300
+ # st.write('======MODBASE==========')
301
+ # st.write(modbase.to_string())
302
+
303
+
304
+
305
+ allData = pd.concat([pdb, swiss, modbase])
306
+ allData.reset_index(inplace=True, drop=True)
307
+ allData.replace({np.NaN: ''}, inplace=True)
308
+ # st.write('======ALL DATA==========')
309
+ # st.write(allData.to_string())
310
+ if len(allData)>0:
311
+ allData.distance.replace({-1000: ''}, inplace=True)
312
+
313
+
314
+ # Get interface positions from ECLAIR. Download HQ human
315
+ print()
316
+ print('Assigning surface regions...')
317
+ print('------------------------------------\n')
318
+
319
+ print('Extracting interface residues...\n')
320
+ data_interface = pd.read_csv(path_to_interfaces, sep='\t')
321
+
322
+ positions = get_interface_positions(data_interface, 'P1', 'P2')
323
+
324
+ interface_dataframe = pd.DataFrame()
325
+
326
+ for key, val in positions.items():
327
+ k = pd.Series((key, str(list(set(val)))))
328
+ interface_dataframe = interface_dataframe.append(k, ignore_index=True)
329
+ interface_dataframe.columns = ['uniprotID', 'positions']
330
+ final_data = finalTouch(allData)
331
+ final_data = final_data.merge(interface_dataframe, on='uniprotID', how='left')
332
+ final_data.positions = final_data.positions.astype('str')
333
+ for i in final_data.index:
334
+ if (str(final_data.at[i, 'pos']) in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'surface':
335
+ final_data.at[i, 'threeState_trsh4_HQ'] = 'interface'
336
+ elif (str(final_data.at[i, 'pos']) not in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'surface':
337
+ final_data.at[i, 'threeState_trsh4_HQ'] = 'surface'
338
+ elif (str(final_data.at[i, 'pos']) not in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'core':
339
+ final_data.at[i, 'threeState_trsh4_HQ'] = 'core'
340
+ elif (str(final_data.at[i, 'pos']) in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'core':
341
+ final_data.at[i, 'threeState_trsh4_HQ'] = 'conflict'
342
+ elif final_data.at[i, 'trsh4'] == 'nan':
343
+ final_data.at[i, 'threeState_trsh4_HQ'] = 'nan'
344
+
345
+ final_data.drop(['positions'], axis=1, inplace=True)
346
+
347
+ fisherResult = pd.read_csv(fisher_path, sep='\t')
348
+ significant_domains = fisherResult.domain.to_list()
349
+ for i in final_data.index:
350
+ if final_data.at[i, 'domain'] in significant_domains:
351
+ final_data.at[i, 'domain_fisher'] = final_data.at[i, 'domain']
352
  else:
353
+ final_data.at[i, 'domain_fisher'] = 'NULL'
354
+ print('Final adjustments are being done...\n')
355
+ binaryCols = UNIPROT_ANNOTATION_COLS[-30:]
356
+ final_data = final_data.astype(str)
357
+ final_data.replace({'NaN': 'nan'}, inplace=True)
358
+ for i in final_data.index:
359
+ for j in binaryCols:
360
+ final_data[j] = final_data[j].astype('str')
361
+ if (final_data.at[i, j] == '0') or (final_data.at[i, j] == '0.0'):
362
+ final_data.at[i, j] = '1'
363
+ elif final_data.at[i, j] == 'nan':
364
+ final_data.at[i, j] = '0'
365
+ elif (final_data.at[i, j] == '1') or (final_data.at[i, j] == '1.0'):
366
+ final_data.at[i, j] = '2'
 
 
 
 
 
 
 
367
 
368
+ annotCols = UNIPROT_ANNOTATION_COLS[:30]
369
+
370
+ for i in final_data.index:
371
+ for annot in annotCols:
372
+ binaryName = str(annot) + 'Binary'
373
+ if final_data.at[i, binaryName] == '2':
374
+ final_data.at[i, annot] = '0.0'
375
+ final_data.rename(
376
+ columns={'uniprotID': 'prot_uniprotAcc', 'wt': 'wt_residue', 'pos': 'position', 'mut': 'mut_residue',
377
+ 'datapoint': 'meta_merged', 'datapoint_disease': 'meta-lab_merged', 'label': 'source_db',
378
+ 'family': 'prot_family', 'domain': 'domains_all', 'domain_fisher': 'domains_sig',
379
+ 'distance': 'domains_3Ddist', 'threeState_trsh4_HQ': 'location_3state',
380
+ 'disulfideBinary': 'disulfide_bin', 'intMetBinary': 'intMet_bin',
381
+ 'intramembraneBinary': 'intramembrane_bin',
382
+ 'naturalVariantBinary': 'naturalVariant_bin', 'dnaBindingBinary': 'dnaBinding_bin',
383
+ 'activeSiteBinary': 'activeSite_bin',
384
+ 'nucleotideBindingBinary': 'nucleotideBinding_bin', 'lipidationBinary': 'lipidation_bin',
385
+ 'siteBinary': 'site_bin',
386
+ 'transmembraneBinary': 'transmembrane_bin', 'crosslinkBinary': 'crosslink_bin',
387
+ 'mutagenesisBinary': 'mutagenesis_bin',
388
+ 'strandBinary': 'strand_bin', 'helixBinary': 'helix_bin', 'turnBinary': 'turn_bin',
389
+ 'metalBindingBinary': 'metalBinding_bin',
390
+ 'repeatBinary': 'repeat_bin', 'topologicalDomainBinary': 'topologicalDomain_bin',
391
+ 'caBindingBinary': 'caBinding_bin',
392
+ 'bindingSiteBinary': 'bindingSite_bin', 'regionBinary': 'region_bin',
393
+ 'signalPeptideBinary': 'signalPeptide_bin',
394
+ 'modifiedResidueBinary': 'modifiedResidue_bin', 'zincFingerBinary': 'zincFinger_bin',
395
+ 'motifBinary': 'motif_bin',
396
+ 'coiledCoilBinary': 'coiledCoil_bin', 'peptideBinary': 'peptide_bin',
397
+ 'transitPeptideBinary': 'transitPeptide_bin',
398
+ 'glycosylationBinary': 'glycosylation_bin', 'propeptideBinary': 'propeptide_bin',
399
+ 'disulfide': 'disulfide_dist', 'intMet': 'intMet_dist',
400
+ 'intramembrane': 'intramembrane_dist', 'naturalVariant': 'naturalVariant_dist',
401
+ 'dnaBinding': 'dnaBinding_dist', 'activeSite': 'activeSite_dist',
402
+ 'nucleotideBinding': 'nucleotideBinding_dist', 'lipidation': 'lipidation_dist',
403
+ 'site': 'site_dist',
404
+ 'transmembrane': 'transmembrane_dist', 'crosslink': 'crosslink_dist',
405
+ 'mutagenesis': 'mutagenesis_dist', 'strand': 'strand_dist', 'helix': 'helix_dist',
406
+ 'turn': 'turn_dist',
407
+ 'metalBinding': 'metalBinding_dist', 'repeat': 'repeat_dist',
408
+ 'topologicalDomain': 'topologicalDomain_dist', 'caBinding': 'caBinding_dist',
409
+ 'bindingSite': 'bindingSite_dist', 'region': 'region_dist',
410
+ 'signalPeptide': 'signalPeptide_dist', 'modifiedResidue': 'modifiedResidue_dist',
411
+ 'zincFinger': 'zincFinger_dist', 'motif': 'motif_dist', 'coiledCoil': 'coiledCoil_dist',
412
+ 'peptide': 'peptide_dist', 'transitPeptide': 'transitPeptide_dist',
413
+ 'glycosylation': 'glycosylation_dist', 'propeptide': 'propeptide_dist'}, inplace=True)
414
+
415
+ final_data = final_data[
416
+ ['prot_uniprotAcc', 'wt_residue', 'mut_residue', 'position','Source', 'meta_merged', 'composition', 'polarity',
417
+ 'volume',
418
+ 'granthamScore', 'domains_all',
419
+ 'domains_sig', 'domains_3Ddist', 'sasa', 'location_3state', 'disulfide_bin', 'intMet_bin',
420
+ 'intramembrane_bin', 'naturalVariant_bin', 'dnaBinding_bin',
421
+ 'activeSite_bin', 'nucleotideBinding_bin', 'lipidation_bin', 'site_bin',
422
+ 'transmembrane_bin', 'crosslink_bin', 'mutagenesis_bin', 'strand_bin',
423
+ 'helix_bin', 'turn_bin', 'metalBinding_bin', 'repeat_bin',
424
+ 'caBinding_bin', 'topologicalDomain_bin', 'bindingSite_bin',
425
+ 'region_bin', 'signalPeptide_bin', 'modifiedResidue_bin',
426
+ 'zincFinger_bin', 'motif_bin', 'coiledCoil_bin', 'peptide_bin',
427
+ 'transitPeptide_bin', 'glycosylation_bin', 'propeptide_bin', 'disulfide_dist', 'intMet_dist',
428
+ 'intramembrane_dist',
429
+ 'naturalVariant_dist', 'dnaBinding_dist', 'activeSite_dist',
430
+ 'nucleotideBinding_dist', 'lipidation_dist', 'site_dist',
431
+ 'transmembrane_dist', 'crosslink_dist', 'mutagenesis_dist',
432
+ 'strand_dist', 'helix_dist', 'turn_dist', 'metalBinding_dist',
433
+ 'repeat_dist', 'caBinding_dist', 'topologicalDomain_dist',
434
+ 'bindingSite_dist', 'region_dist', 'signalPeptide_dist',
435
+ 'modifiedResidue_dist', 'zincFinger_dist', 'motif_dist',
436
+ 'coiledCoil_dist', 'peptide_dist', 'transitPeptide_dist',
437
+ 'glycosylation_dist', 'propeptide_dist']]
438
+ # Imputation
439
+ if (impute == 'True') or (impute == 'true') or (impute == True):
440
+ filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, 15.99,
441
+ 16.82,
442
+ 20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36]
443
+ col_index = 0
444
+ for col_ in final_data.columns[-30:]:
445
+ final_data[col_] = final_data[col_].fillna(filler[col_index])
446
+ final_data[col_] = final_data[col_].replace({'nan': filler[col_index]})
447
+ col_index += 1
448
+ final_data['domains_3Ddist'] = final_data['domains_3Ddist'].fillna(24.5)
449
+ final_data['sasa'] = final_data['sasa'].fillna(29.5)
450
+ final_data['location_3state'] = final_data['location_3state'].fillna('unknown')
451
+ elif (impute == 'False') or (impute == 'false'):
452
+ pass
453
+ final_data = final_data.replace({'nan': np.NaN})
454
+ final_data.domains_all = final_data.domains_all.replace({-1: 'NULL'})
455
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
456
  # ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
457
  if len(final_data) == 0:
458
  print(