Spaces:

HUBioDataLab
/

ASCARIS

Sleeping

App Files Files Community

fatmacankara commited on Dec 13, 2023

Commit

97b1b63

1 Parent(s): 993c0d3

Delete code/add_alignment.py

Browse files

Files changed (1) hide show

code/add_alignment.py +0 -350

code/add_alignment.py DELETED Viewed

@@ -1,350 +0,0 @@
-from Bio import Align
-from Bio.Align import substitution_matrices
-from pathlib import Path
-aligner = Align.PairwiseAligner()
-from Bio.pairwise2 import format_alignment
-def convert_non_standard_amino_acids(sequence):
-    """
-    Convert non-standard or ambiguous amino acid codes to their closest relatives.
-    """
-    # Define a dictionary to map non-standard codes to standard amino acids
-    conversion_dict = {
-        'B': 'D',  # Aspartic Acid (D) is often used for B (Asx)
-        'Z': 'E',  # Glutamic Acid (E) is often used for Z (Glx)
-        'X': 'A',  # Alanine (A) is a common placeholder for unknown/ambiguous
-        'U': 'C',  # Cysteine (C) is often used for Selenocysteine (U)
-        'J': 'L',  # Leucine (L) is often used for J (Leu/Ile)
-        'O': 'K',  # Lysine (K) is often used for O (Pyrrolysine)
-        # '*' or 'Stop' represents a stop codon; you may replace with '' to remove
-        '*': '',
-    }
-    # Replace non-standard codes with their closest relatives
-    converted_sequence = ''.join([conversion_dict.get(aa, aa) for aa in sequence])
-    return converted_sequence
-def do_alignment(identifier, uniprotSequence, pdbSequence, alignment_path):
-    print(f'Aligning Datapoint: {identifier}')
-    if len(pdbSequence) >= 1:
-        uniprotSequence = convert_non_standard_amino_acids(uniprotSequence)
-        pdbSequence = convert_non_standard_amino_acids(pdbSequence)
-        aligner.mode = 'local'
-        aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
-        aligner.open_gap_score = -11
-        aligner.extend_gap_score = -1
-        alignments = aligner.align(uniprotSequence, pdbSequence)
-        alignments = (list(alignments))
-        alignment_list = []
-        for alignment in alignments:
-            alignment = (str(alignment).strip().split('\n'))
-            alignment = [''.join(['.' if m == ' ' else m for m in x]) for x in alignment]
-            alignment_list.append(alignment)
-    return alignment_list
-def mutation_position_on_pdb(alignment_list, pos):
-    which_alignment_to_go = 0
-    for alignment in alignment_list:
-        which_alignment_to_go += 1
-        alignment_uniprot = alignment[0]
-        alignment_pdb = alignment[2]
-        startGap = 0
-        if alignment_uniprot.startswith('.') or alignment_uniprot.startswith('-'):
-            for k in alignment_uniprot:
-                if k == '.' or k == '-':
-                    startGap += 1
-                else:
-                    break
-        countGap = startGap
-        countResidue = 0
-        canonicalRes = ' '
-        pdbRes = ' '
-        for j in alignment_uniprot[startGap:]:
-            if j == '.' or j == '-':
-                countGap += 1
-            else:
-                countResidue += 1
-            if int(countResidue) == int(pos):
-                canonicalRes = alignment_uniprot[countResidue + countGap - 1]
-                try:
-                    pdbRes = alignment_pdb[countResidue + countGap - 1]
-                except:
-                    IndexError
-                    pdbRes = 'nan'
-                break
-        if (alignment[1][countResidue + countGap - 1] == '|') or (alignment[1][countResidue + countGap - 1] == 'X'):
-            if canonicalRes == pdbRes:
-                pdb_alignStatus = 'aligned'
-            elif canonicalRes != pdbRes:
-                pdb_alignStatus = 'aligned*'
-            countGap_pdb = 0
-            countResidue_pdb = 0
-            pdbRes = ' '
-            for j in alignment_pdb[0:countResidue + countGap - 1]:
-                if j == '.' or j == '-':
-                    countGap_pdb += 1
-            if alignment_pdb[countResidue + countGap - 1] == '.' or alignment_pdb[
-                countResidue + countGap - 1] == '-':
-                mutationPositionOnPDB = 'nan'
-                posPDB = 'nan'
-            else:
-                posPDB = countResidue + countGap - countGap_pdb
-                mutationPositionOnPDB = str(posPDB)
-            break
-        elif (canonicalRes == pdbRes) and ((alignment[1][countResidue + countGap - 1] == '.') or (
-                alignment[1][countResidue + countGap - 1] == '-')):
-            pdb_alignStatus = 'not_aligned'
-            mutationPositionOnPDB = 'nan'
-        elif (canonicalRes != pdbRes) and ((alignment[1][countResidue + countGap - 1] == '.') or (
-                alignment[1][countResidue + countGap - 1] == '-')):
-            pdb_alignStatus = 'not_aligned'
-            mutationPositionOnPDB = 'nan'
-        elif alignment_pdb[countResidue + countGap - 1] == '.' or alignment_pdb[
-            countResidue + countGap - 1] == '-':
-            mutationPositionOnPDB = 'nan'
-            posPDB = 'nan'
-        else:
-            pdb_alignStatus = 'not_aligned'
-            mutationPositionOnPDB = 'nan'
-    return (pdb_alignStatus, mutationPositionOnPDB, startGap, alignment_list[which_alignment_to_go - 1])
-def find_position_on_pdb_for_range_annotations(posAnnotation, startGap, alignment_to_use):
-    annotation_on_pdb_start = 'nan'
-    annotation_on_pdb_end = 'nan'
-    pos1 = int(posAnnotation.split('-')[0])
-    count_gap = startGap
-    count_residue = 0
-    for j in alignment_to_use[0][startGap:]:
-        if j == '.' or j == '-':
-            count_gap += 1
-        else:
-            count_residue += 1
-        if int(count_residue) == int(pos1):  # count gaps until the first position
-            break
-    annotation_on_up_start = int(pos1) + int(count_gap)
-    pos2 = int(posAnnotation.split('-')[1])
-    count_gap = startGap
-    count_residue = 0
-    for j in alignment_to_use[0][startGap:]:
-        if j == '.' or j == '-':
-            count_gap += 1
-        else:
-            count_residue += 1
-        if int(count_residue) == int(pos2):  # count gaps until the first position
-            break
-    annotation_on_up_end = int(pos2) + int(count_gap)
-    try:
-        pdb_residue_start = alignment_to_use[2][annotation_on_up_start - 1].strip()
-        if (pdb_residue_start == '.') or (pdb_residue_start == '-'):
-            for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
-                if (alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end][ran] != '.') and \
-                        (alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end][ran] != '-') and \
-                        ((alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == '|') or
-                         (alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == 'X')):
-                    annotation_on_up_start += ran
-                    break
-        elif (pdb_residue_start != '.') and (pdb_residue_start != '-') and \
-                ((alignment_to_use[1][annotation_on_up_start - 1] == '.') or (
-                        alignment_to_use[1][annotation_on_up_start - 1] == '-')):
-            for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
-                if ((alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == '|') or
-                        (alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == 'X')):
-                    annotation_on_up_start += ran
-                    break
-        count_gap_pdb = 0
-        if annotation_on_up_start != 'nan':
-            for q in alignment_to_use[2][0:annotation_on_up_start - 1]:
-                if q == '.' or q == '-':
-                    count_gap_pdb += 1
-            if alignment_to_use[1][annotation_on_up_start] == '-' or alignment_to_use[1][annotation_on_up_start] == '.':
-                annotation_on_pdb_start = 'nan'
-            else:
-                annotation_on_pdb_start = int(annotation_on_up_start) - count_gap_pdb
-        else:
-            annotation_on_pdb_start = 'nan'
-    except:
-        IndexError
-    try:
-        pdb_residue_end = alignment_to_use[2][annotation_on_up_end - 1].strip()
-        if pdb_residue_end == '.' or pdb_residue_end == '-':
-            for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
-                if ((alignment_to_use[1][annotation_on_up_start - 1:annotation_on_up_end][ran] == '.') or
-                        (alignment_to_use[1][(annotation_on_up_start - 1):][ran] == '-')):
-                    annotation_on_up_start += (ran - 1)
-                    annotation_on_up_end = annotation_on_up_start
-                    break
-        elif (pdb_residue_end != '.') and (pdb_residue_end != '-') and \
-                ((alignment_to_use[1][annotation_on_up_end - 1] == '.') or (
-                        alignment_to_use[1][annotation_on_up_end - 1] == '-')):
-            for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
-                if ((alignment_to_use[1][annotation_on_up_start - 1:annotation_on_up_end][ran] == '.') or
-                        (alignment_to_use[1][(annotation_on_up_start - 1):][ran] == '-')):
-                    annotation_on_up_start += (ran - 1)
-                    annotation_on_up_end = annotation_on_up_start
-                    break
-        count_gap_pdb = 0
-        if annotation_on_up_end != 'nan':
-            for q in alignment_to_use[2][0:annotation_on_up_end - 1]:
-                if q == '.' or q == '-':
-                    count_gap_pdb += 1
-            if alignment_to_use[1][annotation_on_up_end - 1] == '-' or alignment_to_use[1][
-                annotation_on_up_end - 1] == '.' and annotation_on_pdb_start == 'nan':
-                annotation_on_pdb_end = 'nan'
-            elif alignment_to_use[1][annotation_on_up_end - 1] == '-' or alignment_to_use[1][
-                annotation_on_up_end - 1] == '.' and annotation_on_pdb_start != 'nan':
-                annotation_on_pdb_end = int(annotation_on_up_end) - count_gap_pdb
-            else:
-                annotation_on_pdb_end = int(annotation_on_up_end) - count_gap_pdb
-        else:
-            annotation_on_pdb_end = 'nan'
-    except:
-        IndexError  # Say isoform 2 is matched with the length 100, but canonical is 150 aa long. If there is an annotation at 105. position, for the isoform it throws an index error.
-    if annotation_on_pdb_start == 'nan' and annotation_on_pdb_end != 'nan':
-        annotation_on_pdb_start = annotation_on_up_start - count_gap_pdb
-    if annotation_on_pdb_start == annotation_on_pdb_end:
-        annotation_on_pdb_start = 'nan'
-        annotation_on_pdb_end = 'nan'
-    return annotation_on_up_start, annotation_on_up_end, annotation_on_pdb_start, annotation_on_pdb_end
-def annotation_pos_on_pdb(annot_positions, startGap, alignment_to_use, identifier):
-    newpos = []
-    if annot_positions != 'nan':
-        annot_positions = (str(annot_positions).replace("'", ''))
-        annot_positions = (str(annot_positions).replace('[', ''))
-        annot_positions = (str(annot_positions).replace("]", ''))
-        positionList_perAnnotation = annot_positions.split(',')
-        positionList_perAnnotation = [h.strip() for h in positionList_perAnnotation]
-        position_start_on_pdb = 'nan'
-        position_end_on_pdb = 'nan'
-        try:
-            positionList_perAnnotation = [i for i in positionList_perAnnotation if i != 'nan']
-        except:
-            TypeError
-        for position in range(len(positionList_perAnnotation)):
-            if ('-' not in str(positionList_perAnnotation[position])) and (str(positionList_perAnnotation[position]) != '?') and (str(positionList_perAnnotation[position]) != '') and (len(str(positionList_perAnnotation[position])) != 0):
-                count_gap = startGap
-                count_residue = 0
-                for j in alignment_to_use[0][startGap:]:
-                    if j == '.' or j == '-':
-                        count_gap += 1
-                    else:
-                        count_residue += 1
-                    try:
-                        if int(count_residue) == int(positionList_perAnnotation[position]):
-                            break
-                    except:
-                        ValueError
-                annotation_on_up = int(positionList_perAnnotation[position]) + int(count_gap)
-                try:
-                    pdb_residue_start = alignment_to_use[2][annotation_on_up - 1].strip()
-                except:
-                    IndexError
-                    pdb_residue_start = 'nan'
-                if pdb_residue_start != 'nan':
-                    try:
-                        if (pdb_residue_start == '.') or (pdb_residue_start == '-'):
-                            for ran in range(len(alignment_to_use[2][(annotation_on_up - 1):annotation_on_up])):
-                                if (alignment_to_use[2][(annotation_on_up - 1):annotation_on_up][ran] != '.') and \
-                                        (alignment_to_use[2][(annotation_on_up - 1):annotation_on_up][
-                                             ran] != '-') and \
-                                        ((alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][
-                                              ran] == '|') or
-                                         (alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][
-                                              ran] == 'X')):
-                                    annotation_on_up += ran
-                                    break
-                        elif (pdb_residue_start != '.') and (pdb_residue_start != '-') and \
-                                ((alignment_to_use[1][annotation_on_up - 1] == '.') or (
-                                        alignment_to_use[1][annotation_on_up - 1] == '-')):
-                            for ran in range(len(alignment_to_use[2][(annotation_on_up - 1):annotation_on_up])):
-                                if ((alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][ran] == '|') or
-                                        (alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][ran] == 'X')):
-                                    annotation_on_up += ran
-                                    break
-                        count_gap_pdb = 0
-                        for q in alignment_to_use[2][0:annotation_on_up - 1]:
-                            if q == '.' or q == '-':
-                                count_gap_pdb += 1
-                        if alignment_to_use[1][annotation_on_up] == '-' or alignment_to_use[1][
-                            annotation_on_up] == '.':
-                            annotation_on_pdb = 'nan'
-                        else:
-                            annotation_on_pdb = int(annotation_on_up) - count_gap_pdb
-                        if count_gap_pdb == annotation_on_up:
-                            annotation_on_pdb = 'nan'
-                        try:
-                            if alignment_to_use[2][count_gap_pdb + annotation_on_pdb - 1] == '.' or alignment_to_use[2][
-                                count_gap_pdb + annotation_on_pdb - 1] == '-':
-                                annotation_on_pdb = 'nan'
-                        except:
-                            IndexError
-                            annotation_on_pdb = 'nan'
-                    except:
-                        IndexError
-                        annotation_on_pdb = 'nan'
-                    newpos.append(annotation_on_pdb)
-            elif ('-' in str(positionList_perAnnotation[position])) and (
-                    str(positionList_perAnnotation[position]) != '?') and (
-                    str(positionList_perAnnotation[position]) != ' ') and (
-                    len(str(positionList_perAnnotation[position])) != 0):
-                try:
-                    position_start_on_pdb = \
-                        find_position_on_pdb_for_range_annotations(positionList_perAnnotation[position],
-                                                                   startGap, alignment_to_use)[2]
-                    position_end_on_pdb = \
-                        find_position_on_pdb_for_range_annotations(positionList_perAnnotation[position],
-                                                                   startGap, alignment_to_use)[3]
-                except:
-                    ValueError
-                newpositions = str(position_start_on_pdb) + '-' + str(position_end_on_pdb)
-                newpos.append(newpositions)
-            else:
-                pass
-    try:
-        newpos = [i for i in newpos if i != 'nan']
-    except:
-        TypeError
-    return newpos
-def final_stage(df, annotation_list, alignment_path):
-    for i in df.index:
-        identifier = df.at[i, 'uniprotID'] + '_' + df.at[i, 'pdbID'] + '_' + df.at[i, 'chain'] + '_'
-        alignment_list = do_alignment(identifier, df.at[i, 'uniprotSequence'], df.at[i, 'pdbSequence'], alignment_path)
-        df.at[i, 'pdb_alignStatus'] = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[0]
-        df.at[i, 'mutationPositionOnPDB'] = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[1]
-        startGap = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[2]
-        alignment_to_use = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[3]
-        for annot in annotation_list:
-            df.at[i, annot] = annotation_pos_on_pdb(df.at[i, annot], startGap, alignment_to_use, identifier)
-        if str(df.at[i, 'domStart']) != 'nan' and str(df.at[i, 'domEnd']) != 'nan' and \
-                ((str(df.at[i, 'domStart']) != '-1' and str(df.at[i, 'domEnd']) != '-1' and
-                  str(df.at[i, 'domStart']) != '-1.0' and str(df.at[i, 'domEnd']) != '-1.0')):
-            domainLoc = str(df.at[i, 'domStart']).split('.')[0] + '-' + str(df.at[i, 'domEnd']).split('.')[0]
-            domain_pos = find_position_on_pdb_for_range_annotations(domainLoc, startGap, alignment_to_use)
-            df.at[i, 'domainStartonPDB'] = domain_pos[2]
-            df.at[i, 'domainEndonPDB'] = domain_pos[3]
-        elif str(df.at[i, 'domStart']) != '-1' or str(df.at[i, 'domEnd']) != '-1' or \
-                str(df.at[i, 'domStart']) != '-1.0' or str(df.at[i, 'domEnd']) != '-1.0':
-            df.at[i, 'domainStartonPDB'] = 'nan'
-            df.at[i, 'domainEndonPDB'] = 'nan'
-    return df
-def alignment(dataframe_to_align, annotation_list, alignment_path):
-    domainList = ['domStart', 'domEnd']
-    result = final_stage(dataframe_to_align, annotation_list, alignment_path)
-    return result