# import sys, os # SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) # sys.path.append(SCRIPT_DIR) from Unified_parser.globals import * # contains helper functions used in parser.py # repeated replacement of a subtring sub with tar in input until no change happens def rec_replace(input : str, sub : str, tar : str): while True: output = input.replace(sub, tar) if output == input: break input = output return output # function - RemoveUnwanted() - referenced in lines 63 - 109 of unified.y def RemoveUnwanted(input : str) -> str: # ignore punctuations punctuationList = ["!",";",":","@","#","$","%","^","&","*",",",".","/","'","’","”","“","।", "]", "[", "×", "ñ", "∙","•"] # replacing problematic unicode characters that look the same but have different encodings. # punjabi update replaceDict = {"ऩ":"ऩ", "ऱ":"ऱ", "क़":"क़", "ख़":"ख़", "ग़":"ग़", "ज़":"ज़", "ड़":"ड़", "ढ़":"ढ़", "ढ़":"ढ़", "फ़":"फ़", "य़":"य़", "ऴ":"ऴ", "ொ":"ொ", "ோ":"ோ", "ൊ":"ൊ", "ോ":"ോ", "ല്‍‌":"ൽ", "ള്‍":"ൾ", "ര്‍":"ർ", "ന്‍":"ൻ", "ണ്‍":"ൺ"} output = "" for c in input: if c in punctuationList: continue output += c for k in replaceDict.keys(): output = rec_replace(output, k, replaceDict[k]) return output # function to replace GetFile in lines 132 - 156 of unified.y # gives the filename according to language and type def GetFile(g : GLOBALS, LangId : int, type : int) -> str: fileName = g.rootPath # return common file that contains the CPS mapping if type == 0: fileName += g.commonFile #print("file",fileName) return fileName elif type == 1: fileName += "dict/" elif type == 2: fileName += "rules/" langIdNameMapping = { 1 : "malayalam", 2 : "tamil", 3 : "telugu", 4 : "kannada", 5 : "hindi", 6 : "bengali", 7 : "gujarathi", 8 : "odiya", 9 : "punjabi", 10 : "english" } if LangId in langIdNameMapping.keys(): fileName += langIdNameMapping[LangId] if type == 1: fileName += ".dict" elif type == 2: fileName += ".rules" return fileName # function to replace SetlangId in lines 62-80 of unified.y def SetlangId(g : GLOBALS, fl : str): id = ord(fl) if(id>=3328 and id<=3455): g.currLang = g.MALAYALAM; #malayalam elif(id>=2944 and id<=3055): g.currLang = g.TAMIL; #tamil elif(id>=3202 and id<=3311): g.currLang = g.KANNADA; #KANNADA elif(id>=3072 and id<=3198): g.currLang = g.TELUGU; #telugu elif(id>=2304 and id<=2431): g.currLang = g.HINDI; #hindi elif(id>=2432 and id<=2559): g.currLang = g.BENGALI; #BENGALI elif(id>=2688 and id<=2815): g.currLang = g.GUJARATHI; #gujarathi elif(id>=2816 and id<=2943): g.currLang = g.ODIYA; #odia elif(id>=2560 and id <= 2687): # punjabi g.currLang = g.PUNJABI elif(id>=64 and id<=123): g.currLang = g.ENGLISH; #english g.langId = g.currLang if(g.langId < 5): g.isSouth = 1 if(g.langId == 0): print(f"UNKNOWN LANGUAGE - id = {fl}") exit(0) return 1 # replacement for function in lins 158 - 213. Sets the lanuage features def SetlanguageFeat(g : GLOBALS, input : str) -> int: # open common file #print("entered here") try: with open(GetFile(g, 0,0), 'r') as infile: lines = infile.readlines() #print("linessss", lines) except: print("Couldn't open common file for reading") return 0 str1 = input length = len(str1) if (length == 0): length = 1 for j in range(0,length): # for skipping invisible char if (ord(str1[j]) < 8204): firstLet = str1[j] break SetlangId(g, firstLet) # set global langId for i in range(len(lines)): l = lines[i].strip().split('\t') g.symbolTable[i][1] = l[1] g.symbolTable[i][0] = l[1 + g.langId] return 1 # replacement for function in lines 52 - 59. Check if symbol is in symbolTable def CheckSymbol(g : GLOBALS, input : str) -> int: i = 0 for i in range(g.ROW): if (g.symbolTable[i][1] == input): return 1 return 0 # replacement for function in lines 249 - 276. Convert utf-8 to cps symbols def ConvertToSymbols(g : GLOBALS, input : str) -> str: str1 = input g.words.syllabifiedWord = "&" for j in range(len(str1)): if (ord(str1[j]) < 8204): g.words.syllabifiedWord += "&" + g.symbolTable[ord(str1[j])%128][1] g.words.syllabifiedWord = g.words.syllabifiedWord[1:] return g.words.syllabifiedWord # function in lines 1278 - 1299. save answer in g.answer def WriteFile(g : GLOBALS, text : str): g.answer = f"(set! wordstruct '( {text}))" # function in lines 588-597. checnk if vowel is in input. 'q' special case, 'rq' special case def CheckVowel(input : str, q : int, rq : int) -> int: if (input.find("a") != -1): return 1 if (input.find("e") != -1): return 1 if (input.find("i") != -1): return 1 if (input.find("o") != -1): return 1 if (input.find("u") != -1): return 1 if (q and input.find("q") != -1): return 1 if (rq and input.find("rq") != -1): return 1 return 0 # function in lines 599-602. def Checkeuv(input : str) -> int: if (input.find("euv") != -1): return 1 return 0 # function in lines 605-613 def CheckSingleVowel(input : str, q : int) -> int: if (input in ['a', 'e', 'i', 'o', 'u']): return 1 if (q != 0 and input == 'q'): return 1 return 0 # function in lines 616 - 629. get the type of phone in the position def GetPhoneType(g : GLOBALS, input : str, pos : int) -> int: phone = input phone = phone.split('&') phone = list(filter(lambda x : x != '', phone)) pos = min(pos, len(phone)) pch = phone[pos - 1] if (g.flags.DEBUG): print(f'input : {input}') print(f"str : {pch} {GetType(g, pch)}") return GetType(g, pch) # function in lines 631 - 637. get the type of given input def GetType(g : GLOBALS, input : str): for i in range(g.VOWELSSIZE): if g.VOWELS[i] == input: return 1 for i in range(g.CONSONANTSSIZE): if g.CONSONANTS[i] == input: return 2 for i in range(g.SEMIVOWELSSIZE): if g.SEMIVOWELS[i] == input: return 3 return 0 # function in lines 640 - 647. check if chillaksharas are there --for malayalam def CheckChillu(input : str) -> int: l = ["nwv", "nnv", "rwv", "lwv", "lnv"] for x in l: if (input.find(x) != -1): return 1 return 0 # function in lines 650 - 660. get UTF-8 from CPS def GetUTF(g : GLOBALS, input : str) -> str : for i in range(g.ROW): if (input == g.symbolTable[i][1]): return g.symbolTable[i][0] return 0 # function in lines 663 - 666. verify the letter is english char -- CLS def isEngLetter(p : str) -> int: if (ord(p) >= 97 and ord(p) <= 122): return 1 return 0 # function in lines 669-682. remove unwanted Symbols from word def CleanseWord(phone : str) -> str: phonecopy = "" for c in phone: if (c != '&' and isEngLetter(c) == 0): c = '#' phonecopy += c phonecopy = rec_replace(phonecopy, '$','') phonecopy = rec_replace(phonecopy, '&&','&') return phonecopy # replacement for funciton in lines 321 - 356. Correct if there is a vowel in the middle def MiddleVowel(g : GLOBALS, phone : str) -> str: c1 = '' c2 = '' phonecopy = phone for i in range(g.CONSONANTSSIZE): for j in range(g.VOWELSSIZE): c1 = f'&{g.CONSONANTS[i]}&{g.VOWELS[j]}&' c2 = f'&{g.CONSONANTS[i]}&av&{g.VOWELS[j]}&' phonecopy = phonecopy.replace(c1, c2) for i in range(g.SEMIVOWELSSIZE): for j in range(g.VOWELSSIZE): c1 = f'&{g.SEMIVOWELS[i]}&{g.VOWELS[j]}&' c2 = f'&{g.SEMIVOWELS[i]}&av&{g.VOWELS[j]}&' phonecopy = phonecopy.replace(c1, c2) return phonecopy # replacement for function in lines 435 - 459. //cant use this as break syllable rules. # NOT USED ANYWHERE def DoubleModifierCorrection(phone : str) -> str: doubleModifierList = ["&nwv&","&nnv&","&rwv&","&lwv&","&lnv&","&aav&","&iiv&","&uuv&","&rqv&","&eev&", "&eiv&","&ouv&","&axv&","&oov&","&aiv&","&auv&","&aev&", "&iv&","&ov&","&ev&","&uv&"] phonecopy = phone for i in range(0,21): for j in range(0,21): c1 = f'{doubleModifierList[i]}#{doubleModifierList[j]}' c2 = f'{doubleModifierList[i]}{doubleModifierList[j]}#&' phonecopy = phonecopy.replace(c1, c2) phonecopy = rec_replace(phonecopy, "&#&hq&","&hq&#&") phonecopy = rec_replace(phonecopy, "&&","&") return phonecopy # replacement for funciton in lines 462 - 495. //for eu&C&C&V def SchwaDoubleConsonent(phone : str) -> str: consonentList = ["k","kh","lx","rx","g","gh","ng","c","ch","j","jh","nj","tx","txh","dx","dxh","nx","t","th","d","dh","n","p","ph","b","bh","m","y","r","l","w","sh","sx","zh","y","s","h","f","dxq"] vowelList = ["av&","nwv&","nnv&","rwv&","lwv&","lnv&","aav&","iiv&","uuv&","rqv&","eev&","eiv&","ouv&", "axv&","oov&","aiv&","nnx&","nxx&","rrx&","llx&","lxx&", "aa&","iv&","ov&","mq&","aa&","ii&","uu&","rq&", "ee&","ei&","ou&","oo&","ax&","ai&","ev&","uv&", "a&","e&","i&","o&","u&"] phonecopy = phone for i in range(0,39): for j in range(0,39): for k in range(0,42): c1 = f'&euv&{consonentList[i]}&{consonentList[j]}&{vowelList[k]}' c2 = f'&euv&{consonentList[i]}&av&{consonentList[j]}&{vowelList[k]}' phonecopy = phonecopy.replace(c1, c2) phonecopy = rec_replace(phonecopy, "$","") return phonecopy # replacement for function in lines 498 - 585. //halant specific correction for aryan langs def SchwaSpecificCorrection(g : GLOBALS, phone : str) -> str: schwaList = ["k","kh","g","gh","ng","c","ch","j","jh","nj","tx","txh","dx","dxh", "nx","t","th","d","dh","n","p","ph","b","bh","m","y", "r","l","s","w","sh","sx","zh","h","lx","rx","f","dxq"] vowelList = ["av&","nwv&","nnv&","rwv&","lwv&","lnv&","aav&","iiv&","uuv&","rqv&","eev&","eiv&","ouv&", "axv&","oov&","aiv&","nnx&","nxx&","rrx&","llx&","lxx&", "aa&","iv&","ov&","mq&","aa&","ii&","uu&","rq&", "ee&","ei&","ou&","oo&","ax&","ai&","ev&","uv&", "a&","e&","i&","o&","u&"] if (g.flags.DEBUG): print(f'{len(phone)}') phonecopy = phone + '!' if (g.flags.DEBUG): print(f'phone cur - {phonecopy}') # // for end correction &av&t&aav&. //dont want av for i in range(0,38): for j in range(1,42): c1 = f'&av&{schwaList[i]}&{vowelList[j]}!' c2 = f'&euv&{schwaList[i]}&{vowelList[j]}!' phonecopy = phonecopy.replace(c1, c2) phonecopy = rec_replace(phonecopy, '!', '') for i in range(0,38): c1 = f'&av&{schwaList[i]}&av&' c2 = f'&euv$&{schwaList[i]}&av$&' phonecopy = phonecopy.replace(c1, c2) if(g.flags.DEBUG): print(f"inside schwa {phonecopy}") for i in range(0,38): c1 = f'&av&{schwaList[i]}&' c3 = f'&{schwaList[i]}&' for j in range(0,41): c4 = f'&euv&{c3}${vowelList[j]}' c2 = f'{c1}{vowelList[j]}' phonecopy = phonecopy.replace(c2, c4) phonecopy = rec_replace(phonecopy, '$', '') #//&q&w&eu& - CORRECTED TO 38 - CHECK for i in range(0,38): c1 = f'&q&{schwaList[i]}&euv&' c2 = f'&q&{schwaList[i]}&av&' phonecopy = phonecopy.replace(c1, c2) return phonecopy # replacement for function in lines . //correct the geminate syllabification ,isReverse --reverse correction def GeminateCorrection(phone : str, isReverse : int) -> str: geminateList = ["k","kh","lx","rx","g","gh","ng","c","ch","j","jh","nj","tx","txh","dx","dxh","nx","t","th","d","dh","n","p","ph","b","bh","m","y", "r","l","w","sh","sx","zh","y","s","h","f","dxq"] phonecopy = phone for i in range(0, 39): c1 = f'&{geminateList[i]}&eu&{geminateList[i]}&' c2 = f'&{geminateList[i]}&{geminateList[i]}&' phonecopy = rec_replace(phonecopy, c2, c1) if isReverse != 0 else rec_replace(phonecopy, c1, c2) return phonecopy # replacement for function in lines 356 - 430. //Syllabilfy the words def Syllabilfy(phone : str) -> str: phonecopy = phone phonecopy = rec_replace(phonecopy, "&&","&") phonecopy = phonecopy.replace("&eu&","&eu&#&") phonecopy = phonecopy.replace("&euv&","&euv&#&") phonecopy = rec_replace(phonecopy, "&avq","&q&av") phonecopy = phonecopy.replace("&av&","&av&#&") phonecopy = phonecopy.replace("&q","&q&#") removeList = ["&nwv&","&nnv&","&rwv&","&lwv&","&lnv&","&aav&","&iiv&","&uuv&","&rqv&","&eev&", "&eiv&","&ouv&","&axv&","&oov&","&aiv&","&auv&","&aev&", "&nnx&","&nxx&","&rrx&","&llx&","&lxx&", "&aa&","&iv&","&ov&","&mq&","&aa&","&ii&","&uu&","&rq&","&au&","&ee&", "&ei&","&ou&","&oo&","&ax&","&ai&","&ev&","&uv&","&ae&", "&a&","&e&","&i&","&o&","&u&"] for i in range(0,45): c1 = removeList[i] c2 = c1 + '#&' phonecopy = phonecopy.replace(c1, c2) phonecopy = rec_replace(phonecopy, "&#&hq&","&hq&#&") # //for vowel in between correction pureVowelList = ["&a&","&e&","&i&","&o&","&u&"] for i in range(0,5): c1 = f'&#{pureVowelList[i]}' phonecopy = phonecopy.replace(pureVowelList[i], c1) consonantList = ["k","kh","g","gh","ng","c","ch","j","jh","nj","tx","txh","dx","dxh", "nx","t","th","d","dh","n","p","ph","b","bh","m","y", "r","l","w","sh","sx","zh","y","s","h","lx","rx","f","dxq"] # // &eu&#&r&eu&#& syllabification correction for i in range(0,39): c1 = f'&eu&#&{consonantList[i]}&euv&#&' c2 = f'&eu&{consonantList[i]}&av&#&' phonecopy = phonecopy.replace(c1, c2) for i in range(0,39): c1 = f'&euv&#&{consonantList[i]}&euv&#&' c2 = f'&euv&{consonantList[i]}&av&#&' phonecopy = phonecopy.replace(c1, c2) phonecopy = phonecopy.replace("&eu&","&eu&#&") return phonecopy # replacement for function in lines 279 - 317. //check the word in Dict. # REMOVED EXIT(1) ON ENGLISH. WAS USELESS def CheckDictionary(g : GLOBALS, input : str) -> int: fileName = GetFile(g, g.langId, 1) if (g.flags.DEBUG): print(f'dict : {fileName}') try: with open(fileName, 'r') as output: cnts = output.readlines() except: if g.flags.DEBUG: print(f'Dict not found') if(g.langId == g.ENGLISH): exit(1) return 0 if (g.langId == g.ENGLISH): input1 = '' for c in input: if ord(c) < 97: c = c.lower() input1 += c input = input1 for l in cnts: l = l.strip().split('\t') assert(len(l) == 3) if g.flags.DEBUG: print(f"word : {l[0]}") if input == l[0]: if g.flags.DEBUG: print(f"match found") print(f'Syllables : {l[1]}') print(f'monophones : {l[2]}') if g.flags.writeFormat == 1: WriteFile(g, l[1]) if g.flags.writeFormat == 0: WriteFile(g, l[2]) return 1 return 0 # replacement for function in lines 801-821. def PositionCorrection(phone : str, left : str, right :str, isReverse:int) -> str: geminateList = ["k","kh","lx","rx","g","gh","ng","c","ch","j","jh","nj","tx","txh","dx","dxh","nx","t","th","d","dh", "n","p","ph","b","bh","m","y","r","l","w","sh","sx","zh","y","s","h","f","dxq"] phonecopy = phone for i in range(0,39): c1 = left c2 = right c1 = c1.replace('@', geminateList[i]) c2 = c2.replace('@', geminateList[i]) phonecopy = rec_replace(phonecopy, c2, c1) if isReverse != 0 else rec_replace(phonecopy, c1, c2) return phonecopy # replacement for function in lines 711 - 713. def CountChars(s : str, c : str) -> int: count = 0 for x in s: if x == c: count += 1 return count # replacement for function in lines 719 - 744. def GenerateAllCombinations(g : GLOBALS, j : int, s : str, c : list, isRight : int): t = '' if (c[j][0][0] == '#'): if isRight == 1: g.combvars.rightStr[g.combvars.bi] = s + '&' g.combvars.bi += 1 else: g.combvars.leftStr[g.combvars.bi] = s + '&' g.combvars.bi += 1 else: i = 0 while (c[j][i][0] != '#'): t = s + '&' + c[j][i] GenerateAllCombinations(g, j+1, t, c, isRight) i += 1 # replacement for function in lines 746 - 768. def GenerateMatrix(g : GLOBALS, combMatrix : list, regex : str): row, col, item = 0, 0, 0 for i in range(0, len(regex)): if regex[i] == '&': combMatrix[row][col+1] = '#' row += 1 col = 0 item = 0 combMatrix[row][col] = '' elif regex[i] == '|': col += 1 item = 0 combMatrix[row][col] = '' else: combMatrix[row][col] = combMatrix[row][col][:item] + regex[i] + combMatrix[row][col][(item+1):] item += 1 if g.flags.DEBUG: print(f'{row} {col} {combMatrix[row][col]}') combMatrix[row][col+1] = '#' combMatrix[row+1][0] = '#' # replacement for function in lines 770 - 799. def CombinationCorrection(g : GLOBALS, phone : str, left : str, right : str, isReverse : int) -> str: leftComb = [['' for _ in range(256)] for _ in range(256)] rightComb = [['' for _ in range(256)] for _ in range(256)] GenerateMatrix(g, leftComb, left) GenerateMatrix(g, rightComb, right) g.combvars.bi = 0 GenerateAllCombinations(g, 0, '', leftComb, 0) g.combvars.bi = 0 GenerateAllCombinations(g, 0, '', rightComb, 1) i = 0 phonecopy = phone while g.combvars.leftStr[i] != '': if isReverse != 0: phonecopy = phonecopy.replace(g.combvars.rightStr[i], g.combvars.leftStr[i]) else: phonecopy = phonecopy.replace(g.combvars.leftStr[i], g.combvars.rightStr[i]) if g.flags.DEBUG: print(f'{g.combvars.leftStr[i]} {g.combvars.rightStr[i]}') i += 1 g.combvars.refresh() return phonecopy # replacement for function in lines 825 - 930. //Language specific corrections def LangSpecificCorrection(g : GLOBALS, phone : str, langSpecFlag : int) -> str: phonecopy = phone if g.isSouth: phonecopy = rec_replace(phonecopy,"&ei&","&ai&") phonecopy = rec_replace(phonecopy,"&eiv&","&aiv&") else: phonecopy = rec_replace(phonecopy,"&oo&","&o&") phonecopy = rec_replace(phonecopy,"&oov&","&ov&") phonecopy = phonecopy.replace("&q&","&av&q&") phonecopy = rec_replace(phonecopy, "&a&av&","&a&") phonecopy = rec_replace(phonecopy, "&e&av&","&e&") phonecopy = rec_replace(phonecopy, "&i&av&","&i&") phonecopy = rec_replace(phonecopy, "&o&av&","&o&") phonecopy = rec_replace(phonecopy, "&u&av&","&u&") phonecopy = rec_replace(phonecopy,"&a&rqv&","&rq&") phonecopy = rec_replace(phonecopy,"&aa&av&","&aa&") phonecopy = rec_replace(phonecopy,"&ae&av&","&ae&") phonecopy = rec_replace(phonecopy,"&ax&av&","&ax&") phonecopy = rec_replace(phonecopy,"&ee&av&","&ee&") phonecopy = rec_replace(phonecopy,"&ii&av&","&ii&") phonecopy = rec_replace(phonecopy,"&ai&av&","&ai&") phonecopy = rec_replace(phonecopy,"&au&av&","&au&") phonecopy = rec_replace(phonecopy,"&oo&av&","&oo&") phonecopy = rec_replace(phonecopy,"&uu&av&","&uu&") phonecopy = rec_replace(phonecopy,"&rq&av&","&rq&") phonecopy = rec_replace(phonecopy,"&av&av&","&av&") phonecopy = rec_replace(phonecopy,"&ev&av&","&ev&") phonecopy = rec_replace(phonecopy,"&iv&av&","&iv&") phonecopy = rec_replace(phonecopy,"&ov&av&","&ov&") phonecopy = rec_replace(phonecopy,"&uv&av&","&uv&") phonecopy = rec_replace(phonecopy, "&av&rqv&","&rqv&") phonecopy = rec_replace(phonecopy, "&aav&av&","&aav&") phonecopy = rec_replace(phonecopy, "&aev&av&","&aev&") phonecopy = rec_replace(phonecopy, "&auv&av&","&auv&") phonecopy = rec_replace(phonecopy, "&axv&av&","&axv&") phonecopy = rec_replace(phonecopy, "&aiv&av&","&aiv&") phonecopy = rec_replace(phonecopy, "&eev&av&","&eev&") phonecopy = rec_replace(phonecopy, "&eiv&av&","&eiv&") phonecopy = rec_replace(phonecopy, "&iiv&av&","&iiv&") phonecopy = rec_replace(phonecopy, "&oov&av&","&oov&") phonecopy = rec_replace(phonecopy, "&ouv&av&","&ouv&") phonecopy = rec_replace(phonecopy, "&uuv&av&","&uuv&") phonecopy = rec_replace(phonecopy, "&rqv&av&","&rqv&") if langSpecFlag == 0: return phonecopy fileName = GetFile(g, g.langId, 2) with open(fileName, 'r') as output: cnts = output.readlines() left = '' right = '' phonecopy = '^' + phonecopy + '$' if (g.flags.DEBUG): print(f'phone : {phonecopy}') for l in cnts: l = l.strip() if (l.find('#') != -1): continue l = l.split('\t') assert(len(l) == 2) left, right = l[0], l[1] if left.find('|') != -1: a1 = left[1:-1] a2 = right[1:-1] phonecopy = CombinationCorrection(g, phonecopy, a1, a2, 0) if g.flags.DEBUG: print(f'{a1}\t{a2}') elif left.find('@') != -1: phonecopy = PositionCorrection(phonecopy, left, right, 0) else: phonecopy = phonecopy.replace(left, right) # //remove head and tail in phone phonecopy = phonecopy.replace('^', '') phonecopy = phonecopy.replace('$', '') # //end correction count = 0 for i in range(len(phonecopy)): if phonecopy[i] == '&': count = i return phonecopy[:(count+1)] # Replacement for function in lines 934 - 991. //Reverse syllable correction for syllable parsing def SyllableReverseCorrection(g : GLOBALS, phone : str, langSpecFlag : int) -> str: phonecopy = phone if g.isSouth: phonecopy = rec_replace(phonecopy, "&ai&","&ei&") phonecopy = rec_replace(phonecopy, "&aiv&","&eiv&") else: phonecopy = rec_replace(phonecopy, "&o&","&oo&") phonecopy = rec_replace(phonecopy, "&ov&","&oov&") if langSpecFlag == 0: return phonecopy fileName = GetFile(g, g.langId, 2) with open(fileName, 'r') as output: cnts = output.readlines() left = '' right = '' # //update head and tail in phone phonecopy = '^' + phonecopy + '$' if g.flags.DEBUG: print(f'before phone : {phonecopy}') for l in cnts: l = l.strip() if (l.find('#') != -1): continue l = l.split('\t') assert(len(l) == 2) left, right = l[0], l[1] if left.find('|') != -1: a1 = left[1:-1] a2 = right[1:-1] phonecopy = CombinationCorrection(g, phonecopy, a1, a2, 1) if g.flags.DEBUG: print(f'{a1}\t{a2}') elif left.find('@') != -1: phonecopy = PositionCorrection(phonecopy, left, right, 1) else: phonecopy = phonecopy.replace(right, left) # //remove head and tail in phone phonecopy = phonecopy.replace('^', '') phonecopy = phonecopy.replace('$', '') # //end correction if (g.flags.DEBUG): print(f'after phone : {phonecopy}') return phonecopy # //language specific syllable correction def LangSyllableCorrection(input : str) -> int: if input == "&av&q&": return 1 else: return 0 # replacement for function in lines 1000 - 1160. //split into syllable array def SplitSyllables(g : GLOBALS, input : str) -> int: incopy = input if g.flags.writeFormat == 2: i = 0 j = 0 fullList = ["k","kh","lx","rx","g","gh","ng","c","ch","j","jh","nj","tx","txh","dx","dxh","nx","t","th","d","dh","n","p","ph","b","bh","m","y","r","l","w","sh","sx","zh","y","s","h","f","dxq"] for i in range(0,39): for j in range(0,39): c1 = f'&{fullList[i]}&{fullList[j]}&' c2 = f'&{fullList[i]}&euv&#&{fullList[j]}&' incopy = incopy.replace(c1, c2) incopy = rec_replace(incopy, "&#&mq&","&mq&") incopy = rec_replace(incopy, "&#&q&","&q&") pch = incopy.split('#') g.syllableList = [] for c in pch: if c != '&': g.syllableList.append(c) # ln -> len ln = len(g.syllableList) if (ln == 0): return 1 if g.flags.DEBUG: for i in range(ln): print(f"initStack : {g.syllableList[i]}") # //south specific av addition if CheckVowel(g.syllableList[ln-1],1,0) == 0 and CheckChillu(g.syllableList[ln-1]) == 0: if g.isSouth: g.syllableList[ln-1] += '&av&' else: g.syllableList[ln-1] += '&euv&' # //round 2 correction if g.flags.writeFormat == 2: g.syllableCount = ln g.flags.writeFormat = 1 return 1 euFlag = 1 if ln > 1: for i in range(ln-1,-1,-1): if LangSyllableCorrection(g.syllableList[i]) == 1: g.syllableList[i-1] += g.syllableList[i] g.syllableList[i] = '' if g.syllableList[i].find("&eu&") != -1: g.syllableList[i] = g.syllableList[i].replace("&eu&", "!") euFlag = 1 if g.syllableList[i].find("&euv&") != -1: g.syllableList[i] = g.syllableList[i].replace("&euv&", "!") euFlag = 2 if CheckVowel(g.syllableList[i],0,1) == 0: if i-1 >= 0: g.syllableList[i-1] += g.syllableList[i] g.syllableList[i] = '' else: g.syllableList[i] += g.syllableList[i+1] g.syllableList[i+1] = '' if i-1 > 0: if euFlag == 1: g.syllableList[i-1] = g.syllableList[i-1].replace("!","&eu&") elif euFlag == 2: g.syllableList[i-1] = g.syllableList[i-1].replace("!","&euv&") g.syllableList[i-1] = rec_replace(g.syllableList[i-1], "&&","&") if euFlag == 1: g.syllableList[i] = g.syllableList[i].replace("!","&eu&") elif euFlag == 2: g.syllableList[i] = g.syllableList[i].replace("!","&euv&") else: if (CheckVowel(g.syllableList[0],1,0) == 0 and g.flags.writeFormat != 3) or Checkeuv(g.syllableList[0]) != 0: g.syllableList[0] += '&av' if g.flags.DEBUG: for i in range(ln): print(f'syllablifiedStack : {g.syllableList[i]}') # //round 3 double syllable correction for i in range(ln): # //corrections g.syllableList[i] = g.syllableList[i].replace('1','') if g.flags.DEBUG: print(f'LenStack : {len(g.syllableList[i])}') if len(g.syllableList[i]) > 0: if g.syllableList[i].find("&eu&") != -1: g.syllableList[i] = g.syllableList[i].replace("&eu&", "!") euFlag = 1 if g.syllableList[i].find("&euv&") != -1: g.syllableList[i] = g.syllableList[i].replace("&euv&", "!") euFlag = 2 if CheckVowel(g.syllableList[i],0,1) == 0 and g.flags.writeFormat != 3: if g.flags.DEBUG: print(f'Stack : {g.syllableList[i]}') g.syllableList[i] += '&av' if g.syllableList[i].find('!') != -1: if euFlag == 1: g.syllableList[i] = g.syllableList[i].replace("!","&eu&") elif euFlag == 2: g.syllableList[i] = g.syllableList[i].replace("!","&euv&") g.syllableList[i] = g.syllableList[i].replace('!', 'eu') g.syllableList[i] = rec_replace(g.syllableList[i], '&&', '&') g.syllableList[i] = GeminateCorrection(g.syllableList[i],1) if g.flags.DEBUG: for i in range(ln): print(f'syllablifiedStack1 : {g.syllableList[i]}') print(f'No of syllables : {ln}') g.syllableCount = ln if g.flags.writeFormat == 3: g.flags.writeFormat = 0 return 1 # replacement for function in lines 1164 - 1275. //make to write format def WritetoFiles(g : GLOBALS) -> int: if g.flags.DEBUG: for i in range(0,g.syllableCount): print(f'syllablifiedStackfinal : {g.syllableList[i]}') validSyllable = 0 for i in range(0,g.syllableCount): if g.syllableList[i] != '': validSyllable += 1 if g.flags.DEBUG: print(f'a correction {g.syllableList[0]}') g.words.outputText = '' # //phone j = 0 if g.flags.writeFormat == 0: syllablesPrint = 0 for i in range(g.syllableCount): g.words.outputText += '(( ' l = g.syllableList[i].split('&') for pch in l: if pch == '': continue if g.flags.DEBUG: print(f'syl {pch}') j = 1 g.words.outputText += f'"{pch}" ' if j != 0: if g.flags.syllTagFlag != 0: if syllablesPrint == 0: g.words.outputText += '_beg' elif syllablesPrint == validSyllable - 1: g.words.outputText += '_end' else: g.words.outputText += '_mid' syllablesPrint += 1 g.words.outputText += ') 0) ' else: g.words.outputText = g.words.outputText[:(len(g.words.outputText) - 3)] j = 0 g.words.outputText = g.words.outputText.replace('v', '') g.words.outputText = g.words.outputText.replace(" \"eu\"","") g.words.outputText = g.words.outputText.replace('!', '') # //syllable elif g.flags.writeFormat == 1: syllablesPrint = 0 for i in range(g.syllableCount): g.syllableList[i] = rec_replace(g.syllableList[i], 'euv', 'eu') g.syllableList[i] = SyllableReverseCorrection(g, g.syllableList[i], g.flags.LangSpecificCorrectionFlag) if g.flags.DEBUG: print(f'{g.syllableList[i]}') g.words.outputText += '(( "' l = g.syllableList[i].split('&') for pch in l: if pch == '': continue if g.flags.DEBUG: print(f'syl {pch}') j = 1 if CheckSymbol(g, pch) != 0: g.words.outputText += GetUTF(g, pch) if pch == 'av' and g.flags.DEBUG: print('av found') if j != 0: if g.flags.syllTagFlag != 0: if syllablesPrint == 0: g.words.outputText += '_beg' elif syllablesPrint == validSyllable - 1: g.words.outputText += '_end' else: g.words.outputText += '_mid' syllablesPrint += 1 g.words.outputText += '" ) 0) ' else: g.words.outputText = g.words.outputText[:(len(g.words.outputText) - 4)] j = 0 g.words.outputText = g.words.outputText.replace('#', '') g.words.outputText = g.words.outputText.replace(' ', ' ') if g.flags.DEBUG: print(f'Print text : {g.words.outputText}') WriteFile(g, g.words.outputText) return 1 def load_mapping_file(g: GLOBALS): # open common file try: # print('1.entered') with open("/speech/utkarsh/tts_api/Unified_parser/common_hindi.map", 'r') as infile: lines = infile.readlines() # print(lines) except: print("Couldn't open common file for reading") return 0 table=[] for i in range(len(lines)): l = lines[i].strip().split('\t') table.append(l) # g.symbolTable[i][1] = l[1] # g.symbolTable[i][0] = l[1 + g.langId] return table def set_lang_id(language): if language == "malayalam": lang_id=1 elif language == "tamil": lang_id=2 elif language == "telugu": lang_id=3 elif language == "kannada": lang_id=4 elif language == "hindi": lang_id=5 elif language == "bengali": lang_id=6 elif language == "gujrathi": lang_id=7 elif language == "odiya": lang_id=8 elif language == "punjabi": lang_id=9 return lang_id def convert_to_main_lang(g : GLOBALS,input_str,final_lang:str): s= input_str final_lang = "telugu" # print("input_str:",input_str) final_lang_id=set_lang_id(final_lang) c=1 # print(s,final_lang_id) temp_string='' new_string='&' table=load_mapping_file(g) # print(final_lang_id) # print(table) for i in range(1,len(s)): if s[i]=="&": c=1 continue if c==1: temp_string+=s[i] if s[i+1]=="&": c=0 # print("new_string_1:",new_string) # print("old_string_1:",temp_string) if temp_string=="#": new_string+=temp_string+"&" temp_string='' continue if temp_string =='av': new_string+=temp_string+"&" temp_string='' # print("new_string_1-av/aiv:",new_string) continue if temp_string =='eu' or temp_string =='euv'or temp_string =='aiv': new_string+=temp_string+"&" # print("new_string_1-eu:",new_string) # print("old_string_1-euv:",s) temp_string='' continue # print("new_string_before_table:",new_string) # print("old_string_before_table:",s) for j in range(len(table)): if table[j][1]==temp_string: # print("2:",table[j][1],temp_string) # print("3:",table[j][final_lang_id+1],ord(table[j][final_lang_id+1][0])) if ord(table[j][final_lang_id+1][0]) < 122: new_string=new_string+table[j][final_lang_id+1]+"&" temp_string='' # print("new string_2:",new_string) break else: new_string+=temp_string+"&" # print("new string_3:",new_string) temp_string='' break return new_string