utkarsh2299's picture
Upload 97 files
2c8dc05 verified
import sys, os
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(SCRIPT_DIR)
# combined lexical analyzer and parser
from ply.lex import Lexer
from ply.yacc import yacc
from globals import *
from helpers import *
import sys
from sys import exit
# tokens used
tokens = ('kaki_c', 'conjsyll2_c', 'fullvowel_b', 'kaki_a', 'kaki_b', 'conjsyll2_b', 'conjsyll2_a',
'conjsyll1', 'nukchan_b','nukchan_a', 'yarule', 'fullvowel_a', 'vowel')
# parser part
def p_sentence(p):
'''
sentence : words
'''
if p.parser.g.flags.parseLevel == 0:
p.parser.g.words.syllabifiedWordOut = p[1]
if p.parser.g.words.syllabifiedWordOut.find('&&') != -1:
p.parser.g.words.syllabifiedWordOut = rec_replace(p.parser.g.words.syllabifiedWordOut,'&&','&')
p.parser.g.flags.parseLevel += 1
else:
p.parser.g.words.phonifiedWord = p[1]
def p_words_syltoken(p):
'''
words : syltoken
'''
if(p.parser.g.flags.DEBUG):
print(f"Syll:\t{p[1]}")
p[0] = p[1]
def p_words_wordsandsyltoken(p):
'''
words : words syltoken
'''
if(p.parser.g.flags.DEBUG):
print(f"Syll:\t{p[2]}")
p[0] = p[1] + p[2]
def p_syltoken(p):
'''
syltoken : fullvowel_b
| fullvowel_a
| conjsyll2_c
| conjsyll2_b
| conjsyll2_a
| conjsyll1
| nukchan_b
| nukchan_a
| yarule
| vowel
'''
p[0] = p[1]
def p_syltoken1(p):
'''
syltoken :
| kaki_c
| kaki_a
| kaki_b
'''
if (p.parser.g.flags.DEBUG):
print(f'kaki : {p[1]}')
p[0] = p[1]
def p_error(p):
print('parse error')
exit(1)
# print the help of syntax
def printHelp():
print("UnifiedParser - Usage Instructions")
print("Run python3 parser.py wd lsflag wfflag clearflag")
print("wd - word to parse in unicode.")
print("lsflag - always 0. we are not using this.")
print("wfflag - 0 for Monophone parsing, 1 for syllable parsing, 2 for Akshara Parsing")
print("clearflag - 1 for removing the lisp like format of output and to just produce space separated output. Otherwise, 0.")
def wordparse(wd : str, lsflag : int, wfflag : int, clearflag : int):
g = GLOBALS()
lexer = Lexer()
parser = yacc()
parser.g = g
g.flags.DEBUG = False
wd = wd.strip('  ') # hidden characters
if lsflag not in [0,1] or wfflag not in [0,1,2]:
print("Invalid input")
exit(1)
g.flags.LangSpecificCorrectionFlag = lsflag
g.flags.writeFormat = wfflag
if wfflag == 4:
g.flags.writeFormat = 1
g.flags.syllTagFlag = 1
word = wd
if g.flags.DEBUG:
print(f'Word : {word}')
word = RemoveUnwanted(word)
if g.flags.DEBUG:
print(f'Cleared Word : {word}')
if SetlanguageFeat(g, word) == 0:
return 0
if CheckDictionary(g, word) != 0:
return 0
if g.flags.DEBUG:
print(f'langId : {g.langId}')
word = ConvertToSymbols(g, word)
if g.flags.DEBUG:
print(f"Symbols code : {g.words.unicodeWord}")
print(f"Symbols syllables : {g.words.syllabifiedWord}")
parser.parse(g.words.syllabifiedWord, lexer=lexer)
if(g.flags.DEBUG):
print(f"Syllabified Word : {g.words.syllabifiedWordOut}")
g.words.syllabifiedWordOut = rec_replace(g.words.syllabifiedWordOut, '&#&','&') + '&'
if(g.flags.DEBUG):
print(f"Syllabified Word out : {g.words.syllabifiedWordOut}")
g.words.syllabifiedWordOut = LangSpecificCorrection(g, g.words.syllabifiedWordOut, g.flags.LangSpecificCorrectionFlag)
if(g.flags.DEBUG):
print(f"Syllabified Word langCorr : {g.words.syllabifiedWordOut}")
if(g.flags.DEBUG):
print(f"Syllabified Word gemCorr : {g.words.syllabifiedWordOut}")
g.words.syllabifiedWordOut = CleanseWord(g.words.syllabifiedWordOut)
if(g.flags.DEBUG):
print(f"Syllabified Word memCorr : {g.words.syllabifiedWordOut}")
if not g.isSouth:
if g.flags.DEBUG:
print('NOT SOUTH')
count = 0
for i in range(len(g.words.syllabifiedWordOut)):
if g.words.syllabifiedWordOut[i] == '&':
count += 1
splitPosition = 2
if GetPhoneType(g, g.words.syllabifiedWordOut, 1) == 1:
if count > 2:
tpe = GetPhoneType(g, g.words.syllabifiedWordOut, 2)
if tpe == 2:
splitPosition = 1
elif tpe == 3:
splitPosition = 3
else:
splitPosition = 1
count = 0
for i in range(len(g.words.syllabifiedWordOut)):
if g.words.syllabifiedWordOut[i] == '&':
count += 1
if count > splitPosition:
count = i
break
start, end = g.words.syllabifiedWordOut, g.words.syllabifiedWordOut
end = end[count:]
start = start[:count]
if(g.flags.DEBUG):
print(f"posi {count} {start} {end}")
end = SchwaSpecificCorrection(g, end)
if(g.flags.DEBUG):
print(f"prefinal : {g.words.syllabifiedWordOut}")
g.words.syllabifiedWordOut = start + end
if(g.flags.DEBUG):
print(f"prefinal1 : {g.words.syllabifiedWordOut}")
g.words.syllabifiedWordOut = CleanseWord(g.words.syllabifiedWordOut)
if(g.flags.DEBUG):
print(f"final : {g.words.syllabifiedWordOut}")
g.words.syllabifiedWordOut = SchwaDoubleConsonent(g.words.syllabifiedWordOut)
if(g.flags.DEBUG):
print(f"final0 : {g.words.syllabifiedWordOut}")
g.words.syllabifiedWordOut = GeminateCorrection(g.words.syllabifiedWordOut, 0)
g.words.syllabifiedWordOut = MiddleVowel(g, g.words.syllabifiedWordOut)
g.words.syllabifiedWordOut = Syllabilfy(g.words.syllabifiedWordOut)
SplitSyllables(g,g.words.syllabifiedWordOut)
WritetoFiles(g)
if clearflag == 1:
t = g.words.outputText
t = t.split('"')
ln = len(t)
i = 1
g.answer = ''
while i < ln:
g.answer += t[i] + ' '
i += 2
g.answer.strip()
return g.answer
if __name__ == '__main__':
if (len(sys.argv) != 5):
printHelp()
exit(-1)
ans = wordparse(sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]))
print(ans)