|
import sys, os |
|
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) |
|
sys.path.append(SCRIPT_DIR) |
|
|
|
|
|
|
|
from ply.lex import Lexer |
|
from ply.yacc import yacc |
|
from globals import * |
|
from helpers import * |
|
import sys |
|
from sys import exit |
|
|
|
|
|
tokens = ('kaki_c', 'conjsyll2_c', 'fullvowel_b', 'kaki_a', 'kaki_b', 'conjsyll2_b', 'conjsyll2_a', |
|
'conjsyll1', 'nukchan_b','nukchan_a', 'yarule', 'fullvowel_a', 'vowel') |
|
|
|
|
|
|
|
def p_sentence(p): |
|
''' |
|
sentence : words |
|
''' |
|
if p.parser.g.flags.parseLevel == 0: |
|
p.parser.g.words.syllabifiedWordOut = p[1] |
|
|
|
if p.parser.g.words.syllabifiedWordOut.find('&&') != -1: |
|
p.parser.g.words.syllabifiedWordOut = rec_replace(p.parser.g.words.syllabifiedWordOut,'&&','&') |
|
|
|
p.parser.g.flags.parseLevel += 1 |
|
else: |
|
p.parser.g.words.phonifiedWord = p[1] |
|
|
|
def p_words_syltoken(p): |
|
''' |
|
words : syltoken |
|
''' |
|
if(p.parser.g.flags.DEBUG): |
|
print(f"Syll:\t{p[1]}") |
|
p[0] = p[1] |
|
|
|
def p_words_wordsandsyltoken(p): |
|
''' |
|
words : words syltoken |
|
''' |
|
if(p.parser.g.flags.DEBUG): |
|
print(f"Syll:\t{p[2]}") |
|
p[0] = p[1] + p[2] |
|
|
|
def p_syltoken(p): |
|
''' |
|
syltoken : fullvowel_b |
|
| fullvowel_a |
|
| conjsyll2_c |
|
| conjsyll2_b |
|
| conjsyll2_a |
|
| conjsyll1 |
|
| nukchan_b |
|
| nukchan_a |
|
| yarule |
|
| vowel |
|
''' |
|
p[0] = p[1] |
|
|
|
def p_syltoken1(p): |
|
''' |
|
syltoken : |
|
| kaki_c |
|
| kaki_a |
|
| kaki_b |
|
''' |
|
if (p.parser.g.flags.DEBUG): |
|
print(f'kaki : {p[1]}') |
|
p[0] = p[1] |
|
|
|
def p_error(p): |
|
print('parse error') |
|
exit(1) |
|
|
|
|
|
def printHelp(): |
|
|
|
print("UnifiedParser - Usage Instructions") |
|
print("Run python3 parser.py wd lsflag wfflag clearflag") |
|
print("wd - word to parse in unicode.") |
|
print("lsflag - always 0. we are not using this.") |
|
print("wfflag - 0 for Monophone parsing, 1 for syllable parsing, 2 for Akshara Parsing") |
|
print("clearflag - 1 for removing the lisp like format of output and to just produce space separated output. Otherwise, 0.") |
|
|
|
|
|
def wordparse(wd : str, lsflag : int, wfflag : int, clearflag : int): |
|
g = GLOBALS() |
|
lexer = Lexer() |
|
parser = yacc() |
|
parser.g = g |
|
g.flags.DEBUG = False |
|
wd = wd.strip(' ') |
|
|
|
if lsflag not in [0,1] or wfflag not in [0,1,2]: |
|
print("Invalid input") |
|
exit(1) |
|
|
|
g.flags.LangSpecificCorrectionFlag = lsflag |
|
|
|
g.flags.writeFormat = wfflag |
|
if wfflag == 4: |
|
g.flags.writeFormat = 1 |
|
g.flags.syllTagFlag = 1 |
|
|
|
word = wd |
|
if g.flags.DEBUG: |
|
print(f'Word : {word}') |
|
|
|
word = RemoveUnwanted(word) |
|
if g.flags.DEBUG: |
|
print(f'Cleared Word : {word}') |
|
|
|
if SetlanguageFeat(g, word) == 0: |
|
return 0 |
|
|
|
if CheckDictionary(g, word) != 0: |
|
return 0 |
|
|
|
if g.flags.DEBUG: |
|
print(f'langId : {g.langId}') |
|
|
|
word = ConvertToSymbols(g, word) |
|
|
|
if g.flags.DEBUG: |
|
print(f"Symbols code : {g.words.unicodeWord}") |
|
print(f"Symbols syllables : {g.words.syllabifiedWord}") |
|
|
|
parser.parse(g.words.syllabifiedWord, lexer=lexer) |
|
if(g.flags.DEBUG): |
|
print(f"Syllabified Word : {g.words.syllabifiedWordOut}") |
|
g.words.syllabifiedWordOut = rec_replace(g.words.syllabifiedWordOut, '&#&','&') + '&' |
|
if(g.flags.DEBUG): |
|
print(f"Syllabified Word out : {g.words.syllabifiedWordOut}") |
|
g.words.syllabifiedWordOut = LangSpecificCorrection(g, g.words.syllabifiedWordOut, g.flags.LangSpecificCorrectionFlag) |
|
if(g.flags.DEBUG): |
|
print(f"Syllabified Word langCorr : {g.words.syllabifiedWordOut}") |
|
if(g.flags.DEBUG): |
|
print(f"Syllabified Word gemCorr : {g.words.syllabifiedWordOut}") |
|
g.words.syllabifiedWordOut = CleanseWord(g.words.syllabifiedWordOut) |
|
if(g.flags.DEBUG): |
|
print(f"Syllabified Word memCorr : {g.words.syllabifiedWordOut}") |
|
|
|
if not g.isSouth: |
|
if g.flags.DEBUG: |
|
print('NOT SOUTH') |
|
count = 0 |
|
for i in range(len(g.words.syllabifiedWordOut)): |
|
if g.words.syllabifiedWordOut[i] == '&': |
|
count += 1 |
|
splitPosition = 2 |
|
if GetPhoneType(g, g.words.syllabifiedWordOut, 1) == 1: |
|
if count > 2: |
|
tpe = GetPhoneType(g, g.words.syllabifiedWordOut, 2) |
|
if tpe == 2: |
|
splitPosition = 1 |
|
elif tpe == 3: |
|
splitPosition = 3 |
|
else: |
|
splitPosition = 1 |
|
count = 0 |
|
for i in range(len(g.words.syllabifiedWordOut)): |
|
if g.words.syllabifiedWordOut[i] == '&': |
|
count += 1 |
|
if count > splitPosition: |
|
count = i |
|
break |
|
start, end = g.words.syllabifiedWordOut, g.words.syllabifiedWordOut |
|
end = end[count:] |
|
start = start[:count] |
|
if(g.flags.DEBUG): |
|
print(f"posi {count} {start} {end}") |
|
end = SchwaSpecificCorrection(g, end) |
|
if(g.flags.DEBUG): |
|
print(f"prefinal : {g.words.syllabifiedWordOut}") |
|
g.words.syllabifiedWordOut = start + end |
|
if(g.flags.DEBUG): |
|
print(f"prefinal1 : {g.words.syllabifiedWordOut}") |
|
g.words.syllabifiedWordOut = CleanseWord(g.words.syllabifiedWordOut) |
|
if(g.flags.DEBUG): |
|
print(f"final : {g.words.syllabifiedWordOut}") |
|
g.words.syllabifiedWordOut = SchwaDoubleConsonent(g.words.syllabifiedWordOut) |
|
if(g.flags.DEBUG): |
|
print(f"final0 : {g.words.syllabifiedWordOut}") |
|
|
|
g.words.syllabifiedWordOut = GeminateCorrection(g.words.syllabifiedWordOut, 0) |
|
g.words.syllabifiedWordOut = MiddleVowel(g, g.words.syllabifiedWordOut) |
|
g.words.syllabifiedWordOut = Syllabilfy(g.words.syllabifiedWordOut) |
|
|
|
SplitSyllables(g,g.words.syllabifiedWordOut) |
|
|
|
WritetoFiles(g) |
|
if clearflag == 1: |
|
t = g.words.outputText |
|
t = t.split('"') |
|
ln = len(t) |
|
i = 1 |
|
g.answer = '' |
|
while i < ln: |
|
g.answer += t[i] + ' ' |
|
i += 2 |
|
g.answer.strip() |
|
return g.answer |
|
|
|
if __name__ == '__main__': |
|
|
|
if (len(sys.argv) != 5): |
|
printHelp() |
|
exit(-1) |
|
|
|
ans = wordparse(sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4])) |
|
print(ans) |
|
|