|
import re |
|
|
|
def cat(file): |
|
with open(file, 'r') as f: |
|
return f.read() |
|
|
|
def ortho_to_phonetic(input_file, phone_list_file, output_file): |
|
with open(input_file, 'r') as f: |
|
words = f.read().split() |
|
|
|
with open(phone_list_file, 'r') as f: |
|
phone_list = set(f.read().splitlines()) |
|
|
|
word_start = 0 |
|
with open(output_file, 'w') as phn_handle: |
|
while word_start < len(words): |
|
word = words[word_start] |
|
if word != "SIL": |
|
num = len(word) |
|
phone_start1 = 0 |
|
while phone_start1 < num: |
|
p1 = word[phone_start1:phone_start1 + 2] |
|
p2 = word[phone_start1:phone_start1 + 3] |
|
p3 = word[phone_start1:phone_start1 + 4] |
|
p4 = word[phone_start1:phone_start1 + 5] |
|
p5 = word[phone_start1:phone_start1 + 6] |
|
|
|
cou = len(set(re.findall(rf'\b{re.escape(p1)}\b', cat(phone_list_file)))) |
|
cou1 = len(set(re.findall(rf'\b{re.escape(p2)}\b', cat(phone_list_file)))) |
|
cou2 = len(set(re.findall(rf'\b{re.escape(p3)}\b', cat(phone_list_file)))) |
|
cou3 = len(set(re.findall(rf'\b{re.escape(p4)}\b', cat(phone_list_file)))) |
|
cou4 = len(set(re.findall(rf'\b{re.escape(p5)}\b', cat(phone_list_file)))) |
|
|
|
|
|
|
|
|
|
if cou4 == 1: |
|
phn_handle.write(p5 + "\n") |
|
phone_start1 += 6 |
|
elif cou3 == 1: |
|
phn_handle.write(p4 + "\n") |
|
phone_start1 += 5 |
|
elif cou2 == 1: |
|
phn_handle.write(p3 + "\n") |
|
phone_start1 += 4 |
|
elif cou1 == 1: |
|
phn_handle.write(p2 + "\n") |
|
phone_start1 += 3 |
|
elif cou == 1: |
|
phn_handle.write(p1 + "\n") |
|
phone_start1 += 2 |
|
else: |
|
p1 = word[phone_start1] |
|
if p1 in [",", "."]: |
|
phone_start1 += 1 |
|
else: |
|
phn_handle.write(p1 + "\n") |
|
phone_start1 += 1 |
|
else: |
|
phn_handle.write("SIL\n") |
|
break |
|
word_start += 1 |
|
|
|
if __name__ == "__main__": |
|
import sys |
|
|
|
if len(sys.argv) != 4: |
|
print("Usage: python script.py input_file phone_list output_file") |
|
sys.exit(0) |
|
|
|
|
|
input_file, phone_list_file, output_file = sys.argv[1], sys.argv[2], sys.argv[3] |
|
|
|
|
|
ortho_to_phonetic(input_file, phone_list_file, output_file) |
|
|