File size: 2,806 Bytes
76690c5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import re
def cat(file):
with open(file, 'r') as f:
return f.read()
def ortho_to_phonetic(input_file, phone_list_file, output_file):
with open(input_file, 'r') as f:
words = f.read().split()
with open(phone_list_file, 'r') as f:
phone_list = set(f.read().splitlines())
word_start = 0
with open(output_file, 'w') as phn_handle:
while word_start < len(words):
word = words[word_start]
if word != "SIL":
num = len(word)
phone_start1 = 0
while phone_start1 < num:
p1 = word[phone_start1:phone_start1 + 2]
p2 = word[phone_start1:phone_start1 + 3]
p3 = word[phone_start1:phone_start1 + 4]
p4 = word[phone_start1:phone_start1 + 5]
p5 = word[phone_start1:phone_start1 + 6]
cou = len(set(re.findall(rf'\b{re.escape(p1)}\b', cat(phone_list_file))))
cou1 = len(set(re.findall(rf'\b{re.escape(p2)}\b', cat(phone_list_file))))
cou2 = len(set(re.findall(rf'\b{re.escape(p3)}\b', cat(phone_list_file))))
cou3 = len(set(re.findall(rf'\b{re.escape(p4)}\b', cat(phone_list_file))))
cou4 = len(set(re.findall(rf'\b{re.escape(p5)}\b', cat(phone_list_file))))
if cou4 == 1:
phn_handle.write(p5 + "\n")
phone_start1 += 6
elif cou3 == 1:
phn_handle.write(p4 + "\n")
phone_start1 += 5
elif cou2 == 1:
phn_handle.write(p3 + "\n")
phone_start1 += 4
elif cou1 == 1:
phn_handle.write(p2 + "\n")
phone_start1 += 3
elif cou == 1:
phn_handle.write(p1 + "\n")
phone_start1 += 2
else:
p1 = word[phone_start1]
if p1 in [",", "."]:
phone_start1 += 1
else:
phn_handle.write(p1 + "\n")
phone_start1 += 1
else:
phn_handle.write("SIL\n")
break
word_start += 1
if __name__ == "__main__":
import sys
if len(sys.argv) != 4:
print("Usage: python script.py input_file phone_list output_file")
sys.exit(0)
#print("Test -- 6")
input_file, phone_list_file, output_file = sys.argv[1], sys.argv[2], sys.argv[3]
#print("output_file", output_file)
ortho_to_phonetic(input_file, phone_list_file, output_file)
|