File size: 2,806 Bytes
76690c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import re

def cat(file):
    with open(file, 'r') as f:
        return f.read()

def ortho_to_phonetic(input_file, phone_list_file, output_file):
    with open(input_file, 'r') as f:
        words = f.read().split()

    with open(phone_list_file, 'r') as f:
        phone_list = set(f.read().splitlines())

    word_start = 0
    with open(output_file, 'w') as phn_handle:
        while word_start < len(words):
            word = words[word_start]
            if word != "SIL":
                num = len(word)
                phone_start1 = 0
                while phone_start1 < num:
                    p1 = word[phone_start1:phone_start1 + 2]
                    p2 = word[phone_start1:phone_start1 + 3]
                    p3 = word[phone_start1:phone_start1 + 4]
                    p4 = word[phone_start1:phone_start1 + 5]
                    p5 = word[phone_start1:phone_start1 + 6]

                    cou = len(set(re.findall(rf'\b{re.escape(p1)}\b', cat(phone_list_file))))
                    cou1 = len(set(re.findall(rf'\b{re.escape(p2)}\b', cat(phone_list_file))))
                    cou2 = len(set(re.findall(rf'\b{re.escape(p3)}\b', cat(phone_list_file))))
                    cou3 = len(set(re.findall(rf'\b{re.escape(p4)}\b', cat(phone_list_file))))
                    cou4 = len(set(re.findall(rf'\b{re.escape(p5)}\b', cat(phone_list_file))))




                    if cou4 == 1:
                        phn_handle.write(p5 + "\n")
                        phone_start1 += 6
                    elif cou3 == 1:
                        phn_handle.write(p4 + "\n")
                        phone_start1 += 5
                    elif cou2 == 1:
                        phn_handle.write(p3 + "\n")
                        phone_start1 += 4
                    elif cou1 == 1:
                        phn_handle.write(p2 + "\n")
                        phone_start1 += 3
                    elif cou == 1:
                        phn_handle.write(p1 + "\n")
                        phone_start1 += 2
                    else:
                        p1 = word[phone_start1]
                        if p1 in [",", "."]:
                            phone_start1 += 1
                        else:
                            phn_handle.write(p1 + "\n")
                            phone_start1 += 1
            else:
                phn_handle.write("SIL\n")
                break
            word_start += 1

if __name__ == "__main__":
    import sys

    if len(sys.argv) != 4:
        print("Usage: python script.py input_file phone_list output_file")
        sys.exit(0)

    #print("Test -- 6")
    input_file, phone_list_file, output_file = sys.argv[1], sys.argv[2], sys.argv[3]

    #print("output_file", output_file)
    ortho_to_phonetic(input_file, phone_list_file, output_file)