Update app.py
Browse files
app.py
CHANGED
@@ -9,17 +9,31 @@ import editdistance
|
|
9 |
from jiwer import wer
|
10 |
import json
|
11 |
|
12 |
-
# Load
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
def clean_phonemes(ipa):
|
19 |
"""Remove diacritics and length markers from phonemes"""
|
20 |
return re.sub(r'[\u064B-\u0652\u02D0]', '', ipa)
|
21 |
|
22 |
def analyze_phonemes(language, reference_text, audio_file):
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
# Convert reference text to phonemes
|
24 |
ref_phonemes = []
|
25 |
for word in reference_text.split():
|
@@ -46,6 +60,7 @@ def analyze_phonemes(language, reference_text, audio_file):
|
|
46 |
|
47 |
# Prepare results in JSON format
|
48 |
results = {
|
|
|
49 |
"reference_text": reference_text,
|
50 |
"transcription": transcription,
|
51 |
"word_alignment": [],
|
@@ -110,17 +125,43 @@ def analyze_phonemes(language, reference_text, audio_file):
|
|
110 |
|
111 |
return json.dumps(results, indent=2, ensure_ascii=False)
|
112 |
|
113 |
-
# Create Gradio interface
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
demo.launch()
|
|
|
9 |
from jiwer import wer
|
10 |
import json
|
11 |
|
12 |
+
# Load both models at startup
|
13 |
+
MODELS = {
|
14 |
+
"Arabic": {
|
15 |
+
"processor": Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-arabic"),
|
16 |
+
"model": Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-arabic"),
|
17 |
+
"epitran": epitran.Epitran("ara-Arab")
|
18 |
+
},
|
19 |
+
"English": {
|
20 |
+
"processor": Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self"),
|
21 |
+
"model": Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self"),
|
22 |
+
"epitran": epitran.Epitran("eng-Latn")
|
23 |
+
}
|
24 |
+
}
|
25 |
|
26 |
def clean_phonemes(ipa):
|
27 |
"""Remove diacritics and length markers from phonemes"""
|
28 |
return re.sub(r'[\u064B-\u0652\u02D0]', '', ipa)
|
29 |
|
30 |
def analyze_phonemes(language, reference_text, audio_file):
|
31 |
+
# Get the appropriate model components
|
32 |
+
lang_models = MODELS[language]
|
33 |
+
processor = lang_models["processor"]
|
34 |
+
model = lang_models["model"]
|
35 |
+
epi = lang_models["epitran"]
|
36 |
+
|
37 |
# Convert reference text to phonemes
|
38 |
ref_phonemes = []
|
39 |
for word in reference_text.split():
|
|
|
60 |
|
61 |
# Prepare results in JSON format
|
62 |
results = {
|
63 |
+
"language": language,
|
64 |
"reference_text": reference_text,
|
65 |
"transcription": transcription,
|
66 |
"word_alignment": [],
|
|
|
125 |
|
126 |
return json.dumps(results, indent=2, ensure_ascii=False)
|
127 |
|
128 |
+
# Create Gradio interface with language-specific default text
|
129 |
+
def get_default_text(language):
|
130 |
+
return {
|
131 |
+
"Arabic": "ููุจูุฃูููู ุขููุงุกู ุฑูุจููููู
ูุง ุชูููุฐููุจูุงูู",
|
132 |
+
"English": "The quick brown fox jumps over the lazy dog"
|
133 |
+
}.get(language, "")
|
134 |
+
|
135 |
+
with gr.Blocks() as demo:
|
136 |
+
gr.Markdown("# Multilingual Phoneme Alignment Analysis")
|
137 |
+
gr.Markdown("Compare audio pronunciation with reference text at phoneme level")
|
138 |
+
|
139 |
+
with gr.Row():
|
140 |
+
language = gr.Dropdown(
|
141 |
+
["Arabic", "English"],
|
142 |
+
label="Language",
|
143 |
+
value="Arabic"
|
144 |
+
)
|
145 |
+
reference_text = gr.Textbox(
|
146 |
+
label="Reference Text",
|
147 |
+
value=get_default_text("Arabic")
|
148 |
+
)
|
149 |
+
|
150 |
+
audio_input = gr.File(label="Upload Audio File", type="file")
|
151 |
+
submit_btn = gr.Button("Analyze")
|
152 |
+
output = gr.JSON(label="Phoneme Alignment Results")
|
153 |
+
|
154 |
+
# Update default text when language changes
|
155 |
+
language.change(
|
156 |
+
fn=get_default_text,
|
157 |
+
inputs=language,
|
158 |
+
outputs=reference_text
|
159 |
+
)
|
160 |
+
|
161 |
+
submit_btn.click(
|
162 |
+
fn=analyze_phonemes,
|
163 |
+
inputs=[language, reference_text, audio_input],
|
164 |
+
outputs=output
|
165 |
+
)
|
166 |
|
167 |
demo.launch()
|