File size: 3,347 Bytes
1383518
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f34197
1383518
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from transformers import pipeline
from transformers.utils import logging
import torch
import pandas as pd
import time
import gradio as gr

logging.set_verbosity_error()

asr = pipeline(task="automatic-speech-recognition",
               model ='openai/whisper-large-v3')

translator = pipeline(task="translation",
                      model="facebook/nllb-200-3.3B", max_length = 5120,
                      # model="facebook/nllb-200-distilled-600M",
                      torch_dtype=torch.bfloat16)

flores_200_df = pd.read_csv("Flores200_language_codes.csv", encoding='cp1252')
flores_200 = dict(zip(flores_200_df['Language'],flores_200_df['FLORES_200_code']))
flores_200_languages = list(flores_200.keys())

def transcribe_audio(filepath, src_language, tgt_language):

    target_language = flores_200_df.loc[int(tgt_language),'Language']
    source_language = flores_200_df.loc[int(src_language),'Language']
    print(f"Selected Source Language: {source_language}, Target Language: {target_language}")

    time.sleep(5)
    if filepath is None:
        gr.Warning("No audio found, please retry.")
        return ""

    english_transcript = asr(
      filepath,
      # max_new_tokens=256,
      chunk_length_s=30,
      batch_size=8,
    )['text']
    print(english_transcript)

    transcripts = english_transcript.split('.')

    translations = []
    for tscript in transcripts:
      translation = translator(tscript, src_lang="eng_Latn",
                            tgt_lang=flores_200_df.loc[int(tgt_language),'FLORES_200_code'])[0]['translation_text']
      translations.append(translation+'.')

    output = ' '.join(translations)

    print(output)
    return output

demo = gr.Blocks()

mic_transcribe = gr.Interface(title="Transcribe Audio of Any Language into Any Language - test and demo app by Srinivas.V ..",
                              description="Speak into your system using your system mic, select your source & target languages and submit (if error appears, retry)",
    fn=transcribe_audio,
    inputs=[gr.Audio(sources="microphone",
                    type="filepath"), gr.Dropdown(flores_200_df.Language.tolist(), type='index', label='Select Source Language'),
                    gr.Dropdown(flores_200_df.Language.tolist(), type='index', label='Select Target Language')],
    outputs=gr.Textbox(label="Transcription in Selected Target Language",
                       lines=3),
    allow_flagging="never")

file_transcribe = gr.Interface(title="Transcribe Audio of Any Language into Any Language - test and demo app by Srinivas.V ..",
                               description="Upload an audio file, select your source & target languages and submit (if error appears, retry)",
    fn=transcribe_audio,
    inputs=[gr.Audio(sources="upload",
                    type="filepath"), gr.Dropdown(flores_200_df.Language.tolist(), type='index', label='Select Source Language'),
                    gr.Dropdown(flores_200_df.Language.tolist(), type='index', label='Select Target Language')],
    outputs=gr.Textbox(label="Transcription in Selected Target Language",
                       lines=3),
    allow_flagging="never",
)

with demo:
    gr.TabbedInterface(
        [mic_transcribe,
         file_transcribe],
        ["Speak Through Microphone",
         "Upload Audio File"],
    )
demo.launch(debug=True)