navodit17 commited on
Commit
0c21945
·
1 Parent(s): 4d1cda2

working demo

Browse files
Files changed (6) hide show
  1. .DS_Store +0 -0
  2. app.py +98 -0
  3. app2.py +166 -0
  4. generated_speech.pt +3 -0
  5. packages.txt +1 -0
  6. requirements.txt +6 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
app.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, MarianTokenizer, MarianMTModel
5
+ from indic_transliteration import sanscript
6
+ from indic_transliteration.sanscript import transliterate
7
+ from transformers.models.whisper.english_normalizer import BasicTextNormalizer
8
+ import soundfile as sf
9
+ from datasets import load_dataset
10
+
11
+
12
+ title = "Cascaded STST"
13
+ description = """
14
+ Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Hindi.
15
+
16
+ Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-small) model for speech translation to English,
17
+ then MarianMT's [opus-mt-en-hi](https://huggingface.co/Helsinki-NLP/opus-mt-en-hi) model for translation to Hindi,
18
+ and finally microsoft/speechT5 fine-tuned for Hindi on IndicTTS dataset for text-to-speech.
19
+ [SpeechT5 TTS](https://huggingface.co/navodit17/speecht5_finetuned_indic_tts_hi) model for text-to-speech:
20
+ ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
21
+
22
+ ### NOTE: The goal is not to generate perfect Hindi speech or translation, but to demonstrate the cascaded STST approach using multiple models.
23
+
24
+ The model might give poor result for very short sentences (1-2 words or so). Try to send longer audio in that case.
25
+
26
+ ---
27
+ """
28
+
29
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
30
+
31
+ # load speech translation checkpoint
32
+ asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=device)
33
+
34
+ # load text-to-speech checkpoint and speaker embeddings
35
+ processor = SpeechT5Processor.from_pretrained("navodit17/speecht5_finetuned_indic_tts_hi")
36
+ model = SpeechT5ForTextToSpeech.from_pretrained("navodit17/speecht5_finetuned_indic_tts_hi").to(device)
37
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
38
+
39
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
40
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
41
+
42
+ # load english to hindi translation checkpoint
43
+ tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
44
+ model_en_hi = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
45
+
46
+ normalizer = BasicTextNormalizer()
47
+
48
+ def translate_en_hi(text):
49
+ inputs = tokenizer(text, return_tensors="pt")
50
+ outputs = model_en_hi.generate(**inputs, max_new_tokens=256)
51
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
52
+
53
+ def translate(audio):
54
+ outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
55
+ # print(f"Translated text - English: {outputs['text']}")
56
+ translated_text = translate_en_hi(outputs["text"])
57
+ # print(f"Translated text - Hindi: {translated_text}")
58
+ return translated_text
59
+
60
+ def synthesise(text):
61
+ text = normalizer(transliterate(text, sanscript.DEVANAGARI, sanscript.ITRANS))
62
+ # print(f"Normalized Text: {text}")
63
+ inputs = processor(text=text, return_tensors="pt")
64
+ # print(f"Inputs: {inputs['input_ids'].shape}")
65
+ speech = model.generate_speech(input_ids=inputs["input_ids"].to(device), speaker_embeddings=speaker_embeddings.to(device), vocoder=vocoder)
66
+ return speech.cpu()
67
+
68
+
69
+ def speech_to_speech_translation(audio):
70
+ translated_text = translate(audio)
71
+ synthesised_speech = synthesise(translated_text)
72
+ synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
73
+ return 16000, synthesised_speech
74
+
75
+ demo = gr.Blocks()
76
+
77
+ file_translate = gr.Interface(
78
+ fn=speech_to_speech_translation,
79
+ inputs=gr.Audio(sources="upload", type="filepath"),
80
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
81
+ title=title,
82
+ description=description,
83
+ )
84
+
85
+ mic_translate = gr.Interface(
86
+ fn=speech_to_speech_translation,
87
+ inputs=gr.Audio(sources="microphone", type="filepath"),
88
+ outputs=gr.Audio(label="Generated Speech", type="numpy", ),
89
+ title=title,
90
+ description=description,
91
+ )
92
+
93
+
94
+ with demo:
95
+ gr.TabbedInterface([file_translate, mic_translate], ["Audio File", "Microphone"])
96
+
97
+ demo.launch()
98
+
app2.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
5
+ from datasets import load_dataset
6
+
7
+ # sample tensor
8
+ sample_tensor = torch.load("generated_speech.pt")
9
+ sample_text = "namaste kya apa mujhe bata sakate haim ki kala apaki phlaita kisa samaya hai kya apa taiksi lemge"
10
+
11
+
12
+ # STT model
13
+ stt_checkpoint = "navodit17/speecht5_finetuned_indic_tts_hi"
14
+ stt_processor = SpeechT5Processor.from_pretrained(stt_checkpoint)
15
+ stt_model = SpeechT5ForTextToSpeech.from_pretrained(stt_checkpoint)
16
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
17
+ # speaker_embeddings = torch.tensor([[
18
+ # -7.9096e-02, 3.2141e-02, 4.4062e-02, 2.2458e-02, 2.4818e-02,
19
+ # 1.0870e-02, -6.9989e-02, 2.9361e-02, 4.9650e-02, 1.4352e-02,
20
+ # -8.0856e-02, -6.9597e-02, 5.1308e-02, 2.0282e-02, 5.4853e-02,
21
+ # 7.3063e-02, 1.3617e-02, 2.8050e-02, 1.6760e-03, 4.5911e-03,
22
+ # 2.2377e-02, 3.5073e-02, -1.2015e-02, -5.6730e-02, -5.4457e-02,
23
+ # -5.6295e-03, -6.0393e-02, 1.5303e-02, 4.4781e-02, 4.2038e-02,
24
+ # -2.6033e-03, 4.3630e-02, 1.7581e-02, 1.0797e-02, 4.1421e-02,
25
+ # -6.9390e-02, 5.0698e-02, 3.5905e-02, 2.7092e-02, -4.9815e-02,
26
+ # 3.9357e-02, 1.8817e-02, 2.8782e-02, 4.3815e-02, 2.6008e-02,
27
+ # -7.6643e-02, -1.4641e-02, 9.5107e-03, -1.0249e-01, 4.3864e-02,
28
+ # 2.4998e-02, 3.7842e-02, 2.0267e-02, 4.0481e-02, -8.5064e-02,
29
+ # -2.1616e-03, 1.5996e-02, 1.0865e-02, 3.2461e-02, 3.8486e-02,
30
+ # 3.6491e-02, -5.9280e-03, -1.1070e-02, 1.6478e-02, 2.4622e-02,
31
+ # 4.9041e-02, 2.8210e-02, -5.2851e-02, -5.6465e-02, -5.9150e-02,
32
+ # 2.7470e-02, 5.4932e-03, 7.2415e-03, 1.3297e-02, 4.7910e-02,
33
+ # 3.7617e-02, 1.5726e-02, 2.0426e-02, -5.4756e-02, -7.1068e-02,
34
+ # -7.2809e-02, -7.8266e-02, -5.7242e-02, -7.1732e-02, -2.2402e-02,
35
+ # -6.6500e-02, -7.6155e-02, 6.3509e-02, -6.7639e-03, -6.8526e-03,
36
+ # 2.5241e-02, -4.5859e-02, 2.0980e-02, -5.7519e-02, 4.9442e-02,
37
+ # -2.2280e-02, 5.3843e-02, 4.3883e-02, -4.6968e-02, -8.6317e-02,
38
+ # 6.1945e-03, -7.8510e-02, -7.2955e-02, 4.6143e-03, 6.8236e-02,
39
+ # -5.7523e-02, 2.6043e-02, 5.1134e-02, 5.0466e-02, 1.9361e-02,
40
+ # -7.3835e-02, 5.8783e-02, 7.8403e-02, 1.4997e-02, 1.8314e-02,
41
+ # 6.3094e-02, -7.9442e-02, 3.4601e-02, -5.3276e-02, -5.0826e-02,
42
+ # 2.4606e-02, -6.8361e-02, 2.6284e-02, 4.2329e-02, -3.7599e-02,
43
+ # 4.1646e-02, -9.5280e-02, 3.7492e-02, 3.7636e-02, 2.5985e-02,
44
+ # -3.0050e-02, 3.2662e-02, 2.7723e-02, 3.8104e-02, 1.8247e-02,
45
+ # -7.3857e-02, -7.5490e-02, 3.2894e-02, -7.2749e-02, -3.6701e-02,
46
+ # 2.2667e-02, 4.1351e-03, -1.4796e-02, 4.1243e-02, -6.9272e-02,
47
+ # 2.4523e-02, 2.1793e-02, -3.3412e-03, 2.7912e-02, -5.5684e-02,
48
+ # 4.8057e-02, -5.1125e-02, -8.6508e-02, 1.6578e-02, 1.4219e-02,
49
+ # 3.8626e-02, 2.8588e-02, -8.8628e-02, -7.7785e-02, 4.5904e-02,
50
+ # 2.6973e-02, 1.1173e-02, 3.4062e-02, 1.5100e-02, -1.1940e-02,
51
+ # 5.4919e-03, 5.3976e-02, 3.3862e-02, 1.7793e-02, 2.7416e-02,
52
+ # 5.0325e-02, -9.2786e-02, 3.4933e-02, -6.3649e-02, 1.7891e-02,
53
+ # 4.2497e-02, -6.2080e-02, 3.1213e-02, 2.6646e-02, -7.2364e-02,
54
+ # 2.3743e-02, -6.4803e-02, -6.8434e-05, 2.2999e-02, -7.8435e-02,
55
+ # -7.1068e-03, 2.0802e-02, 3.8085e-02, -5.5679e-02, -5.2630e-02,
56
+ # -6.0600e-02, 2.3879e-02, -5.8713e-02, 1.2526e-02, 8.7441e-03,
57
+ # 1.2976e-02, 2.3702e-02, 2.0858e-02, 2.4530e-02, -6.1161e-03,
58
+ # 1.6387e-02, 2.9424e-02, -6.9881e-02, 1.0703e-02, 5.4566e-02,
59
+ # 9.7716e-03, 4.1892e-02, 7.9958e-03, 4.0326e-02, -3.9815e-03,
60
+ # -8.0707e-04, -5.9334e-02, -5.2023e-02, 2.4852e-02, -6.4731e-02,
61
+ # 5.9305e-02, 3.0249e-02, -5.8866e-02, 4.2771e-02, 2.5907e-02,
62
+ # -4.9304e-02, 1.9540e-02, -6.2296e-02, -1.7946e-02, -8.4763e-03,
63
+ # 2.7271e-02, 2.8420e-02, 5.1065e-02, 3.1372e-02, 4.7098e-02,
64
+ # 2.6642e-02, 4.0554e-02, 3.0486e-02, -1.4875e-02, 2.8971e-02,
65
+ # -2.8165e-02, 4.5303e-02, 1.7752e-02, 1.1463e-03, 4.1254e-02,
66
+ # -5.5486e-02, 1.4259e-02, -8.8242e-02, 4.6154e-02, -4.7821e-02,
67
+ # 4.6743e-02, 2.4079e-02, 5.9683e-02, 4.8124e-02, 4.4341e-02,
68
+ # 2.6699e-02, 4.2861e-02, 5.9677e-03, -5.6233e-02, 2.2145e-02,
69
+ # 3.6767e-02, -3.2707e-03, 2.9193e-02, -8.3184e-02, 2.9720e-02,
70
+ # 1.6997e-02, -8.8428e-02, 3.9235e-02, 2.8460e-02, 5.2879e-04,
71
+ # 3.4858e-02, 4.1993e-02, 4.5816e-02, 6.6310e-03, -4.0764e-03,
72
+ # 4.1234e-02, -1.3845e-02, 3.9914e-03, 4.9223e-02, -6.4104e-02,
73
+ # 1.7539e-02, 5.6693e-02, 1.4442e-03, -7.4935e-02, -6.3044e-02,
74
+ # 4.0006e-03, 4.8351e-02, 3.9536e-02, -8.7633e-02, 2.9052e-02,
75
+ # 5.1906e-02, 1.2489e-02, 5.8764e-02, -6.9203e-02, 4.2202e-03,
76
+ # 4.1723e-02, 7.5111e-03, 2.1593e-02, 3.7314e-02, 1.9330e-02,
77
+ # 8.8582e-03, 1.7286e-02, 4.1805e-02, 2.7086e-02, 1.3443e-02,
78
+ # -4.9905e-02, 4.1805e-02, 1.4801e-02, -5.4013e-02, 2.9406e-02,
79
+ # 4.8653e-02, 1.5568e-02, 3.5359e-02, 5.9202e-02, 1.8950e-02,
80
+ # 6.6025e-02, 3.6152e-02, -6.8674e-02, 3.7966e-02, -7.0162e-02,
81
+ # -6.5415e-02, -2.1472e-02, 1.3661e-02, -6.1625e-02, 5.2603e-02,
82
+ # -4.2280e-02, -3.5216e-03, -3.5183e-02, 1.0041e-02, 3.2294e-02,
83
+ # -6.7117e-02, 3.7613e-02, 4.9022e-02, -6.9640e-02, -6.6330e-02,
84
+ # -5.7341e-02, 3.2368e-02, 5.1048e-02, 3.5536e-02, -5.7165e-02,
85
+ # 3.9687e-02, 5.5177e-02, 3.6400e-03, 2.2232e-04, 1.0508e-02,
86
+ # 9.4798e-03, -4.9671e-02, 1.5729e-02, -1.6415e-03, -6.5341e-02,
87
+ # 1.1698e-02, 4.2636e-02, 3.0220e-02, -5.1484e-02, -5.8948e-02,
88
+ # -6.5812e-02, -3.3869e-02, -8.1614e-02, 9.3173e-02, 2.7790e-02,
89
+ # -3.7140e-02, -7.4221e-03, 4.7300e-02, -5.5298e-02, 2.2071e-02,
90
+ # -8.1595e-02, 6.8659e-03, 8.6731e-03, 1.7781e-03, 4.9692e-02,
91
+ # 1.8681e-02, -4.8615e-02, 4.8314e-03, 2.4954e-02, 6.3759e-02,
92
+ # 7.7778e-03, 4.2505e-02, -6.9391e-02, 3.7088e-02, -9.7483e-03,
93
+ # -6.1993e-04, 5.2777e-02, 1.2955e-02, 6.6815e-02, 4.6009e-02,
94
+ # -9.4540e-02, -8.8816e-02, 3.7671e-02, 3.2664e-03, 3.8003e-02,
95
+ # 2.6832e-02, 6.7603e-02, -6.1109e-02, 3.9013e-02, 4.2810e-02,
96
+ # 6.7511e-03, 3.2843e-02, -4.2086e-02, 5.1029e-02, 2.9837e-02,
97
+ # -1.5323e-02, 2.4238e-02, -6.2738e-02, 6.7823e-03, 3.2687e-02,
98
+ # -5.8093e-03, -4.2954e-02, 2.5780e-02, 4.0528e-02, 2.4579e-02,
99
+ # 9.3824e-03, 4.6847e-04, -1.0616e-01, -5.4627e-02, -7.2340e-02,
100
+ # 1.0230e-02, -5.9172e-03, 2.7507e-02, -1.1043e-02, 4.9054e-02,
101
+ # -7.6480e-02, 3.5482e-02, -7.6632e-02, -1.0587e-01, -5.0521e-03,
102
+ # -8.1530e-02, 1.9312e-02, 5.2127e-02, 6.3460e-03, -5.5675e-02,
103
+ # 3.2647e-02, 5.0215e-02, 5.0249e-02, 3.8105e-02, -4.5931e-02,
104
+ # -4.5676e-02, -9.0062e-03, -1.7592e-02, 2.7216e-02, 3.1704e-02,
105
+ # 2.2958e-02, 5.5290e-02, 9.7256e-03, 1.7059e-02, -6.9045e-02,
106
+ # 1.3051e-02, 2.0521e-02, 4.5282e-02, 2.9233e-03, -3.0717e-02,
107
+ # 2.6435e-02, 1.9568e-02, 9.9746e-03, 3.0479e-02, -9.6263e-03,
108
+ # 3.9715e-02, 2.3348e-02, 6.4526e-02, 3.9307e-02, 2.5429e-02,
109
+ # 2.4707e-02, 3.0577e-02, -7.1778e-02, -8.8073e-02, 3.7356e-02,
110
+ # -7.6534e-03, 2.5788e-02, 3.7859e-02, 4.2421e-02, -1.0225e-01,
111
+ # 3.5744e-02, 4.9693e-02, 8.0407e-04, 3.3523e-02, 2.7724e-02,
112
+ # -2.7828e-03, 6.0185e-02, 2.4983e-02, 1.8167e-03, 6.2133e-03,
113
+ # -6.7665e-02, 2.4738e-02, -5.1167e-03, 2.7496e-02, 3.8240e-02,
114
+ # -7.5278e-02, -4.1977e-02, 3.0779e-02, 5.3046e-02, 2.9874e-02,
115
+ # 8.0589e-02, -6.3608e-02, 1.8703e-02, 9.5655e-03, -1.2092e-02,
116
+ # -5.1363e-02, 3.7178e-02, -3.4604e-02, 4.1522e-02, -9.3374e-03,
117
+ # -2.2800e-02, -6.7766e-02, 3.4822e-02, -5.2781e-02, -6.7118e-02,
118
+ # 5.7408e-03, 4.5285e-02, -6.4813e-02, 3.5704e-02, 1.0203e-02,
119
+ # -9.9155e-03, 1.6483e-02, 3.9745e-02, 6.8487e-02, 1.9586e-02,
120
+ # 3.5887e-02, -5.5557e-02
121
+ # ]])
122
+
123
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
124
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
125
+ print(speaker_embeddings.shape)
126
+
127
+ def synthesise(text):
128
+ inputs = stt_processor(text=text, return_tensors="pt")
129
+ print(inputs["input_ids"].shape)
130
+ print(inputs["input_ids"].dtype)
131
+ speech = stt_model.generate_speech(
132
+ inputs["input_ids"],
133
+ speaker_embeddings=speaker_embeddings,
134
+ vocoder=vocoder,
135
+ )
136
+ return speech
137
+
138
+ def speech_to_speech_translation(audio):
139
+ synthesised_speech = synthesise(sample_text)
140
+ print(f"Synthesised speech shape: {synthesised_speech.shape}")
141
+ print("Generated waveform dtype:", synthesised_speech.dtype)
142
+ synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
143
+ # synthesised_speech = (sample_tensor.numpy() * 32767).astype(np.int16)
144
+ return 16000, synthesised_speech
145
+
146
+
147
+ # Gradio Interface
148
+
149
+ file_translate = gr.Interface(
150
+ fn=speech_to_speech_translation,
151
+ inputs=gr.Audio(sources="upload", type="filepath"),
152
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
153
+ )
154
+
155
+ mic_translate = gr.Interface(
156
+ fn=speech_to_speech_translation,
157
+ inputs=gr.Audio(sources="microphone", type="filepath"),
158
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
159
+ )
160
+
161
+ demo = gr.Blocks()
162
+
163
+ with demo:
164
+ gr.TabbedInterface([file_translate, mic_translate], ["Audio File", "Microphone"])
165
+
166
+ demo.launch(debug=True)
generated_speech.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdb21e9e7398413a124550ea387051103f097e4094a39e6912ce48bf99108c85
3
+ size 416969
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ indic-transliteration
4
+ sentencepiece
5
+ sacremoses
6
+ datasets