working demo
Browse files- .DS_Store +0 -0
- app.py +98 -0
- app2.py +166 -0
- generated_speech.pt +3 -0
- packages.txt +1 -0
- requirements.txt +6 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, MarianTokenizer, MarianMTModel
|
5 |
+
from indic_transliteration import sanscript
|
6 |
+
from indic_transliteration.sanscript import transliterate
|
7 |
+
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
|
8 |
+
import soundfile as sf
|
9 |
+
from datasets import load_dataset
|
10 |
+
|
11 |
+
|
12 |
+
title = "Cascaded STST"
|
13 |
+
description = """
|
14 |
+
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Hindi.
|
15 |
+
|
16 |
+
Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-small) model for speech translation to English,
|
17 |
+
then MarianMT's [opus-mt-en-hi](https://huggingface.co/Helsinki-NLP/opus-mt-en-hi) model for translation to Hindi,
|
18 |
+
and finally microsoft/speechT5 fine-tuned for Hindi on IndicTTS dataset for text-to-speech.
|
19 |
+
[SpeechT5 TTS](https://huggingface.co/navodit17/speecht5_finetuned_indic_tts_hi) model for text-to-speech:
|
20 |
+

|
21 |
+
|
22 |
+
### NOTE: The goal is not to generate perfect Hindi speech or translation, but to demonstrate the cascaded STST approach using multiple models.
|
23 |
+
|
24 |
+
The model might give poor result for very short sentences (1-2 words or so). Try to send longer audio in that case.
|
25 |
+
|
26 |
+
---
|
27 |
+
"""
|
28 |
+
|
29 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
30 |
+
|
31 |
+
# load speech translation checkpoint
|
32 |
+
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=device)
|
33 |
+
|
34 |
+
# load text-to-speech checkpoint and speaker embeddings
|
35 |
+
processor = SpeechT5Processor.from_pretrained("navodit17/speecht5_finetuned_indic_tts_hi")
|
36 |
+
model = SpeechT5ForTextToSpeech.from_pretrained("navodit17/speecht5_finetuned_indic_tts_hi").to(device)
|
37 |
+
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
|
38 |
+
|
39 |
+
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
40 |
+
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
41 |
+
|
42 |
+
# load english to hindi translation checkpoint
|
43 |
+
tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
|
44 |
+
model_en_hi = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
|
45 |
+
|
46 |
+
normalizer = BasicTextNormalizer()
|
47 |
+
|
48 |
+
def translate_en_hi(text):
|
49 |
+
inputs = tokenizer(text, return_tensors="pt")
|
50 |
+
outputs = model_en_hi.generate(**inputs, max_new_tokens=256)
|
51 |
+
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
52 |
+
|
53 |
+
def translate(audio):
|
54 |
+
outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
|
55 |
+
# print(f"Translated text - English: {outputs['text']}")
|
56 |
+
translated_text = translate_en_hi(outputs["text"])
|
57 |
+
# print(f"Translated text - Hindi: {translated_text}")
|
58 |
+
return translated_text
|
59 |
+
|
60 |
+
def synthesise(text):
|
61 |
+
text = normalizer(transliterate(text, sanscript.DEVANAGARI, sanscript.ITRANS))
|
62 |
+
# print(f"Normalized Text: {text}")
|
63 |
+
inputs = processor(text=text, return_tensors="pt")
|
64 |
+
# print(f"Inputs: {inputs['input_ids'].shape}")
|
65 |
+
speech = model.generate_speech(input_ids=inputs["input_ids"].to(device), speaker_embeddings=speaker_embeddings.to(device), vocoder=vocoder)
|
66 |
+
return speech.cpu()
|
67 |
+
|
68 |
+
|
69 |
+
def speech_to_speech_translation(audio):
|
70 |
+
translated_text = translate(audio)
|
71 |
+
synthesised_speech = synthesise(translated_text)
|
72 |
+
synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
|
73 |
+
return 16000, synthesised_speech
|
74 |
+
|
75 |
+
demo = gr.Blocks()
|
76 |
+
|
77 |
+
file_translate = gr.Interface(
|
78 |
+
fn=speech_to_speech_translation,
|
79 |
+
inputs=gr.Audio(sources="upload", type="filepath"),
|
80 |
+
outputs=gr.Audio(label="Generated Speech", type="numpy"),
|
81 |
+
title=title,
|
82 |
+
description=description,
|
83 |
+
)
|
84 |
+
|
85 |
+
mic_translate = gr.Interface(
|
86 |
+
fn=speech_to_speech_translation,
|
87 |
+
inputs=gr.Audio(sources="microphone", type="filepath"),
|
88 |
+
outputs=gr.Audio(label="Generated Speech", type="numpy", ),
|
89 |
+
title=title,
|
90 |
+
description=description,
|
91 |
+
)
|
92 |
+
|
93 |
+
|
94 |
+
with demo:
|
95 |
+
gr.TabbedInterface([file_translate, mic_translate], ["Audio File", "Microphone"])
|
96 |
+
|
97 |
+
demo.launch()
|
98 |
+
|
app2.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
5 |
+
from datasets import load_dataset
|
6 |
+
|
7 |
+
# sample tensor
|
8 |
+
sample_tensor = torch.load("generated_speech.pt")
|
9 |
+
sample_text = "namaste kya apa mujhe bata sakate haim ki kala apaki phlaita kisa samaya hai kya apa taiksi lemge"
|
10 |
+
|
11 |
+
|
12 |
+
# STT model
|
13 |
+
stt_checkpoint = "navodit17/speecht5_finetuned_indic_tts_hi"
|
14 |
+
stt_processor = SpeechT5Processor.from_pretrained(stt_checkpoint)
|
15 |
+
stt_model = SpeechT5ForTextToSpeech.from_pretrained(stt_checkpoint)
|
16 |
+
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
17 |
+
# speaker_embeddings = torch.tensor([[
|
18 |
+
# -7.9096e-02, 3.2141e-02, 4.4062e-02, 2.2458e-02, 2.4818e-02,
|
19 |
+
# 1.0870e-02, -6.9989e-02, 2.9361e-02, 4.9650e-02, 1.4352e-02,
|
20 |
+
# -8.0856e-02, -6.9597e-02, 5.1308e-02, 2.0282e-02, 5.4853e-02,
|
21 |
+
# 7.3063e-02, 1.3617e-02, 2.8050e-02, 1.6760e-03, 4.5911e-03,
|
22 |
+
# 2.2377e-02, 3.5073e-02, -1.2015e-02, -5.6730e-02, -5.4457e-02,
|
23 |
+
# -5.6295e-03, -6.0393e-02, 1.5303e-02, 4.4781e-02, 4.2038e-02,
|
24 |
+
# -2.6033e-03, 4.3630e-02, 1.7581e-02, 1.0797e-02, 4.1421e-02,
|
25 |
+
# -6.9390e-02, 5.0698e-02, 3.5905e-02, 2.7092e-02, -4.9815e-02,
|
26 |
+
# 3.9357e-02, 1.8817e-02, 2.8782e-02, 4.3815e-02, 2.6008e-02,
|
27 |
+
# -7.6643e-02, -1.4641e-02, 9.5107e-03, -1.0249e-01, 4.3864e-02,
|
28 |
+
# 2.4998e-02, 3.7842e-02, 2.0267e-02, 4.0481e-02, -8.5064e-02,
|
29 |
+
# -2.1616e-03, 1.5996e-02, 1.0865e-02, 3.2461e-02, 3.8486e-02,
|
30 |
+
# 3.6491e-02, -5.9280e-03, -1.1070e-02, 1.6478e-02, 2.4622e-02,
|
31 |
+
# 4.9041e-02, 2.8210e-02, -5.2851e-02, -5.6465e-02, -5.9150e-02,
|
32 |
+
# 2.7470e-02, 5.4932e-03, 7.2415e-03, 1.3297e-02, 4.7910e-02,
|
33 |
+
# 3.7617e-02, 1.5726e-02, 2.0426e-02, -5.4756e-02, -7.1068e-02,
|
34 |
+
# -7.2809e-02, -7.8266e-02, -5.7242e-02, -7.1732e-02, -2.2402e-02,
|
35 |
+
# -6.6500e-02, -7.6155e-02, 6.3509e-02, -6.7639e-03, -6.8526e-03,
|
36 |
+
# 2.5241e-02, -4.5859e-02, 2.0980e-02, -5.7519e-02, 4.9442e-02,
|
37 |
+
# -2.2280e-02, 5.3843e-02, 4.3883e-02, -4.6968e-02, -8.6317e-02,
|
38 |
+
# 6.1945e-03, -7.8510e-02, -7.2955e-02, 4.6143e-03, 6.8236e-02,
|
39 |
+
# -5.7523e-02, 2.6043e-02, 5.1134e-02, 5.0466e-02, 1.9361e-02,
|
40 |
+
# -7.3835e-02, 5.8783e-02, 7.8403e-02, 1.4997e-02, 1.8314e-02,
|
41 |
+
# 6.3094e-02, -7.9442e-02, 3.4601e-02, -5.3276e-02, -5.0826e-02,
|
42 |
+
# 2.4606e-02, -6.8361e-02, 2.6284e-02, 4.2329e-02, -3.7599e-02,
|
43 |
+
# 4.1646e-02, -9.5280e-02, 3.7492e-02, 3.7636e-02, 2.5985e-02,
|
44 |
+
# -3.0050e-02, 3.2662e-02, 2.7723e-02, 3.8104e-02, 1.8247e-02,
|
45 |
+
# -7.3857e-02, -7.5490e-02, 3.2894e-02, -7.2749e-02, -3.6701e-02,
|
46 |
+
# 2.2667e-02, 4.1351e-03, -1.4796e-02, 4.1243e-02, -6.9272e-02,
|
47 |
+
# 2.4523e-02, 2.1793e-02, -3.3412e-03, 2.7912e-02, -5.5684e-02,
|
48 |
+
# 4.8057e-02, -5.1125e-02, -8.6508e-02, 1.6578e-02, 1.4219e-02,
|
49 |
+
# 3.8626e-02, 2.8588e-02, -8.8628e-02, -7.7785e-02, 4.5904e-02,
|
50 |
+
# 2.6973e-02, 1.1173e-02, 3.4062e-02, 1.5100e-02, -1.1940e-02,
|
51 |
+
# 5.4919e-03, 5.3976e-02, 3.3862e-02, 1.7793e-02, 2.7416e-02,
|
52 |
+
# 5.0325e-02, -9.2786e-02, 3.4933e-02, -6.3649e-02, 1.7891e-02,
|
53 |
+
# 4.2497e-02, -6.2080e-02, 3.1213e-02, 2.6646e-02, -7.2364e-02,
|
54 |
+
# 2.3743e-02, -6.4803e-02, -6.8434e-05, 2.2999e-02, -7.8435e-02,
|
55 |
+
# -7.1068e-03, 2.0802e-02, 3.8085e-02, -5.5679e-02, -5.2630e-02,
|
56 |
+
# -6.0600e-02, 2.3879e-02, -5.8713e-02, 1.2526e-02, 8.7441e-03,
|
57 |
+
# 1.2976e-02, 2.3702e-02, 2.0858e-02, 2.4530e-02, -6.1161e-03,
|
58 |
+
# 1.6387e-02, 2.9424e-02, -6.9881e-02, 1.0703e-02, 5.4566e-02,
|
59 |
+
# 9.7716e-03, 4.1892e-02, 7.9958e-03, 4.0326e-02, -3.9815e-03,
|
60 |
+
# -8.0707e-04, -5.9334e-02, -5.2023e-02, 2.4852e-02, -6.4731e-02,
|
61 |
+
# 5.9305e-02, 3.0249e-02, -5.8866e-02, 4.2771e-02, 2.5907e-02,
|
62 |
+
# -4.9304e-02, 1.9540e-02, -6.2296e-02, -1.7946e-02, -8.4763e-03,
|
63 |
+
# 2.7271e-02, 2.8420e-02, 5.1065e-02, 3.1372e-02, 4.7098e-02,
|
64 |
+
# 2.6642e-02, 4.0554e-02, 3.0486e-02, -1.4875e-02, 2.8971e-02,
|
65 |
+
# -2.8165e-02, 4.5303e-02, 1.7752e-02, 1.1463e-03, 4.1254e-02,
|
66 |
+
# -5.5486e-02, 1.4259e-02, -8.8242e-02, 4.6154e-02, -4.7821e-02,
|
67 |
+
# 4.6743e-02, 2.4079e-02, 5.9683e-02, 4.8124e-02, 4.4341e-02,
|
68 |
+
# 2.6699e-02, 4.2861e-02, 5.9677e-03, -5.6233e-02, 2.2145e-02,
|
69 |
+
# 3.6767e-02, -3.2707e-03, 2.9193e-02, -8.3184e-02, 2.9720e-02,
|
70 |
+
# 1.6997e-02, -8.8428e-02, 3.9235e-02, 2.8460e-02, 5.2879e-04,
|
71 |
+
# 3.4858e-02, 4.1993e-02, 4.5816e-02, 6.6310e-03, -4.0764e-03,
|
72 |
+
# 4.1234e-02, -1.3845e-02, 3.9914e-03, 4.9223e-02, -6.4104e-02,
|
73 |
+
# 1.7539e-02, 5.6693e-02, 1.4442e-03, -7.4935e-02, -6.3044e-02,
|
74 |
+
# 4.0006e-03, 4.8351e-02, 3.9536e-02, -8.7633e-02, 2.9052e-02,
|
75 |
+
# 5.1906e-02, 1.2489e-02, 5.8764e-02, -6.9203e-02, 4.2202e-03,
|
76 |
+
# 4.1723e-02, 7.5111e-03, 2.1593e-02, 3.7314e-02, 1.9330e-02,
|
77 |
+
# 8.8582e-03, 1.7286e-02, 4.1805e-02, 2.7086e-02, 1.3443e-02,
|
78 |
+
# -4.9905e-02, 4.1805e-02, 1.4801e-02, -5.4013e-02, 2.9406e-02,
|
79 |
+
# 4.8653e-02, 1.5568e-02, 3.5359e-02, 5.9202e-02, 1.8950e-02,
|
80 |
+
# 6.6025e-02, 3.6152e-02, -6.8674e-02, 3.7966e-02, -7.0162e-02,
|
81 |
+
# -6.5415e-02, -2.1472e-02, 1.3661e-02, -6.1625e-02, 5.2603e-02,
|
82 |
+
# -4.2280e-02, -3.5216e-03, -3.5183e-02, 1.0041e-02, 3.2294e-02,
|
83 |
+
# -6.7117e-02, 3.7613e-02, 4.9022e-02, -6.9640e-02, -6.6330e-02,
|
84 |
+
# -5.7341e-02, 3.2368e-02, 5.1048e-02, 3.5536e-02, -5.7165e-02,
|
85 |
+
# 3.9687e-02, 5.5177e-02, 3.6400e-03, 2.2232e-04, 1.0508e-02,
|
86 |
+
# 9.4798e-03, -4.9671e-02, 1.5729e-02, -1.6415e-03, -6.5341e-02,
|
87 |
+
# 1.1698e-02, 4.2636e-02, 3.0220e-02, -5.1484e-02, -5.8948e-02,
|
88 |
+
# -6.5812e-02, -3.3869e-02, -8.1614e-02, 9.3173e-02, 2.7790e-02,
|
89 |
+
# -3.7140e-02, -7.4221e-03, 4.7300e-02, -5.5298e-02, 2.2071e-02,
|
90 |
+
# -8.1595e-02, 6.8659e-03, 8.6731e-03, 1.7781e-03, 4.9692e-02,
|
91 |
+
# 1.8681e-02, -4.8615e-02, 4.8314e-03, 2.4954e-02, 6.3759e-02,
|
92 |
+
# 7.7778e-03, 4.2505e-02, -6.9391e-02, 3.7088e-02, -9.7483e-03,
|
93 |
+
# -6.1993e-04, 5.2777e-02, 1.2955e-02, 6.6815e-02, 4.6009e-02,
|
94 |
+
# -9.4540e-02, -8.8816e-02, 3.7671e-02, 3.2664e-03, 3.8003e-02,
|
95 |
+
# 2.6832e-02, 6.7603e-02, -6.1109e-02, 3.9013e-02, 4.2810e-02,
|
96 |
+
# 6.7511e-03, 3.2843e-02, -4.2086e-02, 5.1029e-02, 2.9837e-02,
|
97 |
+
# -1.5323e-02, 2.4238e-02, -6.2738e-02, 6.7823e-03, 3.2687e-02,
|
98 |
+
# -5.8093e-03, -4.2954e-02, 2.5780e-02, 4.0528e-02, 2.4579e-02,
|
99 |
+
# 9.3824e-03, 4.6847e-04, -1.0616e-01, -5.4627e-02, -7.2340e-02,
|
100 |
+
# 1.0230e-02, -5.9172e-03, 2.7507e-02, -1.1043e-02, 4.9054e-02,
|
101 |
+
# -7.6480e-02, 3.5482e-02, -7.6632e-02, -1.0587e-01, -5.0521e-03,
|
102 |
+
# -8.1530e-02, 1.9312e-02, 5.2127e-02, 6.3460e-03, -5.5675e-02,
|
103 |
+
# 3.2647e-02, 5.0215e-02, 5.0249e-02, 3.8105e-02, -4.5931e-02,
|
104 |
+
# -4.5676e-02, -9.0062e-03, -1.7592e-02, 2.7216e-02, 3.1704e-02,
|
105 |
+
# 2.2958e-02, 5.5290e-02, 9.7256e-03, 1.7059e-02, -6.9045e-02,
|
106 |
+
# 1.3051e-02, 2.0521e-02, 4.5282e-02, 2.9233e-03, -3.0717e-02,
|
107 |
+
# 2.6435e-02, 1.9568e-02, 9.9746e-03, 3.0479e-02, -9.6263e-03,
|
108 |
+
# 3.9715e-02, 2.3348e-02, 6.4526e-02, 3.9307e-02, 2.5429e-02,
|
109 |
+
# 2.4707e-02, 3.0577e-02, -7.1778e-02, -8.8073e-02, 3.7356e-02,
|
110 |
+
# -7.6534e-03, 2.5788e-02, 3.7859e-02, 4.2421e-02, -1.0225e-01,
|
111 |
+
# 3.5744e-02, 4.9693e-02, 8.0407e-04, 3.3523e-02, 2.7724e-02,
|
112 |
+
# -2.7828e-03, 6.0185e-02, 2.4983e-02, 1.8167e-03, 6.2133e-03,
|
113 |
+
# -6.7665e-02, 2.4738e-02, -5.1167e-03, 2.7496e-02, 3.8240e-02,
|
114 |
+
# -7.5278e-02, -4.1977e-02, 3.0779e-02, 5.3046e-02, 2.9874e-02,
|
115 |
+
# 8.0589e-02, -6.3608e-02, 1.8703e-02, 9.5655e-03, -1.2092e-02,
|
116 |
+
# -5.1363e-02, 3.7178e-02, -3.4604e-02, 4.1522e-02, -9.3374e-03,
|
117 |
+
# -2.2800e-02, -6.7766e-02, 3.4822e-02, -5.2781e-02, -6.7118e-02,
|
118 |
+
# 5.7408e-03, 4.5285e-02, -6.4813e-02, 3.5704e-02, 1.0203e-02,
|
119 |
+
# -9.9155e-03, 1.6483e-02, 3.9745e-02, 6.8487e-02, 1.9586e-02,
|
120 |
+
# 3.5887e-02, -5.5557e-02
|
121 |
+
# ]])
|
122 |
+
|
123 |
+
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
124 |
+
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
125 |
+
print(speaker_embeddings.shape)
|
126 |
+
|
127 |
+
def synthesise(text):
|
128 |
+
inputs = stt_processor(text=text, return_tensors="pt")
|
129 |
+
print(inputs["input_ids"].shape)
|
130 |
+
print(inputs["input_ids"].dtype)
|
131 |
+
speech = stt_model.generate_speech(
|
132 |
+
inputs["input_ids"],
|
133 |
+
speaker_embeddings=speaker_embeddings,
|
134 |
+
vocoder=vocoder,
|
135 |
+
)
|
136 |
+
return speech
|
137 |
+
|
138 |
+
def speech_to_speech_translation(audio):
|
139 |
+
synthesised_speech = synthesise(sample_text)
|
140 |
+
print(f"Synthesised speech shape: {synthesised_speech.shape}")
|
141 |
+
print("Generated waveform dtype:", synthesised_speech.dtype)
|
142 |
+
synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
|
143 |
+
# synthesised_speech = (sample_tensor.numpy() * 32767).astype(np.int16)
|
144 |
+
return 16000, synthesised_speech
|
145 |
+
|
146 |
+
|
147 |
+
# Gradio Interface
|
148 |
+
|
149 |
+
file_translate = gr.Interface(
|
150 |
+
fn=speech_to_speech_translation,
|
151 |
+
inputs=gr.Audio(sources="upload", type="filepath"),
|
152 |
+
outputs=gr.Audio(label="Generated Speech", type="numpy"),
|
153 |
+
)
|
154 |
+
|
155 |
+
mic_translate = gr.Interface(
|
156 |
+
fn=speech_to_speech_translation,
|
157 |
+
inputs=gr.Audio(sources="microphone", type="filepath"),
|
158 |
+
outputs=gr.Audio(label="Generated Speech", type="numpy"),
|
159 |
+
)
|
160 |
+
|
161 |
+
demo = gr.Blocks()
|
162 |
+
|
163 |
+
with demo:
|
164 |
+
gr.TabbedInterface([file_translate, mic_translate], ["Audio File", "Microphone"])
|
165 |
+
|
166 |
+
demo.launch(debug=True)
|
generated_speech.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fdb21e9e7398413a124550ea387051103f097e4094a39e6912ce48bf99108c85
|
3 |
+
size 416969
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
ffmpeg
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
transformers
|
3 |
+
indic-transliteration
|
4 |
+
sentencepiece
|
5 |
+
sacremoses
|
6 |
+
datasets
|