Spaces:

navodit17
/

speech-to-speech-translation

Sleeping

App Files Files Community

navodit17 commited on Apr 16

Commit

0c21945

1 Parent(s): 4d1cda2

working demo

Browse files

Files changed (6) hide show

.DS_Store +0 -0
app.py +98 -0
app2.py +166 -0
generated_speech.pt +3 -0
packages.txt +1 -0
requirements.txt +6 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import gradio as gr
+import torch
+import numpy as np
+from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, MarianTokenizer, MarianMTModel
+from indic_transliteration import sanscript
+from indic_transliteration.sanscript import transliterate
+from transformers.models.whisper.english_normalizer import BasicTextNormalizer
+import soundfile as sf
+from datasets import load_dataset
+title = "Cascaded STST"
+description = """
+Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Hindi.
+Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-small) model for speech translation to English,
+then MarianMT's [opus-mt-en-hi](https://huggingface.co/Helsinki-NLP/opus-mt-en-hi) model for translation to Hindi,
+and finally microsoft/speechT5 fine-tuned for Hindi on IndicTTS dataset for text-to-speech.
+[SpeechT5 TTS](https://huggingface.co/navodit17/speecht5_finetuned_indic_tts_hi) model for text-to-speech:
+![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
+### NOTE: The goal is not to generate perfect Hindi speech or translation, but to demonstrate the cascaded STST approach using multiple models.
+The model might give poor result for very short sentences (1-2 words or so). Try to send longer audio in that case.
+---
+"""
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+# load speech translation checkpoint
+asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=device)
+# load text-to-speech checkpoint and speaker embeddings
+processor = SpeechT5Processor.from_pretrained("navodit17/speecht5_finetuned_indic_tts_hi")
+model = SpeechT5ForTextToSpeech.from_pretrained("navodit17/speecht5_finetuned_indic_tts_hi").to(device)
+vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
+embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+# load english to hindi translation checkpoint
+tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
+model_en_hi = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
+normalizer = BasicTextNormalizer()
+def translate_en_hi(text):
+    inputs = tokenizer(text, return_tensors="pt")
+    outputs = model_en_hi.generate(**inputs, max_new_tokens=256)
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)
+def translate(audio):
+    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
+    # print(f"Translated text - English: {outputs['text']}")
+    translated_text = translate_en_hi(outputs["text"])
+    # print(f"Translated text - Hindi: {translated_text}")
+    return translated_text
+def synthesise(text):
+    text = normalizer(transliterate(text, sanscript.DEVANAGARI, sanscript.ITRANS))
+    # print(f"Normalized Text: {text}")
+    inputs = processor(text=text, return_tensors="pt")
+    # print(f"Inputs: {inputs['input_ids'].shape}")
+    speech = model.generate_speech(input_ids=inputs["input_ids"].to(device), speaker_embeddings=speaker_embeddings.to(device), vocoder=vocoder)
+    return speech.cpu()
+def speech_to_speech_translation(audio):
+    translated_text = translate(audio)
+    synthesised_speech = synthesise(translated_text)
+    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
+    return 16000, synthesised_speech
+demo = gr.Blocks()
+file_translate = gr.Interface(
+    fn=speech_to_speech_translation,
+    inputs=gr.Audio(sources="upload", type="filepath"),
+    outputs=gr.Audio(label="Generated Speech", type="numpy"),
+    title=title,
+    description=description,
+)
+mic_translate = gr.Interface(
+    fn=speech_to_speech_translation,
+    inputs=gr.Audio(sources="microphone", type="filepath"),
+    outputs=gr.Audio(label="Generated Speech", type="numpy", ),
+    title=title,
+    description=description,
+)
+with demo:
+    gr.TabbedInterface([file_translate, mic_translate], ["Audio File", "Microphone"])
+demo.launch()

app2.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import gradio as gr
+import torch
+import numpy as np
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from datasets import load_dataset
+# sample tensor
+sample_tensor = torch.load("generated_speech.pt")
+sample_text = "namaste kya apa mujhe bata sakate haim ki kala apaki phlaita kisa samaya hai kya apa taiksi lemge"
+# STT model
+stt_checkpoint = "navodit17/speecht5_finetuned_indic_tts_hi"
+stt_processor = SpeechT5Processor.from_pretrained(stt_checkpoint)
+stt_model = SpeechT5ForTextToSpeech.from_pretrained(stt_checkpoint)
+vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+# speaker_embeddings = torch.tensor([[
+#     -7.9096e-02,  3.2141e-02,  4.4062e-02,  2.2458e-02,  2.4818e-02,
+#     1.0870e-02, -6.9989e-02,  2.9361e-02,  4.9650e-02,  1.4352e-02,
+#     -8.0856e-02, -6.9597e-02,  5.1308e-02,  2.0282e-02,  5.4853e-02,
+#     7.3063e-02,  1.3617e-02,  2.8050e-02,  1.6760e-03,  4.5911e-03,
+#     2.2377e-02,  3.5073e-02, -1.2015e-02, -5.6730e-02, -5.4457e-02,
+#     -5.6295e-03, -6.0393e-02,  1.5303e-02,  4.4781e-02,  4.2038e-02,
+#     -2.6033e-03,  4.3630e-02,  1.7581e-02,  1.0797e-02,  4.1421e-02,
+#     -6.9390e-02,  5.0698e-02,  3.5905e-02,  2.7092e-02, -4.9815e-02,
+#     3.9357e-02,  1.8817e-02,  2.8782e-02,  4.3815e-02,  2.6008e-02,
+#     -7.6643e-02, -1.4641e-02,  9.5107e-03, -1.0249e-01,  4.3864e-02,
+#     2.4998e-02,  3.7842e-02,  2.0267e-02,  4.0481e-02, -8.5064e-02,
+#     -2.1616e-03,  1.5996e-02,  1.0865e-02,  3.2461e-02,  3.8486e-02,
+#     3.6491e-02, -5.9280e-03, -1.1070e-02,  1.6478e-02,  2.4622e-02,
+#     4.9041e-02,  2.8210e-02, -5.2851e-02, -5.6465e-02, -5.9150e-02,
+#     2.7470e-02,  5.4932e-03,  7.2415e-03,  1.3297e-02,  4.7910e-02,
+#     3.7617e-02,  1.5726e-02,  2.0426e-02, -5.4756e-02, -7.1068e-02,
+#     -7.2809e-02, -7.8266e-02, -5.7242e-02, -7.1732e-02, -2.2402e-02,
+#     -6.6500e-02, -7.6155e-02,  6.3509e-02, -6.7639e-03, -6.8526e-03,
+#     2.5241e-02, -4.5859e-02,  2.0980e-02, -5.7519e-02,  4.9442e-02,
+#     -2.2280e-02,  5.3843e-02,  4.3883e-02, -4.6968e-02, -8.6317e-02,
+#     6.1945e-03, -7.8510e-02, -7.2955e-02,  4.6143e-03,  6.8236e-02,
+#     -5.7523e-02,  2.6043e-02,  5.1134e-02,  5.0466e-02,  1.9361e-02,
+#     -7.3835e-02,  5.8783e-02,  7.8403e-02,  1.4997e-02,  1.8314e-02,
+#     6.3094e-02, -7.9442e-02,  3.4601e-02, -5.3276e-02, -5.0826e-02,
+#     2.4606e-02, -6.8361e-02,  2.6284e-02,  4.2329e-02, -3.7599e-02,
+#     4.1646e-02, -9.5280e-02,  3.7492e-02,  3.7636e-02,  2.5985e-02,
+#     -3.0050e-02,  3.2662e-02,  2.7723e-02,  3.8104e-02,  1.8247e-02,
+#     -7.3857e-02, -7.5490e-02,  3.2894e-02, -7.2749e-02, -3.6701e-02,
+#     2.2667e-02,  4.1351e-03, -1.4796e-02,  4.1243e-02, -6.9272e-02,
+#     2.4523e-02,  2.1793e-02, -3.3412e-03,  2.7912e-02, -5.5684e-02,
+#     4.8057e-02, -5.1125e-02, -8.6508e-02,  1.6578e-02,  1.4219e-02,
+#     3.8626e-02,  2.8588e-02, -8.8628e-02, -7.7785e-02,  4.5904e-02,
+#     2.6973e-02,  1.1173e-02,  3.4062e-02,  1.5100e-02, -1.1940e-02,
+#     5.4919e-03,  5.3976e-02,  3.3862e-02,  1.7793e-02,  2.7416e-02,
+#     5.0325e-02, -9.2786e-02,  3.4933e-02, -6.3649e-02,  1.7891e-02,
+#     4.2497e-02, -6.2080e-02,  3.1213e-02,  2.6646e-02, -7.2364e-02,
+#     2.3743e-02, -6.4803e-02, -6.8434e-05,  2.2999e-02, -7.8435e-02,
+#     -7.1068e-03,  2.0802e-02,  3.8085e-02, -5.5679e-02, -5.2630e-02,
+#     -6.0600e-02,  2.3879e-02, -5.8713e-02,  1.2526e-02,  8.7441e-03,
+#     1.2976e-02,  2.3702e-02,  2.0858e-02,  2.4530e-02, -6.1161e-03,
+#     1.6387e-02,  2.9424e-02, -6.9881e-02,  1.0703e-02,  5.4566e-02,
+#     9.7716e-03,  4.1892e-02,  7.9958e-03,  4.0326e-02, -3.9815e-03,
+#     -8.0707e-04, -5.9334e-02, -5.2023e-02,  2.4852e-02, -6.4731e-02,
+#     5.9305e-02,  3.0249e-02, -5.8866e-02,  4.2771e-02,  2.5907e-02,
+#     -4.9304e-02,  1.9540e-02, -6.2296e-02, -1.7946e-02, -8.4763e-03,
+#     2.7271e-02,  2.8420e-02,  5.1065e-02,  3.1372e-02,  4.7098e-02,
+#     2.6642e-02,  4.0554e-02,  3.0486e-02, -1.4875e-02,  2.8971e-02,
+#     -2.8165e-02,  4.5303e-02,  1.7752e-02,  1.1463e-03,  4.1254e-02,
+#     -5.5486e-02,  1.4259e-02, -8.8242e-02,  4.6154e-02, -4.7821e-02,
+#     4.6743e-02,  2.4079e-02,  5.9683e-02,  4.8124e-02,  4.4341e-02,
+#     2.6699e-02,  4.2861e-02,  5.9677e-03, -5.6233e-02,  2.2145e-02,
+#     3.6767e-02, -3.2707e-03,  2.9193e-02, -8.3184e-02,  2.9720e-02,
+#     1.6997e-02, -8.8428e-02,  3.9235e-02,  2.8460e-02,  5.2879e-04,
+#     3.4858e-02,  4.1993e-02,  4.5816e-02,  6.6310e-03, -4.0764e-03,
+#     4.1234e-02, -1.3845e-02,  3.9914e-03,  4.9223e-02, -6.4104e-02,
+#     1.7539e-02,  5.6693e-02,  1.4442e-03, -7.4935e-02, -6.3044e-02,
+#     4.0006e-03,  4.8351e-02,  3.9536e-02, -8.7633e-02,  2.9052e-02,
+#     5.1906e-02,  1.2489e-02,  5.8764e-02, -6.9203e-02,  4.2202e-03,
+#     4.1723e-02,  7.5111e-03,  2.1593e-02,  3.7314e-02,  1.9330e-02,
+#     8.8582e-03,  1.7286e-02,  4.1805e-02,  2.7086e-02,  1.3443e-02,
+#     -4.9905e-02,  4.1805e-02,  1.4801e-02, -5.4013e-02,  2.9406e-02,
+#     4.8653e-02,  1.5568e-02,  3.5359e-02,  5.9202e-02,  1.8950e-02,
+#     6.6025e-02,  3.6152e-02, -6.8674e-02,  3.7966e-02, -7.0162e-02,
+#     -6.5415e-02, -2.1472e-02,  1.3661e-02, -6.1625e-02,  5.2603e-02,
+#     -4.2280e-02, -3.5216e-03, -3.5183e-02,  1.0041e-02,  3.2294e-02,
+#     -6.7117e-02,  3.7613e-02,  4.9022e-02, -6.9640e-02, -6.6330e-02,
+#     -5.7341e-02,  3.2368e-02,  5.1048e-02,  3.5536e-02, -5.7165e-02,
+#     3.9687e-02,  5.5177e-02,  3.6400e-03,  2.2232e-04,  1.0508e-02,
+#     9.4798e-03, -4.9671e-02,  1.5729e-02, -1.6415e-03, -6.5341e-02,
+#     1.1698e-02,  4.2636e-02,  3.0220e-02, -5.1484e-02, -5.8948e-02,
+#     -6.5812e-02, -3.3869e-02, -8.1614e-02,  9.3173e-02,  2.7790e-02,
+#     -3.7140e-02, -7.4221e-03,  4.7300e-02, -5.5298e-02,  2.2071e-02,
+#     -8.1595e-02,  6.8659e-03,  8.6731e-03,  1.7781e-03,  4.9692e-02,
+#     1.8681e-02, -4.8615e-02,  4.8314e-03,  2.4954e-02,  6.3759e-02,
+#     7.7778e-03,  4.2505e-02, -6.9391e-02,  3.7088e-02, -9.7483e-03,
+#     -6.1993e-04,  5.2777e-02,  1.2955e-02,  6.6815e-02,  4.6009e-02,
+#     -9.4540e-02, -8.8816e-02,  3.7671e-02,  3.2664e-03,  3.8003e-02,
+#     2.6832e-02,  6.7603e-02, -6.1109e-02,  3.9013e-02,  4.2810e-02,
+#     6.7511e-03,  3.2843e-02, -4.2086e-02,  5.1029e-02,  2.9837e-02,
+#     -1.5323e-02,  2.4238e-02, -6.2738e-02,  6.7823e-03,  3.2687e-02,
+#     -5.8093e-03, -4.2954e-02,  2.5780e-02,  4.0528e-02,  2.4579e-02,
+#     9.3824e-03,  4.6847e-04, -1.0616e-01, -5.4627e-02, -7.2340e-02,
+#     1.0230e-02, -5.9172e-03,  2.7507e-02, -1.1043e-02,  4.9054e-02,
+#     -7.6480e-02,  3.5482e-02, -7.6632e-02, -1.0587e-01, -5.0521e-03,
+#     -8.1530e-02,  1.9312e-02,  5.2127e-02,  6.3460e-03, -5.5675e-02,
+#     3.2647e-02,  5.0215e-02,  5.0249e-02,  3.8105e-02, -4.5931e-02,
+#     -4.5676e-02, -9.0062e-03, -1.7592e-02,  2.7216e-02,  3.1704e-02,
+#     2.2958e-02,  5.5290e-02,  9.7256e-03,  1.7059e-02, -6.9045e-02,
+#     1.3051e-02,  2.0521e-02,  4.5282e-02,  2.9233e-03, -3.0717e-02,
+#     2.6435e-02,  1.9568e-02,  9.9746e-03,  3.0479e-02, -9.6263e-03,
+#     3.9715e-02,  2.3348e-02,  6.4526e-02,  3.9307e-02,  2.5429e-02,
+#     2.4707e-02,  3.0577e-02, -7.1778e-02, -8.8073e-02,  3.7356e-02,
+#     -7.6534e-03,  2.5788e-02,  3.7859e-02,  4.2421e-02, -1.0225e-01,
+#     3.5744e-02,  4.9693e-02,  8.0407e-04,  3.3523e-02,  2.7724e-02,
+#     -2.7828e-03,  6.0185e-02,  2.4983e-02,  1.8167e-03,  6.2133e-03,
+#     -6.7665e-02,  2.4738e-02, -5.1167e-03,  2.7496e-02,  3.8240e-02,
+#     -7.5278e-02, -4.1977e-02,  3.0779e-02,  5.3046e-02,  2.9874e-02,
+#     8.0589e-02, -6.3608e-02,  1.8703e-02,  9.5655e-03, -1.2092e-02,
+#     -5.1363e-02,  3.7178e-02, -3.4604e-02,  4.1522e-02, -9.3374e-03,
+#     -2.2800e-02, -6.7766e-02,  3.4822e-02, -5.2781e-02, -6.7118e-02,
+#     5.7408e-03,  4.5285e-02, -6.4813e-02,  3.5704e-02,  1.0203e-02,
+#     -9.9155e-03,  1.6483e-02,  3.9745e-02,  6.8487e-02,  1.9586e-02,
+#     3.5887e-02, -5.5557e-02
+# ]])
+embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+print(speaker_embeddings.shape)
+def synthesise(text):
+    inputs = stt_processor(text=text, return_tensors="pt")
+    print(inputs["input_ids"].shape)
+    print(inputs["input_ids"].dtype)
+    speech = stt_model.generate_speech(
+        inputs["input_ids"],
+        speaker_embeddings=speaker_embeddings,
+        vocoder=vocoder,
+    )
+    return speech
+def speech_to_speech_translation(audio):
+    synthesised_speech = synthesise(sample_text)
+    print(f"Synthesised speech shape: {synthesised_speech.shape}")
+    print("Generated waveform dtype:", synthesised_speech.dtype)
+    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
+    # synthesised_speech = (sample_tensor.numpy() * 32767).astype(np.int16)
+    return 16000, synthesised_speech
+# Gradio Interface
+file_translate = gr.Interface(
+    fn=speech_to_speech_translation,
+    inputs=gr.Audio(sources="upload", type="filepath"),
+    outputs=gr.Audio(label="Generated Speech", type="numpy"),
+)
+mic_translate = gr.Interface(
+    fn=speech_to_speech_translation,
+    inputs=gr.Audio(sources="microphone", type="filepath"),
+    outputs=gr.Audio(label="Generated Speech", type="numpy"),
+)
+demo = gr.Blocks()
+with demo:
+    gr.TabbedInterface([file_translate, mic_translate], ["Audio File", "Microphone"])
+demo.launch(debug=True)

generated_speech.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fdb21e9e7398413a124550ea387051103f097e4094a39e6912ce48bf99108c85
+size 416969

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+transformers
+indic-transliteration
+sentencepiece
+sacremoses
+datasets