speech-to-speech

Paused

App Files Files Community

zongxiao commited on Oct 9, 2023

Commit

dbc99da

1 Parent(s): 5ddbaa2

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -32

app.py CHANGED Viewed

@@ -1,36 +1,29 @@
-import gradio as gr
-import numpy as np
 import torch
-from datasets import load_dataset
-from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-# load speech translation checkpoint
-asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
-# load text-to-speech checkpoint and speaker embeddings
-processor = SpeechT5Processor.from_pretrained("sanchit-gandhi/speecht5_tts_vox_nl")
-model = SpeechT5ForTextToSpeech.from_pretrained("sanchit-gandhi/speecht5_tts_vox_nl").to(device)
-vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
-embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
 def translate(audio):
-    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "dutch"})
     return outputs["text"]
-def synthesise(text):
-    inputs = processor(text=text, return_tensors="pt")
-    max_length = processor.tokenizer.model_max_length
-    speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
-    return speech.cpu()
 def speech_to_speech_translation(audio):
@@ -39,19 +32,28 @@ def speech_to_speech_translation(audio):
     synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
     return 16000, synthesised_speech
-title = "English to Dutch Cascaded STST"
 description = """
-Demo for cascaded speech-to-speech translation (STST), mapping from source speech in English to target speech in Dutch. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and a fine-tuned version of Microsoft's
-SpeechT5 [Speecht5_tts_vox_nl](https://huggingface.co/sanchit-gandhi/speecht5_tts_vox_nl) model for text-to-speech:
 ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
 """
 demo = gr.Blocks()
 mic_translate = gr.Interface(
-    fn=speech_to_speech_translation,
     inputs=gr.Audio(source="microphone", type="filepath"),
     outputs=gr.Audio(label="Generated Speech", type="numpy"),
     title=title,
@@ -59,7 +61,7 @@ mic_translate = gr.Interface(
 )
 file_translate = gr.Interface(
-    fn=speech_to_speech_translation,
     inputs=gr.Audio(source="upload", type="filepath"),
     outputs=gr.Audio(label="Generated Speech", type="numpy"),
     title=title,

 import torch
+from transformers import pipeline
+device="cpu"
+pipe = pipeline(
+    "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
+)
 def translate(audio):
+    outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
     return outputs["text"]
+from transformers import BarkModel
+from transformers import AutoProcessor
+model = BarkModel.from_pretrained("suno/bark-small")
+processor = AutoProcessor.from_pretrained("suno/bark")
+model = model.to(device)
+synthesised_rate = model.generation_config.sample_rate
+def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
+    inputs = processor(text_prompt, voice_preset=voice_preset)
+    speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
+    #print(speech_output[0].cpu().numpy())
+    return speech_output
 def speech_to_speech_translation(audio):
     synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
     return 16000, synthesised_speech
+import numpy as np
+def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
+    translated_text = translate(audio)
+    #print(translated_text)
+    synthesised_speech = synthesise(translated_text,voice_preset)
+    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
+    #print(synthesised_speech)
+    return synthesised_rate , synthesised_speech
+def speech_to_speech_translation_fix(audio,voice_preset="v2/zh_speaker_1"):
+    synthesised_rate,synthesised_speech = speech_to_speech_translation(audio,voice_preset)
+    return synthesised_rate,synthesised_speech.T
+title = "Multilanguage to Chinese(mandarin) Cascaded STST"
 description = """
+Demo for cascaded speech-to-speech translation (STST), mapping from source speech in Multilanguage to target speech in Chinese(mandarin). Demo uses OpenAI's [Whisper arge-v2](https://huggingface.co/openai/whisper-large-v2) model for speech translation, and a suno/bark[bark-small](https://huggingface.co/suno/bark) model for text-to-speech:
 ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
 """
 demo = gr.Blocks()
 mic_translate = gr.Interface(
+    fn=speech_to_speech_translation_fix,
     inputs=gr.Audio(source="microphone", type="filepath"),
     outputs=gr.Audio(label="Generated Speech", type="numpy"),
     title=title,
 )
 file_translate = gr.Interface(
+    fn=speech_to_speech_translation_fix,
     inputs=gr.Audio(source="upload", type="filepath"),
     outputs=gr.Audio(label="Generated Speech", type="numpy"),
     title=title,