zongxiao commited on
Commit
dbc99da
·
1 Parent(s): 5ddbaa2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -32
app.py CHANGED
@@ -1,36 +1,29 @@
1
- import gradio as gr
2
- import numpy as np
3
  import torch
4
- from datasets import load_dataset
5
-
6
- from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
7
-
8
-
9
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
-
11
- # load speech translation checkpoint
12
- asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
13
-
14
- # load text-to-speech checkpoint and speaker embeddings
15
- processor = SpeechT5Processor.from_pretrained("sanchit-gandhi/speecht5_tts_vox_nl")
16
-
17
- model = SpeechT5ForTextToSpeech.from_pretrained("sanchit-gandhi/speecht5_tts_vox_nl").to(device)
18
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
19
-
20
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
21
- speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
22
 
 
 
 
 
23
 
24
  def translate(audio):
25
- outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "dutch"})
26
  return outputs["text"]
27
 
 
 
 
 
 
 
 
28
 
29
- def synthesise(text):
30
- inputs = processor(text=text, return_tensors="pt")
31
- max_length = processor.tokenizer.model_max_length
32
- speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
33
- return speech.cpu()
 
34
 
35
 
36
  def speech_to_speech_translation(audio):
@@ -39,19 +32,28 @@ def speech_to_speech_translation(audio):
39
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
40
  return 16000, synthesised_speech
41
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
-
44
- title = "English to Dutch Cascaded STST"
45
  description = """
46
- Demo for cascaded speech-to-speech translation (STST), mapping from source speech in English to target speech in Dutch. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and a fine-tuned version of Microsoft's
47
- SpeechT5 [Speecht5_tts_vox_nl](https://huggingface.co/sanchit-gandhi/speecht5_tts_vox_nl) model for text-to-speech:
48
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
49
  """
50
 
51
  demo = gr.Blocks()
52
 
53
  mic_translate = gr.Interface(
54
- fn=speech_to_speech_translation,
55
  inputs=gr.Audio(source="microphone", type="filepath"),
56
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
57
  title=title,
@@ -59,7 +61,7 @@ mic_translate = gr.Interface(
59
  )
60
 
61
  file_translate = gr.Interface(
62
- fn=speech_to_speech_translation,
63
  inputs=gr.Audio(source="upload", type="filepath"),
64
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
65
  title=title,
 
 
 
1
  import torch
2
+ from transformers import pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
+ device="cpu"
5
+ pipe = pipeline(
6
+ "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
7
+ )
8
 
9
  def translate(audio):
10
+ outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
11
  return outputs["text"]
12
 
13
+ from transformers import BarkModel
14
+ from transformers import AutoProcessor
15
+ model = BarkModel.from_pretrained("suno/bark-small")
16
+ processor = AutoProcessor.from_pretrained("suno/bark")
17
+
18
+ model = model.to(device)
19
+ synthesised_rate = model.generation_config.sample_rate
20
 
21
+
22
+ def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
23
+ inputs = processor(text_prompt, voice_preset=voice_preset)
24
+ speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
25
+ #print(speech_output[0].cpu().numpy())
26
+ return speech_output
27
 
28
 
29
  def speech_to_speech_translation(audio):
 
32
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
33
  return 16000, synthesised_speech
34
 
35
+ import numpy as np
36
+ def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
37
+ translated_text = translate(audio)
38
+ #print(translated_text)
39
+ synthesised_speech = synthesise(translated_text,voice_preset)
40
+ synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
41
+ #print(synthesised_speech)
42
+ return synthesised_rate , synthesised_speech
43
+ def speech_to_speech_translation_fix(audio,voice_preset="v2/zh_speaker_1"):
44
+ synthesised_rate,synthesised_speech = speech_to_speech_translation(audio,voice_preset)
45
+ return synthesised_rate,synthesised_speech.T
46
 
47
+ title = "Multilanguage to Chinese(mandarin) Cascaded STST"
 
48
  description = """
49
+ Demo for cascaded speech-to-speech translation (STST), mapping from source speech in Multilanguage to target speech in Chinese(mandarin). Demo uses OpenAI's [Whisper arge-v2](https://huggingface.co/openai/whisper-large-v2) model for speech translation, and a suno/bark[bark-small](https://huggingface.co/suno/bark) model for text-to-speech:
 
50
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
51
  """
52
 
53
  demo = gr.Blocks()
54
 
55
  mic_translate = gr.Interface(
56
+ fn=speech_to_speech_translation_fix,
57
  inputs=gr.Audio(source="microphone", type="filepath"),
58
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
59
  title=title,
 
61
  )
62
 
63
  file_translate = gr.Interface(
64
+ fn=speech_to_speech_translation_fix,
65
  inputs=gr.Audio(source="upload", type="filepath"),
66
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
67
  title=title,