zongxiao commited on
Commit
0041ae6
·
1 Parent(s): c8d6046

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -9
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import torch
2
  import numpy as np
 
3
  from transformers import pipeline
4
  from transformers import BarkModel
5
  from transformers import AutoProcessor
@@ -9,26 +10,29 @@ device="cpu"
9
  pipe = pipeline(
10
  "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
11
  )
 
12
  processor = AutoProcessor.from_pretrained("suno/bark")
13
  model = BarkModel.from_pretrained("suno/bark")
14
  model = model.to(device)
15
  synthesised_rate = model.generation_config.sample_rate
16
 
17
- def translate(audio):
 
18
  outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
19
- return outputs["text"]
 
 
 
 
20
  def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
21
  inputs = processor(text_prompt, voice_preset=voice_preset)
22
  speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
23
  return speech_output
24
  def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
25
- translated_text = translate(audio)
26
  synthesised_speech = synthesise(translated_text,voice_preset)
27
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
28
- return synthesised_rate , synthesised_speech ,translated_text
29
- def speech_to_speech_translation_fix(audio,voice_preset="v2/zh_speaker_1"):
30
- synthesised_rate,synthesised_speech,translated_text = speech_to_speech_translation(audio,voice_preset)
31
- return (synthesised_rate,synthesised_speech.T),translated_text
32
 
33
  title = "Multilanguage to Chinese(mandarin) Cascaded STST"
34
  description = """
@@ -51,22 +55,24 @@ import gradio as gr
51
 
52
  demo = gr.Blocks()
53
  file_transcribe = gr.Interface(
54
- fn=speech_to_speech_translation_fix,
55
  inputs=gr.Audio(source="upload", type="filepath"),
56
  outputs=[
57
  gr.Audio(label="Generated Speech", type="numpy"),
58
  gr.Text(label="Transcription"),
 
59
  ],
60
  title=title,
61
  description=description,
62
  examples=examples,
63
  )
64
  mic_transcribe = gr.Interface(
65
- fn=speech_to_speech_translation_fix,
66
  inputs=gr.Audio(source="microphone", type="filepath"),
67
  outputs=[
68
  gr.Audio(label="Generated Speech", type="numpy"),
69
  gr.Text(label="Transcription"),
 
70
  ],
71
  title=title,
72
  description=description,
 
1
  import torch
2
  import numpy as np
3
+ import soundfile as sf
4
  from transformers import pipeline
5
  from transformers import BarkModel
6
  from transformers import AutoProcessor
 
10
  pipe = pipeline(
11
  "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
12
  )
13
+ label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
14
  processor = AutoProcessor.from_pretrained("suno/bark")
15
  model = BarkModel.from_pretrained("suno/bark")
16
  model = model.to(device)
17
  synthesised_rate = model.generation_config.sample_rate
18
 
19
+ def translate(audio_file):
20
+ audio, sampling_rate = sf.read(audio_file)
21
  outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
22
+ language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
23
+ label_outputs = {}
24
+ for pred in language_prediction:
25
+ label_outputs[pred["label"]] = pred["score"]
26
+ return outputs["text"],label_outputs
27
  def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
28
  inputs = processor(text_prompt, voice_preset=voice_preset)
29
  speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
30
  return speech_output
31
  def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
32
+ translated_text, label_outputs= translate(audio)
33
  synthesised_speech = synthesise(translated_text,voice_preset)
34
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
35
+ return (synthesised_rate , synthesised_speech.T),translated_text,label_outputs
 
 
 
36
 
37
  title = "Multilanguage to Chinese(mandarin) Cascaded STST"
38
  description = """
 
55
 
56
  demo = gr.Blocks()
57
  file_transcribe = gr.Interface(
58
+ fn=speech_to_speech_translation,
59
  inputs=gr.Audio(source="upload", type="filepath"),
60
  outputs=[
61
  gr.Audio(label="Generated Speech", type="numpy"),
62
  gr.Text(label="Transcription"),
63
+ gr.Label(label="Language prediction"),
64
  ],
65
  title=title,
66
  description=description,
67
  examples=examples,
68
  )
69
  mic_transcribe = gr.Interface(
70
+ fn=speech_to_speech_translation,
71
  inputs=gr.Audio(source="microphone", type="filepath"),
72
  outputs=[
73
  gr.Audio(label="Generated Speech", type="numpy"),
74
  gr.Text(label="Transcription"),
75
+ gr.Label(label="Language prediction"),
76
  ],
77
  title=title,
78
  description=description,