zongxiao commited on
Commit
b9359f0
·
1 Parent(s): 1237c43

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -43
app.py CHANGED
@@ -21,19 +21,14 @@ def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
21
  inputs = processor(text_prompt, voice_preset=voice_preset)
22
  speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
23
  return speech_output
24
- def speech_to_speech_translation(audio):
25
- translated_text = translate(audio)
26
- synthesised_speech = synthesise(translated_text)
27
- synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
28
- return 16000, synthesised_speech
29
  def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
30
  translated_text = translate(audio)
31
  synthesised_speech = synthesise(translated_text,voice_preset)
32
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
33
- return synthesised_rate , synthesised_speech
34
  def speech_to_speech_translation_fix(audio,voice_preset="v2/zh_speaker_1"):
35
- synthesised_rate,synthesised_speech = speech_to_speech_translation(audio,voice_preset)
36
- return synthesised_rate,synthesised_speech.T
37
 
38
  title = "Multilanguage to Chinese(mandarin) Cascaded STST"
39
  description = """
@@ -41,52 +36,48 @@ Demo for cascaded speech-to-speech translation (STST), mapping from source speec
41
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
42
  """
43
  examples = [
44
- ["./mama (1).mp3", None],
45
- ["./mama (2).mp3", None],
46
- ["./mama (3).mp3", None],
47
- ["./mama (4).mp3", None],
48
- ["./mama (5).mp3", None],
49
- ["./mama (6).mp3", None],
50
- ["./mama (7).mp3", None],
51
- ["./mama (8).mp3", None],
 
 
 
 
 
52
  ]
53
  import gradio as gr
54
- demo = gr.Blocks()
55
-
56
- # Muti_translate=gr.Interface(
57
- # fn=speech_to_speech_translation_fix,
58
- # inputs=[
59
- # gr.Audio(label="Upload Speech", source="upload", type="filepath"),
60
- # gr.Audio(label="Record Speech", source="microphone", type="filepath"),
61
- # ],
62
- # outputs=[
63
- # gr.Audio(label="Generated Speech", type="numpy"),
64
- # gr.Text(label="Transcription"),
65
- # ],
66
- # title=title,
67
- # description=description,
68
- # examples=examples,
69
- # )
70
 
71
- mic_translate = gr.Interface(
 
72
  fn=speech_to_speech_translation_fix,
73
- inputs=gr.Audio(source="microphone", type="filepath"),
74
- outputs=gr.Audio(label="Generated Speech", type="numpy"),
 
 
 
75
  title=title,
76
  description=description,
 
77
  )
78
-
79
- file_translate = gr.Interface(
80
  fn=speech_to_speech_translation_fix,
81
- inputs=gr.Audio(source="upload", type="filepath"),
82
- outputs=gr.Audio(label="Generated Speech", type="numpy"),
83
- examples=examples,
 
 
84
  title=title,
85
  description=description,
86
  )
87
-
88
  with demo:
89
- gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
90
- #gr.TabbedInterface([Muti_translate], ["Record or upload your speech"])
 
 
91
 
92
  demo.launch(share=True)
 
21
  inputs = processor(text_prompt, voice_preset=voice_preset)
22
  speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
23
  return speech_output
 
 
 
 
 
24
  def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
25
  translated_text = translate(audio)
26
  synthesised_speech = synthesise(translated_text,voice_preset)
27
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
28
+ return synthesised_rate , synthesised_speech ,translated_text
29
  def speech_to_speech_translation_fix(audio,voice_preset="v2/zh_speaker_1"):
30
+ synthesised_rate,synthesised_speech,translated_text = speech_to_speech_translation(audio,voice_preset)
31
+ return (synthesised_rate,synthesised_speech.T),translated_text
32
 
33
  title = "Multilanguage to Chinese(mandarin) Cascaded STST"
34
  description = """
 
36
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
37
  """
38
  examples = [
39
+ ["./cs-CZ.wav", None],
40
+ ["./de-DE.wav", None],
41
+ ["./en-AU.wav", None],
42
+ ["./en-GB.wav", None],
43
+ ["./en-US.wav", None],
44
+ ["./es-ES.wav", None],
45
+ ["./fr-FR.wav", None],
46
+ ["./it-IT.wav", None],
47
+ ["./ko-KR.wav", None],
48
+ ["./nl-NL.wav", None],
49
+ ["./pl-PL.wav", None],
50
+ ["./pt-PT.wav", None],
51
+ ["./ru-RU.wav", None],
52
  ]
53
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ demo = gr.Blocks()
56
+ file_transcribe = gr.Interface(
57
  fn=speech_to_speech_translation_fix,
58
+ inputs=gr.Audio(source="upload", type="filepath"),
59
+ outputs=[
60
+ gr.Audio(label="Generated Speech", type="numpy"),
61
+ gr.Text(label="Transcription"),
62
+ ],
63
  title=title,
64
  description=description,
65
+ examples=examples,
66
  )
67
+ mic_transcribe = gr.Interface(
 
68
  fn=speech_to_speech_translation_fix,
69
+ inputs=gr.Audio(source="microphone", type="filepath"),
70
+ outputs=[
71
+ gr.Audio(label="Generated Speech", type="numpy"),
72
+ gr.Text(label="Transcription"),
73
+ ],
74
  title=title,
75
  description=description,
76
  )
 
77
  with demo:
78
+ gr.TabbedInterface(
79
+ [file_transcribe, mic_transcribe],
80
+ ["Transcribe Audio File", "Transcribe Microphone"],
81
+ )
82
 
83
  demo.launch(share=True)