speech-to-speech

Paused

App Files Files Community

zongxiao commited on Oct 12, 2023

Commit

282149c

1 Parent(s): 0035001

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -13

app.py CHANGED Viewed

@@ -1,3 +1,87 @@
 import torch
 import numpy as np
 import soundfile as sf
@@ -10,33 +94,34 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
 pipe = pipeline(
     "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
 )
-label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
 processor = AutoProcessor.from_pretrained("suno/bark")
 model = BarkModel.from_pretrained("suno/bark")
 model = model.to(device)
 synthesised_rate = model.generation_config.sample_rate
 def translate(audio_file):
-    audio, sampling_rate = sf.read(audio_file)
     outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
-    language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
-    label_outputs = {}
-    for pred in language_prediction:
-        label_outputs[pred["label"]] = pred["score"]
-    return outputs["text"],label_outputs
 def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
     inputs = processor(text_prompt, voice_preset=voice_preset)
     speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
     return speech_output
 def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
-    translated_text, label_outputs= translate(audio)
     synthesised_speech = synthesise(translated_text,voice_preset)
     synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
-    return (synthesised_rate , synthesised_speech.T),translated_text,label_outputs
 title = "外国话转中文话"
 description = """
-作为[Hugging Face Audio course](https://huggingface.co/learn/audio-course/chapter0/introduction) 的结课大作业，本演示调用了三个自然语言处理的大模型，一个用于将外国话翻译成中文，一个用于判断说的哪个国家的话，一个用于将中文转成语音输出。演示同时支持语音上传和麦克风输入，转换速度比较慢因为租不起GPU的服务器（支出增加20倍），建议您通过已经缓存Examples体验效果。欢迎添加我的微信号：ESGGTP 与我的平行人交流。
 ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
 """
@@ -46,7 +131,10 @@ examples = [
     ["./de.mp3", None],
     ["./fr.mp3", None],
     ["./it.mp3", None],
 ]
 import gradio as gr
@@ -57,7 +145,7 @@ file_transcribe = gr.Interface(
     outputs=[
         gr.Audio(label="Generated Speech", type="numpy"),
         gr.Text(label="Transcription"),
-        gr.Label(label="Language prediction"),
     ],
     title=title,
     description=description,
@@ -69,7 +157,7 @@ mic_transcribe = gr.Interface(
     outputs=[
         gr.Audio(label="Generated Speech", type="numpy"),
         gr.Text(label="Transcription"),
-        gr.Label(label="Language prediction"),
     ],
     title=title,
     description=description,

+# import torch
+# import numpy as np
+# import soundfile as sf
+# from transformers import pipeline
+# from transformers import BarkModel
+# from transformers import AutoProcessor
+# device = "cuda:0" if torch.cuda.is_available() else "cpu"
+# pipe = pipeline(
+#     "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
+# )
+# label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
+# processor = AutoProcessor.from_pretrained("suno/bark")
+# model = BarkModel.from_pretrained("suno/bark")
+# model = model.to(device)
+# synthesised_rate = model.generation_config.sample_rate
+# def translate(audio_file):
+#     audio, sampling_rate = sf.read(audio_file)
+#     outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
+#     language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
+#     label_outputs = {}
+#     for pred in language_prediction:
+#         label_outputs[pred["label"]] = pred["score"]
+#     return outputs["text"],label_outputs
+# def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
+#     inputs = processor(text_prompt, voice_preset=voice_preset)
+#     speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
+#     return speech_output
+# def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
+#     translated_text, label_outputs= translate(audio)
+#     synthesised_speech = synthesise(translated_text,voice_preset)
+#     synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
+#     return (synthesised_rate , synthesised_speech.T),translated_text,label_outputs
+# title = "外国话转中文话"
+# description = """
+# 作为[Hugging Face Audio course](https://huggingface.co/learn/audio-course/chapter0/introduction) 的结课大作业，本演示调用了三个自然语言处理的大模型，一个用于将外国话翻译成中文，一个用于判断说的哪个国家的话，一个用于将中文转成语音输出。演示同时支持语音上传和麦克风输入，转换速度比较慢因为租不起GPU的服务器（支出增加20倍），建议您通过已经缓存Examples体验效果。欢迎添加我的微信号：ESGGTP 与我的平行人交流。
+# ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
+# """
+# examples = [
+#     ["./en.mp3", None],
+#     ["./de.mp3", None],
+#     ["./fr.mp3", None],
+#     ["./it.mp3", None],
+# ]
+# import gradio as gr
+# demo = gr.Blocks()
+# file_transcribe = gr.Interface(
+#     fn=speech_to_speech_translation,
+#     inputs=gr.Audio(source="upload", type="filepath"),
+#     outputs=[
+#         gr.Audio(label="Generated Speech", type="numpy"),
+#         gr.Text(label="Transcription"),
+#         gr.Label(label="Language prediction"),
+#     ],
+#     title=title,
+#     description=description,
+#     examples=examples,
+# )
+# mic_transcribe = gr.Interface(
+#     fn=speech_to_speech_translation,
+#     inputs=gr.Audio(source="microphone", type="filepath"),
+#     outputs=[
+#         gr.Audio(label="Generated Speech", type="numpy"),
+#         gr.Text(label="Transcription"),
+#         gr.Label(label="Language prediction"),
+#     ],
+#     title=title,
+#     description=description,
+# )
+# with demo:
+#     gr.TabbedInterface(
+#         [file_transcribe, mic_transcribe],
+#         ["Transcribe Audio File", "Transcribe Microphone"],
+#     )
+# demo.launch(share=True)
+###########################################################################################################################
 import torch
 import numpy as np
 import soundfile as sf
 pipe = pipeline(
     "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
 )
+#label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
 processor = AutoProcessor.from_pretrained("suno/bark")
 model = BarkModel.from_pretrained("suno/bark")
 model = model.to(device)
 synthesised_rate = model.generation_config.sample_rate
 def translate(audio_file):
+#    audio, sampling_rate = sf.read(audio_file)
     outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
+#    language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
+#    label_outputs = {}
+#    for pred in language_prediction:
+#        label_outputs[pred["label"]] = pred["score"]
+    return outputs["text"]#,label_outputs
 def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
     inputs = processor(text_prompt, voice_preset=voice_preset)
     speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
     return speech_output
 def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
+    #translated_text, label_outputs= translate(audio)
+    translated_text = translate(audio)
     synthesised_speech = synthesise(translated_text,voice_preset)
     synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
+    return (synthesised_rate , synthesised_speech.T),translated_text#,label_outputs
 title = "外国话转中文话"
 description = """
+作为[Hugging Face Audio course](https://github.com/danfouer/HFAudioCourse) 的结课大作业，本演示调用了两个自然语言处理的大模型，一个用于将外国话翻译成中文，一个用于将中文转成语音输出。演示同时支持语音上传和麦克风输入，转换速度比较慢因为租不起GPU的服务器（支出增加20倍），建议您通过已经缓存Examples体验效果。欢迎添加我的微信号：ESGGTP 与我的平行人交流。
 ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
 """
     ["./de.mp3", None],
     ["./fr.mp3", None],
     ["./it.mp3", None],
+    ["./nl.mp3", None],
+    ["./fi.mp3", None],
+    ["./cs.mp3", None],
+    ["./pl.mp3", None],
 ]
 import gradio as gr
     outputs=[
         gr.Audio(label="Generated Speech", type="numpy"),
         gr.Text(label="Transcription"),
+#        gr.Label(label="Language prediction"),
     ],
     title=title,
     description=description,
     outputs=[
         gr.Audio(label="Generated Speech", type="numpy"),
         gr.Text(label="Transcription"),
+#        gr.Label(label="Language prediction"),
     ],
     title=title,
     description=description,