zongxiao commited on
Commit
282149c
·
1 Parent(s): 0035001

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -13
app.py CHANGED
@@ -1,3 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
  import numpy as np
3
  import soundfile as sf
@@ -10,33 +94,34 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
  pipe = pipeline(
11
  "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
12
  )
13
- label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
14
  processor = AutoProcessor.from_pretrained("suno/bark")
15
  model = BarkModel.from_pretrained("suno/bark")
16
  model = model.to(device)
17
  synthesised_rate = model.generation_config.sample_rate
18
 
19
  def translate(audio_file):
20
- audio, sampling_rate = sf.read(audio_file)
21
  outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
22
- language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
23
- label_outputs = {}
24
- for pred in language_prediction:
25
- label_outputs[pred["label"]] = pred["score"]
26
- return outputs["text"],label_outputs
27
  def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
28
  inputs = processor(text_prompt, voice_preset=voice_preset)
29
  speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
30
  return speech_output
31
  def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
32
- translated_text, label_outputs= translate(audio)
 
33
  synthesised_speech = synthesise(translated_text,voice_preset)
34
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
35
- return (synthesised_rate , synthesised_speech.T),translated_text,label_outputs
36
 
37
  title = "外国话转中文话"
38
  description = """
39
- 作为[Hugging Face Audio course](https://huggingface.co/learn/audio-course/chapter0/introduction) 的结课大作业,本演示调用了三个自然语言处理的大模型,一个用于将外国话翻译成中文,一个用于判断说的哪个国家的话,一个用于将中文转成语音输出。演示同时支持语音上传和麦克风输入,转换速度比较慢因为租不起GPU的服务器(支出增加20倍),建议您通过已经缓存Examples体验效果。欢迎添加我的微信号:ESGGTP 与我的平行人交流。
40
 
41
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
42
  """
@@ -46,7 +131,10 @@ examples = [
46
  ["./de.mp3", None],
47
  ["./fr.mp3", None],
48
  ["./it.mp3", None],
49
-
 
 
 
50
  ]
51
  import gradio as gr
52
 
@@ -57,7 +145,7 @@ file_transcribe = gr.Interface(
57
  outputs=[
58
  gr.Audio(label="Generated Speech", type="numpy"),
59
  gr.Text(label="Transcription"),
60
- gr.Label(label="Language prediction"),
61
  ],
62
  title=title,
63
  description=description,
@@ -69,7 +157,7 @@ mic_transcribe = gr.Interface(
69
  outputs=[
70
  gr.Audio(label="Generated Speech", type="numpy"),
71
  gr.Text(label="Transcription"),
72
- gr.Label(label="Language prediction"),
73
  ],
74
  title=title,
75
  description=description,
 
1
+ # import torch
2
+ # import numpy as np
3
+ # import soundfile as sf
4
+ # from transformers import pipeline
5
+ # from transformers import BarkModel
6
+ # from transformers import AutoProcessor
7
+
8
+ # device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
+
10
+ # pipe = pipeline(
11
+ # "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
12
+ # )
13
+ # label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
14
+ # processor = AutoProcessor.from_pretrained("suno/bark")
15
+ # model = BarkModel.from_pretrained("suno/bark")
16
+ # model = model.to(device)
17
+ # synthesised_rate = model.generation_config.sample_rate
18
+
19
+ # def translate(audio_file):
20
+ # audio, sampling_rate = sf.read(audio_file)
21
+ # outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
22
+ # language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
23
+ # label_outputs = {}
24
+ # for pred in language_prediction:
25
+ # label_outputs[pred["label"]] = pred["score"]
26
+ # return outputs["text"],label_outputs
27
+ # def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
28
+ # inputs = processor(text_prompt, voice_preset=voice_preset)
29
+ # speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
30
+ # return speech_output
31
+ # def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
32
+ # translated_text, label_outputs= translate(audio)
33
+ # synthesised_speech = synthesise(translated_text,voice_preset)
34
+ # synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
35
+ # return (synthesised_rate , synthesised_speech.T),translated_text,label_outputs
36
+
37
+ # title = "外国话转中文话"
38
+ # description = """
39
+ # 作为[Hugging Face Audio course](https://huggingface.co/learn/audio-course/chapter0/introduction) 的结课大作业,本演示调用了三个自然语言处理的大模型,一个用于将外国话翻译成中文,一个用于判断说的哪个国家的话,一个用于将中文转成语音输出。演示同时支持语音上传和麦克风输入,转换速度比较慢因为租不起GPU的服务器(支出增加20倍),建议您通过已经缓存Examples体验效果。欢迎添加我的微信号:ESGGTP 与我的平行人交流。
40
+
41
+ # ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
42
+ # """
43
+
44
+ # examples = [
45
+ # ["./en.mp3", None],
46
+ # ["./de.mp3", None],
47
+ # ["./fr.mp3", None],
48
+ # ["./it.mp3", None],
49
+
50
+ # ]
51
+ # import gradio as gr
52
+
53
+ # demo = gr.Blocks()
54
+ # file_transcribe = gr.Interface(
55
+ # fn=speech_to_speech_translation,
56
+ # inputs=gr.Audio(source="upload", type="filepath"),
57
+ # outputs=[
58
+ # gr.Audio(label="Generated Speech", type="numpy"),
59
+ # gr.Text(label="Transcription"),
60
+ # gr.Label(label="Language prediction"),
61
+ # ],
62
+ # title=title,
63
+ # description=description,
64
+ # examples=examples,
65
+ # )
66
+ # mic_transcribe = gr.Interface(
67
+ # fn=speech_to_speech_translation,
68
+ # inputs=gr.Audio(source="microphone", type="filepath"),
69
+ # outputs=[
70
+ # gr.Audio(label="Generated Speech", type="numpy"),
71
+ # gr.Text(label="Transcription"),
72
+ # gr.Label(label="Language prediction"),
73
+ # ],
74
+ # title=title,
75
+ # description=description,
76
+ # )
77
+ # with demo:
78
+ # gr.TabbedInterface(
79
+ # [file_transcribe, mic_transcribe],
80
+ # ["Transcribe Audio File", "Transcribe Microphone"],
81
+ # )
82
+
83
+ # demo.launch(share=True)
84
+ ###########################################################################################################################
85
  import torch
86
  import numpy as np
87
  import soundfile as sf
 
94
  pipe = pipeline(
95
  "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
96
  )
97
+ #label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
98
  processor = AutoProcessor.from_pretrained("suno/bark")
99
  model = BarkModel.from_pretrained("suno/bark")
100
  model = model.to(device)
101
  synthesised_rate = model.generation_config.sample_rate
102
 
103
  def translate(audio_file):
104
+ # audio, sampling_rate = sf.read(audio_file)
105
  outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
106
+ # language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
107
+ # label_outputs = {}
108
+ # for pred in language_prediction:
109
+ # label_outputs[pred["label"]] = pred["score"]
110
+ return outputs["text"]#,label_outputs
111
  def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
112
  inputs = processor(text_prompt, voice_preset=voice_preset)
113
  speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
114
  return speech_output
115
  def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
116
+ #translated_text, label_outputs= translate(audio)
117
+ translated_text = translate(audio)
118
  synthesised_speech = synthesise(translated_text,voice_preset)
119
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
120
+ return (synthesised_rate , synthesised_speech.T),translated_text#,label_outputs
121
 
122
  title = "外国话转中文话"
123
  description = """
124
+ 作为[Hugging Face Audio course](https://github.com/danfouer/HFAudioCourse) 的结课大作业,本演示调用了两个自然语言处理的大模型,一个用于将外国话翻译成中文,一个用于将中文转成语音输出。演示同时支持语音上传和麦克风输入,转换速度比较慢因为租不起GPU的服务器(支出增加20倍),建议您通过已经缓存Examples体验效果。欢迎添加我的微信号:ESGGTP 与我的平行人交流。
125
 
126
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
127
  """
 
131
  ["./de.mp3", None],
132
  ["./fr.mp3", None],
133
  ["./it.mp3", None],
134
+ ["./nl.mp3", None],
135
+ ["./fi.mp3", None],
136
+ ["./cs.mp3", None],
137
+ ["./pl.mp3", None],
138
  ]
139
  import gradio as gr
140
 
 
145
  outputs=[
146
  gr.Audio(label="Generated Speech", type="numpy"),
147
  gr.Text(label="Transcription"),
148
+ # gr.Label(label="Language prediction"),
149
  ],
150
  title=title,
151
  description=description,
 
157
  outputs=[
158
  gr.Audio(label="Generated Speech", type="numpy"),
159
  gr.Text(label="Transcription"),
160
+ # gr.Label(label="Language prediction"),
161
  ],
162
  title=title,
163
  description=description,