Spaces:
Running
Running
Upload app.py
Browse files
app.py
CHANGED
@@ -1,80 +1,99 @@
|
|
1 |
-
import torch
|
2 |
-
import gradio as gr
|
3 |
-
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
# 加载模型和处理器
|
8 |
-
def load_model(model_path, use_gpu=True, use_flash_attention_2=False, use_bettertransformer=False):
|
9 |
-
device = "cuda:0" if torch.cuda.is_available() and use_gpu else "cpu"
|
10 |
-
torch_dtype = torch.float16 if torch.cuda.is_available() and use_gpu else torch.float32
|
11 |
-
|
12 |
-
processor = AutoProcessor.from_pretrained(model_path)
|
13 |
-
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
14 |
-
model_path, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True,
|
15 |
-
use_flash_attention_2=use_flash_attention_2
|
16 |
-
)
|
17 |
-
if use_bettertransformer and not use_flash_attention_2:
|
18 |
-
model = model.to_bettertransformer()
|
19 |
-
|
20 |
-
|
21 |
-
model.to(device)
|
22 |
-
|
23 |
-
return processor, model, device, torch_dtype
|
24 |
-
|
25 |
-
# 初始化模型
|
26 |
-
processor, model, device, torch_dtype = load_model(
|
27 |
-
model_path=r"panlr/whisper-finetune-teochew",
|
28 |
-
use_gpu=True,
|
29 |
-
use_flash_attention_2=False,
|
30 |
-
use_bettertransformer=False
|
31 |
-
)
|
32 |
-
|
33 |
-
# 创建推理管道
|
34 |
-
infer_pipe = pipeline(
|
35 |
-
"automatic-speech-recognition",
|
36 |
-
model=model,
|
37 |
-
tokenizer=processor.tokenizer,
|
38 |
-
feature_extractor=processor.feature_extractor,
|
39 |
-
max_new_tokens=128,
|
40 |
-
chunk_length_s=30,
|
41 |
-
batch_size=16,
|
42 |
-
torch_dtype=torch_dtype,
|
43 |
-
device=device
|
44 |
-
)
|
45 |
-
|
46 |
-
# 定义推理函数
|
47 |
-
def transcribe_audio(audio_path, num_beams=1):
|
48 |
-
generate_kwargs = {"num_beams": num_beams}
|
49 |
-
|
50 |
-
forced_decoder_ids = [
|
51 |
-
(1, processor.tokenizer.encode("<|startoftranscript|>")[0]),
|
52 |
-
(2, processor.tokenizer.encode("<|zh|>")[0]),
|
53 |
-
(3, processor.tokenizer.encode("<|transcribe|>")[0]),
|
54 |
-
]
|
55 |
-
model.generation_config.forced_decoder_ids = forced_decoder_ids
|
56 |
-
# if language is not None:
|
57 |
-
# generate_kwargs["language"] = language
|
58 |
-
|
59 |
-
result = infer_pipe(audio_path, return_timestamps=False, generate_kwargs=generate_kwargs)
|
60 |
-
|
61 |
-
return result['text']
|
62 |
-
|
63 |
-
# Gradio 界面
|
64 |
-
def gradio_interface(audio):
|
65 |
-
return transcribe_audio(audio)
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import gradio as gr
|
3 |
+
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
|
4 |
+
|
5 |
+
|
6 |
+
|
7 |
+
# 加载模型和处理器
|
8 |
+
def load_model(model_path, use_gpu=True, use_flash_attention_2=False, use_bettertransformer=False):
|
9 |
+
device = "cuda:0" if torch.cuda.is_available() and use_gpu else "cpu"
|
10 |
+
torch_dtype = torch.float16 if torch.cuda.is_available() and use_gpu else torch.float32
|
11 |
+
|
12 |
+
processor = AutoProcessor.from_pretrained(model_path)
|
13 |
+
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
14 |
+
model_path, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True,
|
15 |
+
use_flash_attention_2=use_flash_attention_2
|
16 |
+
)
|
17 |
+
if use_bettertransformer and not use_flash_attention_2:
|
18 |
+
model = model.to_bettertransformer()
|
19 |
+
|
20 |
+
|
21 |
+
model.to(device)
|
22 |
+
|
23 |
+
return processor, model, device, torch_dtype
|
24 |
+
|
25 |
+
# 初始化模型
|
26 |
+
processor, model, device, torch_dtype = load_model(
|
27 |
+
model_path=r"panlr/whisper-finetune-teochew",
|
28 |
+
use_gpu=True,
|
29 |
+
use_flash_attention_2=False,
|
30 |
+
use_bettertransformer=False
|
31 |
+
)
|
32 |
+
|
33 |
+
# 创建推理管道
|
34 |
+
infer_pipe = pipeline(
|
35 |
+
"automatic-speech-recognition",
|
36 |
+
model=model,
|
37 |
+
tokenizer=processor.tokenizer,
|
38 |
+
feature_extractor=processor.feature_extractor,
|
39 |
+
max_new_tokens=128,
|
40 |
+
chunk_length_s=30,
|
41 |
+
batch_size=16,
|
42 |
+
torch_dtype=torch_dtype,
|
43 |
+
device=device
|
44 |
+
)
|
45 |
+
|
46 |
+
# 定义推理函数
|
47 |
+
def transcribe_audio(audio_path, num_beams=1):
|
48 |
+
generate_kwargs = {"num_beams": num_beams}
|
49 |
+
|
50 |
+
forced_decoder_ids = [
|
51 |
+
(1, processor.tokenizer.encode("<|startoftranscript|>")[0]),
|
52 |
+
(2, processor.tokenizer.encode("<|zh|>")[0]),
|
53 |
+
(3, processor.tokenizer.encode("<|transcribe|>")[0]),
|
54 |
+
]
|
55 |
+
model.generation_config.forced_decoder_ids = forced_decoder_ids
|
56 |
+
# if language is not None:
|
57 |
+
# generate_kwargs["language"] = language
|
58 |
+
|
59 |
+
result = infer_pipe(audio_path, return_timestamps=False, generate_kwargs=generate_kwargs)
|
60 |
+
|
61 |
+
return result['text']
|
62 |
+
|
63 |
+
# Gradio 界面
|
64 |
+
def gradio_interface(audio):
|
65 |
+
return transcribe_audio(audio)
|
66 |
+
|
67 |
+
|
68 |
+
|
69 |
+
# 创建 Gradio 界面
|
70 |
+
with gr.Blocks() as interface:
|
71 |
+
gr.Markdown("## Whisper 潮汕话-正字 语音转录")
|
72 |
+
|
73 |
+
audio_input = gr.Audio(
|
74 |
+
sources=["microphone", "upload"],
|
75 |
+
type="filepath",
|
76 |
+
label="输入音频",
|
77 |
+
value="./example.wav" # 指定默认音频文件
|
78 |
+
)
|
79 |
+
|
80 |
+
output_text = gr.Textbox(label="转录结果")
|
81 |
+
|
82 |
+
# 在输入模块的下方添加说明
|
83 |
+
gr.Markdown("""
|
84 |
+
📢 **使用说明**
|
85 |
+
- 本demo部署在CPU上,所以推理速度较慢。对于比较书面的话语,识别效果还不错,对土话、俗话还需要更多的数据。
|
86 |
+
- 你可以 **上传音频文件** 或 **使用麦克风** 向模型输入。
|
87 |
+
- 音频文件最好发音清晰、标准。
|
88 |
+
- 默认提供一个示例音频,你可以直接点击“提交”查看转录效果。
|
89 |
+
- 示例音频的对应文本: 【状元 林大钦,兵部尚(siên7)书 翁万达,了佮 工部 左侍郎(se6 neng5) 陈一松,拢是 嘉靖 年间 介 进士】
|
90 |
+
""")
|
91 |
+
|
92 |
+
submit_btn = gr.Button("提交")
|
93 |
+
submit_btn.click(gradio_interface, inputs=audio_input, outputs=output_text)
|
94 |
+
|
95 |
+
# 启动 Gradio 应用
|
96 |
+
interface.launch()
|
97 |
+
|
98 |
+
|
99 |
+
|