panlr commited on
Commit
6336b0f
·
verified ·
1 Parent(s): 276b406

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -80
app.py CHANGED
@@ -1,80 +1,99 @@
1
- import torch
2
- import gradio as gr
3
- from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
4
-
5
-
6
-
7
- # 加载模型和处理器
8
- def load_model(model_path, use_gpu=True, use_flash_attention_2=False, use_bettertransformer=False):
9
- device = "cuda:0" if torch.cuda.is_available() and use_gpu else "cpu"
10
- torch_dtype = torch.float16 if torch.cuda.is_available() and use_gpu else torch.float32
11
-
12
- processor = AutoProcessor.from_pretrained(model_path)
13
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
14
- model_path, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True,
15
- use_flash_attention_2=use_flash_attention_2
16
- )
17
- if use_bettertransformer and not use_flash_attention_2:
18
- model = model.to_bettertransformer()
19
-
20
-
21
- model.to(device)
22
-
23
- return processor, model, device, torch_dtype
24
-
25
- # 初始化模型
26
- processor, model, device, torch_dtype = load_model(
27
- model_path=r"panlr/whisper-finetune-teochew",
28
- use_gpu=True,
29
- use_flash_attention_2=False,
30
- use_bettertransformer=False
31
- )
32
-
33
- # 创建推理管道
34
- infer_pipe = pipeline(
35
- "automatic-speech-recognition",
36
- model=model,
37
- tokenizer=processor.tokenizer,
38
- feature_extractor=processor.feature_extractor,
39
- max_new_tokens=128,
40
- chunk_length_s=30,
41
- batch_size=16,
42
- torch_dtype=torch_dtype,
43
- device=device
44
- )
45
-
46
- # 定义推理函数
47
- def transcribe_audio(audio_path, num_beams=1):
48
- generate_kwargs = {"num_beams": num_beams}
49
-
50
- forced_decoder_ids = [
51
- (1, processor.tokenizer.encode("<|startoftranscript|>")[0]),
52
- (2, processor.tokenizer.encode("<|zh|>")[0]),
53
- (3, processor.tokenizer.encode("<|transcribe|>")[0]),
54
- ]
55
- model.generation_config.forced_decoder_ids = forced_decoder_ids
56
- # if language is not None:
57
- # generate_kwargs["language"] = language
58
-
59
- result = infer_pipe(audio_path, return_timestamps=False, generate_kwargs=generate_kwargs)
60
-
61
- return result['text']
62
-
63
- # Gradio 界面
64
- def gradio_interface(audio):
65
- return transcribe_audio(audio)
66
-
67
- # 创建 Gradio 应用
68
- interface = gr.Interface(
69
- fn=gradio_interface,
70
- inputs=[
71
- gr.Audio(sources=["microphone", "upload"], type="filepath", label="输入音频"),
72
- ],
73
- outputs=gr.Textbox(label="转录结果"),
74
- title="Whisper 潮汕话-正字 语音转录",
75
- description="上传音频文件或使用麦克风输入,进行语音转录。"
76
- )
77
-
78
- # 启动 Gradio 应用
79
- interface.launch()
80
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
4
+
5
+
6
+
7
+ # 加载模型和处理器
8
+ def load_model(model_path, use_gpu=True, use_flash_attention_2=False, use_bettertransformer=False):
9
+ device = "cuda:0" if torch.cuda.is_available() and use_gpu else "cpu"
10
+ torch_dtype = torch.float16 if torch.cuda.is_available() and use_gpu else torch.float32
11
+
12
+ processor = AutoProcessor.from_pretrained(model_path)
13
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
14
+ model_path, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True,
15
+ use_flash_attention_2=use_flash_attention_2
16
+ )
17
+ if use_bettertransformer and not use_flash_attention_2:
18
+ model = model.to_bettertransformer()
19
+
20
+
21
+ model.to(device)
22
+
23
+ return processor, model, device, torch_dtype
24
+
25
+ # 初始化模型
26
+ processor, model, device, torch_dtype = load_model(
27
+ model_path=r"panlr/whisper-finetune-teochew",
28
+ use_gpu=True,
29
+ use_flash_attention_2=False,
30
+ use_bettertransformer=False
31
+ )
32
+
33
+ # 创建推理管道
34
+ infer_pipe = pipeline(
35
+ "automatic-speech-recognition",
36
+ model=model,
37
+ tokenizer=processor.tokenizer,
38
+ feature_extractor=processor.feature_extractor,
39
+ max_new_tokens=128,
40
+ chunk_length_s=30,
41
+ batch_size=16,
42
+ torch_dtype=torch_dtype,
43
+ device=device
44
+ )
45
+
46
+ # 定义推理函数
47
+ def transcribe_audio(audio_path, num_beams=1):
48
+ generate_kwargs = {"num_beams": num_beams}
49
+
50
+ forced_decoder_ids = [
51
+ (1, processor.tokenizer.encode("<|startoftranscript|>")[0]),
52
+ (2, processor.tokenizer.encode("<|zh|>")[0]),
53
+ (3, processor.tokenizer.encode("<|transcribe|>")[0]),
54
+ ]
55
+ model.generation_config.forced_decoder_ids = forced_decoder_ids
56
+ # if language is not None:
57
+ # generate_kwargs["language"] = language
58
+
59
+ result = infer_pipe(audio_path, return_timestamps=False, generate_kwargs=generate_kwargs)
60
+
61
+ return result['text']
62
+
63
+ # Gradio 界面
64
+ def gradio_interface(audio):
65
+ return transcribe_audio(audio)
66
+
67
+
68
+
69
+ # 创建 Gradio 界面
70
+ with gr.Blocks() as interface:
71
+ gr.Markdown("## Whisper 潮汕话-正字 语音转录")
72
+
73
+ audio_input = gr.Audio(
74
+ sources=["microphone", "upload"],
75
+ type="filepath",
76
+ label="输入音频",
77
+ value="./example.wav" # 指定默认音频文件
78
+ )
79
+
80
+ output_text = gr.Textbox(label="转录结果")
81
+
82
+ # 在输入模块的下方添加说明
83
+ gr.Markdown("""
84
+ 📢 **使用说明**
85
+ - 本demo部署在CPU上,所以推理速度较慢。对于比较书面的话语,识别效果还不错,对土话、俗话还需要更多的数据。
86
+ - 你可以 **上传音频文件** 或 **使用麦克风** 向模型输入。
87
+ - 音频文件最好发音清晰、标准。
88
+ - 默认提供一个示例音频,你可以直接点击“提交”查看转录效果。
89
+ - 示例音频的对应文本: 【状元 林大钦,兵部尚(siên7)书 翁万达,了佮 工部 左侍郎(se6 neng5) 陈一松,拢是 嘉靖 年间 介 进士】
90
+ """)
91
+
92
+ submit_btn = gr.Button("提交")
93
+ submit_btn.click(gradio_interface, inputs=audio_input, outputs=output_text)
94
+
95
+ # 启动 Gradio 应用
96
+ interface.launch()
97
+
98
+
99
+