|
import spaces |
|
import os |
|
import re |
|
import gradio as gr |
|
import torch |
|
import librosa |
|
import numpy as np |
|
from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration |
|
|
|
|
|
model_path = "./model" |
|
base_model_id = "Qwen/Qwen2-Audio-7B-Instruct" |
|
|
|
|
|
def load_model(): |
|
|
|
processor = AutoProcessor.from_pretrained( |
|
base_model_id, |
|
trust_remote_code=True, |
|
) |
|
|
|
|
|
model = Qwen2AudioForConditionalGeneration.from_pretrained( |
|
base_model_id, |
|
torch_dtype=torch.bfloat16, |
|
trust_remote_code=True, |
|
device_map="auto", |
|
) |
|
|
|
model.eval() |
|
|
|
return model, processor |
|
|
|
|
|
model, processor = load_model() |
|
|
|
|
|
def extract_components(text): |
|
thinking = "" |
|
semantic = "" |
|
answer = "" |
|
|
|
|
|
think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL) |
|
if think_match: |
|
thinking = think_match.group(1).strip() |
|
|
|
|
|
semantic_match = re.search(r"<semantic_elements>(.*?)</semantic_elements>", text, re.DOTALL) |
|
if semantic_match: |
|
semantic = semantic_match.group(1).strip() |
|
|
|
|
|
answer_match = re.search(r"<answer>(.*?)</answer>", text, re.DOTALL) |
|
if answer_match: |
|
answer = answer_match.group(1).strip() |
|
|
|
return thinking, semantic, answer |
|
|
|
|
|
|
|
@spaces.GPU |
|
def process_audio(audio_file): |
|
|
|
y, sr = librosa.load(audio_file, sr=None) |
|
|
|
|
|
if sr != 16000: |
|
y = librosa.resample(y, orig_sr=sr, target_sr=16000) |
|
sr = 16000 |
|
|
|
|
|
if len(y.shape) > 1 and y.shape[1] > 1: |
|
y = librosa.to_mono(y) |
|
|
|
|
|
sampling_rate = 16000 |
|
|
|
|
|
conversation = [ |
|
{"role": "user", "content": [ |
|
{"type": "audio", "audio": y}, |
|
{"type": "text", "text": "Describe the audio in detail."} |
|
]} |
|
] |
|
|
|
|
|
chat_text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) |
|
|
|
|
|
inputs = processor( |
|
text=chat_text, |
|
audios=[y], |
|
return_tensors="pt", |
|
sampling_rate=sampling_rate, |
|
).to(model.device) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model.generate( |
|
**inputs, |
|
max_new_tokens=768, |
|
do_sample=False, |
|
) |
|
|
|
|
|
generated_text = processor.tokenizer.decode(outputs[0], skip_special_tokens=False) |
|
assistant_text = generated_text.split("<|im_start|>assistant\n")[-1].split("<|im_end|>")[0].strip() |
|
|
|
|
|
thinking, semantic, answer = extract_components(assistant_text) |
|
|
|
return thinking, semantic, answer |
|
|
|
|
|
demo = gr.Interface( |
|
fn=process_audio, |
|
inputs=gr.Audio(type="filepath", label="Upload Audio"), |
|
outputs=[ |
|
gr.Textbox(label="Thinking Process", lines=10), |
|
gr.Textbox(label="Semantic Elements", lines=5), |
|
gr.Textbox(label="Answer", lines=5) |
|
], |
|
title="Qwen2Audio Audio Description Demo", |
|
description="Upload an audio file and the model will provide detailed analysis and description.", |
|
examples=[], |
|
cache_examples=False, |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|