|
import spaces |
|
import os |
|
import re |
|
import gradio as gr |
|
import torch |
|
import librosa |
|
import numpy as np |
|
from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration, TextIteratorStreamer |
|
import torchaudio |
|
from threading import Thread |
|
|
|
|
|
model_path_1 = "./model" |
|
model_path_2 = "./model2" |
|
base_model_id = "Qwen/Qwen2-Audio-7B-Instruct" |
|
|
|
|
|
loaded_models = {} |
|
|
|
|
|
def load_model(model_path): |
|
|
|
if model_path in loaded_models: |
|
return loaded_models[model_path] |
|
|
|
|
|
processor = AutoProcessor.from_pretrained( |
|
base_model_id, |
|
trust_remote_code=True, |
|
) |
|
|
|
|
|
model = Qwen2AudioForConditionalGeneration.from_pretrained( |
|
model_path, |
|
torch_dtype=torch.bfloat16, |
|
trust_remote_code=True, |
|
device_map="auto", |
|
) |
|
|
|
model.eval() |
|
|
|
|
|
loaded_models[model_path] = (model, processor) |
|
|
|
return model, processor |
|
|
|
|
|
model, processor = load_model(model_path_1) |
|
|
|
|
|
def process_output(output): |
|
if "<think>" in output: |
|
rest = output.split("<think>")[1] |
|
output = "<think>\n" + rest |
|
elif "<semantic_elements>" in output: |
|
rest = output.split("<semantic_elements>")[1] |
|
output = "<semantic_elements>\n" + rest |
|
elif "<answer>" in output: |
|
rest = output.split("<answer>")[1] |
|
output = "<answer>\n" + rest |
|
elif "</think>" in output: |
|
rest = output.split("</think>")[0] |
|
output = rest + "\n</think>\n\n" |
|
elif "</semantic_elements>" in output: |
|
rest = output.split("</semantic_elements>")[0] |
|
output = rest + "\n</semantic_elements>\n\n" |
|
elif "</answer>" in output: |
|
rest = output.split("</answer>")[0] |
|
output = rest + "\n</answer>\n" |
|
output = output.replace("\\n", "\n") |
|
output = output.replace("\\", "\n") |
|
output = output.replace("\n-", "-") |
|
return output |
|
|
|
|
|
@spaces.GPU |
|
def process_audio_streaming(audio_file, model_choice): |
|
|
|
model_path = model_path_1 if model_choice == "Think" else model_path_2 |
|
model, processor = load_model(model_path) |
|
|
|
|
|
waveform, sr = torchaudio.load(audio_file) |
|
|
|
|
|
if sr != 16000: |
|
waveform = torchaudio.functional.resample(waveform, sr, 16000) |
|
sr = 16000 |
|
|
|
|
|
if waveform.shape[0] > 1: |
|
waveform = torch.mean(waveform, dim=0, keepdim=True) |
|
|
|
|
|
y = waveform.squeeze().numpy() |
|
|
|
|
|
sampling_rate = 16000 |
|
|
|
|
|
conversation = [ |
|
{"role": "user", "content": [ |
|
{"type": "audio", "audio": y}, |
|
{"type": "text", "text": "Describe the audio in detail."} |
|
]} |
|
] |
|
|
|
|
|
chat_text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) |
|
|
|
|
|
inputs = processor( |
|
text=chat_text, |
|
audios=[y], |
|
return_tensors="pt", |
|
sampling_rate=sampling_rate, |
|
).to(model.device) |
|
|
|
|
|
streamer = TextIteratorStreamer( |
|
processor.tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True) |
|
|
|
|
|
accumulated_output = "" |
|
|
|
|
|
with torch.no_grad(): |
|
generate_kwargs = dict( |
|
**inputs, |
|
streamer=streamer, |
|
max_new_tokens=768, |
|
do_sample=False, |
|
) |
|
t = Thread(target=model.generate, kwargs=generate_kwargs) |
|
t.start() |
|
|
|
|
|
for output in streamer: |
|
output = process_output(output) |
|
accumulated_output += output |
|
yield accumulated_output |
|
|
|
|
|
audio_demo = gr.Interface( |
|
fn=process_audio_streaming, |
|
inputs=[ |
|
gr.Audio(type="filepath", label="Upload Audio"), |
|
gr.Radio(["Think", "Think + Semantics"], label="Select Model", value="Think + Semantics") |
|
], |
|
outputs=gr.Textbox(label="Generated Output", lines=30), |
|
title="SemThink", |
|
description="Upload an audio file and the model will provide detailed analysis and description. Choose between different model versions.", |
|
examples=[["examples/1.wav", "Think + Semantics"]], |
|
cache_examples=False, |
|
live=True |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
audio_demo.launch() |