Spaces:

gijs
/

SemThink

Running

SemThink / app.py

Gijs Wijngaard

read in audio

fbe7912 about 2 months ago

3.72 kB

	import spaces
	import os
	import re
	import gradio as gr
	import torch
	import librosa
	import numpy as np
	from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration

	# Model path and configuration
	model_path = "./model"
	base_model_id = "Qwen/Qwen2-Audio-7B-Instruct"

	# Load the model and processor
	def load_model():
	# Load the processor from the base model
	processor = AutoProcessor.from_pretrained(
	base_model_id,
	trust_remote_code=True,
	)

	# Load the base model
	model = Qwen2AudioForConditionalGeneration.from_pretrained(
	base_model_id,
	torch_dtype=torch.bfloat16,
	trust_remote_code=True,
	device_map="auto",
	)

	model.eval()

	return model, processor

	# Initialize model and processor
	model, processor = load_model()

	# Function to extract components from model output
	def extract_components(text):
	thinking = ""
	semantic = ""
	answer = ""

	# Extract thinking
	think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
	if think_match:
	thinking = think_match.group(1).strip()

	# Extract semantic elements
	semantic_match = re.search(r"<semantic_elements>(.*?)</semantic_elements>", text, re.DOTALL)
	if semantic_match:
	semantic = semantic_match.group(1).strip()

	# Extract answer
	answer_match = re.search(r"<answer>(.*?)</answer>", text, re.DOTALL)
	if answer_match:
	answer = answer_match.group(1).strip()

	return thinking, semantic, answer



	@spaces.GPU
	def process_audio(audio_file):
	# Load and process the audio with librosa
	y, sr = librosa.load(audio_file, sr=None) # Load audio file

	# Resample to 16kHz if needed
	if sr != 16000:
	y = librosa.resample(y, orig_sr=sr, target_sr=16000)
	sr = 16000

	# Convert to mono if stereo
	if len(y.shape) > 1 and y.shape[1] > 1:
	y = librosa.to_mono(y)

	# Set sampling rate for the processor
	sampling_rate = 16000

	# Create conversation format
	conversation = [
	{"role": "user", "content": [
	{"type": "audio", "audio": y},
	{"type": "text", "text": "Describe the audio in detail."}
	]}
	]

	# Format the chat
	chat_text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)

	# Process the inputs
	inputs = processor(
	text=chat_text,
	audios=[y],
	return_tensors="pt",
	sampling_rate=sampling_rate,
	).to(model.device)

	# Generate the output
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=768,
	do_sample=False,
	)

	# Decode the output
	generated_text = processor.tokenizer.decode(outputs[0], skip_special_tokens=False)
	assistant_text = generated_text.split("<\|im_start\|>assistant\n")[-1].split("<\|im_end\|>")[0].strip()

	# Extract components
	thinking, semantic, answer = extract_components(assistant_text)

	return thinking, semantic, answer

	# Create Gradio interface
	demo = gr.Interface(
	fn=process_audio,
	inputs=gr.Audio(type="filepath", label="Upload Audio"),
	outputs=[
	gr.Textbox(label="Thinking Process", lines=10),
	gr.Textbox(label="Semantic Elements", lines=5),
	gr.Textbox(label="Answer", lines=5)
	],
	title="Qwen2Audio Audio Description Demo",
	description="Upload an audio file and the model will provide detailed analysis and description.",
	examples=[], # Add example files here if available
	cache_examples=False,
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()