Spaces:

gijs
/

SemThink

Running

File size: 3,721 Bytes

import spaces
import os
import re
import gradio as gr
import torch
import librosa
import numpy as np
from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration

# Model path and configuration
model_path = "./model"
base_model_id = "Qwen/Qwen2-Audio-7B-Instruct"

# Load the model and processor
def load_model():
    # Load the processor from the base model
    processor = AutoProcessor.from_pretrained(
        base_model_id,
        trust_remote_code=True,
    )
    
    # Load the base model
    model = Qwen2AudioForConditionalGeneration.from_pretrained(
        base_model_id,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        device_map="auto",
    )
    
    model.eval()
    
    return model, processor

# Initialize model and processor
model, processor = load_model()

# Function to extract components from model output
def extract_components(text):
    thinking = ""
    semantic = ""
    answer = ""
    
    # Extract thinking
    think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
    if think_match:
        thinking = think_match.group(1).strip()
    
    # Extract semantic elements
    semantic_match = re.search(r"<semantic_elements>(.*?)</semantic_elements>", text, re.DOTALL)
    if semantic_match:
        semantic = semantic_match.group(1).strip()
    
    # Extract answer
    answer_match = re.search(r"<answer>(.*?)</answer>", text, re.DOTALL)
    if answer_match:
        answer = answer_match.group(1).strip()
    
    return thinking, semantic, answer



@spaces.GPU
def process_audio(audio_file):
    # Load and process the audio with librosa
    y, sr = librosa.load(audio_file, sr=None)  # Load audio file
    
    # Resample to 16kHz if needed
    if sr != 16000:
        y = librosa.resample(y, orig_sr=sr, target_sr=16000)
        sr = 16000
    
    # Convert to mono if stereo
    if len(y.shape) > 1 and y.shape[1] > 1:
        y = librosa.to_mono(y)
            
    # Set sampling rate for the processor
    sampling_rate = 16000
    
    # Create conversation format
    conversation = [
        {"role": "user", "content": [
            {"type": "audio", "audio": y},
            {"type": "text", "text": "Describe the audio in detail."}
        ]}
    ]
    
    # Format the chat
    chat_text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
    
    # Process the inputs
    inputs = processor(
        text=chat_text,
        audios=[y],
        return_tensors="pt",
        sampling_rate=sampling_rate,
    ).to(model.device)
    
    # Generate the output
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=768,
            do_sample=False,
        )
    
    # Decode the output
    generated_text = processor.tokenizer.decode(outputs[0], skip_special_tokens=False)
    assistant_text = generated_text.split("<|im_start|>assistant\n")[-1].split("<|im_end|>")[0].strip()
    
    # Extract components
    thinking, semantic, answer = extract_components(assistant_text)
    
    return thinking, semantic, answer

# Create Gradio interface
demo = gr.Interface(
    fn=process_audio,
    inputs=gr.Audio(type="filepath", label="Upload Audio"),
    outputs=[
        gr.Textbox(label="Thinking Process", lines=10),
        gr.Textbox(label="Semantic Elements", lines=5),
        gr.Textbox(label="Answer", lines=5)
    ],
    title="Qwen2Audio Audio Description Demo",
    description="Upload an audio file and the model will provide detailed analysis and description.",
    examples=[],  # Add example files here if available
    cache_examples=False,
)

# Launch the app
if __name__ == "__main__":
    demo.launch()