Spaces:

gijs
/

SemThink

Running

App Files Files Community

Gijs Wijngaard commited on Mar 12

Commit

b770eaa

1 Parent(s): 5ee12ec

Finished

Browse files

Files changed (2) hide show

app.py +96 -40
examples/1.wav +0 -0

app.py CHANGED Viewed

@@ -1,11 +1,13 @@
-import spaces
 import os
 import re
 import gradio as gr
 import torch
 import librosa
 import numpy as np
-from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
 # Model path and configuration
 model_path = "./model"
@@ -21,7 +23,7 @@ def load_model():
     # Load the base model
     model = Qwen2AudioForConditionalGeneration.from_pretrained(
-        base_model_id,
         torch_dtype=torch.bfloat16,
         trust_remote_code=True,
         device_map="auto",
@@ -57,21 +59,78 @@ def extract_components(text):
     return thinking, semantic, answer
-@spaces.GPU
-def process_audio(audio_file):
-    # Load and process the audio with librosa
-    y, sr = librosa.load(audio_file, sr=None)  # Load audio file
     # Resample to 16kHz if needed
     if sr != 16000:
-        y = librosa.resample(y, orig_sr=sr, target_sr=16000)
         sr = 16000
     # Convert to mono if stereo
-    if len(y.shape) > 1 and y.shape[1] > 1:
-        y = librosa.to_mono(y)
     # Set sampling rate for the processor
     sampling_rate = 16000
@@ -95,45 +154,42 @@ def process_audio(audio_file):
         sampling_rate=sampling_rate,
     ).to(model.device)
-    # Generate the output
     with torch.no_grad():
-        outputs = model.generate(
             **inputs,
             max_new_tokens=768,
             do_sample=False,
         )
-    # Decode the output
-    generated_text = processor.tokenizer.decode(outputs[0], skip_special_tokens=False)
-    assistant_text = generated_text.split("\nassistant\n")[1]
-    # Extract sections from the response
-    # Add newlines before XML tags if they exist
-    if "<think>" in assistant_text:
-        assistant_text = assistant_text.replace("<think>", "\n<think>")
-    if "<semantic_elements>" in assistant_text:
-        assistant_text = assistant_text.replace("<semantic_elements>", "\n<semantic_elements>")
-    if "<answer>" in assistant_text:
-        assistant_text = assistant_text.replace("<answer>", "\n<answer>")
-    # Combine all components into a single output
-    return assistant_text
-# Create Gradio interface
-demo = gr.Interface(
-    fn=process_audio,
     inputs=gr.Audio(type="filepath", label="Upload Audio"),
-    outputs=gr.Textbox(label="Analysis Result", lines=20),
-    title="Qwen2Audio Audio Description Demo",
     description="Upload an audio file and the model will provide detailed analysis and description.",
-    examples=[],  # Add example files here if available
     cache_examples=False,
 )
-# Launch the app
 if __name__ == "__main__":
-    demo.launch()

+# import spaces
 import os
 import re
 import gradio as gr
 import torch
 import librosa
 import numpy as np
+from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration, TextIteratorStreamer
+import torchaudio
+from threading import Thread
 # Model path and configuration
 model_path = "./model"
     # Load the base model
     model = Qwen2AudioForConditionalGeneration.from_pretrained(
+        model_path,
         torch_dtype=torch.bfloat16,
         trust_remote_code=True,
         device_map="auto",
     return thinking, semantic, answer
+# Function to handle chat messages
+def chat(message, history):
+    chat = []
+    for item in history:
+        chat.append({"role": "user", "content": item[0]})
+        if item[1] is not None:
+            chat.append({"role": "assistant", "content": item[1]})
+    chat.append({"role": "user", "content": message})
+    messages = processor.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
+    # Tokenize the messages string
+    model_inputs = processor([messages], return_tensors="pt").to(model.device)
+    streamer = TextIteratorStreamer(
+        processor.tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        model_inputs,
+        streamer=streamer,
+        max_new_tokens=1024,
+        do_sample=True,
+        top_p=0.95,
+        top_k=1000,
+        temperature=0.75,
+        num_beams=1,
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    # Initialize an empty string to store the generated text
+    partial_text = ""
+    for new_text in streamer:
+        # print(new_text)
+        partial_text += new_text
+        # Yield an empty string to cleanup the message textbox and the updated conversation history
+        yield partial_text
+def process_output(output):
+    if "<think>" in output:
+        rest = output.split("<think>")[1]
+        output = "<think>\n" + rest
+    elif "<semantic_elements>" in output:
+        rest = output.split("<semantic_elements>")[1]
+        output = "<semantic_elements>\n" + rest
+    elif "<answer>" in output:
+        rest = output.split("<answer>")[1]
+        output = "<answer>\n" + rest
+    elif "</think>" in output:
+        rest = output.split("</think>")[0]
+        output = rest + "\n</think>\n"
+    elif "</semantic_elements>" in output:
+        rest = output.split("</semantic_elements>")[0]
+        output = rest + "\n</semantic_elements>\n"
+    elif "</answer>" in output:
+        rest = output.split("</answer>")[0]
+        output = rest + "\n</answer>\n"
+    return output
+# Keep only the process_audio_streaming function that's actually used in the Gradio interface
+def process_audio_streaming(audio_file):
+    # Load and process the audio with torchaudio
+    waveform, sr = torchaudio.load(audio_file)
     # Resample to 16kHz if needed
     if sr != 16000:
+        waveform = torchaudio.functional.resample(waveform, sr, 16000)
         sr = 16000
     # Convert to mono if stereo
+    if waveform.shape[0] > 1:
+        waveform = torch.mean(waveform, dim=0, keepdim=True)
+    # Get the audio data as numpy array
+    y = waveform.squeeze().numpy()
     # Set sampling rate for the processor
     sampling_rate = 16000
         sampling_rate=sampling_rate,
     ).to(model.device)
+    # Create a streamer instance
+    streamer = TextIteratorStreamer(
+        processor.tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
+    # Initialize an empty string to store the generated text
+    accumulated_output = ""
+    # Generate the output with streaming
     with torch.no_grad():
+        generate_kwargs = dict(
             **inputs,
+            streamer=streamer,
             max_new_tokens=768,
             do_sample=False,
         )
+        t = Thread(target=model.generate, kwargs=generate_kwargs)
+        t.start()
+        # Yield the final outputs
+        for output in streamer:
+            output = process_output(output)
+            accumulated_output += output  # Append new output to the accumulated string
+            yield accumulated_output  # Yield the accumulated output
+# Create Gradio interface for audio processing
+audio_demo = gr.Interface(
+    fn=process_audio_streaming,
     inputs=gr.Audio(type="filepath", label="Upload Audio"),
+    outputs=gr.Textbox(label="Generated Output", lines=24),
+    title="SemThink",
     description="Upload an audio file and the model will provide detailed analysis and description.",
+    examples=["examples/1.wav"],  # Add example files here if available
     cache_examples=False,
+    live=True  # Enable live updates
 )
+# Launch the apps
 if __name__ == "__main__":
+    audio_demo.launch()

examples/1.wav ADDED Viewed

Binary file (163 kB). View file