Spaces:

gijs
/

SemThink

Running

Gijs Wijngaard commited on Mar 11

Commit

fbe7912

1 Parent(s): d1d89ce

read in audio

Files changed (2) hide show

app.py CHANGED Viewed

@@ -3,6 +3,8 @@ import os
 import re
 import gradio as gr
 import torch
 from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
 # Model path and configuration
@@ -59,13 +61,25 @@ def extract_components(text):
 @spaces.GPU
 def process_audio(audio_file):
-    # Load and process the audio
-    sampling_rate = processor.feature_extractor.sampling_rate
     # Create conversation format
     conversation = [
         {"role": "user", "content": [
-            {"type": "audio", "audio": audio_file},
             {"type": "text", "text": "Describe the audio in detail."}
         ]}
     ]
@@ -76,7 +90,7 @@ def process_audio(audio_file):
     # Process the inputs
     inputs = processor(
         text=chat_text,
-        audios=[audio_file],
         return_tensors="pt",
         sampling_rate=sampling_rate,
     ).to(model.device)

 import re
 import gradio as gr
 import torch
+import librosa
+import numpy as np
 from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
 # Model path and configuration
 @spaces.GPU
 def process_audio(audio_file):
+    # Load and process the audio with librosa
+    y, sr = librosa.load(audio_file, sr=None)  # Load audio file
+    # Resample to 16kHz if needed
+    if sr != 16000:
+        y = librosa.resample(y, orig_sr=sr, target_sr=16000)
+        sr = 16000
+    # Convert to mono if stereo
+    if len(y.shape) > 1 and y.shape[1] > 1:
+        y = librosa.to_mono(y)
+    # Set sampling rate for the processor
+    sampling_rate = 16000
     # Create conversation format
     conversation = [
         {"role": "user", "content": [
+            {"type": "audio", "audio": y},
             {"type": "text", "text": "Describe the audio in detail."}
         ]}
     ]
     # Process the inputs
     inputs = processor(
         text=chat_text,
+        audios=[y],
         return_tensors="pt",
         sampling_rate=sampling_rate,
     ).to(model.device)

requirements.txt CHANGED Viewed

@@ -3,4 +3,5 @@ torch
 transformers
 peft
 matplotlib
-soundfile

 transformers
 peft
 matplotlib
+soundfile
+librosa