Gijs Wijngaard
commited on
Commit
·
fbe7912
1
Parent(s):
d1d89ce
read in audio
Browse files- app.py +18 -4
- requirements.txt +2 -1
app.py
CHANGED
@@ -3,6 +3,8 @@ import os
|
|
3 |
import re
|
4 |
import gradio as gr
|
5 |
import torch
|
|
|
|
|
6 |
from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
|
7 |
|
8 |
# Model path and configuration
|
@@ -59,13 +61,25 @@ def extract_components(text):
|
|
59 |
|
60 |
@spaces.GPU
|
61 |
def process_audio(audio_file):
|
62 |
-
# Load and process the audio
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
# Create conversation format
|
66 |
conversation = [
|
67 |
{"role": "user", "content": [
|
68 |
-
{"type": "audio", "audio":
|
69 |
{"type": "text", "text": "Describe the audio in detail."}
|
70 |
]}
|
71 |
]
|
@@ -76,7 +90,7 @@ def process_audio(audio_file):
|
|
76 |
# Process the inputs
|
77 |
inputs = processor(
|
78 |
text=chat_text,
|
79 |
-
audios=[
|
80 |
return_tensors="pt",
|
81 |
sampling_rate=sampling_rate,
|
82 |
).to(model.device)
|
|
|
3 |
import re
|
4 |
import gradio as gr
|
5 |
import torch
|
6 |
+
import librosa
|
7 |
+
import numpy as np
|
8 |
from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
|
9 |
|
10 |
# Model path and configuration
|
|
|
61 |
|
62 |
@spaces.GPU
|
63 |
def process_audio(audio_file):
|
64 |
+
# Load and process the audio with librosa
|
65 |
+
y, sr = librosa.load(audio_file, sr=None) # Load audio file
|
66 |
+
|
67 |
+
# Resample to 16kHz if needed
|
68 |
+
if sr != 16000:
|
69 |
+
y = librosa.resample(y, orig_sr=sr, target_sr=16000)
|
70 |
+
sr = 16000
|
71 |
+
|
72 |
+
# Convert to mono if stereo
|
73 |
+
if len(y.shape) > 1 and y.shape[1] > 1:
|
74 |
+
y = librosa.to_mono(y)
|
75 |
+
|
76 |
+
# Set sampling rate for the processor
|
77 |
+
sampling_rate = 16000
|
78 |
|
79 |
# Create conversation format
|
80 |
conversation = [
|
81 |
{"role": "user", "content": [
|
82 |
+
{"type": "audio", "audio": y},
|
83 |
{"type": "text", "text": "Describe the audio in detail."}
|
84 |
]}
|
85 |
]
|
|
|
90 |
# Process the inputs
|
91 |
inputs = processor(
|
92 |
text=chat_text,
|
93 |
+
audios=[y],
|
94 |
return_tensors="pt",
|
95 |
sampling_rate=sampling_rate,
|
96 |
).to(model.device)
|
requirements.txt
CHANGED
@@ -3,4 +3,5 @@ torch
|
|
3 |
transformers
|
4 |
peft
|
5 |
matplotlib
|
6 |
-
soundfile
|
|
|
|
3 |
transformers
|
4 |
peft
|
5 |
matplotlib
|
6 |
+
soundfile
|
7 |
+
librosa
|