Gijs Wijngaard commited on
Commit
fbe7912
·
1 Parent(s): d1d89ce

read in audio

Browse files
Files changed (2) hide show
  1. app.py +18 -4
  2. requirements.txt +2 -1
app.py CHANGED
@@ -3,6 +3,8 @@ import os
3
  import re
4
  import gradio as gr
5
  import torch
 
 
6
  from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
7
 
8
  # Model path and configuration
@@ -59,13 +61,25 @@ def extract_components(text):
59
 
60
  @spaces.GPU
61
  def process_audio(audio_file):
62
- # Load and process the audio
63
- sampling_rate = processor.feature_extractor.sampling_rate
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  # Create conversation format
66
  conversation = [
67
  {"role": "user", "content": [
68
- {"type": "audio", "audio": audio_file},
69
  {"type": "text", "text": "Describe the audio in detail."}
70
  ]}
71
  ]
@@ -76,7 +90,7 @@ def process_audio(audio_file):
76
  # Process the inputs
77
  inputs = processor(
78
  text=chat_text,
79
- audios=[audio_file],
80
  return_tensors="pt",
81
  sampling_rate=sampling_rate,
82
  ).to(model.device)
 
3
  import re
4
  import gradio as gr
5
  import torch
6
+ import librosa
7
+ import numpy as np
8
  from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
9
 
10
  # Model path and configuration
 
61
 
62
  @spaces.GPU
63
  def process_audio(audio_file):
64
+ # Load and process the audio with librosa
65
+ y, sr = librosa.load(audio_file, sr=None) # Load audio file
66
+
67
+ # Resample to 16kHz if needed
68
+ if sr != 16000:
69
+ y = librosa.resample(y, orig_sr=sr, target_sr=16000)
70
+ sr = 16000
71
+
72
+ # Convert to mono if stereo
73
+ if len(y.shape) > 1 and y.shape[1] > 1:
74
+ y = librosa.to_mono(y)
75
+
76
+ # Set sampling rate for the processor
77
+ sampling_rate = 16000
78
 
79
  # Create conversation format
80
  conversation = [
81
  {"role": "user", "content": [
82
+ {"type": "audio", "audio": y},
83
  {"type": "text", "text": "Describe the audio in detail."}
84
  ]}
85
  ]
 
90
  # Process the inputs
91
  inputs = processor(
92
  text=chat_text,
93
+ audios=[y],
94
  return_tensors="pt",
95
  sampling_rate=sampling_rate,
96
  ).to(model.device)
requirements.txt CHANGED
@@ -3,4 +3,5 @@ torch
3
  transformers
4
  peft
5
  matplotlib
6
- soundfile
 
 
3
  transformers
4
  peft
5
  matplotlib
6
+ soundfile
7
+ librosa