Spaces:
Running
Running
Michael Hu
commited on
Commit
·
a4f48aa
1
Parent(s):
2477bc4
fix audio string to data
Browse files- requirements.txt +2 -0
- utils/stt.py +5 -1
requirements.txt
CHANGED
@@ -12,4 +12,6 @@ phonemizer>=3.0
|
|
12 |
scipy>=1.11
|
13 |
munch>=2.5
|
14 |
accelerate>=1.2.0
|
|
|
|
|
15 |
# git+https://github.com/hexgrad/Kokoro-82M
|
|
|
12 |
scipy>=1.11
|
13 |
munch>=2.5
|
14 |
accelerate>=1.2.0
|
15 |
+
soundfile>=0.13.0
|
16 |
+
libsndfile1
|
17 |
# git+https://github.com/hexgrad/Kokoro-82M
|
utils/stt.py
CHANGED
@@ -9,6 +9,7 @@ logger = logging.getLogger(__name__)
|
|
9 |
import torch
|
10 |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
|
11 |
from pydub import AudioSegment
|
|
|
12 |
|
13 |
def transcribe_audio(audio_path):
|
14 |
"""
|
@@ -46,8 +47,11 @@ def transcribe_audio(audio_path):
|
|
46 |
|
47 |
# Processing
|
48 |
logger.info("Processing audio input")
|
|
|
|
|
|
|
49 |
inputs = processor(
|
50 |
-
|
51 |
sampling_rate=16000,
|
52 |
return_tensors="pt",
|
53 |
truncation=True,
|
|
|
9 |
import torch
|
10 |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
|
11 |
from pydub import AudioSegment
|
12 |
+
import soundfile as sf # Add this import
|
13 |
|
14 |
def transcribe_audio(audio_path):
|
15 |
"""
|
|
|
47 |
|
48 |
# Processing
|
49 |
logger.info("Processing audio input")
|
50 |
+
logger.debug("Loading audio data")
|
51 |
+
audio_data, sample_rate = sf.read(wav_path)
|
52 |
+
audio_data = audio_data.astype(np.float32)
|
53 |
inputs = processor(
|
54 |
+
audio_data, # Pass audio array instead of path
|
55 |
sampling_rate=16000,
|
56 |
return_tensors="pt",
|
57 |
truncation=True,
|