Michael Hu commited on
Commit
a4f48aa
·
1 Parent(s): 2477bc4

fix audio string to data

Browse files
Files changed (2) hide show
  1. requirements.txt +2 -0
  2. utils/stt.py +5 -1
requirements.txt CHANGED
@@ -12,4 +12,6 @@ phonemizer>=3.0
12
  scipy>=1.11
13
  munch>=2.5
14
  accelerate>=1.2.0
 
 
15
  # git+https://github.com/hexgrad/Kokoro-82M
 
12
  scipy>=1.11
13
  munch>=2.5
14
  accelerate>=1.2.0
15
+ soundfile>=0.13.0
16
+ libsndfile1
17
  # git+https://github.com/hexgrad/Kokoro-82M
utils/stt.py CHANGED
@@ -9,6 +9,7 @@ logger = logging.getLogger(__name__)
9
  import torch
10
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
11
  from pydub import AudioSegment
 
12
 
13
  def transcribe_audio(audio_path):
14
  """
@@ -46,8 +47,11 @@ def transcribe_audio(audio_path):
46
 
47
  # Processing
48
  logger.info("Processing audio input")
 
 
 
49
  inputs = processor(
50
- wav_path,
51
  sampling_rate=16000,
52
  return_tensors="pt",
53
  truncation=True,
 
9
  import torch
10
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
11
  from pydub import AudioSegment
12
+ import soundfile as sf # Add this import
13
 
14
  def transcribe_audio(audio_path):
15
  """
 
47
 
48
  # Processing
49
  logger.info("Processing audio input")
50
+ logger.debug("Loading audio data")
51
+ audio_data, sample_rate = sf.read(wav_path)
52
+ audio_data = audio_data.astype(np.float32)
53
  inputs = processor(
54
+ audio_data, # Pass audio array instead of path
55
  sampling_rate=16000,
56
  return_tensors="pt",
57
  truncation=True,