GavinHuang commited on
Commit
779d79b
·
1 Parent(s): fe027e3

fix: improve audio processing in transcribe function and add soundfile dependency

Browse files
Files changed (2) hide show
  1. app.py +31 -36
  2. requirements.txt +1 -0
app.py CHANGED
@@ -15,45 +15,40 @@ model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/parakeet-tdt-
15
 
16
  print(f"Model loaded on device: {model.device}")
17
 
18
- @spaces.GPU(duration=120) # Increase duration if inference takes >60s
 
 
 
 
19
  def transcribe(audio, state=""):
20
- """
21
- Transcribe audio in real-time
22
- """
23
- # Skip processing if no audio is provided
24
- if audio is None:
25
  return state, state
26
 
27
- if isinstance(audio, tuple):
28
- # If audio is a tuple, assume the first element is the file path
29
- print("Received tuple input, extracting first element as file path")
30
- audio = audio[0] if len(audio) > 0 else None
31
- elif not isinstance(audio, str):
32
- raise ValueError(f"Expected audio as a file path (str), got {type(audio)}")
 
 
 
 
33
 
34
- if not audio:
35
- raise ValueError("No valid audio input provided")
36
-
37
- global model
38
- # Move model to GPU if available
39
- if torch.cuda.is_available():
40
- print(f"CUDA device: {torch.cuda.get_device_name(0)}")
41
- model = model.cuda()
42
-
43
- # Get the sample rate from the audio
44
- sample_rate = 16000 # Default to 16kHz if not specified
45
-
46
- # Process the audio with the ASR model
47
- with torch.no_grad():
48
- transcription = model.transcribe([audio])[0]
49
-
50
- # Append new transcription to the state
51
- if state == "":
52
- new_state = transcription
53
- else:
54
- new_state = state + " " + transcription
55
- model.cpu()
56
- return new_state, new_state
57
 
58
  # Define the Gradio interface
59
  with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
@@ -91,7 +86,7 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
91
  inputs=[audio_input, state],
92
  outputs=[state, streaming_text],
93
  )
94
-
95
  # Clear the transcription
96
  def clear_transcription():
97
  return "", "", ""
 
15
 
16
  print(f"Model loaded on device: {model.device}")
17
 
18
+ import numpy as np
19
+ import soundfile as sf
20
+ audio_buffer = []
21
+
22
+ @spaces.GPU(duration=120)
23
  def transcribe(audio, state=""):
24
+ global model, audio_buffer
25
+ if audio is None or isinstance(audio, int):
26
+ print(f"Skipping invalid audio input: {type(audio)}")
 
 
27
  return state, state
28
 
29
+ # Append NumPy array to buffer
30
+ if isinstance(audio, np.ndarray):
31
+ audio_buffer.append(audio)
32
+ # Process if buffer has enough data (e.g., 5 seconds at 16kHz)
33
+ if len(np.concatenate(audio_buffer)) >= 5 * 16000:
34
+ # Concatenate and preprocess
35
+ audio_data = np.concatenate(audio_buffer)
36
+ audio_data = audio_data.mean(axis=1) if audio_data.ndim > 1 else audio_data # To mono
37
+ temp_file = "temp_audio.wav"
38
+ sf.write(temp_file, audio_data, samplerate=16000)
39
 
40
+ # Transcribe
41
+ if torch.cuda.is_available():
42
+ model = model.cuda()
43
+ transcription = model.transcribe([temp_file])[0]
44
+ model = model.cpu()
45
+ os.remove(temp_file)
46
+
47
+ # Clear buffer
48
+ audio_buffer = []
49
+ new_state = state + " " + transcription if state else transcription
50
+ return new_state, new_state
51
+ return state, state
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  # Define the Gradio interface
54
  with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
 
86
  inputs=[audio_input, state],
87
  outputs=[state, streaming_text],
88
  )
89
+
90
  # Clear the transcription
91
  def clear_transcription():
92
  return "", "", ""
requirements.txt CHANGED
@@ -4,3 +4,4 @@ nemo_toolkit[asr]>=1.18.0
4
  omegaconf>=2.2.0
5
  numpy>=1.22.0
6
  cuda-python>=12.3
 
 
4
  omegaconf>=2.2.0
5
  numpy>=1.22.0
6
  cuda-python>=12.3
7
+ soundfile