Spaces:

GavinHuang
/

asr-demo

Running on Zero

App Files Files Community

GavinHuang commited on 16 days ago

Commit

2b5f9bc

1 Parent(s): 50ea265

feat: add file transcription functionality and enhance UI for model selection

Browse files

Files changed (1) hide show

app.py +107 -29

app.py CHANGED Viewed

@@ -140,6 +140,33 @@ def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_
     print(f"Invalid audio input format: {type(audio)}")
     return state, state, audio_buffer, last_processed_time
 # Define the Gradio interface
 with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
     gr.Markdown("# 🎙️ Real-time Speech-to-Text Transcription")
@@ -159,28 +186,50 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
     # Status indicator for model loading
     model_status = gr.Textbox(value=f"Current model: {current_model_name}", label="Model Status")
-    with gr.Row():
-        with gr.Column(scale=2):
-            audio_input = gr.Audio(
-                sources=["microphone"],
-                type="numpy",
-                streaming=True,
-                label="Speak into your microphone"
-            )
-            clear_btn = gr.Button("Clear Transcript")
-        with gr.Column(scale=3):
-            text_output = gr.Textbox(
-                label="Transcription",
-                placeholder="Your speech will appear here...",
-                lines=10
-            )
-            streaming_text = gr.Textbox(
-                label="Real-time Transcription",
-                placeholder="Real-time results will appear here...",
-                lines=2
-            )
       # State to store the ongoing transcription
     state = gr.State("")
     audio_buffer = gr.State(value=None)
@@ -188,9 +237,20 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
     # Function to handle model selection
     def update_model(model_name):
-        global current_model_name
         current_model_name = model_name
-        return f"Current model: {model_name}", None, 0  # Reset audio buffer and last processed time
     # Load model button event
     load_button.click(
@@ -198,16 +258,24 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
         inputs=[model_dropdown],
         outputs=[model_status, audio_buffer, last_processed_time]
     )
-    # Handle the audio stream
     audio_input.stream(
         fn=transcribe,
         inputs=[audio_input, model_dropdown, state, audio_buffer, last_processed_time],
         outputs=[state, streaming_text, audio_buffer, last_processed_time],
-    )    # Clear the transcription
     def clear_transcription():
-        return "", "", None, 0
     clear_btn.click(
         fn=clear_transcription,
         inputs=[],
@@ -220,14 +288,24 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
         inputs=[state],
         outputs=[text_output]
     )
     gr.Markdown("## 📝 Instructions")
     gr.Markdown("""
     1. Select an ASR model from the dropdown menu
     2. Click 'Load Selected Model' to load the model
     3. Click the microphone button to start recording
     4. Speak clearly into your microphone
     5. The transcription will appear in real-time
     6. Click 'Clear Transcript' to start a new transcription
     """)
 # Launch the app

     print(f"Invalid audio input format: {type(audio)}")
     return state, state, audio_buffer, last_processed_time
+@spaces.GPU(duration=120)
+def transcribe_file(audio_file, model_name="nvidia/parakeet-tdt-0.6b-v2"):
+    # Load the model inside the GPU worker process
+    import numpy as np
+    import soundfile as sf
+    import librosa
+    import os
+    # Check if audio file is provided
+    if audio_file is None:
+        return "No audio file provided. Please upload an audio file."
+    try:
+        model = load_model(model_name)
+        print(f"Processing file: {audio_file}")
+        # Transcribe the entire file at once
+        hypothesis = model.transcribe([audio_file])[0]
+        transcription = hypothesis.text
+        print(f"File transcription: {transcription}")
+        return transcription
+    except Exception as e:
+        print(f"Error transcribing file: {e}")
+        return f"Error transcribing file: {str(e)}"
 # Define the Gradio interface
 with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
     gr.Markdown("# 🎙️ Real-time Speech-to-Text Transcription")
     # Status indicator for model loading
     model_status = gr.Textbox(value=f"Current model: {current_model_name}", label="Model Status")
+    # Create tabs for real-time and file-based transcription
+    with gr.Tabs():
+        # Real-time transcription tab
+        with gr.TabItem("Real-time Transcription"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    audio_input = gr.Audio(
+                        sources=["microphone"],
+                        type="numpy",
+                        streaming=True,
+                        label="Speak into your microphone"
+                    )
+                    clear_btn = gr.Button("Clear Transcript")
+                with gr.Column(scale=3):
+                    text_output = gr.Textbox(
+                        label="Transcription",
+                        placeholder="Your speech will appear here...",
+                        lines=10
+                    )
+                    streaming_text = gr.Textbox(
+                        label="Real-time Transcription",
+                        placeholder="Real-time results will appear here...",
+                        lines=2
+                    )
+          # File-based transcription tab
+        with gr.TabItem("File Transcription"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    # Audio recorder that saves to file
+                    audio_recorder = gr.Audio(
+                        sources=["microphone"],
+                        type="filepath",
+                        label="Record or upload audio file"
+                    )
+                    transcribe_btn = gr.Button("Transcribe Audio File")
+                with gr.Column(scale=3):
+                    file_transcription = gr.Textbox(
+                        label="File Transcription",
+                        placeholder="Transcription will appear here after clicking 'Transcribe Audio File'",
+                        lines=10
+                    )
       # State to store the ongoing transcription
     state = gr.State("")
     audio_buffer = gr.State(value=None)
     # Function to handle model selection
     def update_model(model_name):
+        global current_model_name, model
         current_model_name = model_name
+        # Load the model immediately if we're in a GPU context
+        try:
+            # This will load the model in the GPU worker
+            model = load_model(model_name)
+            status_message = f"Current model: {model_name} (loaded)"
+            print(f"Model {model_name} loaded successfully")
+        except Exception as e:
+            status_message = f"Current model: {model_name} (will be loaded on first use)"
+            print(f"Model will be loaded on first use: {e}")
+        return status_message, None, 0  # Reset audio buffer and last processed time
     # Load model button event
     load_button.click(
         inputs=[model_dropdown],
         outputs=[model_status, audio_buffer, last_processed_time]
     )
+      # Handle the audio stream for real-time transcription
     audio_input.stream(
         fn=transcribe,
         inputs=[audio_input, model_dropdown, state, audio_buffer, last_processed_time],
         outputs=[state, streaming_text, audio_buffer, last_processed_time],
+    )
+    # Handle file transcription
+    transcribe_btn.click(
+        fn=transcribe_file,
+        inputs=[audio_recorder, model_dropdown],
+        outputs=[file_transcription]
+    )
+    # Clear the transcription
     def clear_transcription():
+        return "", "", None, 0
     clear_btn.click(
         fn=clear_transcription,
         inputs=[],
         inputs=[state],
         outputs=[text_output]
     )
     gr.Markdown("## 📝 Instructions")
     gr.Markdown("""
+    ### Real-time Transcription:
     1. Select an ASR model from the dropdown menu
     2. Click 'Load Selected Model' to load the model
     3. Click the microphone button to start recording
     4. Speak clearly into your microphone
     5. The transcription will appear in real-time
     6. Click 'Clear Transcript' to start a new transcription
+    ### File Transcription:
+    1. Select an ASR model from the dropdown menu
+    2. Click 'Load Selected Model' to load the model
+    3. Switch to the 'File Transcription' tab
+    4. Record audio by clicking the microphone button or upload an existing audio file
+    5. Click 'Transcribe Audio File' to process the recording
+    6. The complete transcription will appear in the text box
     """)
 # Launch the app