crisper-whisper

Running on Zero

App Files Files Community

rafaaa2105 commited on Feb 11

Commit

9db60e1

verified ·

1 Parent(s): 92ea310

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -13

app.py CHANGED Viewed

@@ -1,40 +1,91 @@
 import spaces
 import torch
 import gradio as gr
 import yt_dlp as youtube_dl
 from transformers import pipeline
 from transformers.pipelines.audio_utils import ffmpeg_read
 import tempfile
 import os
 hf_token = os.getenv('HF_TOKEN')
 MODEL_NAME = "nyrahealth/CrisperWhisper"
 BATCH_SIZE = 8
 FILE_LIMIT_MB = 1000
 YT_LENGTH_LIMIT_S = 3600  # limit to 1 hour YouTube files
 device = 0 if torch.cuda.is_available() else "cpu"
 pipe = pipeline(
     task="automatic-speech-recognition",
     model=MODEL_NAME,
     token=hf_token,
     chunk_length_s=30,
     device=device,
 )
 @spaces.GPU
 def transcribe(inputs, task):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
-    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
-    return  text
 def _return_yt_html_embed(yt_url):
     video_id = yt_url.split("?v=")[-1]
     HTML_str = (
@@ -43,6 +94,7 @@ def _return_yt_html_embed(yt_url):
     )
     return HTML_str
 def download_yt_audio(yt_url, filename):
     info_loader = youtube_dl.YoutubeDL()
@@ -74,6 +126,7 @@ def download_yt_audio(yt_url, filename):
         except youtube_dl.utils.ExtractorError as err:
             raise gr.Error(str(err))
 @spaces.GPU
 def yt_transcribe(yt_url, task, max_filesize=75.0):
     html_embed_str = _return_yt_html_embed(yt_url)
@@ -87,11 +140,27 @@ def yt_transcribe(yt_url, task, max_filesize=75.0):
     inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
     inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
-    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
-    return html_embed_str, text
 demo = gr.Blocks()
 mf_transcribe = gr.Interface(
@@ -142,8 +211,9 @@ yt_transcribe = gr.Interface(
     allow_flagging="never",
 )
 with demo:
     gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
-demo.queue().launch()

 import spaces
 import torch
 import gradio as gr
 import yt_dlp as youtube_dl
 from transformers import pipeline
 from transformers.pipelines.audio_utils import ffmpeg_read
 import tempfile
 import os
+import time
+# Environment and model configuration
 hf_token = os.getenv('HF_TOKEN')
 MODEL_NAME = "nyrahealth/CrisperWhisper"
 BATCH_SIZE = 8
 FILE_LIMIT_MB = 1000
 YT_LENGTH_LIMIT_S = 3600  # limit to 1 hour YouTube files
+# Device setup
 device = 0 if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+# Timestamp adjustment function
+def adjust_pauses_for_hf_pipeline_output(pipeline_output, split_threshold=0.12):
+    """
+    Adjust pause timings by distributing pauses up to the threshold evenly between adjacent words.
+    """
+    adjusted_chunks = pipeline_output["chunks"].copy()
+    for i in range(len(adjusted_chunks) - 1):
+        current_chunk = adjusted_chunks[i]
+        next_chunk = adjusted_chunks[i + 1]
+        current_start, current_end = current_chunk["timestamp"]
+        next_start, next_end = next_chunk["timestamp"]
+        pause_duration = next_start - current_end
+        if pause_duration > 0:
+            if pause_duration > split_threshold:
+                distribute = split_threshold / 2
+            else:
+                distribute = pause_duration / 2
+            # Adjust current chunk end time
+            adjusted_chunks[i]["timestamp"] = (current_start, current_end + distribute)
+            # Adjust next chunk start time
+            adjusted_chunks[i + 1]["timestamp"] = (next_start - distribute, next_end)
+    pipeline_output["chunks"] = adjusted_chunks
+    return pipeline_output
+# Initialize pipeline
 pipe = pipeline(
     task="automatic-speech-recognition",
     model=MODEL_NAME,
     token=hf_token,
+    torch_dtype=torch_dtype,
     chunk_length_s=30,
     device=device,
+    return_timestamps='word',  # Enable word-level timestamps
 )
+# Transcribe function for microphone and file inputs
 @spaces.GPU
 def transcribe(inputs, task):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
+    # Get full pipeline output
+    raw_output = pipe(
+        inputs,
+        batch_size=BATCH_SIZE,
+        generate_kwargs={"task": task},
+        return_timestamps='word'
+    )
+    # Apply timestamp adjustment
+    adjusted_output = adjust_pauses_for_hf_pipeline_output(raw_output)
+    # Format output with timestamps
+    formatted_text = ""
+    for chunk in adjusted_output["chunks"]:
+        start = chunk["timestamp"][0]
+        text = chunk["text"]
+        formatted_text += f"[{start:.2f}] {text}\n"
+    return formatted_text
+# YouTube HTML embed function
 def _return_yt_html_embed(yt_url):
     video_id = yt_url.split("?v=")[-1]
     HTML_str = (
     )
     return HTML_str
+# YouTube audio download function
 def download_yt_audio(yt_url, filename):
     info_loader = youtube_dl.YoutubeDL()
         except youtube_dl.utils.ExtractorError as err:
             raise gr.Error(str(err))
+# Transcribe function for YouTube inputs
 @spaces.GPU
 def yt_transcribe(yt_url, task, max_filesize=75.0):
     html_embed_str = _return_yt_html_embed(yt_url)
     inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
     inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
+    # Get full pipeline output
+    raw_output = pipe(
+        inputs,
+        batch_size=BATCH_SIZE,
+        generate_kwargs={"task": task},
+        return_timestamps='word'
+    )
+    # Apply timestamp adjustment
+    adjusted_output = adjust_pauses_for_hf_pipeline_output(raw_output)
+    # Format output with timestamps
+    formatted_text = ""
+    for chunk in adjusted_output["chunks"]:
+        start = chunk["timestamp"][0]
+        text = chunk["text"]
+        formatted_text += f"[{start:.2f}] {text}\n"
+    return html_embed_str, formatted_text
+# Gradio interface setup
 demo = gr.Blocks()
 mf_transcribe = gr.Interface(
     allow_flagging="never",
 )
+# Combine interfaces into a tabbed layout
 with demo:
     gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
+# Launch the app
+demo.queue().launch()