nb-whisper-demo

Running on T4

App Files Files Community

pere commited on Oct 15, 2024

Commit

d761860

verified ·

1 Parent(s): 143ef7b

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -35

app.py CHANGED Viewed

@@ -18,9 +18,7 @@ except ImportError:
 import yt_dlp  # Added import for yt-dlp
 MODEL_NAME = "NbAiLab/nb-whisper-large"
-#lang = "no"
-max_audio_length= 30 * 60
 share = (os.environ.get("SHARE", "False")[0].lower() in "ty1") or None
 auth_token = os.environ.get("AUTH_TOKEN") or True
@@ -28,7 +26,7 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 print(f"Bruker enhet: {device}")
 @spaces.GPU(duration=60 * 2)
-def pipe(file, return_timestamps=False,lang="no"):
     asr = pipeline(
         task="automatic-speech-recognition",
         model=MODEL_NAME,
@@ -46,24 +44,20 @@ def pipe(file, return_timestamps=False,lang="no"):
     return asr(file, return_timestamps=return_timestamps, batch_size=24, generate_kwargs={'task': 'transcribe', 'language': lang})
 def format_output(text):
-    # Add a line break after ".", "!", ":", or "?" unless part of sequences like "..."
-    #text = re.sub(r'(?<!\.)[.!:?](?!\.)', lambda m: m.group() + '<br>', text)
-    # Ensure line break after sequences like "..." or other punctuation patterns
     text = re.sub(r'(\.{3,}|[.!:?])', lambda m: m.group() + '<br>', text)
     return text
-def transcribe(file, return_timestamps=False,lang_nn=False):
     waveform, sample_rate = torchaudio.load(file)
     audio_duration = waveform.size(1) / sample_rate
-    warning_message=None
     if audio_duration > max_audio_length:
         warning_message = (
             "<b style='color:red;'>⚠️ Advarsel:</b> "
             "Lydfilen er lengre enn 30 minutter. Kun de første 30 minuttene vil bli transkribert."
         )
-        # Trim the waveform to the first 30 minutes
         waveform = waveform[:, :int(max_audio_length * sample_rate)]
         truncated_file = "truncated_audio.wav"
         torchaudio.save(truncated_file, waveform, sample_rate)
@@ -73,7 +67,6 @@ def transcribe(file, return_timestamps=False,lang_nn=False):
         file_to_transcribe = file
         truncated = False
     if not lang_nn:
         if not return_timestamps:
             text = pipe(file_to_transcribe)["text"]
@@ -89,10 +82,10 @@ def transcribe(file, return_timestamps=False,lang_nn=False):
             formatted_text = "<br>".join(text)
     else:
         if not return_timestamps:
-            text = pipe(file_to_transcribe,lang="nn")["text"]
             formatted_text = format_output(text)
         else:
-            chunks = pipe(file_to_transcribe, return_timestamps=True,lang="nn")["chunks"]
             text = []
             for chunk in chunks:
                 start_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][0])) if chunk["timestamp"][0] is not None else "??:??:??"
@@ -175,35 +168,15 @@ with demo:
         outputs=[
             gr.HTML(label="Varsel"),
             gr.HTML(label="text"),
-            gr.File(label="Last ned transkripsjon")
         ],
-        #outputs="text",
         description=(
             "Demoen bruker"
             f" modellen [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) til å transkribere lydfiler opp til 30 minutter."
         ),
         allow_flagging="never",
-        #show_submit_button=False,
     )
-    # Uncomment to add the YouTube transcription interface if needed
-    # yt_transcribe_interface = gr.Interface(
-    #     fn=yt_transcribe,
-    #     inputs=[
-    #         gr.components.Textbox(lines=1, placeholder="Lim inn URL til en YouTube-video her", label="YouTube URL"),
-    #         gr.components.Checkbox(label="Inkluder tidsstempler"),
-    #     ],
-    #     examples=[["https://www.youtube.com/watch?v=mukeSSa5GKo"]],
-    #     outputs=["html", "text"],
-    #     title="Whisper Demo: Transkriber YouTube",
-    #     description=(
-    #         "Transkriber lange YouTube-videoer med et enkelt klikk! Demoen bruker den fintunede modellen:"
-    #         f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) og 🤗 Transformers til å transkribere lydfiler av"
-    #         " vilkårlig lengde."
-    #     ),
-    #     allow_flagging="never",
-    # )
 # Start demoen uten faner
-demo.launch(share=share, show_api=False,allowed_paths=["Logonew.png"]).queue()

 import yt_dlp  # Added import for yt-dlp
 MODEL_NAME = "NbAiLab/nb-whisper-large"
+max_audio_length = 30 * 60
 share = (os.environ.get("SHARE", "False")[0].lower() in "ty1") or None
 auth_token = os.environ.get("AUTH_TOKEN") or True
 print(f"Bruker enhet: {device}")
 @spaces.GPU(duration=60 * 2)
+def pipe(file, return_timestamps=False, lang="no"):
     asr = pipeline(
         task="automatic-speech-recognition",
         model=MODEL_NAME,
     return asr(file, return_timestamps=return_timestamps, batch_size=24, generate_kwargs={'task': 'transcribe', 'language': lang})
 def format_output(text):
     text = re.sub(r'(\.{3,}|[.!:?])', lambda m: m.group() + '<br>', text)
     return text
+def transcribe(file, return_timestamps=False, lang_nn=False):
     waveform, sample_rate = torchaudio.load(file)
     audio_duration = waveform.size(1) / sample_rate
+    warning_message = None
     if audio_duration > max_audio_length:
         warning_message = (
             "<b style='color:red;'>⚠️ Advarsel:</b> "
             "Lydfilen er lengre enn 30 minutter. Kun de første 30 minuttene vil bli transkribert."
         )
         waveform = waveform[:, :int(max_audio_length * sample_rate)]
         truncated_file = "truncated_audio.wav"
         torchaudio.save(truncated_file, waveform, sample_rate)
         file_to_transcribe = file
         truncated = False
     if not lang_nn:
         if not return_timestamps:
             text = pipe(file_to_transcribe)["text"]
             formatted_text = "<br>".join(text)
     else:
         if not return_timestamps:
+            text = pipe(file_to_transcribe, lang="nn")["text"]
             formatted_text = format_output(text)
         else:
+            chunks = pipe(file_to_transcribe, return_timestamps=True, lang="nn")["chunks"]
             text = []
             for chunk in chunks:
                 start_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][0])) if chunk["timestamp"][0] is not None else "??:??:??"
         outputs=[
             gr.HTML(label="Varsel"),
             gr.HTML(label="text"),
+            gr.File(label="Last ned transkripsjon")  # Removed right side space in the box
         ],
         description=(
             "Demoen bruker"
             f" modellen [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) til å transkribere lydfiler opp til 30 minutter."
         ),
         allow_flagging="never",
     )
 # Start demoen uten faner
+demo.launch(share=share, show_api=False, allowed_paths=["Logonew.png"]).queue()