Spaces:
Running
on
T4
Running
on
T4
File size: 6,835 Bytes
69dba80 07ebfb1 0f57ece 07ebfb1 37b5da4 e1a5899 07ebfb1 5b11f8b 5aa1892 07ebfb1 311ebef 99d9b3e 07ebfb1 446a864 d761860 37b5da4 07ebfb1 6351056 0f57ece 07ebfb1 1c4706b d761860 84d6345 983c638 1c4706b ddb3a35 983c638 35af703 b8fdc42 983c638 a051d76 43e8833 07ebfb1 0f57ece 420784b 0f57ece d761860 dc1ede9 37b5da4 b8abf27 d761860 37b5da4 d18f6de 404feeb 6e36262 404feeb d18f6de 37b5da4 dc1ede9 4553dcb dc1ede9 f1787c5 dc1ede9 f1787c5 dc1ede9 6a3ae5e dc1ede9 d761860 dc1ede9 d761860 dc1ede9 3b6c548 37b5da4 10f081b 37b5da4 8b58bde 37b5da4 78f663d 37b5da4 5f56b9e 37b5da4 b8abf27 07ebfb1 e1a5899 f4d4476 99d9b3e b9fdb45 e1a5899 07ebfb1 f4d4476 07ebfb1 0f57ece 07ebfb1 535cd88 569a668 07ebfb1 143ef7b 210c7da 143ef7b 81a1d8a 210c7da 143ef7b 81a1d8a 0f57ece 143ef7b dc1ede9 5c6b453 3b6c548 ac4dacd b8abf27 d761860 3b6c548 143ef7b 0f57ece 143ef7b 0f57ece 99d9b3e 143ef7b 0b3cc6d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
import time
import os
import re
import torch
import torchaudio
import gradio as gr
import spaces
from transformers import AutoFeatureExtractor, AutoTokenizer, WhisperForConditionalGeneration, WhisperProcessor, pipeline
from huggingface_hub import model_info
try:
import flash_attn
FLASH_ATTENTION = True
except ImportError:
FLASH_ATTENTION = False
import yt_dlp # Added import for yt-dlp
MODEL_NAME = "NbAiLab/nb-whisper-large"
max_audio_length = 30 * 60
share = (os.environ.get("SHARE", "False")[0].lower() in "ty1") or None
auth_token = os.environ.get("AUTH_TOKEN") or True
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Bruker enhet: {device}")
@spaces.GPU(duration=60 * 2)
def pipe(file, return_timestamps=False, lang="no"):
asr = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=28,
device=device,
token=auth_token,
torch_dtype=torch.float16,
model_kwargs={"attn_implementation": "flash_attention_2", "num_beams": 5, "language": lang} if FLASH_ATTENTION else {"attn_implementation": "sdpa", "num_beams": 5},
)
asr.model.config.forced_decoder_ids = asr.tokenizer.get_decoder_prompt_ids(
language=lang,
task="transcribe",
no_timestamps=not return_timestamps,
)
return asr(file, return_timestamps=return_timestamps, batch_size=24, generate_kwargs={'task': 'transcribe', 'language': lang})
def format_output(text):
text = re.sub(r'(\.{3,}|[.!:?])', lambda m: m.group() + '<br>', text)
return text
def transcribe(file, return_timestamps=False, lang_nn=False):
waveform, sample_rate = torchaudio.load(file)
audio_duration = waveform.size(1) / sample_rate
warning_message = None
if audio_duration > max_audio_length:
warning_message = (
"<b style='color:red;'>⚠️ Advarsel:</b> "
"Lydfilen er lengre enn 30 minutter. Kun de første 30 minuttene vil bli transkribert."
)
waveform = waveform[:, :int(max_audio_length * sample_rate)]
truncated_file = "truncated_audio.wav"
torchaudio.save(truncated_file, waveform, sample_rate)
file_to_transcribe = truncated_file
truncated = True
else:
file_to_transcribe = file
truncated = False
if not lang_nn:
if not return_timestamps:
text = pipe(file_to_transcribe)["text"]
formatted_text = format_output(text)
else:
chunks = pipe(file_to_transcribe, return_timestamps=True)["chunks"]
text = []
for chunk in chunks:
start_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][0])) if chunk["timestamp"][0] is not None else "??:??:??"
end_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][1])) if chunk["timestamp"][1] is not None else "??:??:??"
line = f"[{start_time} -> {end_time}] {chunk['text']}"
text.append(line)
formatted_text = "<br>".join(text)
else:
if not return_timestamps:
text = pipe(file_to_transcribe, lang="nn")["text"]
formatted_text = format_output(text)
else:
chunks = pipe(file_to_transcribe, return_timestamps=True, lang="nn")["chunks"]
text = []
for chunk in chunks:
start_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][0])) if chunk["timestamp"][0] is not None else "??:??:??"
end_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][1])) if chunk["timestamp"][1] is not None else "??:??:??"
line = f"[{start_time} -> {end_time}] {chunk['text']}"
text.append(line)
formatted_text = "<br>".join(text)
output_file = "transcription.txt"
with open(output_file, "w") as f:
f.write(re.sub('<br>', '\n', formatted_text))
if truncated:
link="https://github.com/NbAiLab/nostram/blob/main/leverandorer.md"
disclaimer = (
"\n\n Dette er en demo. Det er ikke tillatt å bruke denne teksten i profesjonell sammenheng. "
"Vi anbefaler at hvis du trenger å transkribere lengre opptak, så kjører du enten modellen lokalt "
"eller sjekker denne siden for å se hvem som leverer løsninger basert på NB-Whisper: "
f"<a href='{link}' target='_blank'>denne siden</a>."
)
formatted_text += f"<br><br><i>{disclaimer}</i>"
formatted_text += "<br><br><i>Transkribert med NB-Whisper demo</i>"
return warning_message, formatted_text, output_file
def _return_yt_html_embed(yt_url):
video_id = yt_url.split("?v=")[-1]
HTML_str = (
f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
" </center>"
)
return HTML_str
def yt_transcribe(yt_url, return_timestamps=False):
html_embed_str = _return_yt_html_embed(yt_url)
ydl_opts = {
'format': 'bestaudio/best',
'outtmpl': 'audio.%(ext)s',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
'quiet': True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([yt_url])
text = transcribe("audio.mp3", return_timestamps=return_timestamps)
return html_embed_str, text
# Lag Gradio-appen uten faner
demo = gr.Blocks(theme=gr.themes.Default(primary_hue=gr.themes.colors.green, secondary_hue=gr.themes.colors.red))
with demo:
with gr.Column():
gr.HTML(f"<img src='file/Logonew.png' style='width:190px;'>")
with gr.Column(scale=8):
# Use Markdown for title and description
gr.Markdown(
"""
<h1 style="font-size: 3.5em;">NB-Whisper Demo</h1>
"""
)
mf_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.components.Audio(sources=['upload', 'microphone'], type="filepath"),
gr.components.Checkbox(label="Inkluder tidskoder"),
gr.components.Checkbox(label="Nynorsk"),
],
outputs=[
gr.HTML(label="Varsel"),
gr.HTML(label="text"),
gr.File(label="Last ned transkripsjon") # Removed right side space in the box
],
description=(
"Demoen bruker"
f" modellen [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) til å transkribere lydfiler opp til 30 minutter."
),
allow_flagging="never",
)
# Start demoen uten faner
demo.launch(share=share, show_api=False, allowed_paths=["Logonew.png"]).queue() |