Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,40 +1,91 @@
|
|
1 |
import spaces
|
2 |
import torch
|
3 |
-
|
4 |
import gradio as gr
|
5 |
import yt_dlp as youtube_dl
|
6 |
from transformers import pipeline
|
7 |
from transformers.pipelines.audio_utils import ffmpeg_read
|
8 |
-
|
9 |
import tempfile
|
10 |
import os
|
|
|
11 |
|
|
|
12 |
hf_token = os.getenv('HF_TOKEN')
|
13 |
MODEL_NAME = "nyrahealth/CrisperWhisper"
|
14 |
BATCH_SIZE = 8
|
15 |
FILE_LIMIT_MB = 1000
|
16 |
YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
|
17 |
|
|
|
18 |
device = 0 if torch.cuda.is_available() else "cpu"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
|
|
20 |
pipe = pipeline(
|
21 |
task="automatic-speech-recognition",
|
22 |
model=MODEL_NAME,
|
23 |
token=hf_token,
|
|
|
24 |
chunk_length_s=30,
|
25 |
device=device,
|
|
|
26 |
)
|
27 |
|
28 |
-
|
29 |
@spaces.GPU
|
30 |
def transcribe(inputs, task):
|
31 |
if inputs is None:
|
32 |
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
-
|
35 |
-
return text
|
36 |
-
|
37 |
-
|
38 |
def _return_yt_html_embed(yt_url):
|
39 |
video_id = yt_url.split("?v=")[-1]
|
40 |
HTML_str = (
|
@@ -43,6 +94,7 @@ def _return_yt_html_embed(yt_url):
|
|
43 |
)
|
44 |
return HTML_str
|
45 |
|
|
|
46 |
def download_yt_audio(yt_url, filename):
|
47 |
info_loader = youtube_dl.YoutubeDL()
|
48 |
|
@@ -74,6 +126,7 @@ def download_yt_audio(yt_url, filename):
|
|
74 |
except youtube_dl.utils.ExtractorError as err:
|
75 |
raise gr.Error(str(err))
|
76 |
|
|
|
77 |
@spaces.GPU
|
78 |
def yt_transcribe(yt_url, task, max_filesize=75.0):
|
79 |
html_embed_str = _return_yt_html_embed(yt_url)
|
@@ -87,11 +140,27 @@ def yt_transcribe(yt_url, task, max_filesize=75.0):
|
|
87 |
inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
|
88 |
inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
|
89 |
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
|
|
95 |
demo = gr.Blocks()
|
96 |
|
97 |
mf_transcribe = gr.Interface(
|
@@ -142,8 +211,9 @@ yt_transcribe = gr.Interface(
|
|
142 |
allow_flagging="never",
|
143 |
)
|
144 |
|
|
|
145 |
with demo:
|
146 |
gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
|
147 |
|
148 |
-
|
149 |
-
|
|
|
1 |
import spaces
|
2 |
import torch
|
|
|
3 |
import gradio as gr
|
4 |
import yt_dlp as youtube_dl
|
5 |
from transformers import pipeline
|
6 |
from transformers.pipelines.audio_utils import ffmpeg_read
|
|
|
7 |
import tempfile
|
8 |
import os
|
9 |
+
import time
|
10 |
|
11 |
+
# Environment and model configuration
|
12 |
hf_token = os.getenv('HF_TOKEN')
|
13 |
MODEL_NAME = "nyrahealth/CrisperWhisper"
|
14 |
BATCH_SIZE = 8
|
15 |
FILE_LIMIT_MB = 1000
|
16 |
YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
|
17 |
|
18 |
+
# Device setup
|
19 |
device = 0 if torch.cuda.is_available() else "cpu"
|
20 |
+
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
21 |
+
|
22 |
+
# Timestamp adjustment function
|
23 |
+
def adjust_pauses_for_hf_pipeline_output(pipeline_output, split_threshold=0.12):
|
24 |
+
"""
|
25 |
+
Adjust pause timings by distributing pauses up to the threshold evenly between adjacent words.
|
26 |
+
"""
|
27 |
+
adjusted_chunks = pipeline_output["chunks"].copy()
|
28 |
+
|
29 |
+
for i in range(len(adjusted_chunks) - 1):
|
30 |
+
current_chunk = adjusted_chunks[i]
|
31 |
+
next_chunk = adjusted_chunks[i + 1]
|
32 |
+
|
33 |
+
current_start, current_end = current_chunk["timestamp"]
|
34 |
+
next_start, next_end = next_chunk["timestamp"]
|
35 |
+
pause_duration = next_start - current_end
|
36 |
+
|
37 |
+
if pause_duration > 0:
|
38 |
+
if pause_duration > split_threshold:
|
39 |
+
distribute = split_threshold / 2
|
40 |
+
else:
|
41 |
+
distribute = pause_duration / 2
|
42 |
+
|
43 |
+
# Adjust current chunk end time
|
44 |
+
adjusted_chunks[i]["timestamp"] = (current_start, current_end + distribute)
|
45 |
+
# Adjust next chunk start time
|
46 |
+
adjusted_chunks[i + 1]["timestamp"] = (next_start - distribute, next_end)
|
47 |
+
|
48 |
+
pipeline_output["chunks"] = adjusted_chunks
|
49 |
+
return pipeline_output
|
50 |
|
51 |
+
# Initialize pipeline
|
52 |
pipe = pipeline(
|
53 |
task="automatic-speech-recognition",
|
54 |
model=MODEL_NAME,
|
55 |
token=hf_token,
|
56 |
+
torch_dtype=torch_dtype,
|
57 |
chunk_length_s=30,
|
58 |
device=device,
|
59 |
+
return_timestamps='word', # Enable word-level timestamps
|
60 |
)
|
61 |
|
62 |
+
# Transcribe function for microphone and file inputs
|
63 |
@spaces.GPU
|
64 |
def transcribe(inputs, task):
|
65 |
if inputs is None:
|
66 |
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
|
67 |
+
|
68 |
+
# Get full pipeline output
|
69 |
+
raw_output = pipe(
|
70 |
+
inputs,
|
71 |
+
batch_size=BATCH_SIZE,
|
72 |
+
generate_kwargs={"task": task},
|
73 |
+
return_timestamps='word'
|
74 |
+
)
|
75 |
+
|
76 |
+
# Apply timestamp adjustment
|
77 |
+
adjusted_output = adjust_pauses_for_hf_pipeline_output(raw_output)
|
78 |
+
|
79 |
+
# Format output with timestamps
|
80 |
+
formatted_text = ""
|
81 |
+
for chunk in adjusted_output["chunks"]:
|
82 |
+
start = chunk["timestamp"][0]
|
83 |
+
text = chunk["text"]
|
84 |
+
formatted_text += f"[{start:.2f}] {text}\n"
|
85 |
+
|
86 |
+
return formatted_text
|
87 |
|
88 |
+
# YouTube HTML embed function
|
|
|
|
|
|
|
89 |
def _return_yt_html_embed(yt_url):
|
90 |
video_id = yt_url.split("?v=")[-1]
|
91 |
HTML_str = (
|
|
|
94 |
)
|
95 |
return HTML_str
|
96 |
|
97 |
+
# YouTube audio download function
|
98 |
def download_yt_audio(yt_url, filename):
|
99 |
info_loader = youtube_dl.YoutubeDL()
|
100 |
|
|
|
126 |
except youtube_dl.utils.ExtractorError as err:
|
127 |
raise gr.Error(str(err))
|
128 |
|
129 |
+
# Transcribe function for YouTube inputs
|
130 |
@spaces.GPU
|
131 |
def yt_transcribe(yt_url, task, max_filesize=75.0):
|
132 |
html_embed_str = _return_yt_html_embed(yt_url)
|
|
|
140 |
inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
|
141 |
inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
|
142 |
|
143 |
+
# Get full pipeline output
|
144 |
+
raw_output = pipe(
|
145 |
+
inputs,
|
146 |
+
batch_size=BATCH_SIZE,
|
147 |
+
generate_kwargs={"task": task},
|
148 |
+
return_timestamps='word'
|
149 |
+
)
|
150 |
+
|
151 |
+
# Apply timestamp adjustment
|
152 |
+
adjusted_output = adjust_pauses_for_hf_pipeline_output(raw_output)
|
153 |
+
|
154 |
+
# Format output with timestamps
|
155 |
+
formatted_text = ""
|
156 |
+
for chunk in adjusted_output["chunks"]:
|
157 |
+
start = chunk["timestamp"][0]
|
158 |
+
text = chunk["text"]
|
159 |
+
formatted_text += f"[{start:.2f}] {text}\n"
|
160 |
+
|
161 |
+
return html_embed_str, formatted_text
|
162 |
|
163 |
+
# Gradio interface setup
|
164 |
demo = gr.Blocks()
|
165 |
|
166 |
mf_transcribe = gr.Interface(
|
|
|
211 |
allow_flagging="never",
|
212 |
)
|
213 |
|
214 |
+
# Combine interfaces into a tabbed layout
|
215 |
with demo:
|
216 |
gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
|
217 |
|
218 |
+
# Launch the app
|
219 |
+
demo.queue().launch()
|