razhan commited on
Commit
9040397
·
verified ·
1 Parent(s): 3c2ecdf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +162 -220
app.py CHANGED
@@ -1,20 +1,27 @@
 
 
 
 
1
  # import torch
2
 
3
  # import gradio as gr
4
  # import yt_dlp as youtube_dl
 
 
5
 
6
  # from transformers import pipeline
7
- # from huggingface_hub import model_info
8
- # import re
9
  # import tempfile
10
  # import os
11
 
12
- # MODEL_NAME = "razhan/whisper-small-ckb"
13
  # BATCH_SIZE = 1
14
  # FILE_LIMIT_MB = 10
15
- # YT_LENGTH_LIMIT_S = 60 * 10
16
 
17
  # device = 0 if torch.cuda.is_available() else "cpu"
 
18
  # pipe = pipeline(
19
  # task="automatic-speech-recognition",
20
  # model=MODEL_NAME,
@@ -22,164 +29,173 @@
22
  # device=device,
23
  # )
24
 
25
- # pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(task="transcribe")
26
-
27
- # def transcribe(microphone, file_upload):
28
- # warn_output = ""
29
- # if (microphone is not None) and (file_upload is not None):
30
- # warn_output = (
31
- # "WARNING: You've uploaded an audio file and used the microphone. "
32
- # "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
33
- # )
34
 
35
- # elif (microphone is None) and (file_upload is None):
36
- # return "ERROR: You have to either use the microphone or upload an audio file"
 
 
37
 
38
- # file = microphone if microphone is not None else file_upload
39
-
40
- # text = pipe(file)["text"]
41
-
42
- # return warn_output + text
43
 
44
 
45
  # def _return_yt_html_embed(yt_url):
46
- # if 'youtu.be' in yt_url:
47
- # video_id = yt_url.split('/')[-1].split('?')[0]
48
- # else:
49
- # video_id = yt_url.split("?v=")[-1].split('&')[0]
50
-
51
  # HTML_str = (
52
- # f'<center><iframe width="560" height="315" src="https://www.youtube.com/embed/{video_id}" '
53
- # 'frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" '
54
- # 'allowfullscreen></iframe></center>'
55
  # )
56
  # return HTML_str
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
 
 
 
59
 
 
 
 
 
 
 
60
 
 
 
61
 
62
- # def yt_transcribe(yt_url, task="transcribe", max_filesize=75.0, progress=gr.Progress()):
63
- # html_embed_str = _return_yt_html_embed(yt_url)
64
 
65
- # with tempfile.TemporaryDirectory() as tmpdirname:
66
- # filepath = os.path.join(tmpdirname, "video.mp4")
67
- # download_yt_audio(yt_url, filepath)
68
- # with open(filepath, "rb") as f:
69
- # inputs = f.read()
70
 
71
- # inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
72
- # inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
73
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
 
 
75
 
76
- # start_time = time.time()
77
- # outputs = pipe(inputs, chunk_length_s=30, batch_size=BATCH_SIZE, generate_kwargs={"task": task, "language": "persian"}, return_timestamps=False)
78
- # exec_time = time.time() - start_time
79
- # logging.info(print(f"transcribe: {exec_time} sec."))
80
-
81
- # return html_embed_str, txt, exec_time
82
 
 
 
83
 
84
- # def download_yt_audio(yt_url, filename, progress=gr.Progress()):
85
- # if '&list' in yt_url:
86
- # yt_url = yt_url.split('&list')[0]
87
-
88
- # info_loader = youtube_dl.YoutubeDL()
89
 
90
- # try:
91
- # info = info_loader.extract_info(yt_url, download=False)
92
- # except youtube_dl.utils.DownloadError as err:
93
- # raise gr.Error(str(err))
94
-
95
- # file_length = info["duration_string"]
96
- # file_h_m_s = file_length.split(":")
97
- # file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
98
-
99
- # if len(file_h_m_s) == 1:
100
- # file_h_m_s.insert(0, 0)
101
- # if len(file_h_m_s) == 2:
102
- # file_h_m_s.insert(0, 0)
103
- # file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
104
-
105
- # if file_length_s > YT_LENGTH_LIMIT_S:
106
- # yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
107
- # file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
108
- # raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
109
-
110
- # # ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
111
- # ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
112
-
113
- # with youtube_dl.YoutubeDL(ydl_opts) as ydl:
114
- # try:
115
- # ydl.download([yt_url])
116
- # except youtube_dl.utils.ExtractorError as err:
117
- # raise gr.Error(str(err))
118
- # progress(1, desc="Video downloaded from YouTube!")
119
 
 
120
 
121
  # mf_transcribe = gr.Interface(
122
  # fn=transcribe,
123
  # inputs=[
124
  # gr.Audio(sources="microphone", type="filepath"),
125
- # gr.Audio(sources="upload", type="filepath"),
126
  # ],
127
  # outputs="text",
128
- # title="Whisper Central Kurdish‌ (Sorani) Demo: Transcribe Audio",
129
  # description=(
130
- # "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned"
131
  # f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
132
  # " of arbitrary length."
133
  # ),
134
- # allow_flagging="never",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  # )
136
 
137
  # yt_transcribe = gr.Interface(
138
  # fn=yt_transcribe,
139
- # inputs=[gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
140
- # outputs=["html",
141
- # gr.Textbox(
142
- # label="Output",
143
- # rtl=True,
144
- # show_copy_button=True,
145
- # ),
146
- # gr.Text(label="Transcription Time")
147
- # ],
148
- # title="Whisper Central Kurdish‌ (Sorani) Demo: Transcribe YouTube",
149
  # description=(
150
- # "Transcribe long-form YouTube videos with the click of a button! Demo uses the the fine-tuned checkpoint:"
151
- # f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files of"
152
  # " arbitrary length."
153
  # ),
154
- # allow_flagging="never",
155
  # )
156
 
 
 
 
157
 
158
- # demo = gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
159
-
160
- # if __name__ == "__main__":
161
- # demo.launch()
162
-
163
-
164
 
165
  import spaces
166
  import torch
167
-
168
  import gradio as gr
169
- import yt_dlp as youtube_dl
170
  from pytubefix import YouTube
171
  from pytubefix.cli import on_progress
172
-
173
  from transformers import pipeline
174
  from transformers.pipelines.audio_utils import ffmpeg_read
175
-
176
  import tempfile
177
  import os
178
 
179
  MODEL_NAME = "razhan/whisper-base-hawrami-transcription"
180
  BATCH_SIZE = 1
181
- FILE_LIMIT_MB = 10
182
- YT_LENGTH_LIMIT_S = 60 * 10 # limit to 1 hour YouTube files
183
 
184
  device = 0 if torch.cuda.is_available() else "cpu"
185
 
@@ -190,157 +206,83 @@ pipe = pipeline(
190
  device=device,
191
  )
192
 
193
-
194
- # @spaces.GPU
195
  def transcribe(inputs, task="transcribe"):
196
  if inputs is None:
197
- raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
198
-
199
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
200
- return text
201
 
 
 
202
 
203
  def _return_yt_html_embed(yt_url):
204
  video_id = yt_url.split("?v=")[-1]
205
- HTML_str = (
206
- f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
207
- " </center>"
208
- )
209
- return HTML_str
210
-
211
- # def download_yt_audio(yt_url, filename):
212
- # info_loader = youtube_dl.YoutubeDL()
213
-
214
- # try:
215
- # info = info_loader.extract_info(yt_url, download=False)
216
- # except youtube_dl.utils.DownloadError as err:
217
- # raise gr.Error(str(err))
218
-
219
- # file_length = info["duration_string"]
220
- # file_h_m_s = file_length.split(":")
221
- # file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
222
-
223
- # if len(file_h_m_s) == 1:
224
- # file_h_m_s.insert(0, 0)
225
- # if len(file_h_m_s) == 2:
226
- # file_h_m_s.insert(0, 0)
227
- # file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
228
-
229
- # if file_length_s > YT_LENGTH_LIMIT_S:
230
- # yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
231
- # file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
232
- # raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
233
-
234
- # ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
235
-
236
- # with youtube_dl.YoutubeDL(ydl_opts) as ydl:
237
- # try:
238
- # ydl.download([yt_url])
239
- # except youtube_dl.utils.ExtractorError as err:
240
- # raise gr.Error(str(err))
241
- # yt = pt.YouTube(yt_url)
242
- # stream = yt.streams.filter(only_audio=True)[0]
243
- # stream.download(filename=filename)
244
-
245
- # @spaces.GPU
246
- # def yt_transcribe(yt_url, task="transcribe", max_filesize=75.0):
247
- # html_embed_str = _return_yt_html_embed(yt_url)
248
-
249
- # with tempfile.TemporaryDirectory() as tmpdirname:
250
- # # filepath = os.path.join(tmpdirname, "video.mp4")
251
- # filepath = os.path.join(tmpdirname, "audio.mp3")
252
- # download_yt_audio(yt_url, filepath)
253
- # with open(filepath, "rb") as f:
254
- # inputs = f.read()
255
-
256
- # inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
257
- # inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
258
-
259
- # text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
260
-
261
- # return html_embed_str, text
262
-
263
 
264
- def yt_transcribe(yt_url, task="transcribe", progress=gr.Progress(), max_filesize=75.0):
265
  progress(0, desc="Loading audio file...")
266
- html_embed_str = _return_yt_html_embed(yt_url)
 
267
  try:
268
- # yt = pytube.YouTube(yt_url)
269
- # stream = yt.streams.filter(only_audio=True)[0]
270
- yt = YouTube(yt_url, on_progress_callback = on_progress, use_po_token=True)
271
-
272
  stream = yt.streams.get_audio_only()
273
-
274
- except:
275
- raise gr.Error("An error occurred while loading the YouTube video. Please try again.")
276
-
277
- if stream.filesize_mb > max_filesize:
278
- raise gr.Error(f"Maximum YouTube file size is {max_filesize}MB, got {stream.filesize_mb:.2f}MB.")
279
-
280
- # stream.download(filename="audio.mp3")
281
- stream.download(filename="audio.mp3", mp3=True)
282
 
283
- with open("audio.mp3", "rb") as f:
284
- inputs = f.read()
285
-
286
- inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
287
- inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
288
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
289
- return html_embed_str, text
290
 
 
 
 
 
 
291
 
292
  demo = gr.Blocks(theme=gr.themes.Ocean())
293
 
 
 
 
 
294
  mf_transcribe = gr.Interface(
295
  fn=transcribe,
296
  inputs=[
297
  gr.Audio(sources="microphone", type="filepath"),
298
- # gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
299
  ],
300
  outputs="text",
301
- title="Whisper Horami Demo: Transcribe Audio",
302
- description=(
303
- "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
304
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
305
- " of arbitrary length."
306
- ),
307
- flagging_mode="never",
308
  )
309
 
310
  file_transcribe = gr.Interface(
311
  fn=transcribe,
312
  inputs=[
313
  gr.Audio(sources="upload", type="filepath", label="Audio file"),
314
- # gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
315
  ],
316
  outputs="text",
317
- title="Whisper Horami Demo: Transcribe Audio",
318
- description=(
319
- "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
320
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
321
- " of arbitrary length."
322
- ),
323
- flagging_mode="never",
324
  )
325
 
326
- yt_transcribe = gr.Interface(
327
  fn=yt_transcribe,
328
  inputs=[
329
- gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
330
- # gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
331
  ],
332
  outputs=["html", "text"],
333
- title="Whisper Horami Demo: Translate YouTube",
334
- description=(
335
- "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
336
- f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
337
- " arbitrary length."
338
- ),
339
- flagging_mode="never",
340
  )
341
 
342
  with demo:
343
- # gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
344
- gr.TabbedInterface([mf_transcribe, file_transcribe], ["Microphone", "Audio file"])
 
 
345
 
346
  demo.queue().launch(ssr_mode=False)
 
1
+
2
+
3
+
4
+ # import spaces
5
  # import torch
6
 
7
  # import gradio as gr
8
  # import yt_dlp as youtube_dl
9
+ # from pytubefix import YouTube
10
+ # from pytubefix.cli import on_progress
11
 
12
  # from transformers import pipeline
13
+ # from transformers.pipelines.audio_utils import ffmpeg_read
14
+
15
  # import tempfile
16
  # import os
17
 
18
+ # MODEL_NAME = "razhan/whisper-base-hawrami-transcription"
19
  # BATCH_SIZE = 1
20
  # FILE_LIMIT_MB = 10
21
+ # YT_LENGTH_LIMIT_S = 60 * 10 # limit to 1 hour YouTube files
22
 
23
  # device = 0 if torch.cuda.is_available() else "cpu"
24
+
25
  # pipe = pipeline(
26
  # task="automatic-speech-recognition",
27
  # model=MODEL_NAME,
 
29
  # device=device,
30
  # )
31
 
 
 
 
 
 
 
 
 
 
32
 
33
+ # # @spaces.GPU
34
+ # def transcribe(inputs, task="transcribe"):
35
+ # if inputs is None:
36
+ # raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
37
 
38
+ # text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
39
+ # return text
 
 
 
40
 
41
 
42
  # def _return_yt_html_embed(yt_url):
43
+ # video_id = yt_url.split("?v=")[-1]
 
 
 
 
44
  # HTML_str = (
45
+ # f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
46
+ # " </center>"
 
47
  # )
48
  # return HTML_str
49
 
50
+ # # def download_yt_audio(yt_url, filename):
51
+ # # info_loader = youtube_dl.YoutubeDL()
52
+
53
+ # # try:
54
+ # # info = info_loader.extract_info(yt_url, download=False)
55
+ # # except youtube_dl.utils.DownloadError as err:
56
+ # # raise gr.Error(str(err))
57
+
58
+ # # file_length = info["duration_string"]
59
+ # # file_h_m_s = file_length.split(":")
60
+ # # file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
61
+
62
+ # # if len(file_h_m_s) == 1:
63
+ # # file_h_m_s.insert(0, 0)
64
+ # # if len(file_h_m_s) == 2:
65
+ # # file_h_m_s.insert(0, 0)
66
+ # # file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
67
+
68
+ # # if file_length_s > YT_LENGTH_LIMIT_S:
69
+ # # yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
70
+ # # file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
71
+ # # raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
72
+
73
+ # # ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
74
+
75
+ # # with youtube_dl.YoutubeDL(ydl_opts) as ydl:
76
+ # # try:
77
+ # # ydl.download([yt_url])
78
+ # # except youtube_dl.utils.ExtractorError as err:
79
+ # # raise gr.Error(str(err))
80
+ # # yt = pt.YouTube(yt_url)
81
+ # # stream = yt.streams.filter(only_audio=True)[0]
82
+ # # stream.download(filename=filename)
83
 
84
+ # # @spaces.GPU
85
+ # # def yt_transcribe(yt_url, task="transcribe", max_filesize=75.0):
86
+ # # html_embed_str = _return_yt_html_embed(yt_url)
87
 
88
+ # # with tempfile.TemporaryDirectory() as tmpdirname:
89
+ # # # filepath = os.path.join(tmpdirname, "video.mp4")
90
+ # # filepath = os.path.join(tmpdirname, "audio.mp3")
91
+ # # download_yt_audio(yt_url, filepath)
92
+ # # with open(filepath, "rb") as f:
93
+ # # inputs = f.read()
94
 
95
+ # # inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
96
+ # # inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
97
 
98
+ # # text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
 
99
 
100
+ # # return html_embed_str, text
 
 
 
 
101
 
 
 
102
 
103
+ # def yt_transcribe(yt_url, task="transcribe", progress=gr.Progress(), max_filesize=75.0):
104
+ # progress(0, desc="Loading audio file...")
105
+ # html_embed_str = _return_yt_html_embed(yt_url)
106
+ # try:
107
+ # # yt = pytube.YouTube(yt_url)
108
+ # # stream = yt.streams.filter(only_audio=True)[0]
109
+ # yt = YouTube(yt_url, on_progress_callback = on_progress, use_po_token=True)
110
+
111
+ # stream = yt.streams.get_audio_only()
112
+
113
+ # except:
114
+ # raise gr.Error("An error occurred while loading the YouTube video. Please try again.")
115
 
116
+ # if stream.filesize_mb > max_filesize:
117
+ # raise gr.Error(f"Maximum YouTube file size is {max_filesize}MB, got {stream.filesize_mb:.2f}MB.")
118
 
119
+ # # stream.download(filename="audio.mp3")
120
+ # stream.download(filename="audio.mp3", mp3=True)
 
 
 
 
121
 
122
+ # with open("audio.mp3", "rb") as f:
123
+ # inputs = f.read()
124
 
125
+ # inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
126
+ # inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
127
+ # text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
128
+ # return html_embed_str, text
 
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
+ # demo = gr.Blocks(theme=gr.themes.Ocean())
132
 
133
  # mf_transcribe = gr.Interface(
134
  # fn=transcribe,
135
  # inputs=[
136
  # gr.Audio(sources="microphone", type="filepath"),
137
+ # # gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
138
  # ],
139
  # outputs="text",
140
+ # title="Whisper Horami Demo: Transcribe Audio",
141
  # description=(
142
+ # "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
143
  # f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
144
  # " of arbitrary length."
145
  # ),
146
+ # flagging_mode="never",
147
+ # )
148
+
149
+ # file_transcribe = gr.Interface(
150
+ # fn=transcribe,
151
+ # inputs=[
152
+ # gr.Audio(sources="upload", type="filepath", label="Audio file"),
153
+ # # gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
154
+ # ],
155
+ # outputs="text",
156
+ # title="Whisper Horami Demo: Transcribe Audio",
157
+ # description=(
158
+ # "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
159
+ # f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
160
+ # " of arbitrary length."
161
+ # ),
162
+ # flagging_mode="never",
163
  # )
164
 
165
  # yt_transcribe = gr.Interface(
166
  # fn=yt_transcribe,
167
+ # inputs=[
168
+ # gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
169
+ # # gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
170
+ # ],
171
+ # outputs=["html", "text"],
172
+ # title="Whisper Horami Demo: Translate YouTube",
 
 
 
 
173
  # description=(
174
+ # "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
175
+ # f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
176
  # " arbitrary length."
177
  # ),
178
+ # flagging_mode="never",
179
  # )
180
 
181
+ # with demo:
182
+ # # gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
183
+ # gr.TabbedInterface([mf_transcribe, file_transcribe], ["Microphone", "Audio file"])
184
 
185
+ # demo.queue().launch(ssr_mode=False)
 
 
 
 
 
186
 
187
  import spaces
188
  import torch
 
189
  import gradio as gr
 
190
  from pytubefix import YouTube
191
  from pytubefix.cli import on_progress
 
192
  from transformers import pipeline
193
  from transformers.pipelines.audio_utils import ffmpeg_read
 
194
  import tempfile
195
  import os
196
 
197
  MODEL_NAME = "razhan/whisper-base-hawrami-transcription"
198
  BATCH_SIZE = 1
 
 
199
 
200
  device = 0 if torch.cuda.is_available() else "cpu"
201
 
 
206
  device=device,
207
  )
208
 
 
 
209
  def transcribe(inputs, task="transcribe"):
210
  if inputs is None:
211
+ raise gr.Error("Please upload or record an audio file before submitting.")
 
 
 
212
 
213
+ result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
214
+ return result["text"]
215
 
216
  def _return_yt_html_embed(yt_url):
217
  video_id = yt_url.split("?v=")[-1]
218
+ return f'<center><iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"></iframe></center>'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
+ def yt_transcribe(yt_url, task="transcribe", progress=gr.Progress()):
221
  progress(0, desc="Loading audio file...")
222
+ html_embed = _return_yt_html_embed(yt_url)
223
+
224
  try:
225
+ yt = YouTube(yt_url, on_progress_callback=on_progress, use_po_token=True)
 
 
 
226
  stream = yt.streams.get_audio_only()
227
+ except Exception as e:
228
+ raise gr.Error(f"Error loading YouTube video: {str(e)}")
 
 
 
 
 
 
 
229
 
230
+ with tempfile.TemporaryDirectory() as tmpdir:
231
+ file_path = os.path.join(tmpdir, "audio.mp3")
232
+ stream.download(filename=file_path)
233
+
234
+ with open(file_path, "rb") as f:
235
+ audio_data = f.read()
 
236
 
237
+ audio = ffmpeg_read(audio_data, pipe.feature_extractor.sampling_rate)
238
+ inputs = {"array": audio, "sampling_rate": pipe.feature_extractor.sampling_rate}
239
+
240
+ result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
241
+ return html_embed, result["text"]
242
 
243
  demo = gr.Blocks(theme=gr.themes.Ocean())
244
 
245
+ common_inputs = [
246
+ gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
247
+ ]
248
+
249
  mf_transcribe = gr.Interface(
250
  fn=transcribe,
251
  inputs=[
252
  gr.Audio(sources="microphone", type="filepath"),
253
+ *common_inputs
254
  ],
255
  outputs="text",
256
+ title="Whisper Horami: Live Transcription",
257
+ description="Transcribe audio from your microphone in real-time"
 
 
 
 
 
258
  )
259
 
260
  file_transcribe = gr.Interface(
261
  fn=transcribe,
262
  inputs=[
263
  gr.Audio(sources="upload", type="filepath", label="Audio file"),
264
+ *common_inputs
265
  ],
266
  outputs="text",
267
+ title="Whisper Horami: File Transcription",
268
+ description="Upload an audio file for transcription"
 
 
 
 
 
269
  )
270
 
271
+ yt_interface = gr.Interface(
272
  fn=yt_transcribe,
273
  inputs=[
274
+ gr.Textbox(placeholder="YouTube URL", label="Video URL"),
275
+ *common_inputs
276
  ],
277
  outputs=["html", "text"],
278
+ title="Whisper Horami: YouTube Transcription",
279
+ description="Transcribe audio from YouTube videos"
 
 
 
 
 
280
  )
281
 
282
  with demo:
283
+ gr.TabbedInterface(
284
+ [mf_transcribe, file_transcribe],
285
+ ["Microphone", "Audio File",]
286
+ )
287
 
288
  demo.queue().launch(ssr_mode=False)