rafaaa2105 commited on
Commit
9db60e1
·
verified ·
1 Parent(s): 92ea310

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -13
app.py CHANGED
@@ -1,40 +1,91 @@
1
  import spaces
2
  import torch
3
-
4
  import gradio as gr
5
  import yt_dlp as youtube_dl
6
  from transformers import pipeline
7
  from transformers.pipelines.audio_utils import ffmpeg_read
8
-
9
  import tempfile
10
  import os
 
11
 
 
12
  hf_token = os.getenv('HF_TOKEN')
13
  MODEL_NAME = "nyrahealth/CrisperWhisper"
14
  BATCH_SIZE = 8
15
  FILE_LIMIT_MB = 1000
16
  YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
17
 
 
18
  device = 0 if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
 
20
  pipe = pipeline(
21
  task="automatic-speech-recognition",
22
  model=MODEL_NAME,
23
  token=hf_token,
 
24
  chunk_length_s=30,
25
  device=device,
 
26
  )
27
 
28
-
29
  @spaces.GPU
30
  def transcribe(inputs, task):
31
  if inputs is None:
32
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
35
- return text
36
-
37
-
38
  def _return_yt_html_embed(yt_url):
39
  video_id = yt_url.split("?v=")[-1]
40
  HTML_str = (
@@ -43,6 +94,7 @@ def _return_yt_html_embed(yt_url):
43
  )
44
  return HTML_str
45
 
 
46
  def download_yt_audio(yt_url, filename):
47
  info_loader = youtube_dl.YoutubeDL()
48
 
@@ -74,6 +126,7 @@ def download_yt_audio(yt_url, filename):
74
  except youtube_dl.utils.ExtractorError as err:
75
  raise gr.Error(str(err))
76
 
 
77
  @spaces.GPU
78
  def yt_transcribe(yt_url, task, max_filesize=75.0):
79
  html_embed_str = _return_yt_html_embed(yt_url)
@@ -87,11 +140,27 @@ def yt_transcribe(yt_url, task, max_filesize=75.0):
87
  inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
88
  inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
89
 
90
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
91
-
92
- return html_embed_str, text
93
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
 
95
  demo = gr.Blocks()
96
 
97
  mf_transcribe = gr.Interface(
@@ -142,8 +211,9 @@ yt_transcribe = gr.Interface(
142
  allow_flagging="never",
143
  )
144
 
 
145
  with demo:
146
  gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
147
 
148
- demo.queue().launch()
149
-
 
1
  import spaces
2
  import torch
 
3
  import gradio as gr
4
  import yt_dlp as youtube_dl
5
  from transformers import pipeline
6
  from transformers.pipelines.audio_utils import ffmpeg_read
 
7
  import tempfile
8
  import os
9
+ import time
10
 
11
+ # Environment and model configuration
12
  hf_token = os.getenv('HF_TOKEN')
13
  MODEL_NAME = "nyrahealth/CrisperWhisper"
14
  BATCH_SIZE = 8
15
  FILE_LIMIT_MB = 1000
16
  YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
17
 
18
+ # Device setup
19
  device = 0 if torch.cuda.is_available() else "cpu"
20
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
21
+
22
+ # Timestamp adjustment function
23
+ def adjust_pauses_for_hf_pipeline_output(pipeline_output, split_threshold=0.12):
24
+ """
25
+ Adjust pause timings by distributing pauses up to the threshold evenly between adjacent words.
26
+ """
27
+ adjusted_chunks = pipeline_output["chunks"].copy()
28
+
29
+ for i in range(len(adjusted_chunks) - 1):
30
+ current_chunk = adjusted_chunks[i]
31
+ next_chunk = adjusted_chunks[i + 1]
32
+
33
+ current_start, current_end = current_chunk["timestamp"]
34
+ next_start, next_end = next_chunk["timestamp"]
35
+ pause_duration = next_start - current_end
36
+
37
+ if pause_duration > 0:
38
+ if pause_duration > split_threshold:
39
+ distribute = split_threshold / 2
40
+ else:
41
+ distribute = pause_duration / 2
42
+
43
+ # Adjust current chunk end time
44
+ adjusted_chunks[i]["timestamp"] = (current_start, current_end + distribute)
45
+ # Adjust next chunk start time
46
+ adjusted_chunks[i + 1]["timestamp"] = (next_start - distribute, next_end)
47
+
48
+ pipeline_output["chunks"] = adjusted_chunks
49
+ return pipeline_output
50
 
51
+ # Initialize pipeline
52
  pipe = pipeline(
53
  task="automatic-speech-recognition",
54
  model=MODEL_NAME,
55
  token=hf_token,
56
+ torch_dtype=torch_dtype,
57
  chunk_length_s=30,
58
  device=device,
59
+ return_timestamps='word', # Enable word-level timestamps
60
  )
61
 
62
+ # Transcribe function for microphone and file inputs
63
  @spaces.GPU
64
  def transcribe(inputs, task):
65
  if inputs is None:
66
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
67
+
68
+ # Get full pipeline output
69
+ raw_output = pipe(
70
+ inputs,
71
+ batch_size=BATCH_SIZE,
72
+ generate_kwargs={"task": task},
73
+ return_timestamps='word'
74
+ )
75
+
76
+ # Apply timestamp adjustment
77
+ adjusted_output = adjust_pauses_for_hf_pipeline_output(raw_output)
78
+
79
+ # Format output with timestamps
80
+ formatted_text = ""
81
+ for chunk in adjusted_output["chunks"]:
82
+ start = chunk["timestamp"][0]
83
+ text = chunk["text"]
84
+ formatted_text += f"[{start:.2f}] {text}\n"
85
+
86
+ return formatted_text
87
 
88
+ # YouTube HTML embed function
 
 
 
89
  def _return_yt_html_embed(yt_url):
90
  video_id = yt_url.split("?v=")[-1]
91
  HTML_str = (
 
94
  )
95
  return HTML_str
96
 
97
+ # YouTube audio download function
98
  def download_yt_audio(yt_url, filename):
99
  info_loader = youtube_dl.YoutubeDL()
100
 
 
126
  except youtube_dl.utils.ExtractorError as err:
127
  raise gr.Error(str(err))
128
 
129
+ # Transcribe function for YouTube inputs
130
  @spaces.GPU
131
  def yt_transcribe(yt_url, task, max_filesize=75.0):
132
  html_embed_str = _return_yt_html_embed(yt_url)
 
140
  inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
141
  inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
142
 
143
+ # Get full pipeline output
144
+ raw_output = pipe(
145
+ inputs,
146
+ batch_size=BATCH_SIZE,
147
+ generate_kwargs={"task": task},
148
+ return_timestamps='word'
149
+ )
150
+
151
+ # Apply timestamp adjustment
152
+ adjusted_output = adjust_pauses_for_hf_pipeline_output(raw_output)
153
+
154
+ # Format output with timestamps
155
+ formatted_text = ""
156
+ for chunk in adjusted_output["chunks"]:
157
+ start = chunk["timestamp"][0]
158
+ text = chunk["text"]
159
+ formatted_text += f"[{start:.2f}] {text}\n"
160
+
161
+ return html_embed_str, formatted_text
162
 
163
+ # Gradio interface setup
164
  demo = gr.Blocks()
165
 
166
  mf_transcribe = gr.Interface(
 
211
  allow_flagging="never",
212
  )
213
 
214
+ # Combine interfaces into a tabbed layout
215
  with demo:
216
  gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
217
 
218
+ # Launch the app
219
+ demo.queue().launch()