ankandrew commited on
Commit
bef4e11
·
verified ·
1 Parent(s): b8e512a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -1
app.py CHANGED
@@ -15,6 +15,7 @@ import whisper
15
 
16
  YT_AUDIO_FORMAT = "bestaudio[ext=m4a]"
17
 
 
18
  MODEL_SIZES = ["tiny", "base", "small", "medium", "large", "turbo"]
19
  for size in MODEL_SIZES:
20
  whisper.load_model(size, device="cpu")
@@ -73,6 +74,9 @@ def transcribe_audio(
73
  youtube_url: str,
74
  return_timestamps: bool,
75
  temperature: float,
 
 
 
76
  ):
77
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
78
  results = []
@@ -84,6 +88,9 @@ def transcribe_audio(
84
  word_timestamps=return_timestamps,
85
  temperature=temperature,
86
  verbose=False,
 
 
 
87
  )
88
  text = out["text"].strip()
89
  segments = out["segments"] if return_timestamps else []
@@ -129,6 +136,28 @@ def build_demo() -> gr.Blocks:
129
  step=0.01,
130
  )
131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  audio_input = gr.Audio(
133
  label="Upload or record audio",
134
  sources=["upload"],
@@ -151,7 +180,16 @@ def build_demo() -> gr.Blocks:
151
 
152
  transcribe_btn.click(
153
  transcribe_audio,
154
- inputs=[model_choices, audio_input, yt_input, ts_checkbox, temp_slider],
 
 
 
 
 
 
 
 
 
155
  outputs=[out_table],
156
  )
157
 
 
15
 
16
  YT_AUDIO_FORMAT = "bestaudio[ext=m4a]"
17
 
18
+
19
  MODEL_SIZES = ["tiny", "base", "small", "medium", "large", "turbo"]
20
  for size in MODEL_SIZES:
21
  whisper.load_model(size, device="cpu")
 
74
  youtube_url: str,
75
  return_timestamps: bool,
76
  temperature: float,
77
+ logprob_threshold: float = -1.0,
78
+ no_speech_threshold: float = 0.6,
79
+ compression_ratio_threshold: float = 2.4,
80
  ):
81
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
82
  results = []
 
88
  word_timestamps=return_timestamps,
89
  temperature=temperature,
90
  verbose=False,
91
+ logprob_threshold=logprob_threshold,
92
+ no_speech_threshold=no_speech_threshold,
93
+ compression_ratio_threshold=compression_ratio_threshold,
94
  )
95
  text = out["text"].strip()
96
  segments = out["segments"] if return_timestamps else []
 
136
  step=0.01,
137
  )
138
 
139
+ logprob_slider = gr.Slider(
140
+ label="Average log-probability threshold",
141
+ minimum=-10.0,
142
+ maximum=0.0,
143
+ value=-1.0,
144
+ step=0.1,
145
+ )
146
+ no_speech_slider = gr.Slider(
147
+ label="No-speech probability threshold",
148
+ minimum=0.0,
149
+ maximum=1.0,
150
+ value=0.6,
151
+ step=0.01,
152
+ )
153
+ compression_slider = gr.Slider(
154
+ label="Compression ratio threshold",
155
+ minimum=1.0,
156
+ maximum=5.0,
157
+ value=2.4,
158
+ step=0.1,
159
+ )
160
+
161
  audio_input = gr.Audio(
162
  label="Upload or record audio",
163
  sources=["upload"],
 
180
 
181
  transcribe_btn.click(
182
  transcribe_audio,
183
+ inputs=[
184
+ model_choices,
185
+ audio_input,
186
+ yt_input,
187
+ ts_checkbox,
188
+ temp_slider,
189
+ logprob_slider,
190
+ no_speech_slider,
191
+ compression_slider,
192
+ ],
193
  outputs=[out_table],
194
  )
195