kostissz commited on
Commit
41deb18
·
verified ·
1 Parent(s): b7a92bd

Fix mic audio input

Browse files
Files changed (1) hide show
  1. app.py +24 -5
app.py CHANGED
@@ -1,10 +1,12 @@
1
  import csv
2
- from pathlib import Path
 
3
  from typing import Tuple
4
 
5
  import gradio as gr
6
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
7
  from whisper_bidec import decode_wav, get_logits_processor, load_corpus_from_sentences
 
8
 
9
 
10
  def _parse_file(file_path: str) -> list[str]:
@@ -22,9 +24,22 @@ def _parse_file(file_path: str) -> list[str]:
22
  return sentences
23
 
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def transcribe(
26
  processor_name: str,
27
- audio: str,
28
  bias_strength: float,
29
  bias_text: str | None,
30
  bias_text_file: str | None,
@@ -36,21 +51,25 @@ def transcribe(
36
 
37
  if bias_text:
38
  sentences = bias_text.split(",")
39
- elif Path(bias_text_file).is_file():
40
  sentences = _parse_file(bias_text_file)
41
 
 
 
42
  if sentences:
43
  corpus = load_corpus_from_sentences(sentences, processor)
44
  logits_processor = get_logits_processor(
45
  corpus=corpus, processor=processor, bias_towards_lm=bias_strength
46
  )
47
  text_with_bias = decode_wav(
48
- model, processor, audio, logits_processor=logits_processor
49
  )
50
  else:
51
  text_with_bias = ""
52
 
53
- text_no_bias = decode_wav(model, processor, audio, logits_processor=None)
 
 
54
 
55
  return text_no_bias, text_with_bias
56
 
 
1
  import csv
2
+ import os
3
+ import tempfile
4
  from typing import Tuple
5
 
6
  import gradio as gr
7
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
8
  from whisper_bidec import decode_wav, get_logits_processor, load_corpus_from_sentences
9
+ from pydub import AudioSegment
10
 
11
 
12
  def _parse_file(file_path: str) -> list[str]:
 
24
  return sentences
25
 
26
 
27
+ def _convert_audio(input_audio_path: str) -> str:
28
+ """Whisper decoder expects wav files with 16kHz sample rate and mono channel.
29
+ Convert the audio file to this format, save it in a tmp file and return the path.
30
+ """
31
+ fd, tmp_path = tempfile.mkstemp(suffix=".wav")
32
+ os.close(fd) # Close file descriptor
33
+
34
+ audio = AudioSegment.from_file(input_audio_path)
35
+ audio = audio.set_channels(1).set_frame_rate(16000)
36
+ audio.export(tmp_path, format="wav")
37
+ return tmp_path
38
+
39
+
40
  def transcribe(
41
  processor_name: str,
42
+ audio_path: str,
43
  bias_strength: float,
44
  bias_text: str | None,
45
  bias_text_file: str | None,
 
51
 
52
  if bias_text:
53
  sentences = bias_text.split(",")
54
+ elif bias_text_file:
55
  sentences = _parse_file(bias_text_file)
56
 
57
+ converted_audio_path = _convert_audio(audio_path)
58
+
59
  if sentences:
60
  corpus = load_corpus_from_sentences(sentences, processor)
61
  logits_processor = get_logits_processor(
62
  corpus=corpus, processor=processor, bias_towards_lm=bias_strength
63
  )
64
  text_with_bias = decode_wav(
65
+ model, processor, converted_audio_path, logits_processor=logits_processor
66
  )
67
  else:
68
  text_with_bias = ""
69
 
70
+ text_no_bias = decode_wav(
71
+ model, processor, converted_audio_path, logits_processor=None
72
+ )
73
 
74
  return text_no_bias, text_with_bias
75