reallynicejam commited on
Commit
9e18d98
·
verified ·
1 Parent(s): 5dc63c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -5
app.py CHANGED
@@ -1,6 +1,5 @@
1
- pip install torchaudio
2
  import gradio as gr
3
- import torchaudio
4
  import IPython.display as ipd
5
  from pathlib import Path
6
  from fairseq import hub_utils
@@ -10,6 +9,7 @@ from fairseq.models.text_to_speech import CodeHiFiGANVocoder
10
  from fairseq.models.text_to_speech.hub_interface import VocoderHubInterface
11
  from huggingface_hub import snapshot_download
12
  import json
 
13
 
14
  # Load speech-to-text model
15
  models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
@@ -47,7 +47,13 @@ vocoder = CodeHiFiGANVocoder(x["args"]["model_path"][0], vocoder_cfg)
47
  tts_model = VocoderHubInterface(vocoder_cfg, vocoder)
48
 
49
 
50
- def transcribe_and_synthesize(audio):
 
 
 
 
 
 
51
  # Speech-to-Text
52
  sample = S2THubInterface.get_model_input(task, audio)
53
  unit = S2THubInterface.get_prediction(task, models[0], generator, sample)
@@ -60,6 +66,5 @@ def transcribe_and_synthesize(audio):
60
 
61
 
62
  # Gradio Interface
63
- audio_input = gr.Audio(preprocessing="raw", type="microphone", label="Record your audio")
64
- iface = gr.Interface(fn=transcribe_and_synthesize, inputs=audio_input, outputs="audio")
65
  iface.launch()
 
 
1
  import gradio as gr
2
+ import numpy as np
3
  import IPython.display as ipd
4
  from pathlib import Path
5
  from fairseq import hub_utils
 
9
  from fairseq.models.text_to_speech.hub_interface import VocoderHubInterface
10
  from huggingface_hub import snapshot_download
11
  import json
12
+ import sounddevice as sd
13
 
14
  # Load speech-to-text model
15
  models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
 
47
  tts_model = VocoderHubInterface(vocoder_cfg, vocoder)
48
 
49
 
50
+ def record_and_transcribe_synthesize():
51
+ # Record audio using sounddevice
52
+ sr = 16000 # Sample rate
53
+ duration = 5 # Recording duration in seconds
54
+ audio = sd.rec(int(sr * duration), samplerate=sr, channels=1, dtype=np.int16)
55
+ sd.wait()
56
+
57
  # Speech-to-Text
58
  sample = S2THubInterface.get_model_input(task, audio)
59
  unit = S2THubInterface.get_prediction(task, models[0], generator, sample)
 
66
 
67
 
68
  # Gradio Interface
69
+ iface = gr.Interface(fn=record_and_transcribe_synthesize, inputs=None, outputs="audio")
 
70
  iface.launch()