Spaces:

reallynicejam
/

s2ut-hk-en

Runtime error

reallynicejam commited on Jan 30, 2024

Commit

9e18d98

verified ·

1 Parent(s): 5dc63c7

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
-pip install torchaudio
 import gradio as gr
-import torchaudio
 import IPython.display as ipd
 from pathlib import Path
 from fairseq import hub_utils
@@ -10,6 +9,7 @@ from fairseq.models.text_to_speech import CodeHiFiGANVocoder
 from fairseq.models.text_to_speech.hub_interface import VocoderHubInterface
 from huggingface_hub import snapshot_download
 import json
 # Load speech-to-text model
 models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
@@ -47,7 +47,13 @@ vocoder = CodeHiFiGANVocoder(x["args"]["model_path"][0], vocoder_cfg)
 tts_model = VocoderHubInterface(vocoder_cfg, vocoder)
-def transcribe_and_synthesize(audio):
     # Speech-to-Text
     sample = S2THubInterface.get_model_input(task, audio)
     unit = S2THubInterface.get_prediction(task, models[0], generator, sample)
@@ -60,6 +66,5 @@ def transcribe_and_synthesize(audio):
 # Gradio Interface
-audio_input = gr.Audio(preprocessing="raw", type="microphone", label="Record your audio")
-iface = gr.Interface(fn=transcribe_and_synthesize, inputs=audio_input, outputs="audio")
 iface.launch()

 import gradio as gr
+import numpy as np
 import IPython.display as ipd
 from pathlib import Path
 from fairseq import hub_utils
 from fairseq.models.text_to_speech.hub_interface import VocoderHubInterface
 from huggingface_hub import snapshot_download
 import json
+import sounddevice as sd
 # Load speech-to-text model
 models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
 tts_model = VocoderHubInterface(vocoder_cfg, vocoder)
+def record_and_transcribe_synthesize():
+    # Record audio using sounddevice
+    sr = 16000  # Sample rate
+    duration = 5  # Recording duration in seconds
+    audio = sd.rec(int(sr * duration), samplerate=sr, channels=1, dtype=np.int16)
+    sd.wait()
     # Speech-to-Text
     sample = S2THubInterface.get_model_input(task, audio)
     unit = S2THubInterface.get_prediction(task, models[0], generator, sample)
 # Gradio Interface
+iface = gr.Interface(fn=record_and_transcribe_synthesize, inputs=None, outputs="audio")
 iface.launch()