toolbox-tts / app.py
kennethli319's picture
update app
e6e581b
raw
history blame
1.49 kB
import gradio as gr
import torch
import torchaudio
import tempfile
import numpy as np
from nemo.collections.tts.models import FastPitchModel
from nemo.collections.tts.models import HifiGanModel
from nemo.collections.tts.models import MixerTTSModel
from transformers import pipeline
# spec_generator_2 = MixerTTSModel.from_pretrained("tts_en_lj_mixerttsx")
# model1 = HifiGanModel.from_pretrained(model_name="tts_en_lj_hifigan_ft_mixerttsx")
def greet(name):
return "Hello " + name + "!!"
def run():
spec_generator = FastPitchModel.from_pretrained("tts_en_fastpitch_multispeaker")
spec_generator.eval()
voc_model = HifiGanModel.from_pretrained(model_name="tts_en_hifitts_hifigan_ft_fastpitch")
voc_model.eval()
pipe = pipeline("text-to-speech", model="suno/bark-small")
def generate_tts(text: str, speaker: int = 0):
sr = 44100
parsed = spec_generator.parse(text)
spectrogram = spec_generator.generate_spectrogram(tokens=parsed, speaker=speaker)
audio = voc_model.convert_spectrogram_to_audio(spec=spectrogram)
return (sr, audio.squeeze(0).cpu().numpy())
demo = gr.Interface(
fn=generate_tts,
inputs=[gr.Textbox(value="This is a test.", label="Text to Synthesize"),
gr.Slider(0, 10, step=1, label="Speaker")],
outputs=gr.Audio(label="Output", type="numpy"),
)
demo.launch(server_name="0.0.0.0", server_port=7860)
if __name__ == "__main__":
run()