import gradio as gr import torch import torchaudio import tempfile import numpy as np from nemo.collections.tts.models import FastPitchModel from nemo.collections.tts.models import HifiGanModel from nemo.collections.tts.models import MixerTTSModel from transformers import pipeline Audio(output["audio"], rate=output["sampling_rate"]) # spec_generator_2 = MixerTTSModel.from_pretrained("tts_en_lj_mixerttsx") # model1 = HifiGanModel.from_pretrained(model_name="tts_en_lj_hifigan_ft_mixerttsx") spec_generator = FastPitchModel.from_pretrained("tts_en_fastpitch_multispeaker") spec_generator.eval() voc_model = HifiGanModel.from_pretrained(model_name="tts_en_hifitts_hifigan_ft_fastpitch") voc_model.eval() pipe = pipeline("text-to-speech", model="suno/bark-small") def greet(name): return "Hello " + name + "!!" def generate_tts(text: str, speaker: int = 0): sr = 44100 # parsed = spec_generator.parse(text) # spectrogram = spec_generator.generate_spectrogram(tokens=parsed, speaker=speaker) # audio = voc_model.convert_spectrogram_to_audio(spec=spectrogram) output = pipe(text) # with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: # torchaudio.save(fp.name, audio.to('cpu'), sample_rate=sr) # return fp.name return (output["sampling_rate"], output["audio"]) def run(): demo = gr.Interface( fn=generate_tts, inputs=[gr.Textbox(value="This is a test.", label="Text to Synthesize"), gr.Slider(0, 10, step=1, label="Speaker")], outputs=gr.Audio(label="Output", type="numpy"), ) demo.launch(server_name="0.0.0.0", server_port=7860) if __name__ == "__main__": run()