from transformers import VitsModel, AutoTokenizer import torch import gradio as gr import spaces import numpy as np device = "cuda" print(f"Using device: {device}") model = VitsModel.from_pretrained("facebook/mms-tts-eng").to(device) tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng") @spaces.GPU def process_text(text): inputs = tokenizer(text, return_tensors="pt").to(device) with torch.no_grad(): output = model(**inputs).waveform audio_numpy = output.cpu().numpy().squeeze() sample_rate = model.config.sampling_rate return (sample_rate, audio_numpy) examples = [ ["Hello, welcome to text-to-speech system!"], ["How amazing is artificial intelligence technology?"], ["The weather is beautiful today, isn't it?"], ["Learning new things makes life exciting."], ["This audio was generated by artificial intelligence."] ] with gr.Blocks() as demo: gr.Markdown("## 🎤 MMS-TTS English Text-to-Speech System") gr.Markdown("Enter text below and convert it to natural sounding speech!") with gr.Row(): with gr.Column(): input_text = gr.Textbox(label="Input Text", placeholder="Enter text here...") gr.Examples(examples=examples, inputs=input_text, label="Example Texts") submit_btn = gr.Button("Generate Speech") with gr.Column(): audio_output = gr.Audio(label="Generated Speech", type="numpy") submit_btn.click(fn=process_text, inputs=input_text, outputs=audio_output) demo.launch()