from transformers import VitsModel, AutoTokenizer
import torch
import gradio as gr
import spaces
import numpy as np

device = "cuda"
print(f"Using device: {device}")

model = VitsModel.from_pretrained("facebook/mms-tts-eng").to(device)
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")

@spaces.GPU
def process_text(text):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    
    with torch.no_grad():
        output = model(**inputs).waveform
    
    audio_numpy = output.cpu().numpy().squeeze()
    sample_rate = model.config.sampling_rate
    return (sample_rate, audio_numpy)

examples = [
    ["Hello, welcome to text-to-speech system!"],
    ["How amazing is artificial intelligence technology?"],
    ["The weather is beautiful today, isn't it?"],
    ["Learning new things makes life exciting."],
    ["This audio was generated by artificial intelligence."]
]

with gr.Blocks() as demo:
    gr.Markdown("## 🎤 MMS-TTS English Text-to-Speech System")
    gr.Markdown("Enter text below and convert it to natural sounding speech!")
    
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(label="Input Text", placeholder="Enter text here...")
            gr.Examples(examples=examples, inputs=input_text, label="Example Texts")
            submit_btn = gr.Button("Generate Speech")
        
        with gr.Column():
            audio_output = gr.Audio(label="Generated Speech", type="numpy")
    
    submit_btn.click(fn=process_text, inputs=input_text, outputs=audio_output)

demo.launch()