regional-tts / app.py
argodinho's picture
updated app.py
2e8f85a verified
import os
import torch
import time
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf
from google.generativeai import GenerativeModel, configure
import gradio as gr
# Initialize with prints
print("⚡ Initializing models...")
start_load = time.time()
# 1. Load Gemini
GEMINI_KEY = os.environ.get('GEMINI_API_KEY')
configure(api_key=GEMINI_KEY)
gemini = GenerativeModel('gemini-2.0-flash')
print(f" ✅ Gemini loaded (device: {'GPU' if torch.cuda.is_available() else 'CPU'})")
# 2. Load Indic-TTS
device = "cuda" if torch.cuda.is_available() else "cpu"
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to(device)
tts_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
desc_tokenizer = AutoTokenizer.from_pretrained(tts_model.config.text_encoder._name_or_path)
print(f" ✅ Indic-TTS loaded in {time.time() - start_load:.2f}s\n")
def hinglish_to_devnagri(text):
try:
print(f"🔠 Converting Hinglish to Devnagri: '{text[:30]}...'")
start = time.time()
response = gemini.generate_content(
f"""
Convert the following English sentence to Hindi.
Rules:
- The translation must be in Devanagari script.
- It should sound like what a normal Hindi speaker would say in daily life.
- Avoid formal or highly Sanskritized words.
- Keep proper nouns and technical/educational terms in English if commonly used that way (like 'if condition', 'laptop', etc.)
- Focus on fluency and ease of understanding.
- Remove the code if you find them in backticks ```.
Sentence:
"{text}"
Output ONLY the Hindi translation in Devanagari script.
"""
)
print(f" ✓ Translation done in {time.time() - start:.2f}s")
return response.text
except Exception as e:
print(f"❌ Gemini error: {str(e)}")
raise gr.Error(f"Gemini error: {str(e)}")
def generate_speech(text):
print("\n" + "="*50)
print("🎤 Starting Hinglish-to-Speech pipeline")
# 1. Text Conversion
hindi_text = hinglish_to_devnagri(text)
print(f" Hindi text: {hindi_text[:50]}...")
# 2. Audio Generation
print("\n🔊 Generating audio...")
start_audio = time.time()
desc = "एक महिला वक्ता हिंदी में बोल रही हैं"
desc_inputs = desc_tokenizer(desc, return_tensors="pt").to(device)
text_inputs = tts_tokenizer(hindi_text, return_tensors="pt").to(device)
audio = tts_model.generate(
input_ids=desc_inputs.input_ids,
attention_mask=desc_inputs.attention_mask,
prompt_input_ids=text_inputs.input_ids,
prompt_attention_mask=text_inputs.attention_mask
)
# 3. Save Output
sf.write("output.wav", audio.cpu().numpy().squeeze(), tts_model.config.sampling_rate)
print(f"\n💾 Audio generated in {time.time() - start_audio:.2f}s")
print("="*50 + "\n")
return "output.wav", hindi_text
# Gradio UI
with gr.Blocks() as app:
gr.Markdown("## 🚀 Hinglish-to-Speech (Gemini + Indic-TTS)")
with gr.Row():
inp = gr.Textbox(label="Enter Hinglish Text", placeholder="Aaj mood nahi hai...")
btn = gr.Button("Generate")
with gr.Row():
audio_out = gr.Audio(label="Speech Output")
text_out = gr.Textbox(label="Devnagri Translation")
btn.click(fn=generate_speech, inputs=inp, outputs=[audio_out, text_out])
print("\n🚀 App ready! Waiting for input...")
app.launch()