import os import torch import time from parler_tts import ParlerTTSForConditionalGeneration from transformers import AutoTokenizer import soundfile as sf from google.generativeai import GenerativeModel, configure import gradio as gr # Initialize with prints print("⚡ Initializing models...") start_load = time.time() # 1. Load Gemini GEMINI_KEY = os.environ.get('GEMINI_API_KEY') configure(api_key=GEMINI_KEY) gemini = GenerativeModel('gemini-2.0-flash') print(f" ✅ Gemini loaded (device: {'GPU' if torch.cuda.is_available() else 'CPU'})") # 2. Load Indic-TTS device = "cuda" if torch.cuda.is_available() else "cpu" tts_model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to(device) tts_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts") desc_tokenizer = AutoTokenizer.from_pretrained(tts_model.config.text_encoder._name_or_path) print(f" ✅ Indic-TTS loaded in {time.time() - start_load:.2f}s\n") def hinglish_to_devnagri(text): try: print(f"🔠 Converting Hinglish to Devnagri: '{text[:30]}...'") start = time.time() response = gemini.generate_content( f""" Convert the following English sentence to Hindi. Rules: - The translation must be in Devanagari script. - It should sound like what a normal Hindi speaker would say in daily life. - Avoid formal or highly Sanskritized words. - Keep proper nouns and technical/educational terms in English if commonly used that way (like 'if condition', 'laptop', etc.) - Focus on fluency and ease of understanding. - Remove the code if you find them in backticks ```. Sentence: "{text}" Output ONLY the Hindi translation in Devanagari script. """ ) print(f" ✓ Translation done in {time.time() - start:.2f}s") return response.text except Exception as e: print(f"❌ Gemini error: {str(e)}") raise gr.Error(f"Gemini error: {str(e)}") def generate_speech(text): print("\n" + "="*50) print("🎤 Starting Hinglish-to-Speech pipeline") # 1. Text Conversion hindi_text = hinglish_to_devnagri(text) print(f" Hindi text: {hindi_text[:50]}...") # 2. Audio Generation print("\n🔊 Generating audio...") start_audio = time.time() desc = "एक महिला वक्ता हिंदी में बोल रही हैं" desc_inputs = desc_tokenizer(desc, return_tensors="pt").to(device) text_inputs = tts_tokenizer(hindi_text, return_tensors="pt").to(device) audio = tts_model.generate( input_ids=desc_inputs.input_ids, attention_mask=desc_inputs.attention_mask, prompt_input_ids=text_inputs.input_ids, prompt_attention_mask=text_inputs.attention_mask ) # 3. Save Output sf.write("output.wav", audio.cpu().numpy().squeeze(), tts_model.config.sampling_rate) print(f"\n💾 Audio generated in {time.time() - start_audio:.2f}s") print("="*50 + "\n") return "output.wav", hindi_text # Gradio UI with gr.Blocks() as app: gr.Markdown("## 🚀 Hinglish-to-Speech (Gemini + Indic-TTS)") with gr.Row(): inp = gr.Textbox(label="Enter Hinglish Text", placeholder="Aaj mood nahi hai...") btn = gr.Button("Generate") with gr.Row(): audio_out = gr.Audio(label="Speech Output") text_out = gr.Textbox(label="Devnagri Translation") btn.click(fn=generate_speech, inputs=inp, outputs=[audio_out, text_out]) print("\n🚀 App ready! Waiting for input...") app.launch()