Spaces:
Running
Running
import os | |
import torch | |
import time | |
from parler_tts import ParlerTTSForConditionalGeneration | |
from transformers import AutoTokenizer | |
import soundfile as sf | |
from google.generativeai import GenerativeModel, configure | |
import gradio as gr | |
# Initialize with prints | |
print("⚡ Initializing models...") | |
start_load = time.time() | |
# 1. Load Gemini | |
GEMINI_KEY = os.environ.get('GEMINI_API_KEY') | |
configure(api_key=GEMINI_KEY) | |
gemini = GenerativeModel('gemini-2.0-flash') | |
print(f" ✅ Gemini loaded (device: {'GPU' if torch.cuda.is_available() else 'CPU'})") | |
# 2. Load Indic-TTS | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to(device) | |
tts_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts") | |
desc_tokenizer = AutoTokenizer.from_pretrained(tts_model.config.text_encoder._name_or_path) | |
print(f" ✅ Indic-TTS loaded in {time.time() - start_load:.2f}s\n") | |
def hinglish_to_devnagri(text): | |
try: | |
print(f"🔠 Converting Hinglish to Devnagri: '{text[:30]}...'") | |
start = time.time() | |
response = gemini.generate_content( | |
f""" | |
Convert the following English sentence to Hindi. | |
Rules: | |
- The translation must be in Devanagari script. | |
- It should sound like what a normal Hindi speaker would say in daily life. | |
- Avoid formal or highly Sanskritized words. | |
- Keep proper nouns and technical/educational terms in English if commonly used that way (like 'if condition', 'laptop', etc.) | |
- Focus on fluency and ease of understanding. | |
- Remove the code if you find them in backticks ```. | |
Sentence: | |
"{text}" | |
Output ONLY the Hindi translation in Devanagari script. | |
""" | |
) | |
print(f" ✓ Translation done in {time.time() - start:.2f}s") | |
return response.text | |
except Exception as e: | |
print(f"❌ Gemini error: {str(e)}") | |
raise gr.Error(f"Gemini error: {str(e)}") | |
def generate_speech(text): | |
print("\n" + "="*50) | |
print("🎤 Starting Hinglish-to-Speech pipeline") | |
# 1. Text Conversion | |
hindi_text = hinglish_to_devnagri(text) | |
print(f" Hindi text: {hindi_text[:50]}...") | |
# 2. Audio Generation | |
print("\n🔊 Generating audio...") | |
start_audio = time.time() | |
desc = "एक महिला वक्ता हिंदी में बोल रही हैं" | |
desc_inputs = desc_tokenizer(desc, return_tensors="pt").to(device) | |
text_inputs = tts_tokenizer(hindi_text, return_tensors="pt").to(device) | |
audio = tts_model.generate( | |
input_ids=desc_inputs.input_ids, | |
attention_mask=desc_inputs.attention_mask, | |
prompt_input_ids=text_inputs.input_ids, | |
prompt_attention_mask=text_inputs.attention_mask | |
) | |
# 3. Save Output | |
sf.write("output.wav", audio.cpu().numpy().squeeze(), tts_model.config.sampling_rate) | |
print(f"\n💾 Audio generated in {time.time() - start_audio:.2f}s") | |
print("="*50 + "\n") | |
return "output.wav", hindi_text | |
# Gradio UI | |
with gr.Blocks() as app: | |
gr.Markdown("## 🚀 Hinglish-to-Speech (Gemini + Indic-TTS)") | |
with gr.Row(): | |
inp = gr.Textbox(label="Enter Hinglish Text", placeholder="Aaj mood nahi hai...") | |
btn = gr.Button("Generate") | |
with gr.Row(): | |
audio_out = gr.Audio(label="Speech Output") | |
text_out = gr.Textbox(label="Devnagri Translation") | |
btn.click(fn=generate_speech, inputs=inp, outputs=[audio_out, text_out]) | |
print("\n🚀 App ready! Waiting for input...") | |
app.launch() |