File size: 3,713 Bytes
62e10c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e8f85a
62e10c5
 
2e8f85a
 
 
 
 
 
 
 
 
 
62e10c5
2e8f85a
62e10c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e8f85a
62e10c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
import torch
import time
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf
from google.generativeai import GenerativeModel, configure
import gradio as gr

# Initialize with prints
print("⚡ Initializing models...")
start_load = time.time()

# 1. Load Gemini
GEMINI_KEY = os.environ.get('GEMINI_API_KEY')
configure(api_key=GEMINI_KEY)
gemini = GenerativeModel('gemini-2.0-flash')
print(f"   ✅ Gemini loaded (device: {'GPU' if torch.cuda.is_available() else 'CPU'})")

# 2. Load Indic-TTS
device = "cuda" if torch.cuda.is_available() else "cpu"
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to(device)
tts_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
desc_tokenizer = AutoTokenizer.from_pretrained(tts_model.config.text_encoder._name_or_path)
print(f"   ✅ Indic-TTS loaded in {time.time() - start_load:.2f}s\n")

def hinglish_to_devnagri(text):
    try:
        print(f"🔠 Converting Hinglish to Devnagri: '{text[:30]}...'")
        start = time.time()
        
        response = gemini.generate_content(
            f"""
            Convert the following English sentence to Hindi. 

            Rules:
            - The translation must be in Devanagari script.
            - It should sound like what a normal Hindi speaker would say in daily life.
            - Avoid formal or highly Sanskritized words.
            - Keep proper nouns and technical/educational terms in English if commonly used that way (like 'if condition', 'laptop', etc.)
            - Focus on fluency and ease of understanding.
            - Remove the code if you find them in backticks ```.


            Sentence:
            "{text}"

            Output ONLY the Hindi translation in Devanagari script.
            """
        )
        
        print(f"   ✓ Translation done in {time.time() - start:.2f}s")
        return response.text
    except Exception as e:
        print(f"❌ Gemini error: {str(e)}")
        raise gr.Error(f"Gemini error: {str(e)}")

def generate_speech(text):
    print("\n" + "="*50)
    print("🎤 Starting Hinglish-to-Speech pipeline")
    
    # 1. Text Conversion
    hindi_text = hinglish_to_devnagri(text)
    print(f"   Hindi text: {hindi_text[:50]}...")
    
    # 2. Audio Generation
    print("\n🔊 Generating audio...")
    start_audio = time.time()
    
    desc = "एक महिला वक्ता हिंदी में बोल रही हैं"
    desc_inputs = desc_tokenizer(desc, return_tensors="pt").to(device)
    text_inputs = tts_tokenizer(hindi_text, return_tensors="pt").to(device)
    
    audio = tts_model.generate(
        input_ids=desc_inputs.input_ids,
        attention_mask=desc_inputs.attention_mask,
        prompt_input_ids=text_inputs.input_ids,
        prompt_attention_mask=text_inputs.attention_mask
    )
    
    # 3. Save Output
    sf.write("output.wav", audio.cpu().numpy().squeeze(), tts_model.config.sampling_rate)
    print(f"\n💾 Audio generated in {time.time() - start_audio:.2f}s")
    print("="*50 + "\n")
    
    return "output.wav", hindi_text

# Gradio UI
with gr.Blocks() as app:
    gr.Markdown("## 🚀 Hinglish-to-Speech (Gemini + Indic-TTS)")
    with gr.Row():
        inp = gr.Textbox(label="Enter Hinglish Text", placeholder="Aaj mood nahi hai...")
        btn = gr.Button("Generate")
    with gr.Row():
        audio_out = gr.Audio(label="Speech Output")
        text_out = gr.Textbox(label="Devnagri Translation")
    
    btn.click(fn=generate_speech, inputs=inp, outputs=[audio_out, text_out])

print("\n🚀 App ready! Waiting for input...")
app.launch()