File size: 3,633 Bytes
740846d
b8a34b4
cb63aa0
b8a34b4
 
cb63aa0
a493c8c
bdfd7a5
740846d
cb63aa0
e8ee7fe
bdfd7a5
740846d
 
af3c122
 
e8ee7fe
 
 
 
 
43ac355
740846d
a493c8c
740846d
e8ee7fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
740846d
e8ee7fe
740846d
a493c8c
 
740846d
a493c8c
 
af3c122
a493c8c
 
af3c122
a493c8c
740846d
a493c8c
 
 
af3c122
e8ee7fe
af3c122
e8ee7fe
 
 
cb63aa0
a493c8c
 
740846d
 
af3c122
a493c8c
af3c122
a493c8c
af3c122
a493c8c
740846d
a493c8c
 
43ac355
af3c122
 
a493c8c
af3c122
a493c8c
b1483f2
af3c122
cb63aa0
a493c8c
e8ee7fe
43ac355
af3c122
a493c8c
af3c122
 
 
b8a34b4
a493c8c
bdfd7a5
5f3d5cb
a493c8c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import gradio as gr
import asyncio
import numpy as np
from google import genai
from google.genai import types
import soundfile as sf
import io

# Configuration
SAMPLE_RATE = 24000
MODEL = "models/gemini-2.0-flash-exp"  # Updated to a stable model version

class GeminiTTS:
    def __init__(self, api_key):
        if not api_key:
            raise ValueError("API key cannot be empty")
        self.client = genai.Client(api_key=api_key)  # Removed experimental http_options
        self.config = types.GenerationConfig(
            candidate_count=1,
            max_output_tokens=2048,
            temperature=0.9,
        )

    async def text_to_speech(self, text):
        try:
            # Using standard generate_content instead of experimental live API
            response = await self.client.generate_content_async(
                contents=[types.Content(parts=[types.Part(text=text)])],
                generation_config=self.config
            )
            
            # For actual TTS, you would use the text response with a TTS service
            # This is a placeholder for the actual audio generation
            text_response = response.text
            
            # Generate synthetic audio (replace with actual TTS API call)
            duration = min(max(len(text_response) * 0.1, 10)  # Max 10 seconds
            t = np.linspace(0, duration, int(SAMPLE_RATE * duration), False)
            audio_data = np.sin(2 * np.pi * 220 * t) * 0.5  # Simple sine wave
            
            # Convert to WAV bytes for Gradio
            with io.BytesIO() as wav_buffer:
                sf.write(wav_buffer, audio_data, SAMPLE_RATE, format='WAV')
                return wav_buffer.getvalue(), text_response
            
        except Exception as e:
            return None, f"Error: {str(e)}"

def create_interface():
    tts_engine = None
    
    def init_engine(api_key):
        nonlocal tts_engine
        try:
            tts_engine = GeminiTTS(api_key)
            return "βœ… TTS Initialized Successfully"
        except Exception as e:
            return f"❌ Initialization Failed: {str(e)}"
    
    async def generate_speech(text):
        if not tts_engine:
            raise gr.Error("Please initialize the TTS first")
        
        audio_data, message = await tts_engine.text_to_speech(text)
        
        if audio_data:
            return (SAMPLE_RATE, audio_data), message
        return None, message

    with gr.Blocks(title="Gemini TTS") as app:
        gr.Markdown("# 🎀 Gemini Text-to-Speech")
        
        with gr.Row():
            api_key = gr.Textbox(
                label="API Key",
                type="password",
                placeholder="Enter your Gemini API key"
            )
            init_btn = gr.Button("Initialize")
        
        init_status = gr.Textbox(label="Status", interactive=False)
        init_btn.click(init_engine, inputs=api_key, outputs=init_status)
        
        with gr.Group():
            text_input = gr.Textbox(
                label="Input Text",
                lines=3,
                placeholder="Type something to speak..."
            )
            generate_btn = gr.Button("Generate Speech")
        
        audio_output = gr.Audio(label="Output Audio")
        text_output = gr.Textbox(label="Response Message", interactive=False)
        
        generate_btn.click(
            generate_speech,
            inputs=text_input,
            outputs=[audio_output, text_output]
        )
    
    return app

if __name__ == "__main__":
    app = create_interface()
    app.launch(server_name="0.0.0.0", server_port=7860)