|
import gradio as gr |
|
import asyncio |
|
import numpy as np |
|
from google import genai |
|
from google.genai import types |
|
import soundfile as sf |
|
import io |
|
|
|
|
|
SAMPLE_RATE = 24000 |
|
MODEL = "models/gemini-2.0-flash-exp" |
|
|
|
class GeminiTTS: |
|
def __init__(self, api_key): |
|
if not api_key: |
|
raise ValueError("API key cannot be empty") |
|
self.client = genai.Client(api_key=api_key) |
|
self.config = types.GenerationConfig( |
|
candidate_count=1, |
|
max_output_tokens=2048, |
|
temperature=0.9, |
|
) |
|
|
|
async def text_to_speech(self, text): |
|
try: |
|
|
|
response = await self.client.generate_content_async( |
|
contents=[types.Content(parts=[types.Part(text=text)])], |
|
generation_config=self.config |
|
) |
|
|
|
|
|
|
|
text_response = response.text |
|
|
|
|
|
duration = min(max(len(text_response) * 0.1, 10) |
|
t = np.linspace(0, duration, int(SAMPLE_RATE * duration), False) |
|
audio_data = np.sin(2 * np.pi * 220 * t) * 0.5 |
|
|
|
|
|
with io.BytesIO() as wav_buffer: |
|
sf.write(wav_buffer, audio_data, SAMPLE_RATE, format='WAV') |
|
return wav_buffer.getvalue(), text_response |
|
|
|
except Exception as e: |
|
return None, f"Error: {str(e)}" |
|
|
|
def create_interface(): |
|
tts_engine = None |
|
|
|
def init_engine(api_key): |
|
nonlocal tts_engine |
|
try: |
|
tts_engine = GeminiTTS(api_key) |
|
return "β
TTS Initialized Successfully" |
|
except Exception as e: |
|
return f"β Initialization Failed: {str(e)}" |
|
|
|
async def generate_speech(text): |
|
if not tts_engine: |
|
raise gr.Error("Please initialize the TTS first") |
|
|
|
audio_data, message = await tts_engine.text_to_speech(text) |
|
|
|
if audio_data: |
|
return (SAMPLE_RATE, audio_data), message |
|
return None, message |
|
|
|
with gr.Blocks(title="Gemini TTS") as app: |
|
gr.Markdown("# π€ Gemini Text-to-Speech") |
|
|
|
with gr.Row(): |
|
api_key = gr.Textbox( |
|
label="API Key", |
|
type="password", |
|
placeholder="Enter your Gemini API key" |
|
) |
|
init_btn = gr.Button("Initialize") |
|
|
|
init_status = gr.Textbox(label="Status", interactive=False) |
|
init_btn.click(init_engine, inputs=api_key, outputs=init_status) |
|
|
|
with gr.Group(): |
|
text_input = gr.Textbox( |
|
label="Input Text", |
|
lines=3, |
|
placeholder="Type something to speak..." |
|
) |
|
generate_btn = gr.Button("Generate Speech") |
|
|
|
audio_output = gr.Audio(label="Output Audio") |
|
text_output = gr.Textbox(label="Response Message", interactive=False) |
|
|
|
generate_btn.click( |
|
generate_speech, |
|
inputs=text_input, |
|
outputs=[audio_output, text_output] |
|
) |
|
|
|
return app |
|
|
|
if __name__ == "__main__": |
|
app = create_interface() |
|
app.launch(server_name="0.0.0.0", server_port=7860) |