|
import gradio as gr |
|
import asyncio |
|
import numpy as np |
|
from google import genai |
|
from google.genai import types |
|
import soundfile as sf |
|
import io |
|
|
|
|
|
SAMPLE_RATE = 24000 |
|
MODEL = "gemini-2.0-flash-exp" |
|
|
|
class GeminiTTS: |
|
def __init__(self, api_key): |
|
if not api_key: |
|
raise ValueError("API key cannot be empty") |
|
self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key) |
|
self.config = types.LiveConnectConfig( |
|
response_modalities=["AUDIO"], |
|
speech_config=types.SpeechConfig( |
|
voice_config=types.VoiceConfig( |
|
prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Puck") |
|
) |
|
), |
|
system_instruction=types.Content( |
|
parts=[types.Part.from_text(text="Speak exactly what the user says")], |
|
role="user" |
|
), |
|
) |
|
|
|
async def text_to_speech(self, text): |
|
try: |
|
async with self.client.aio.live.connect(model=MODEL, config=self.config) as session: |
|
await session.send(input=text or " ", end_of_turn=True) |
|
|
|
async for response in session.receive(): |
|
if audio_data := response.data: |
|
|
|
audio_array = np.frombuffer(audio_data, dtype=np.float32) |
|
|
|
|
|
if audio_array.size == 0: |
|
audio_array = np.zeros(int(SAMPLE_RATE * 0.5)) |
|
|
|
|
|
max_val = np.max(np.abs(audio_array)) |
|
if max_val > 0: |
|
audio_array = audio_array / max_val |
|
|
|
|
|
with io.BytesIO() as wav_buffer: |
|
sf.write(wav_buffer, audio_array, SAMPLE_RATE, format='WAV') |
|
return (SAMPLE_RATE, wav_buffer.getvalue()) |
|
|
|
if text_response := response.text: |
|
return text_response |
|
|
|
return None |
|
except Exception as e: |
|
return f"Error: {str(e)}" |
|
|
|
def create_interface(): |
|
tts_engine = None |
|
|
|
def init_engine(api_key): |
|
nonlocal tts_engine |
|
try: |
|
tts_engine = GeminiTTS(api_key) |
|
return "β
TTS Initialized Successfully" |
|
except Exception as e: |
|
return f"β Initialization Failed: {str(e)}" |
|
|
|
async def generate_speech(text): |
|
if not tts_engine: |
|
raise gr.Error("Please initialize the TTS first") |
|
|
|
result = await tts_engine.text_to_speech(text) |
|
|
|
if isinstance(result, str): |
|
return None, result |
|
elif result: |
|
return result, "" |
|
return None, "No response received" |
|
|
|
with gr.Blocks(title="Gemini TTS") as app: |
|
gr.Markdown("# π€ Gemini Text-to-Speech") |
|
|
|
with gr.Row(): |
|
api_key = gr.Textbox( |
|
label="API Key", |
|
type="password", |
|
placeholder="Enter your Gemini API key" |
|
) |
|
init_btn = gr.Button("Initialize") |
|
|
|
init_status = gr.Textbox(label="Status", interactive=False) |
|
init_btn.click(init_engine, inputs=api_key, outputs=init_status) |
|
|
|
with gr.Group(): |
|
text_input = gr.Textbox( |
|
label="Input Text", |
|
lines=3, |
|
placeholder="Type something to speak..." |
|
) |
|
generate_btn = gr.Button("Generate Speech") |
|
|
|
audio_output = gr.Audio(label="Output Audio") |
|
text_output = gr.Textbox(label="Messages", interactive=False) |
|
|
|
generate_btn.click( |
|
generate_speech, |
|
inputs=text_input, |
|
outputs=[audio_output, text_output] |
|
) |
|
|
|
return app |
|
|
|
if __name__ == "__main__": |
|
app = create_interface() |
|
app.launch(server_name="0.0.0.0", server_port=7860) |