File size: 4,385 Bytes
740846d b8a34b4 cb63aa0 b8a34b4 cb63aa0 a493c8c bdfd7a5 740846d cb63aa0 a493c8c bdfd7a5 740846d af3c122 740846d 43ac355 8bdf1fa 43ac355 a493c8c 43ac355 740846d a493c8c 740846d a493c8c 740846d a493c8c af3c122 a493c8c af3c122 a493c8c 740846d a493c8c 740846d a493c8c 740846d a493c8c af3c122 a493c8c af3c122 a493c8c 740846d a493c8c af3c122 a493c8c af3c122 a493c8c cb63aa0 a493c8c 740846d af3c122 a493c8c af3c122 a493c8c af3c122 a493c8c 740846d a493c8c 43ac355 af3c122 a493c8c af3c122 a493c8c b1483f2 af3c122 cb63aa0 a493c8c 43ac355 af3c122 a493c8c af3c122 b8a34b4 a493c8c bdfd7a5 5f3d5cb a493c8c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import gradio as gr
import asyncio
import numpy as np
from google import genai
from google.genai import types
import soundfile as sf
import io
# Configuration
SAMPLE_RATE = 24000
MODEL = "models/gemini-2.0-flash"
class GeminiTTS:
def __init__(self, api_key):
if not api_key:
raise ValueError("API key cannot be empty")
self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
self.config = types.LiveConnectConfig(
response_modalities=["audio"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Puck")
)
),
system_instruction=types.Content(
parts=[types.Part.from_text(text="Speak exactly what the user says")],
role="user"
),
)
async def text_to_speech(self, text):
try:
async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
await session.send(input=text or " ", end_of_turn=True)
turn = session.receive()
async for response in turn:
if audio_data := response.data:
# Convert to numpy array and normalize
audio_array = np.frombuffer(audio_data, dtype=np.float32)
# Handle empty/quiet audio
if audio_array.size == 0:
audio_array = np.zeros(int(SAMPLE_RATE * 0.5)) # 0.5s of silence
# Normalize audio to prevent processing warnings
max_val = np.max(np.abs(audio_array))
if max_val > 0:
audio_array = audio_array / max_val
# Convert to WAV bytes for Gradio
with io.BytesIO() as wav_buffer:
sf.write(wav_buffer, audio_array, SAMPLE_RATE, format='WAV')
return wav_buffer.getvalue()
if text_response := response.text:
return text_response
return None
except Exception as e:
return f"Error: {str(e)}"
def create_interface():
tts_engine = None
def init_engine(api_key):
nonlocal tts_engine
try:
tts_engine = GeminiTTS(api_key)
return "β
TTS Initialized Successfully"
except Exception as e:
return f"β Initialization Failed: {str(e)}"
async def generate_speech(text):
if not tts_engine:
raise gr.Error("Please initialize the TTS first")
result = await tts_engine.text_to_speech(text)
if isinstance(result, str):
return None, result # Return error message
elif result:
return (SAMPLE_RATE, result), "" # Return audio and empty message
return None, "No response received"
with gr.Blocks(title="Gemini TTS") as app:
gr.Markdown("# π€ Gemini Text-to-Speech")
with gr.Row():
api_key = gr.Textbox(
label="API Key",
type="password",
placeholder="Enter your Gemini API key"
)
init_btn = gr.Button("Initialize")
init_status = gr.Textbox(label="Status", interactive=False)
init_btn.click(init_engine, inputs=api_key, outputs=init_status)
with gr.Group():
text_input = gr.Textbox(
label="Input Text",
lines=3,
placeholder="Type something to speak..."
)
generate_btn = gr.Button("Generate Speech")
audio_output = gr.Audio(label="Output Audio")
text_output = gr.Textbox(label="Messages", interactive=False)
generate_btn.click(
generate_speech,
inputs=text_input,
outputs=[audio_output, text_output]
)
return app
if __name__ == "__main__":
app = create_interface()
app.launch(server_name="0.0.0.0", server_port=7860) |