Update app.py
Browse files
app.py
CHANGED
@@ -3,24 +3,22 @@ import asyncio
|
|
3 |
import base64
|
4 |
import io
|
5 |
import cv2
|
6 |
-
import
|
|
|
7 |
import PIL.Image
|
8 |
import mss
|
9 |
from google import genai
|
10 |
from google.genai import types
|
|
|
11 |
|
12 |
# Configuration
|
13 |
-
|
14 |
-
CHANNELS = 1
|
15 |
-
SEND_SAMPLE_RATE = 16000
|
16 |
-
RECEIVE_SAMPLE_RATE = 24000
|
17 |
CHUNK_SIZE = 1024
|
18 |
MODEL = "models/gemini-2.0-flash-exp"
|
19 |
|
20 |
class GeminiTTS:
|
21 |
def __init__(self, api_key):
|
22 |
self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
|
23 |
-
self.pya = pyaudio.PyAudio()
|
24 |
self.audio_in_queue = asyncio.Queue()
|
25 |
self.out_queue = asyncio.Queue(maxsize=5)
|
26 |
self.session = None
|
@@ -61,6 +59,21 @@ class GeminiTTS:
|
|
61 |
image_io.seek(0)
|
62 |
return {"mime_type": "image/jpeg", "data": base64.b64encode(image_io.read()).decode()}
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
async def process_input(self, text=None, mode="text"):
|
65 |
try:
|
66 |
async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
|
@@ -83,7 +96,11 @@ class GeminiTTS:
|
|
83 |
turn = session.receive()
|
84 |
async for response in turn:
|
85 |
if data := response.data:
|
86 |
-
|
|
|
|
|
|
|
|
|
87 |
if text := response.text:
|
88 |
return text
|
89 |
|
@@ -105,18 +122,27 @@ def create_gradio_interface():
|
|
105 |
|
106 |
result = await tts_handler.process_input(text, mode)
|
107 |
|
108 |
-
if isinstance(result,
|
109 |
-
# Audio response
|
110 |
-
|
111 |
-
wav_buffer.write(result)
|
112 |
-
wav_buffer.seek(0)
|
113 |
-
return (RECEIVE_SAMPLE_RATE, wav_buffer.read())
|
114 |
else:
|
115 |
# Text response
|
116 |
return result
|
117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
with gr.Blocks(title="Gemini TTS Interface") as demo:
|
119 |
-
gr.Markdown("# 🎤 Gemini Text-to-Speech Interface")
|
120 |
|
121 |
with gr.Row():
|
122 |
api_key = gr.Textbox(label="Gemini API Key", type="password")
|
@@ -133,6 +159,11 @@ def create_gradio_interface():
|
|
133 |
text_output = gr.Audio(label="Generated Speech")
|
134 |
text_btn.click(generate_response, inputs=[text_input, gr.Text("text", visible=False)], outputs=text_output)
|
135 |
|
|
|
|
|
|
|
|
|
|
|
136 |
with gr.Tab("Camera Input"):
|
137 |
camera_btn = gr.Button("Capture and Process")
|
138 |
camera_output = gr.Audio(label="Generated Speech from Camera")
|
|
|
3 |
import base64
|
4 |
import io
|
5 |
import cv2
|
6 |
+
import sounddevice as sd
|
7 |
+
import numpy as np
|
8 |
import PIL.Image
|
9 |
import mss
|
10 |
from google import genai
|
11 |
from google.genai import types
|
12 |
+
import soundfile as sf
|
13 |
|
14 |
# Configuration
|
15 |
+
SAMPLE_RATE = 24000
|
|
|
|
|
|
|
16 |
CHUNK_SIZE = 1024
|
17 |
MODEL = "models/gemini-2.0-flash-exp"
|
18 |
|
19 |
class GeminiTTS:
|
20 |
def __init__(self, api_key):
|
21 |
self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
|
|
|
22 |
self.audio_in_queue = asyncio.Queue()
|
23 |
self.out_queue = asyncio.Queue(maxsize=5)
|
24 |
self.session = None
|
|
|
59 |
image_io.seek(0)
|
60 |
return {"mime_type": "image/jpeg", "data": base64.b64encode(image_io.read()).decode()}
|
61 |
|
62 |
+
async def record_audio(self, duration=5):
|
63 |
+
"""Record audio using sounddevice"""
|
64 |
+
print(f"Recording for {duration} seconds...")
|
65 |
+
recording = sd.rec(int(duration * SAMPLE_RATE),
|
66 |
+
samplerate=SAMPLE_RATE,
|
67 |
+
channels=1,
|
68 |
+
dtype='float32')
|
69 |
+
sd.wait() # Wait until recording is finished
|
70 |
+
return recording
|
71 |
+
|
72 |
+
async def play_audio(self, audio_data):
|
73 |
+
"""Play audio using sounddevice"""
|
74 |
+
sd.play(audio_data, samplerate=SAMPLE_RATE)
|
75 |
+
sd.wait() # Wait until playback is finished
|
76 |
+
|
77 |
async def process_input(self, text=None, mode="text"):
|
78 |
try:
|
79 |
async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
|
|
|
96 |
turn = session.receive()
|
97 |
async for response in turn:
|
98 |
if data := response.data:
|
99 |
+
# Save audio to buffer
|
100 |
+
with io.BytesIO() as wav_buffer:
|
101 |
+
sf.write(wav_buffer, data, SAMPLE_RATE, format='WAV')
|
102 |
+
wav_buffer.seek(0)
|
103 |
+
return (SAMPLE_RATE, wav_buffer.read())
|
104 |
if text := response.text:
|
105 |
return text
|
106 |
|
|
|
122 |
|
123 |
result = await tts_handler.process_input(text, mode)
|
124 |
|
125 |
+
if isinstance(result, tuple) and len(result) == 2:
|
126 |
+
# Audio response (sample_rate, audio_data)
|
127 |
+
return result
|
|
|
|
|
|
|
128 |
else:
|
129 |
# Text response
|
130 |
return result
|
131 |
|
132 |
+
async def record_and_process():
|
133 |
+
if not tts_handler:
|
134 |
+
raise gr.Error("Please initialize the TTS system first with your API key")
|
135 |
+
|
136 |
+
# Record audio
|
137 |
+
recording = await tts_handler.record_audio(duration=5)
|
138 |
+
|
139 |
+
# Process audio (you would need to implement this part)
|
140 |
+
# For now, we'll just play it back
|
141 |
+
await tts_handler.play_audio(recording)
|
142 |
+
return (SAMPLE_RATE, recording.tobytes())
|
143 |
+
|
144 |
with gr.Blocks(title="Gemini TTS Interface") as demo:
|
145 |
+
gr.Markdown("# 🎤 Gemini Text-to-Speech Interface with SoundDevice")
|
146 |
|
147 |
with gr.Row():
|
148 |
api_key = gr.Textbox(label="Gemini API Key", type="password")
|
|
|
159 |
text_output = gr.Audio(label="Generated Speech")
|
160 |
text_btn.click(generate_response, inputs=[text_input, gr.Text("text", visible=False)], outputs=text_output)
|
161 |
|
162 |
+
with gr.Tab("Voice Input"):
|
163 |
+
record_btn = gr.Button("Record and Process (5 sec)")
|
164 |
+
voice_output = gr.Audio(label="Processed Audio")
|
165 |
+
record_btn.click(record_and_process, outputs=voice_output)
|
166 |
+
|
167 |
with gr.Tab("Camera Input"):
|
168 |
camera_btn = gr.Button("Capture and Process")
|
169 |
camera_output = gr.Audio(label="Generated Speech from Camera")
|