Athspi commited on
Commit
5f3d5cb
·
verified ·
1 Parent(s): 3c0565b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +294 -0
app.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Documentation
2
+ # Quickstart: https://github.com/google-gemini/cookbook/blob/main/quickstarts/Get_started_LiveAPI.py
3
+ #
4
+ ## Setup
5
+ #
6
+ # To install the dependencies for this script, run:
7
+ #
8
+ # ```
9
+ # pip install google-genai opencv-python pyaudio pillow mss
10
+ # ```
11
+
12
+
13
+ import asyncio
14
+ import base64
15
+ import io
16
+ import traceback
17
+
18
+ import cv2
19
+ import pyaudio
20
+ import PIL.Image
21
+ import mss
22
+
23
+ import argparse
24
+
25
+ from google import genai
26
+ from google.genai import types
27
+
28
+ import gradio as gr
29
+
30
+ FORMAT = pyaudio.paInt16
31
+ CHANNELS = 1
32
+ SEND_SAMPLE_RATE = 16000
33
+ RECEIVE_SAMPLE_RATE = 24000
34
+ CHUNK_SIZE = 1024
35
+
36
+ MODEL = "models/gemini-2.0-flash-exp"
37
+
38
+ DEFAULT_MODE = "camera"
39
+
40
+ # Replace with your actual API key
41
+ # client = genai.Client(http_options={"api_version": "v1alpha"}, api_key="YOUR_API_KEY")
42
+ client = genai.Client(http_options={"api_version": "v1alpha"}, api_key="GEMINI_API_KEY")
43
+
44
+ # While Gemini 2.0 Flash is in experimental preview mode, only one of AUDIO or
45
+ # TEXT may be passed here.
46
+ CONFIG = types.LiveConnectConfig(
47
+ response_modalities=[
48
+ "audio",
49
+ ],
50
+ speech_config=types.SpeechConfig(
51
+ voice_config=types.VoiceConfig(
52
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Puck")
53
+ )
54
+ ),
55
+ system_instruction=types.Content(
56
+ parts=[types.Part.from_text(text="Answer user ask replay same thing user say no other word explain ")],
57
+ role="user"
58
+ ),
59
+ )
60
+
61
+ pya = pyaudio.PyAudio()
62
+
63
+
64
+ class AudioLoop:
65
+ def __init__(self, video_mode=DEFAULT_MODE):
66
+ self.video_mode = video_mode
67
+
68
+ self.audio_in_queue = None
69
+ self.out_queue = None
70
+
71
+ self.session = None
72
+
73
+ self.send_text_task = None
74
+ self.receive_audio_task = None
75
+ self.play_audio_task = None
76
+
77
+ async def send_text(self, text):
78
+ # while True:
79
+ # text = await asyncio.to_thread(
80
+ # input,
81
+ # "message > ",
82
+ # )
83
+ # if text.lower() == "q":
84
+ # break
85
+ await self.session.send(input=text or ".", end_of_turn=True)
86
+
87
+ def _get_frame(self, cap):
88
+ # Read the frameq
89
+ ret, frame = cap.read()
90
+ # Check if the frame was read successfully
91
+ if not ret:
92
+ return None
93
+ # Fix: Convert BGR to RGB color space
94
+ # OpenCV captures in BGR but PIL expects RGB format
95
+ # This prevents the blue tint in the video feed
96
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
97
+ img = PIL.Image.fromarray(frame_rgb) # Now using RGB frame
98
+ img.thumbnail([1024, 1024])
99
+
100
+ image_io = io.BytesIO()
101
+ img.save(image_io, format="jpeg")
102
+ image_io.seek(0)
103
+
104
+ mime_type = "image/jpeg"
105
+ image_bytes = image_io.read()
106
+ return {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()}
107
+
108
+ async def get_frames(self):
109
+ # This takes about a second, and will block the whole program
110
+ # causing the audio pipeline to overflow if you don't to_thread it.
111
+ cap = await asyncio.to_thread(
112
+ cv2.VideoCapture, 0
113
+ ) # 0 represents the default camera
114
+
115
+ while True:
116
+ frame = await asyncio.to_thread(self._get_frame, cap)
117
+ if frame is None:
118
+ break
119
+
120
+ await asyncio.sleep(1.0)
121
+
122
+ await self.out_queue.put(frame)
123
+
124
+ # Release the VideoCapture object
125
+ cap.release()
126
+
127
+ def _get_screen(self):
128
+ sct = mss.mss()
129
+ monitor = sct.monitors[0]
130
+
131
+ i = sct.grab(monitor)
132
+
133
+ mime_type = "image/jpeg"
134
+ image_bytes = mss.tools.to_png(i.rgb, i.size)
135
+ img = PIL.Image.open(io.BytesIO(image_bytes))
136
+
137
+ image_io = io.BytesIO()
138
+ img.save(image_io, format="jpeg")
139
+ image_io.seek(0)
140
+
141
+ image_bytes = image_io.read()
142
+ return {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()}
143
+
144
+ async def get_screen(self):
145
+
146
+ while True:
147
+ frame = await asyncio.to_thread(self._get_screen)
148
+ if frame is None:
149
+ break
150
+
151
+ await asyncio.sleep(1.0)
152
+
153
+ await self.out_queue.put(frame)
154
+
155
+ async def send_realtime(self):
156
+ while True:
157
+ msg = await self.out_queue.get()
158
+ await self.session.send(input=msg)
159
+
160
+ async def listen_audio(self):
161
+ mic_info = pya.get_default_input_device_info()
162
+ self.audio_stream = await asyncio.to_thread(
163
+ pya.open,
164
+ format=FORMAT,
165
+ channels=CHANNELS,
166
+ rate=SEND_SAMPLE_RATE,
167
+ input=True,
168
+ input_device_index=mic_info["index"],
169
+ frames_per_buffer=CHUNK_SIZE,
170
+ )
171
+ if __debug__:
172
+ kwargs = {"exception_on_overflow": False}
173
+ else:
174
+ kwargs = {}
175
+ while True:
176
+ data = await asyncio.to_thread(self.audio_stream.read, CHUNK_SIZE, **kwargs)
177
+ await self.out_queue.put({"data": data, "mime_type": "audio/pcm"})
178
+
179
+ async def receive_audio(self):
180
+ "Background task to reads from the websocket and write pcm chunks to the output queue"
181
+ while True:
182
+ turn = self.session.receive()
183
+ async for response in turn:
184
+ if data := response.data:
185
+ self.audio_in_queue.put_nowait(data)
186
+ continue
187
+ if text := response.text:
188
+ # print(text, end="") # Don't print to console, return it
189
+ return text # Return the text to Gradio
190
+
191
+ # If you interrupt the model, it sends a turn_complete.
192
+ # For interruptions to work, we need to stop playback.
193
+ # So empty out the audio queue because it may have loaded
194
+ # much more audio than has played yet.
195
+ while not self.audio_in_queue.empty():
196
+ self.audio_in_queue.get_nowait()
197
+
198
+ async def play_audio(self):
199
+ stream = await asyncio.to_thread(
200
+ pya.open,
201
+ format=FORMAT,
202
+ channels=CHANNELS,
203
+ rate=RECEIVE_SAMPLE_RATE,
204
+ output=True,
205
+ )
206
+ while True:
207
+ bytestream = await self.audio_in_queue.get()
208
+ await asyncio.to_thread(stream.write, bytestream)
209
+
210
+ async def run(self):
211
+ try:
212
+ async with (
213
+ client.aio.live.connect(model=MODEL, config=CONFIG) as session,
214
+ asyncio.TaskGroup() as tg,
215
+ ):
216
+ self.session = session
217
+
218
+ self.audio_in_queue = asyncio.Queue()
219
+ self.out_queue = asyncio.Queue(maxsize=5)
220
+
221
+ # send_text_task = tg.create_task(self.send_text()) #No text task anymore.
222
+ tg.create_task(self.send_realtime())
223
+ tg.create_task(self.listen_audio())
224
+ if self.video_mode == "camera":
225
+ tg.create_task(self.get_frames())
226
+ elif self.video_mode == "screen":
227
+ tg.create_task(self.get_screen())
228
+
229
+ tg.create_task(self.receive_audio())
230
+ tg.create_task(self.play_audio())
231
+
232
+ # await send_text_task
233
+ # raise asyncio.CancelledError("User requested exit")
234
+ return await self.receive_audio() #return audio transcript result
235
+
236
+ except asyncio.CancelledError:
237
+ pass
238
+ except ExceptionGroup as EG:
239
+ self.audio_stream.close()
240
+ traceback.print_exception(EG)
241
+ except Exception as e:
242
+ traceback.print_exc() # Print the traceback for debugging
243
+ return f"Error: {str(e)}" # Return error message
244
+
245
+
246
+ # Global instance
247
+ audio_loop = None # Initialize the AudioLoop object
248
+
249
+ async def transcribe_audio(text_input):
250
+ """
251
+ Transcribes audio using the AudioLoop class and returns the result.
252
+ """
253
+ global audio_loop
254
+ if audio_loop is None:
255
+ audio_loop = AudioLoop(video_mode="none") # Instantiate the class only once
256
+ # You might want to handle the initialization differently based on your needs.
257
+
258
+ loop = asyncio.get_event_loop()
259
+
260
+ # if loop.is_running():
261
+ # print("Async event loop already running. Using existing loop.")
262
+ # task = loop.create_task(audio_loop.send_text(text_input))
263
+ # return await task
264
+ # else:
265
+ # print("Starting new async event loop.")
266
+ # return asyncio.run(audio_loop.send_text(text_input))
267
+
268
+ if audio_loop.session is None:
269
+ try:
270
+ return await audio_loop.run()
271
+ except Exception as e:
272
+ print(f"Error in run(): {e}")
273
+ traceback.print_exc()
274
+ return f"Error: {str(e)}"
275
+ else:
276
+ try:
277
+ await audio_loop.send_text(text_input)
278
+ return await audio_loop.receive_audio() # Assuming receive_audio returns a string
279
+ except Exception as e:
280
+ print(f"Error after session is established: {e}")
281
+ traceback.print_exc()
282
+ return f"Error: {str(e)}"
283
+
284
+
285
+ # Gradio interface
286
+ if __name__ == "__main__":
287
+ iface = gr.Interface(
288
+ fn=transcribe_audio,
289
+ inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
290
+ outputs="text",
291
+ title="Gemini Live Connect Demo with Gradio",
292
+ description="Enter text, and the model will replay same you said. This is a demo of the Gemini Live Connect API with Gradio.",
293
+ )
294
+ iface.launch()