Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -165,7 +165,7 @@ class GeminiInteractionLoop:
|
|
165 |
return
|
166 |
try:
|
167 |
logging.info(f"Sending text to Gemini: '{user_text[:50]}...'")
|
168 |
-
await self.gemini_session.send_client_content(
|
169 |
except Exception as e:
|
170 |
logging.error(
|
171 |
f"Error sending text message to Gemini: {e}", exc_info=True)
|
@@ -180,6 +180,29 @@ class GeminiInteractionLoop:
|
|
180 |
if not all(k in media_data for k in ["data", "mime_type"]):
|
181 |
logging.warning(f"Media data missing required fields")
|
182 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
|
184 |
# Check if it's an image and needs resizing
|
185 |
if media_data["mime_type"].startswith("image/"):
|
@@ -246,21 +269,37 @@ class GeminiInteractionLoop:
|
|
246 |
try:
|
247 |
validated_media = self._validate_media_payload(media_data)
|
248 |
if validated_media:
|
249 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
250 |
await self.gemini_session.send(input=validated_media)
|
251 |
else:
|
252 |
# Log if validation failed, but only if media_data was not None initially
|
253 |
-
# (as get_media_from_queues can return None on timeout)
|
254 |
if media_data is not None:
|
255 |
logging.warning(f"Media validation failed for payload. Type: {media_data.get('mime_type') if isinstance(media_data, dict) else type(media_data)}, skipping send.")
|
256 |
except websockets.exceptions.ConnectionClosedError as e_conn_closed:
|
|
|
|
|
|
|
257 |
logging.error(f"Connection closed while sending media: {e_conn_closed}", exc_info=True)
|
258 |
-
|
259 |
-
#
|
260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
except Exception as e:
|
262 |
-
logging.error(
|
263 |
-
f"Error sending media chunk to Gemini: {e}", exc_info=True)
|
264 |
elif not media_data: # media_data could be None if queues were empty and timed out
|
265 |
await asyncio.sleep(0.05) # Yield to other tasks if no media
|
266 |
except asyncio.CancelledError:
|
@@ -393,7 +432,7 @@ class GeminiInteractionLoop:
|
|
393 |
f"Gemini session established with API for model {MODEL_NAME}.")
|
394 |
try:
|
395 |
logging.info("Sending system prompt to Gemini...")
|
396 |
-
await self.gemini_session.send_client_content(
|
397 |
logging.info("System prompt sent successfully.")
|
398 |
except Exception as e:
|
399 |
logging.error(
|
@@ -513,33 +552,48 @@ class AudioProcessor(AudioProcessorBase):
|
|
513 |
if audio_chunks_to_gemini_q is None:
|
514 |
return
|
515 |
for frame in audio_frames:
|
516 |
-
audio_data = frame.planes[0].to_bytes()
|
517 |
-
|
518 |
-
# Skip empty audio frames
|
519 |
-
if not audio_data or len(audio_data) == 0:
|
520 |
-
continue
|
521 |
-
|
522 |
-
# Fix for the WebSocket error 1007 (invalid payload data)
|
523 |
-
# Use the correct mime type format and ensure the audio data is valid
|
524 |
-
# The audio format must match one of the formats supported by the Gemini API
|
525 |
-
# Using standard audio/L16 with 16kHz sample rate instead of 24kHz
|
526 |
-
mime_type = f"audio/L16;rate=16000;channels=1"
|
527 |
-
|
528 |
try:
|
529 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
530 |
if isinstance(audio_data, bytes) and len(audio_data) > 0:
|
531 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
532 |
|
|
|
533 |
if audio_chunks_to_gemini_q.full():
|
534 |
try:
|
|
|
535 |
await asyncio.wait_for(audio_chunks_to_gemini_q.get(), timeout=0.01)
|
|
|
536 |
except asyncio.TimeoutError:
|
537 |
logging.warning("Audio queue full, chunk dropped.")
|
538 |
continue
|
|
|
|
|
539 |
audio_chunks_to_gemini_q.put_nowait(api_data)
|
|
|
|
|
540 |
except Exception as e:
|
541 |
-
logging.error(
|
542 |
-
f"Error queueing audio chunk: {e}", exc_info=True)
|
543 |
|
544 |
async def recv(self, frames):
|
545 |
try:
|
|
|
165 |
return
|
166 |
try:
|
167 |
logging.info(f"Sending text to Gemini: '{user_text[:50]}...'")
|
168 |
+
await self.gemini_session.send_client_content(types.Part(text=user_text))
|
169 |
except Exception as e:
|
170 |
logging.error(
|
171 |
f"Error sending text message to Gemini: {e}", exc_info=True)
|
|
|
180 |
if not all(k in media_data for k in ["data", "mime_type"]):
|
181 |
logging.warning(f"Media data missing required fields")
|
182 |
return None
|
183 |
+
|
184 |
+
# Handle audio data - ensure proper format for Gemini API
|
185 |
+
if media_data["mime_type"].startswith("audio/"):
|
186 |
+
try:
|
187 |
+
# Ensure audio data is in bytes format
|
188 |
+
if isinstance(media_data["data"], bytes):
|
189 |
+
# No need to base64 encode binary audio data for Gemini API
|
190 |
+
# Just ensure the mime_type is correctly formatted
|
191 |
+
if "rate=" not in media_data["mime_type"]:
|
192 |
+
# Default to 16kHz if not specified
|
193 |
+
media_data["mime_type"] = f"audio/L16;rate=16000;channels=1"
|
194 |
+
|
195 |
+
# Create a new dict to avoid modifying the original
|
196 |
+
return {
|
197 |
+
"mime_type": media_data["mime_type"],
|
198 |
+
"data": media_data["data"]
|
199 |
+
}
|
200 |
+
else:
|
201 |
+
logging.warning(f"Audio data is not in bytes format: {type(media_data['data'])}")
|
202 |
+
return None
|
203 |
+
except Exception as e:
|
204 |
+
logging.error(f"Error processing audio data: {e}", exc_info=True)
|
205 |
+
return None
|
206 |
|
207 |
# Check if it's an image and needs resizing
|
208 |
if media_data["mime_type"].startswith("image/"):
|
|
|
269 |
try:
|
270 |
validated_media = self._validate_media_payload(media_data)
|
271 |
if validated_media:
|
272 |
+
# Log media type and size before sending
|
273 |
+
data_size = len(validated_media.get('data', b'')) if isinstance(validated_media.get('data'), bytes) else len(validated_media.get('data', ''))
|
274 |
+
logging.debug(f"Sending media to Gemini. Type: {validated_media.get('mime_type')}, Data size: {data_size} bytes")
|
275 |
+
|
276 |
+
# Ensure we're not exceeding WebSocket payload limits
|
277 |
+
if data_size > MAX_PAYLOAD_SIZE_BYTES:
|
278 |
+
logging.warning(f"Media payload exceeds maximum size ({data_size} > {MAX_PAYLOAD_SIZE_BYTES}), skipping")
|
279 |
+
continue
|
280 |
+
|
281 |
+
# Send the validated media to Gemini
|
282 |
await self.gemini_session.send(input=validated_media)
|
283 |
else:
|
284 |
# Log if validation failed, but only if media_data was not None initially
|
|
|
285 |
if media_data is not None:
|
286 |
logging.warning(f"Media validation failed for payload. Type: {media_data.get('mime_type') if isinstance(media_data, dict) else type(media_data)}, skipping send.")
|
287 |
except websockets.exceptions.ConnectionClosedError as e_conn_closed:
|
288 |
+
error_code = getattr(e_conn_closed, 'code', None)
|
289 |
+
error_reason = getattr(e_conn_closed, 'reason', 'Unknown reason')
|
290 |
+
logging.error(f"WebSocket connection closed with code {error_code}: {error_reason}")
|
291 |
logging.error(f"Connection closed while sending media: {e_conn_closed}", exc_info=True)
|
292 |
+
|
293 |
+
# If we get a 1007 error (invalid frame payload data), log more details
|
294 |
+
if error_code == 1007:
|
295 |
+
logging.error(f"Invalid frame payload data error. This is likely due to malformed media data.")
|
296 |
+
if isinstance(media_data, dict):
|
297 |
+
logging.error(f"Media type: {media_data.get('mime_type', 'unknown')}, Data type: {type(media_data.get('data', None))}")
|
298 |
+
|
299 |
+
# Stop the interaction loop if connection is lost
|
300 |
+
self.is_running = False
|
301 |
except Exception as e:
|
302 |
+
logging.error(f"Error sending media chunk to Gemini: {e}", exc_info=True)
|
|
|
303 |
elif not media_data: # media_data could be None if queues were empty and timed out
|
304 |
await asyncio.sleep(0.05) # Yield to other tasks if no media
|
305 |
except asyncio.CancelledError:
|
|
|
432 |
f"Gemini session established with API for model {MODEL_NAME}.")
|
433 |
try:
|
434 |
logging.info("Sending system prompt to Gemini...")
|
435 |
+
await self.gemini_session.send_client_content(types.Part(text=MEDICAL_ASSISTANT_SYSTEM_PROMPT))
|
436 |
logging.info("System prompt sent successfully.")
|
437 |
except Exception as e:
|
438 |
logging.error(
|
|
|
552 |
if audio_chunks_to_gemini_q is None:
|
553 |
return
|
554 |
for frame in audio_frames:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
555 |
try:
|
556 |
+
# Extract audio data from frame
|
557 |
+
audio_data = frame.planes[0].to_bytes()
|
558 |
+
|
559 |
+
# Skip empty audio frames
|
560 |
+
if not audio_data or len(audio_data) == 0:
|
561 |
+
continue
|
562 |
+
|
563 |
+
# Ensure we're using the correct format for Gemini API
|
564 |
+
# WebSocket error 1007 occurs with invalid frame payload data
|
565 |
+
# Using standard audio/L16 with 16kHz sample rate (matches SEND_SAMPLE_RATE)
|
566 |
+
mime_type = "audio/L16;rate=16000;channels=1"
|
567 |
+
|
568 |
+
# Validate audio data before queueing
|
569 |
if isinstance(audio_data, bytes) and len(audio_data) > 0:
|
570 |
+
# Check if data size is reasonable (avoid oversized payloads)
|
571 |
+
if len(audio_data) > MAX_PAYLOAD_SIZE_BYTES:
|
572 |
+
logging.warning(f"Audio chunk too large ({len(audio_data)} bytes), skipping")
|
573 |
+
continue
|
574 |
+
|
575 |
+
# Create properly formatted API data
|
576 |
+
api_data = {
|
577 |
+
"data": audio_data, # Keep as bytes, don't base64 encode
|
578 |
+
"mime_type": mime_type
|
579 |
+
}
|
580 |
|
581 |
+
# Handle queue overflow
|
582 |
if audio_chunks_to_gemini_q.full():
|
583 |
try:
|
584 |
+
# Remove oldest item if queue is full
|
585 |
await asyncio.wait_for(audio_chunks_to_gemini_q.get(), timeout=0.01)
|
586 |
+
audio_chunks_to_gemini_q.task_done()
|
587 |
except asyncio.TimeoutError:
|
588 |
logging.warning("Audio queue full, chunk dropped.")
|
589 |
continue
|
590 |
+
|
591 |
+
# Queue the validated audio data
|
592 |
audio_chunks_to_gemini_q.put_nowait(api_data)
|
593 |
+
else:
|
594 |
+
logging.warning(f"Invalid audio data format: {type(audio_data)}, skipping")
|
595 |
except Exception as e:
|
596 |
+
logging.error(f"Error processing audio chunk: {e}", exc_info=True)
|
|
|
597 |
|
598 |
async def recv(self, frames):
|
599 |
try:
|