Spaces:

renesistech
/

Spatial-awares

Running

App Files Files Community

noumanjavaid commited on 5 days ago

Commit

90b4c24

verified ·

1 Parent(s): c3e1dff

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -25

app.py CHANGED Viewed

@@ -165,7 +165,7 @@ class GeminiInteractionLoop:
             return
         try:
             logging.info(f"Sending text to Gemini: '{user_text[:50]}...'")
-            await self.gemini_session.send_client_content(content=[types.Part(text=user_text)], end_of_turn=True)
         except Exception as e:
             logging.error(
                 f"Error sending text message to Gemini: {e}", exc_info=True)
@@ -180,6 +180,29 @@ class GeminiInteractionLoop:
         if not all(k in media_data for k in ["data", "mime_type"]):
             logging.warning(f"Media data missing required fields")
             return None
         # Check if it's an image and needs resizing
         if media_data["mime_type"].startswith("image/"):
@@ -246,21 +269,37 @@ class GeminiInteractionLoop:
                     try:
                         validated_media = self._validate_media_payload(media_data)
                         if validated_media:
-                            logging.debug(f"Sending media to Gemini. Type: {validated_media.get('mime_type')}, Data size: {len(validated_media.get('data', b'')) if isinstance(validated_media.get('data'), bytes) else len(validated_media.get('data', ''))}")
                             await self.gemini_session.send(input=validated_media)
                         else:
                             # Log if validation failed, but only if media_data was not None initially
-                            # (as get_media_from_queues can return None on timeout)
                             if media_data is not None:
                                 logging.warning(f"Media validation failed for payload. Type: {media_data.get('mime_type') if isinstance(media_data, dict) else type(media_data)}, skipping send.")
                     except websockets.exceptions.ConnectionClosedError as e_conn_closed:
                         logging.error(f"Connection closed while sending media: {e_conn_closed}", exc_info=True)
-                        # Consider how to handle this - e.g., attempt to reconnect or stop the loop.
-                        # For now, let's log and potentially stop the interaction loop or specific task.
-                        self.is_running = False # Example: stop if connection is lost
                     except Exception as e:
-                        logging.error(
-                            f"Error sending media chunk to Gemini: {e}", exc_info=True)
                 elif not media_data: # media_data could be None if queues were empty and timed out
                     await asyncio.sleep(0.05)  # Yield to other tasks if no media
         except asyncio.CancelledError:
@@ -393,7 +432,7 @@ class GeminiInteractionLoop:
                     f"Gemini session established with API for model {MODEL_NAME}.")
                 try:
                     logging.info("Sending system prompt to Gemini...")
-                    await self.gemini_session.send_client_content(content=[types.Part(text=MEDICAL_ASSISTANT_SYSTEM_PROMPT)], end_of_turn=True)
                     logging.info("System prompt sent successfully.")
                 except Exception as e:
                     logging.error(
@@ -513,33 +552,48 @@ class AudioProcessor(AudioProcessorBase):
         if audio_chunks_to_gemini_q is None:
             return
         for frame in audio_frames:
-            audio_data = frame.planes[0].to_bytes()
-            # Skip empty audio frames
-            if not audio_data or len(audio_data) == 0:
-                continue
-            # Fix for the WebSocket error 1007 (invalid payload data)
-            # Use the correct mime type format and ensure the audio data is valid
-            # The audio format must match one of the formats supported by the Gemini API
-            # Using standard audio/L16 with 16kHz sample rate instead of 24kHz
-            mime_type = f"audio/L16;rate=16000;channels=1"
             try:
-                # Prepare API data - making sure all data is valid
                 if isinstance(audio_data, bytes) and len(audio_data) > 0:
-                    api_data = {"data": audio_data, "mime_type": mime_type}
                     if audio_chunks_to_gemini_q.full():
                         try:
                             await asyncio.wait_for(audio_chunks_to_gemini_q.get(), timeout=0.01)
                         except asyncio.TimeoutError:
                             logging.warning("Audio queue full, chunk dropped.")
                             continue
                     audio_chunks_to_gemini_q.put_nowait(api_data)
             except Exception as e:
-                logging.error(
-                    f"Error queueing audio chunk: {e}", exc_info=True)
     async def recv(self, frames):
         try:

             return
         try:
             logging.info(f"Sending text to Gemini: '{user_text[:50]}...'")
+            await self.gemini_session.send_client_content(types.Part(text=user_text))
         except Exception as e:
             logging.error(
                 f"Error sending text message to Gemini: {e}", exc_info=True)
         if not all(k in media_data for k in ["data", "mime_type"]):
             logging.warning(f"Media data missing required fields")
             return None
+        # Handle audio data - ensure proper format for Gemini API
+        if media_data["mime_type"].startswith("audio/"):
+            try:
+                # Ensure audio data is in bytes format
+                if isinstance(media_data["data"], bytes):
+                    # No need to base64 encode binary audio data for Gemini API
+                    # Just ensure the mime_type is correctly formatted
+                    if "rate=" not in media_data["mime_type"]:
+                        # Default to 16kHz if not specified
+                        media_data["mime_type"] = f"audio/L16;rate=16000;channels=1"
+                    # Create a new dict to avoid modifying the original
+                    return {
+                        "mime_type": media_data["mime_type"],
+                        "data": media_data["data"]
+                    }
+                else:
+                    logging.warning(f"Audio data is not in bytes format: {type(media_data['data'])}")
+                    return None
+            except Exception as e:
+                logging.error(f"Error processing audio data: {e}", exc_info=True)
+                return None
         # Check if it's an image and needs resizing
         if media_data["mime_type"].startswith("image/"):
                     try:
                         validated_media = self._validate_media_payload(media_data)
                         if validated_media:
+                            # Log media type and size before sending
+                            data_size = len(validated_media.get('data', b'')) if isinstance(validated_media.get('data'), bytes) else len(validated_media.get('data', ''))
+                            logging.debug(f"Sending media to Gemini. Type: {validated_media.get('mime_type')}, Data size: {data_size} bytes")
+                            # Ensure we're not exceeding WebSocket payload limits
+                            if data_size > MAX_PAYLOAD_SIZE_BYTES:
+                                logging.warning(f"Media payload exceeds maximum size ({data_size} > {MAX_PAYLOAD_SIZE_BYTES}), skipping")
+                                continue
+                            # Send the validated media to Gemini
                             await self.gemini_session.send(input=validated_media)
                         else:
                             # Log if validation failed, but only if media_data was not None initially
                             if media_data is not None:
                                 logging.warning(f"Media validation failed for payload. Type: {media_data.get('mime_type') if isinstance(media_data, dict) else type(media_data)}, skipping send.")
                     except websockets.exceptions.ConnectionClosedError as e_conn_closed:
+                        error_code = getattr(e_conn_closed, 'code', None)
+                        error_reason = getattr(e_conn_closed, 'reason', 'Unknown reason')
+                        logging.error(f"WebSocket connection closed with code {error_code}: {error_reason}")
                         logging.error(f"Connection closed while sending media: {e_conn_closed}", exc_info=True)
+                        # If we get a 1007 error (invalid frame payload data), log more details
+                        if error_code == 1007:
+                            logging.error(f"Invalid frame payload data error. This is likely due to malformed media data.")
+                            if isinstance(media_data, dict):
+                                logging.error(f"Media type: {media_data.get('mime_type', 'unknown')}, Data type: {type(media_data.get('data', None))}")
+                        # Stop the interaction loop if connection is lost
+                        self.is_running = False
                     except Exception as e:
+                        logging.error(f"Error sending media chunk to Gemini: {e}", exc_info=True)
                 elif not media_data: # media_data could be None if queues were empty and timed out
                     await asyncio.sleep(0.05)  # Yield to other tasks if no media
         except asyncio.CancelledError:
                     f"Gemini session established with API for model {MODEL_NAME}.")
                 try:
                     logging.info("Sending system prompt to Gemini...")
+                    await self.gemini_session.send_client_content(types.Part(text=MEDICAL_ASSISTANT_SYSTEM_PROMPT))
                     logging.info("System prompt sent successfully.")
                 except Exception as e:
                     logging.error(
         if audio_chunks_to_gemini_q is None:
             return
         for frame in audio_frames:
             try:
+                # Extract audio data from frame
+                audio_data = frame.planes[0].to_bytes()
+                # Skip empty audio frames
+                if not audio_data or len(audio_data) == 0:
+                    continue
+                # Ensure we're using the correct format for Gemini API
+                # WebSocket error 1007 occurs with invalid frame payload data
+                # Using standard audio/L16 with 16kHz sample rate (matches SEND_SAMPLE_RATE)
+                mime_type = "audio/L16;rate=16000;channels=1"
+                # Validate audio data before queueing
                 if isinstance(audio_data, bytes) and len(audio_data) > 0:
+                    # Check if data size is reasonable (avoid oversized payloads)
+                    if len(audio_data) > MAX_PAYLOAD_SIZE_BYTES:
+                        logging.warning(f"Audio chunk too large ({len(audio_data)} bytes), skipping")
+                        continue
+                    # Create properly formatted API data
+                    api_data = {
+                        "data": audio_data,  # Keep as bytes, don't base64 encode
+                        "mime_type": mime_type
+                    }
+                    # Handle queue overflow
                     if audio_chunks_to_gemini_q.full():
                         try:
+                            # Remove oldest item if queue is full
                             await asyncio.wait_for(audio_chunks_to_gemini_q.get(), timeout=0.01)
+                            audio_chunks_to_gemini_q.task_done()
                         except asyncio.TimeoutError:
                             logging.warning("Audio queue full, chunk dropped.")
                             continue
+                    # Queue the validated audio data
                     audio_chunks_to_gemini_q.put_nowait(api_data)
+                else:
+                    logging.warning(f"Invalid audio data format: {type(audio_data)}, skipping")
             except Exception as e:
+                logging.error(f"Error processing audio chunk: {e}", exc_info=True)
     async def recv(self, frames):
         try: