Spaces:

renesistech
/

Spatial-aware

Running

App Files Files Community

noumanjavaid commited on 7 days ago

Commit

a4d774b

verified ·

1 Parent(s): ed44aa7

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +47 -30

src/streamlit_app.py CHANGED Viewed

@@ -47,12 +47,10 @@ VIDEO_FPS_TO_GEMINI = 2
 VIDEO_API_RESIZE = (1024, 1024)
 # !!! IMPORTANT: Verify this model name is correct for the Live API !!!
-# This is a common point of failure for ConnectionClosedError.
 MODEL_NAME = "models/gemini-2.0-flash-live-001"
 logging.info(f"Using Gemini Model: {MODEL_NAME}")
 MEDICAL_ASSISTANT_SYSTEM_PROMPT = """You are an AI Medical Assistant. Your primary function is to analyze visual information from the user's camera or screen and respond via voice.
 Your responsibilities are:
 1.  **Visual Observation and Description:** Carefully examine the images or video feed. Describe relevant details you observe.
 2.  **General Information (Non-Diagnostic):** Provide general information related to what is visually presented, if applicable. You are not a diagnostic tool.
@@ -63,7 +61,6 @@ Your responsibilities are:
     *   If you see something that *appears* visually concerning (e.g., an unusual skin lesion, signs of injury), you may gently suggest it might be wise to have it looked at by a professional, without speculating on what it is.
 4.  **Tone:** Maintain a helpful, empathetic, and calm tone.
 5.  **Interaction:** After this initial instruction, you can make a brief acknowledgment of your role (e.g., "I'm ready to assist by looking at what you show me. Please remember to consult a doctor for medical advice."). Then, focus on responding to the user's visual input and questions.
 Example of a disclaimer you might use: "As an AI assistant, I can describe what I see, but I can't provide medical advice or diagnoses. For any health concerns, it's always best to speak with a doctor or other healthcare professional."
 """
@@ -101,16 +98,18 @@ else:
     logging.critical("GEMINI_API_KEY not found.")
     st.stop()
-# Gemini LiveConnectConfig - HIGHLY SIMPLIFIED FOR DEBUGGING ConnectionClosedError
-# Start with the absolute minimum. If this connects, incrementally add back features.
-# If this still fails, the issue is likely MODEL_NAME or API Key/Project permissions.
 LIVE_CONNECT_CONFIG = types.LiveConnectConfig(
-    response_modalities=["audio"], # Start with text only
     speech_config=types.SpeechConfig(
         voice_config=types.VoiceConfig(
-            prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Zephyr")
-        ),
-    )
 # --- Backend Gemini Interaction Loop ---
 class GeminiInteractionLoop:
@@ -126,6 +125,8 @@ class GeminiInteractionLoop:
             return
         try:
             logging.info(f"Sending text to Gemini: '{user_text[:50]}...'")
             await self.gemini_session.send(input=user_text, end_of_turn=True)
         except Exception as e:
             logging.error(f"Error sending text message to Gemini: {e}", exc_info=True)
@@ -153,7 +154,10 @@ class GeminiInteractionLoop:
                 media_data = await get_media_from_queues()
                 if media_data is None and not self.is_running: break # Sentinel and stop signal
                 if media_data and self.gemini_session and self.is_running:
-                    try: await self.gemini_session.send(input=media_data)
                     except Exception as e: logging.error(f"Error sending media chunk to Gemini: {e}", exc_info=True)
                 elif not media_data: await asyncio.sleep(0.05) # No data, yield
         except asyncio.CancelledError: logging.info("Task cancelled: stream_media_to_gemini.")
@@ -176,7 +180,6 @@ class GeminiInteractionLoop:
                             logging.info(f"Gemini text response: {text_response[:100]}")
                             if 'chat_messages' not in st.session_state: st.session_state.chat_messages = []
                             st.session_state.chat_messages = st.session_state.chat_messages + [{"role": "assistant", "content": text_response}]
-                            # Consider using st.rerun() via a thread-safe mechanism if immediate UI update is critical
                 except types.generation_types.StopCandidateException: logging.info("Gemini response stream ended normally.")
                 except Exception as e:
                     if self.is_running: logging.error(f"Error receiving from Gemini: {e}", exc_info=True)
@@ -221,8 +224,8 @@ class GeminiInteractionLoop:
     def signal_stop(self):
         logging.info("Signal to stop GeminiInteractionLoop received.")
         self.is_running = False
-        for q_name, q_obj_ref in [("video_q", video_frames_to_gemini_q),
-                                  ("audio_in_q", audio_chunks_to_gemini_q),
                                   ("audio_out_q", audio_from_gemini_playback_q)]:
             if q_obj_ref:
                 try: q_obj_ref.put_nowait(None)
@@ -255,7 +258,7 @@ class GeminiInteractionLoop:
                     logging.error(f"Failed to send system prompt: {e}", exc_info=True)
                     self.is_running = False; return
-                # Python 3.9 does not have asyncio.TaskGroup, so manage tasks individually
                 tasks = []
                 try:
                     logging.info("Creating async tasks for Gemini interaction...")
@@ -263,12 +266,23 @@ class GeminiInteractionLoop:
                     tasks.append(asyncio.create_task(self.process_gemini_responses(), name="process_gemini_responses"))
                     tasks.append(asyncio.create_task(self.play_gemini_audio(), name="play_gemini_audio"))
                     logging.info("All Gemini interaction tasks created.")
-                    await asyncio.gather(*tasks) # Wait for all tasks to complete
-                except Exception as e_gather: # Catch errors from tasks gathered
-                    logging.error(f"Error during asyncio.gather: {e_gather}", exc_info=True)
                     for task in tasks:
-                        if not task.done(): task.cancel() # Cancel pending tasks
-                    await asyncio.gather(*tasks, return_exceptions=True) # Wait for cancellations
                 logging.info("Gemini interaction tasks finished or cancelled.")
         except asyncio.CancelledError: logging.info("GeminiInteractionLoop.run_main_loop() was cancelled.")
@@ -276,15 +290,22 @@ class GeminiInteractionLoop:
             logging.error(f"Exception in GeminiInteractionLoop run_main_loop: {type(e).__name__}: {e}", exc_info=True)
         finally:
             logging.info("GeminiInteractionLoop.run_main_loop() finishing...")
-            self.is_running = False # Ensure flag is set for all tasks
-            self.signal_stop() # Send sentinels again to be sure
             self.gemini_session = None
-            # Clear global queues by setting them to None
             video_frames_to_gemini_q = None
             audio_chunks_to_gemini_q = None
             audio_from_gemini_playback_q = None
             logging.info("GeminiInteractionLoop finished and global queues set to None.")
 # --- WebRTC Media Processors ---
 class VideoProcessor(VideoProcessorBase):
     def __init__(self):
@@ -311,8 +332,7 @@ class VideoProcessor(VideoProcessorBase):
             video_frames_to_gemini_q.put_nowait(api_data)
         except Exception as e: logging.error(f"Error processing/queueing video frame: {e}", exc_info=True)
-    async def recv(self
-, frame):
         img_bgr = frame.to_ndarray(format="bgr24")
         try:
             loop = asyncio.get_running_loop()
@@ -325,11 +345,8 @@ class AudioProcessor(AudioProcessorBase):
         if audio_chunks_to_gemini_q is None: return
         for frame in audio_frames:
             audio_data = frame.planes[0].to_bytes()
-            # Note: Ensure this mime_type and the actual audio data format (sample rate, channels, bit depth)
-            # are compatible with what the Gemini Live API expects for PCM audio.
             mime_type = f"audio/L16;rate={frame.sample_rate};channels={frame.layout.channels}"
-            api_data = {"data"
-: audio_data, "mime_type": mime_type}
             try:
                 if audio_chunks_to_gemini_q.full():
                     try: await asyncio.wait_for(audio_chunks_to_gemini_q.get(), timeout=0.01)
@@ -414,7 +431,7 @@ def run_streamlit_app():
         if webrtc_ctx.state.playing:
             st.caption("WebRTC connected. Streaming your camera and microphone.")
-        elif st.session_state.gemini_session_active: # Check if session is supposed to be active
             st.caption("WebRTC attempting to connect. Ensure camera/microphone permissions are granted in your browser.")
             if hasattr(webrtc_ctx.state, 'error') and webrtc_ctx.state.error:
                 st.error(f"WebRTC Connection Error: {webrtc_ctx.state.error}")

 VIDEO_API_RESIZE = (1024, 1024)
 # !!! IMPORTANT: Verify this model name is correct for the Live API !!!
 MODEL_NAME = "models/gemini-2.0-flash-live-001"
 logging.info(f"Using Gemini Model: {MODEL_NAME}")
 MEDICAL_ASSISTANT_SYSTEM_PROMPT = """You are an AI Medical Assistant. Your primary function is to analyze visual information from the user's camera or screen and respond via voice.
 Your responsibilities are:
 1.  **Visual Observation and Description:** Carefully examine the images or video feed. Describe relevant details you observe.
 2.  **General Information (Non-Diagnostic):** Provide general information related to what is visually presented, if applicable. You are not a diagnostic tool.
     *   If you see something that *appears* visually concerning (e.g., an unusual skin lesion, signs of injury), you may gently suggest it might be wise to have it looked at by a professional, without speculating on what it is.
 4.  **Tone:** Maintain a helpful, empathetic, and calm tone.
 5.  **Interaction:** After this initial instruction, you can make a brief acknowledgment of your role (e.g., "I'm ready to assist by looking at what you show me. Please remember to consult a doctor for medical advice."). Then, focus on responding to the user's visual input and questions.
 Example of a disclaimer you might use: "As an AI assistant, I can describe what I see, but I can't provide medical advice or diagnoses. For any health concerns, it's always best to speak with a doctor or other healthcare professional."
 """
     logging.critical("GEMINI_API_KEY not found.")
     st.stop()
+# Gemini LiveConnectConfig - Using audio response and Puck voice as in your latest code
+# Ensure this configuration is valid for your API key and model.
 LIVE_CONNECT_CONFIG = types.LiveConnectConfig(
+    response_modalities=["audio"], # Requesting audio response
     speech_config=types.SpeechConfig(
         voice_config=types.VoiceConfig(
+            prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Puck") # Using Puck voice
+        )
+    ) # <---------------------------------- CORRECTED: Added missing closing parenthesis
+)
+logging.info(f"Attempting connection with LiveConnectConfig: {LIVE_CONNECT_CONFIG}")
 # --- Backend Gemini Interaction Loop ---
 class GeminiInteractionLoop:
             return
         try:
             logging.info(f"Sending text to Gemini: '{user_text[:50]}...'")
+            # Use the specific method as suggested by the deprecation warning if possible
+            # For now, keeping session.send as it was working functionally
             await self.gemini_session.send(input=user_text, end_of_turn=True)
         except Exception as e:
             logging.error(f"Error sending text message to Gemini: {e}", exc_info=True)
                 media_data = await get_media_from_queues()
                 if media_data is None and not self.is_running: break # Sentinel and stop signal
                 if media_data and self.gemini_session and self.is_running:
+                    try:
+                        # Use the specific method as suggested by the deprecation warning if possible
+                        # For now, keeping session.send as it was working functionally
+                        await self.gemini_session.send(input=media_data)
                     except Exception as e: logging.error(f"Error sending media chunk to Gemini: {e}", exc_info=True)
                 elif not media_data: await asyncio.sleep(0.05) # No data, yield
         except asyncio.CancelledError: logging.info("Task cancelled: stream_media_to_gemini.")
                             logging.info(f"Gemini text response: {text_response[:100]}")
                             if 'chat_messages' not in st.session_state: st.session_state.chat_messages = []
                             st.session_state.chat_messages = st.session_state.chat_messages + [{"role": "assistant", "content": text_response}]
                 except types.generation_types.StopCandidateException: logging.info("Gemini response stream ended normally.")
                 except Exception as e:
                     if self.is_running: logging.error(f"Error receiving from Gemini: {e}", exc_info=True)
     def signal_stop(self):
         logging.info("Signal to stop GeminiInteractionLoop received.")
         self.is_running = False
+        for q_name, q_obj_ref in [("video_q", video_frames_to_gemini_q),
+                                  ("audio_in_q", audio_chunks_to_gemini_q),
                                   ("audio_out_q", audio_from_gemini_playback_q)]:
             if q_obj_ref:
                 try: q_obj_ref.put_nowait(None)
                     logging.error(f"Failed to send system prompt: {e}", exc_info=True)
                     self.is_running = False; return
+                # Using asyncio.gather for Python 3.9 compatibility
                 tasks = []
                 try:
                     logging.info("Creating async tasks for Gemini interaction...")
                     tasks.append(asyncio.create_task(self.process_gemini_responses(), name="process_gemini_responses"))
                     tasks.append(asyncio.create_task(self.play_gemini_audio(), name="play_gemini_audio"))
                     logging.info("All Gemini interaction tasks created.")
+                    # Wait for tasks to complete or raise an exception
+                    done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
+                    # Check results of completed tasks for errors
+                    for future in done:
+                        try:
+                            future.result() # Raise exception if task failed
+                        except Exception as task_exc:
+                            logging.error(f"Task {future.get_name()} failed: {task_exc}", exc_info=True)
+                            # Optionally cancel remaining tasks if one fails critically
+                            for p_task in pending: p_task.cancel()
+                    # If loop completes normally (e.g., user stops), pending tasks will be handled by finally block
+                except Exception as e_gather: # Catch errors during task creation/gathering
+                    logging.error(f"Error during task management: {e_gather}", exc_info=True)
                     for task in tasks:
+                        if not task.done(): task.cancel()
+                    # Wait for cancellations to complete
+                    await asyncio.gather(*tasks, return_exceptions=True)
                 logging.info("Gemini interaction tasks finished or cancelled.")
         except asyncio.CancelledError: logging.info("GeminiInteractionLoop.run_main_loop() was cancelled.")
             logging.error(f"Exception in GeminiInteractionLoop run_main_loop: {type(e).__name__}: {e}", exc_info=True)
         finally:
             logging.info("GeminiInteractionLoop.run_main_loop() finishing...")
+            self.is_running = False
+            self.signal_stop() # Ensure sentinels are sent
+            # Clean up any remaining tasks (important if gather didn't complete)
+            # current_tasks = [t for t in asyncio.all_tasks(self.async_event_loop) if t is not asyncio.current_task()]
+            # if current_tasks:
+            #     logging.info(f"Cancelling {len(current_tasks)} remaining tasks...")
+            #     for task in current_tasks: task.cancel()
+            #     await asyncio.gather(*current_tasks, return_exceptions=True)
             self.gemini_session = None
             video_frames_to_gemini_q = None
             audio_chunks_to_gemini_q = None
             audio_from_gemini_playback_q = None
             logging.info("GeminiInteractionLoop finished and global queues set to None.")
 # --- WebRTC Media Processors ---
 class VideoProcessor(VideoProcessorBase):
     def __init__(self):
             video_frames_to_gemini_q.put_nowait(api_data)
         except Exception as e: logging.error(f"Error processing/queueing video frame: {e}", exc_info=True)
+    async def recv(self, frame):
         img_bgr = frame.to_ndarray(format="bgr24")
         try:
             loop = asyncio.get_running_loop()
         if audio_chunks_to_gemini_q is None: return
         for frame in audio_frames:
             audio_data = frame.planes[0].to_bytes()
             mime_type = f"audio/L16;rate={frame.sample_rate};channels={frame.layout.channels}"
+            api_data = {"data": audio_data, "mime_type": mime_type}
             try:
                 if audio_chunks_to_gemini_q.full():
                     try: await asyncio.wait_for(audio_chunks_to_gemini_q.get(), timeout=0.01)
         if webrtc_ctx.state.playing:
             st.caption("WebRTC connected. Streaming your camera and microphone.")
+        elif st.session_state.gemini_session_active:
             st.caption("WebRTC attempting to connect. Ensure camera/microphone permissions are granted in your browser.")
             if hasattr(webrtc_ctx.state, 'error') and webrtc_ctx.state.error:
                 st.error(f"WebRTC Connection Error: {webrtc_ctx.state.error}")