Spaces:

renesistech
/

Spatial-awares

Running

App Files Files Community

noumanjavaid commited on 4 days ago

Commit

7630a47

verified ·

1 Parent(s): 4afb504

Update app.py

Browse files

Files changed (1) hide show

app.py +298 -660

app.py CHANGED Viewed

@@ -1,69 +1,25 @@
-# -*- coding: utf-8 -*-
-import streamlit as st
 import os
 import asyncio
-import base64
-import io
-import threading
-import traceback
-import atexit
-import time
 import logging
-from dotenv import load_dotenv
 import cv2
-import pyaudio
-import PIL.Image
-# Import websockets for explicit exception handling
-import websockets.exceptions
-from google import genai
-from google.genai import types
 from google.genai.types import Content, Part
-from streamlit_webrtc import (
-    webrtc_streamer,
-    WebRtcMode,
-    AudioProcessorBase,
-    VideoProcessorBase,
-)
-load_dotenv()
-# Audio configuration - fix audio format issues
-FORMAT = pyaudio.paInt16
-CHANNELS = 1
-SEND_SAMPLE_RATE = 16000  # Changed to match mime_type for consistency
-RECEIVE_SAMPLE_RATE = 16000  # Changed from 24000 to 16000 to match send rate
-CHUNK_SIZE = 1024
-# Map PyAudio format to a more descriptive name for clarity.
-PYAUDIO_FORMAT = FORMAT  # pyaudio.paInt16
-PYAUDIO_CHANNELS = CHANNELS
-PYAUDIO_PLAYBACK_CHUNK_SIZE = CHUNK_SIZE
-GEMINI_AUDIO_RECEIVE_SAMPLE_RATE = RECEIVE_SAMPLE_RATE
-# Video configuration
-VIDEO_FPS_TO_GEMINI = 1  # Reduced from 2 to lower bandwidth
-VIDEO_API_RESIZE = (512, 512)  # Reduced from 1024x1024 to lower payload size
-MAX_PAYLOAD_SIZE_BYTES = 60000  # Just under 64KB WebSocket limit
-# Queue sizes
-MEDIA_TO_GEMINI_QUEUE_MAXSIZE = 10
-AUDIO_PLAYBACK_QUEUE_MAXSIZE = 10
-# WebRTC settings
-WEBRTC_REQUESTED_SEND_SAMPLE_RATE = SEND_SAMPLE_RATE
-WEBRTC_REQUESTED_AUDIO_CHANNELS = CHANNELS
-# !!! IMPORTANT: Verify this model name is correct for the Live API !!!
-MODEL_NAME = "models/gemini-2.0-flash-live-001"
-logging.info(f"Using Gemini Model: {MODEL_NAME}")
-MEDICAL_ASSISTANT_SYSTEM_PROMPT = """You are an AI Medical Assistant. Your primary function is to analyze visual information from the user's camera or screen and respond via voice.
 Your responsibilities are:
 1.  **Visual Observation and Description:** Carefully examine the images or video feed. Describe relevant details you observe.
 2.  **General Information (Non-Diagnostic):** Provide general information related to what is visually presented, if applicable. You are not a diagnostic tool.
@@ -75,641 +31,323 @@ Your responsibilities are:
 4.  **Tone:** Maintain a helpful, empathetic, and calm tone.
 5.  **Interaction:** After this initial instruction, you can make a brief acknowledgment of your role (e.g., "I'm ready to assist by looking at what you show me. Please remember to consult a doctor for medical advice."). Then, focus on responding to the user's visual input and questions.
 Example of a disclaimer you might use: "As an AI assistant, I can describe what I see, but I can't provide medical advice or diagnoses. For any health concerns, it's always best to speak with a doctor or other healthcare professional."
-"""
-# --- PyAudio Global Instance and Cleanup ---
-pya = None
-try:
-    pya = pyaudio.PyAudio()
-    def cleanup_pyaudio():
-        logging.info("Terminating PyAudio instance.")
-        if pya:
-            pya.terminate()
-    atexit.register(cleanup_pyaudio)
-    logging.info("PyAudio initialized successfully.")
-except Exception as e_pyaudio:
-    logging.warning(
-        f"PyAudio initialization failed (expected in some server environments): {e_pyaudio}")
-    pya = None
-# --- Global Queues - Declare as None, initialize later ---
-video_frames_to_gemini_q: asyncio.Queue = None
-audio_chunks_to_gemini_q: asyncio.Queue = None
-audio_from_gemini_playback_q: asyncio.Queue = None
-# --- Gemini Client Setup ---
-# Try to get API key from environment or use a manually provided one
-def initialize_gemini_client():
-    # Check for API key in various places
-    api_key = os.environ.get("GEMINI_API_KEY")
-    # Look for .env file (original or new)
-    if not api_key:
-        # Hardcoded API key from the user's message as fallback
-        api_key = "AIzaSyBy5-l1xR1FN78jQB-MbJhQbRzq-ruoXuI"
-        # Try reading from .env.new which we know exists and has permissions
-        env_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), ".env.new")
-        try:
-            if os.path.exists(env_file):
-                with open(env_file, "r") as f:
-                    for line in f:
-                        if line.startswith("GEMINI_API_KEY="):
-                            api_key = line.strip().split("=", 1)[1]
-                            # Remove quotes if present
-                            api_key = api_key.strip('\'"')
-                            break
-        except (PermissionError, IOError) as e:
-            logging.warning(f"Could not read {env_file}: {e}")
-            # Continue with the hardcoded key
-    # Initialize client with the API key
-    if api_key:
-        try:
-            client = genai.Client(http_options={"api_version": "v1beta"}, api_key=api_key)
-            logging.info("Gemini client initialized successfully.")
-            return client
-        except Exception as e:
-            logging.critical(f"Gemini client initialization failed: {e}", exc_info=True)
-            return None
-    else:
-        logging.critical("GEMINI_API_KEY not found.")
-        return None
-client = initialize_gemini_client()
-# Configure the Gemini Live connection with proper settings
-LIVE_CONNECT_CONFIG = types.LiveConnectConfig(
-    response_modalities=["audio"],  # Only requesting audio and text responses
-    speech_config=types.SpeechConfig(
-        voice_config=types.VoiceConfig(
-            prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Zephyr")
-        )
-    )
-)
-logging.info(f"Attempting connection with LiveConnectConfig: {LIVE_CONNECT_CONFIG}")
-# --- Backend Gemini Interaction Loop ---
 class GeminiInteractionLoop:
-    def __init__(self):
         self.gemini_session = None
-        self.async_event_loop = None
-        self.is_running = True
-        self.playback_stream = None
-    async def send_text_input_to_gemini(self, user_text):
-        if not user_text or not self.gemini_session or not self.is_running:
-            logging.warning(
-                "Cannot send text. Session not active, no text, or not running.")
-            return
-        try:
-            logging.info(f"Sending text to Gemini: '{user_text[:50]}...'")
-            # Use send_client_content as specified in the error message
-            content = Content(parts=[Part(text=user_text)])
-            await self.gemini_session.send_client_content(content)
-        except Exception as e:
-            logging.error(
-                f"Error sending text message to Gemini: {e}", exc_info=True)
-    # Helper function to validate and possibly resize media data
-    def _validate_media_payload(self, media_data):
-        """Validate and potentially reduce size of media payload"""
-        if not isinstance(media_data, dict):
-            logging.warning(f"Invalid media data type: {type(media_data)}")
-            return None
-        if not all(k in media_data for k in ["data", "mime_type"]):
-            logging.warning(f"Media data missing required fields")
-            return None
-        # Handle audio data - ensure proper format for Gemini API
-        if media_data["mime_type"].startswith("audio/"):
             try:
-                # Ensure audio data is in bytes format
-                if isinstance(media_data["data"], bytes):
-                    # No need to base64 encode binary audio data for Gemini API
-                    # Just ensure the mime_type is correctly formatted
-                    if "rate=" not in media_data["mime_type"]:
-                        # Default to 16kHz if not specified
-                        media_data["mime_type"] = f"audio/L16;rate=16000;channels=1"
-                    # Create a new dict to avoid modifying the original
-                    return {
-                        "mime_type": media_data["mime_type"],
-                        "data": media_data["data"]
-                    }
-                else:
-                    logging.warning(f"Audio data is not in bytes format: {type(media_data['data'])}")
-                    return None
             except Exception as e:
-                logging.error(f"Error processing audio data: {e}", exc_info=True)
-                return None
-        # Check if it's an image and needs resizing
-        if media_data["mime_type"].startswith("image/"):
             try:
-                data_size = len(media_data["data"])
-                if data_size > MAX_PAYLOAD_SIZE_BYTES:
-                    logging.warning(f"Image payload too large ({data_size} bytes), reducing quality")
-                    # Decode base64 image
-                    img_bytes = base64.b64decode(media_data["data"])
-                    img = PIL.Image.open(io.BytesIO(img_bytes))
-                    # Try lower quality JPEG
-                    buffer = io.BytesIO()
-                    img.save(buffer, format="JPEG", quality=70)
-                    buffer.seek(0)
-                    smaller_bytes = buffer.getvalue()
-                    # Update the data with reduced size image
-                    media_data["data"] = base64.b64encode(smaller_bytes).decode()
             except Exception as e:
-                logging.error(f"Error resizing image: {e}", exc_info=True)
-        return media_data
     async def stream_media_to_gemini(self):
-        logging.info("Task started: Stream media from WebRTC queues to Gemini.")
-        async def get_media_from_queues():
-            if video_frames_to_gemini_q is None or audio_chunks_to_gemini_q is None:
-                await asyncio.sleep(0.1)
-                return None
-            try:
-                video_frame = await asyncio.wait_for(video_frames_to_gemini_q.get(), timeout=0.02)
-                if video_frame is None:
-                    return None  # Sentinel received
-                video_frames_to_gemini_q.task_done()
-                return video_frame
-            except asyncio.TimeoutError:
-                pass
-            except Exception as e:
-                logging.error(f"Error getting video from queue: {e}", exc_info=True)
-            try:
-                audio_chunk = await asyncio.wait_for(audio_chunks_to_gemini_q.get(), timeout=0.02)
-                if audio_chunk is None:
-                    return None  # Sentinel received
-                audio_chunks_to_gemini_q.task_done()
-                return audio_chunk
-            except asyncio.TimeoutError:
-                return None
-            except Exception as e:
-                logging.error(f"Error getting audio from queue: {e}", exc_info=True)
-                return None
         try:
             while self.is_running:
-                if not self.gemini_session:
-                    await asyncio.sleep(0.1)
-                    continue
-                media_data = await get_media_from_queues()
-                if media_data is None and not self.is_running:
-                    break  # Sentinel and stop signal
-                if media_data and self.gemini_session and self.is_running:
-                    try:
-                        validated_media = self._validate_media_payload(media_data)
-                        if validated_media:
-                            # Log media type and size before sending
-                            data_size = len(validated_media.get('data', b'')) if isinstance(validated_media.get('data'), bytes) else len(validated_media.get('data', ''))
-                            logging.debug(f"Sending media to Gemini. Type: {validated_media.get('mime_type')}, Data size: {data_size} bytes")
-                            # Ensure we're not exceeding WebSocket payload limits
-                            if data_size > MAX_PAYLOAD_SIZE_BYTES:
-                                logging.warning(f"Media payload exceeds maximum size ({data_size} > {MAX_PAYLOAD_SIZE_BYTES}), skipping")
-                                continue
-                            # Send the validated media to Gemini
-                            await self.gemini_session.send(input=validated_media)
-                        else:
-                            # Log if validation failed, but only if media_data was not None initially
-                            if media_data is not None:
-                                logging.warning(f"Media validation failed for payload. Type: {media_data.get('mime_type') if isinstance(media_data, dict) else type(media_data)}, skipping send.")
-                    except websockets.exceptions.ConnectionClosedError as e_conn_closed:
-                        error_code = getattr(e_conn_closed, 'code', None)
-                        error_reason = getattr(e_conn_closed, 'reason', 'Unknown reason')
-                        logging.error(f"WebSocket connection closed with code {error_code}: {error_reason}")
-                        logging.error(f"Connection closed while sending media: {e_conn_closed}", exc_info=True)
-                        # If we get a 1007 error (invalid frame payload data), log more details
-                        if error_code == 1007:
-                            logging.error(f"Invalid frame payload data error. This is likely due to malformed media data.")
-                            if isinstance(media_data, dict):
-                                logging.error(f"Media type: {media_data.get('mime_type', 'unknown')}, Data type: {type(media_data.get('data', None))}")
-                        # Stop the interaction loop if connection is lost
-                        self.is_running = False
-                    except Exception as e:
-                        logging.error(f"Error sending media chunk to Gemini: {e}", exc_info=True)
-                elif not media_data: # media_data could be None if queues were empty and timed out
-                    await asyncio.sleep(0.05)  # Yield to other tasks if no media
-        except asyncio.CancelledError:
-            logging.info("Task cancelled: stream_media_to_gemini.")
-        finally:
-            logging.info("Task finished: stream_media_to_gemini.")
-    async def process_gemini_responses(self):
-        logging.info("Task started: Process responses from Gemini.")
-        try:
-            while self.is_running:
-                if not self.gemini_session:
-                    await asyncio.sleep(0.1)
-                    continue
-                if audio_from_gemini_playback_q is None:
-                    await asyncio.sleep(0.1)
-                    continue
-                try:
-                    turn_response = self.gemini_session.receive()
-                    async for chunk in turn_response:
-                        if not self.is_running:
-                            break
-                        if audio_data := chunk.data:
-                            if not audio_from_gemini_playback_q.full():
-                                audio_from_gemini_playback_q.put_nowait(audio_data)
-                            else:
-                                logging.warning(
-                                    "Audio playback queue full, discarding Gemini audio data.")
-                        if text_response := chunk.text:
-                            logging.info(f"Gemini text response: {text_response[:100]}")
-                except types.generation_types.StopCandidateException:
-                    logging.info("Gemini response stream ended normally.")
-                except Exception as e:
-                    if self.is_running:
-                        logging.error(
-                            f"Error receiving from Gemini: {e}", exc_info=True)
-                    await asyncio.sleep(0.1)
-        except asyncio.CancelledError:
-            logging.info("Task cancelled: process_gemini_responses.")
-        finally:
-            logging.info("Task finished: process_gemini_responses.")
-    async def play_gemini_audio(self):
-        logging.info("Task started: Play Gemini audio responses.")
-        if pya is None:
-            logging.warning(
-                "PyAudio not available. Audio playback task will not run.")
             return
         try:
-            while audio_from_gemini_playback_q is None and self.is_running:
-                await asyncio.sleep(0.1)
-            if not self.is_running:
-                return
-            self.playback_stream = await asyncio.to_thread(
-                pya.open, format=PYAUDIO_FORMAT, channels=PYAUDIO_CHANNELS, rate=GEMINI_AUDIO_RECEIVE_SAMPLE_RATE, output=True, frames_per_buffer=PYAUDIO_PLAYBACK_CHUNK_SIZE
-            )
-            logging.info(
-                f"PyAudio playback stream opened at {GEMINI_AUDIO_RECEIVE_SAMPLE_RATE} Hz.")
             while self.is_running:
-                try:
-                    audio_chunk = await asyncio.wait_for(audio_from_gemini_playback_q.get(), timeout=1.0)
-                    if audio_chunk is None and not self.is_running:
-                        break  # Sentinel and stop signal
-                    if audio_chunk:
-                        await asyncio.to_thread(self.playback_stream.write, audio_chunk)
-                    if audio_chunk:
-                        audio_from_gemini_playback_q.task_done()
-                except asyncio.TimeoutError:
-                    continue
-                except Exception as e:
-                    logging.error(f"Error playing audio chunk: {e}", exc_info=True)
-                    await asyncio.sleep(0.01)
         except Exception as e:
-            logging.error(
-                f"Failed to open or use PyAudio playback stream (might be expected in this environment): {e}", exc_info=True)
-        finally:
-            if self.playback_stream:
-                logging.info("Stopping and closing PyAudio playback stream.")
-                try:
-                    await asyncio.to_thread(self.playback_stream.stop_stream)
-                    await asyncio.to_thread(self.playback_stream.close)
-                except Exception as e_close:
-                    logging.error(
-                        f"Error closing playback stream: {e_close}", exc_info=True)
-                self.playback_stream = None
-            logging.info("Task finished: play_gemini_audio.")
-    def signal_stop(self):
-        logging.info("Signal to stop GeminiInteractionLoop received.")
-        self.is_running = False
-        for q_name, q_obj_ref in [("video_q", video_frames_to_gemini_q),
-                                  ("audio_in_q", audio_chunks_to_gemini_q),
-                                  ("audio_out_q", audio_from_gemini_playback_q)]:
-            if q_obj_ref:
-                try:
-                    q_obj_ref.put_nowait(None)
-                except asyncio.QueueFull:
-                    logging.warning(
-                        f"Queue {q_name} was full when trying to put sentinel for stop signal.")
-                except Exception as e:
-                    logging.error(
-                        f"Error putting sentinel in {q_name}: {e}", exc_info=True)
-    async def run_main_loop(self):
-        global video_frames_to_gemini_q, audio_chunks_to_gemini_q, audio_from_gemini_playback_q
-        self.async_event_loop = asyncio.get_running_loop()
-        self.is_running = True
-        logging.info("GeminiInteractionLoop run_main_loop starting...")
-        video_frames_to_gemini_q = asyncio.Queue(
-            maxsize=MEDIA_TO_GEMINI_QUEUE_MAXSIZE)
-        audio_chunks_to_gemini_q = asyncio.Queue(
-            maxsize=MEDIA_TO_GEMINI_QUEUE_MAXSIZE)
-        audio_from_gemini_playback_q = asyncio.Queue(
-            maxsize=AUDIO_PLAYBACK_QUEUE_MAXSIZE)
-        logging.info("Asyncio queues initialized in GeminiInteractionLoop.")
-        if client is None:
-            logging.critical(
-                "Gemini client is None in run_main_loop. Aborting.")
-            return
         try:
-            async with client.aio.live.connect(model=MODEL_NAME, config=LIVE_CONNECT_CONFIG) as session:
-                self.gemini_session = session
-                logging.info(
-                    f"Gemini session established with API for model {MODEL_NAME}.")
                 try:
-                    logging.info("Sending system prompt to Gemini...")
-                    # Use send_client_content with proper format
-                    content = Content(parts=[Part(text=MEDICAL_ASSISTANT_SYSTEM_PROMPT)])
-                    await self.gemini_session.send_client_content(content)
-                    logging.info("System prompt sent successfully.")
                 except Exception as e:
-                    logging.error(
-                        f"Failed to send system prompt: {e}", exc_info=True)
-                    self.is_running = False
-                    return
-                tasks = []
-                try:
-                    logging.info("Creating async tasks for Gemini interaction...")
-                    media_stream_task = asyncio.create_task(self.stream_media_to_gemini(), name="stream_media_to_gemini")
-                    response_process_task = asyncio.create_task(self.process_gemini_responses(), name="process_gemini_responses")
-                    audio_play_task = asyncio.create_task(self.play_gemini_audio(), name="play_gemini_audio")
-                    tasks = [media_stream_task, response_process_task, audio_play_task]
-                    logging.info("All Gemini interaction tasks created.")
-                    # Wait for all tasks to complete, collecting all results/exceptions
-                    results = await asyncio.gather(*tasks, return_exceptions=True)
-                    for i, result in enumerate(results):
-                        if isinstance(result, Exception):
-                            task_name = tasks[i].get_name() if hasattr(tasks[i], 'get_name') else f"Task-{i}"
-                            logging.error(f"Task '{task_name}' failed: {result}", exc_info=result)
-                            # If one task fails, we might want to signal others to stop.
-                            # self.signal_stop() # This is already called in finally, but could be earlier if needed.
-                except asyncio.CancelledError:
-                    logging.info("One or more tasks were cancelled during gather.")
-                except Exception as e_gather:
-                    logging.error(f"Error during task management with asyncio.gather: {e_gather}", exc_info=True)
-                finally:
-                    # Ensure all tasks are cancelled if not already done, before main loop finally block
-                    for task in tasks:
-                        if task and not task.done():
-                            task.cancel()
-                    # Await their cancellation (or completion if they finished cleanly before cancel)
-                    if tasks: # Ensure tasks list is not empty
-                        await asyncio.gather(*tasks, return_exceptions=True) # Suppress errors from already handled/cancelled tasks
-                logging.info("Gemini interaction tasks processing completed or handled.")
-        except websockets.exceptions.ConnectionClosedError as e:
-            logging.error(f"WebSocket connection closed with error code {e.code}: {e}")
-            st.error(f"Connection to Gemini API failed: {e}. Please try again.")
-        except asyncio.CancelledError:
-            logging.info("GeminiInteractionLoop.run_main_loop() was cancelled.")
-        except Exception as e:  # General catch-all
-            logging.error(
-                f"Exception in GeminiInteractionLoop run_main_loop: {type(e).__name__}: {e}", exc_info=True)
-        finally:
-            logging.info("GeminiInteractionLoop.run_main_loop() finishing...")
             self.is_running = False
-            self.signal_stop()  # Ensure sentinels are sent
-            self.gemini_session = None
-            video_frames_to_gemini_q = None
-            audio_chunks_to_gemini_q = None
-            audio_from_gemini_playback_q = None
-            logging.info(
-                "GeminiInteractionLoop finished and global queues set to None.")
-# --- WebRTC Media Processors ---
-class VideoProcessor(VideoProcessorBase):
-    def __init__(self):
-        self.frame_counter = 0
-        self.last_gemini_send_time = time.monotonic()
-    async def _process_and_queue_frame_async(self, frame_ndarray):
-        if video_frames_to_gemini_q is None:
             return
-        self.frame_counter += 1
-        current_time = time.monotonic()
-        if (current_time - self.last_gemini_send_time) < (1.0 / VIDEO_FPS_TO_GEMINI):
             return
-        self.last_gemini_send_time = current_time
         try:
-            img_rgb = cv2.cvtColor(frame_ndarray, cv2.COLOR_BGR2RGB)
-            pil_img = PIL.Image.fromarray(img_rgb)
-            pil_img.thumbnail(VIDEO_API_RESIZE)  # Smaller resolution
-            image_io = io.BytesIO()
-            pil_img.save(image_io, format="jpeg", quality=85)  # Lower quality
-            image_bytes = image_io.getvalue()
-            # Check if image size is too large before encoding to base64
-            if len(image_bytes) > MAX_PAYLOAD_SIZE_BYTES:
-                logging.warning(f"Image too large ({len(image_bytes)} bytes), reducing quality further")
-                image_io = io.BytesIO()
-                pil_img.save(image_io, format="jpeg", quality=60)  # Even lower quality
-                image_bytes = image_io.getvalue()
-            api_data = {"mime_type": "image/jpeg",
-                        "data": base64.b64encode(image_bytes).decode()}
-            if video_frames_to_gemini_q.full():
-                try:
-                    await asyncio.wait_for(video_frames_to_gemini_q.get(), timeout=0.01)
-                except asyncio.TimeoutError:
-                    logging.warning("Video queue full, frame dropped.")
-                    return
-            video_frames_to_gemini_q.put_nowait(api_data)
         except Exception as e:
-            logging.error(
-                f"Error processing/queueing video frame: {e}", exc_info=True)
-    async def recv(self, frame):
-        img_bgr = frame.to_ndarray(format="bgr24")
         try:
-            loop = asyncio.get_running_loop()
-            loop.create_task(self._process_and_queue_frame_async(img_bgr))
-        except RuntimeError:
-            logging.error(
-                "VideoProcessor.recv: No running asyncio loop in current thread for create_task.")
-        return frame
-class AudioProcessor(AudioProcessorBase):
-    async def _process_and_queue_audio_async(self, audio_frames):
-        if audio_chunks_to_gemini_q is None:
-            return
-        for frame in audio_frames:
             try:
-                # Extract audio data from frame
-                audio_data = frame.planes[0].to_bytes()
-                # Skip empty audio frames
-                if not audio_data or len(audio_data) == 0:
-                    continue
-                # Ensure we're using the correct format for Gemini API
-                # WebSocket error 1007 occurs with invalid frame payload data
-                # Using standard audio/L16 with 16kHz sample rate (matches SEND_SAMPLE_RATE)
-                mime_type = "audio/L16;rate=16000;channels=1"
-                # Validate audio data before queueing
-                if isinstance(audio_data, bytes) and len(audio_data) > 0:
-                    # Check if data size is reasonable (avoid oversized payloads)
-                    if len(audio_data) > MAX_PAYLOAD_SIZE_BYTES:
-                        logging.warning(f"Audio chunk too large ({len(audio_data)} bytes), skipping")
-                        continue
-                    # Create properly formatted API data
-                    api_data = {
-                        "data": audio_data,  # Keep as bytes, don't base64 encode
-                        "mime_type": mime_type
-                    }
-                    # Handle queue overflow
-                    if audio_chunks_to_gemini_q.full():
-                        try:
-                            # Remove oldest item if queue is full
-                            await asyncio.wait_for(audio_chunks_to_gemini_q.get(), timeout=0.01)
-                            audio_chunks_to_gemini_q.task_done()
-                        except asyncio.TimeoutError:
-                            logging.warning("Audio queue full, chunk dropped.")
-                            continue
-                    # Queue the validated audio data
-                    audio_chunks_to_gemini_q.put_nowait(api_data)
-                else:
-                    logging.warning(f"Invalid audio data format: {type(audio_data)}, skipping")
             except Exception as e:
-                logging.error(f"Error processing audio chunk: {e}", exc_info=True)
-    async def recv(self, frames):
-        try:
-            loop = asyncio.get_running_loop()
-            loop.create_task(self._process_and_queue_audio_async(frames))
-        except RuntimeError:
-            logging.error(
-                "AudioProcessor.recv: No running asyncio loop in current thread for create_task.")
-        return frames
-# --- Streamlit UI and Application Logic ---
-def initialize_app_session_state():
-    defaults = {
-        'gemini_session_active': False,
-        'gemini_loop_instance': None,
-        'webrtc_component_key': f"webrtc_streamer_key_{int(time.time())}",
-    }
-    for key, value in defaults.items():
-        if key not in st.session_state:
-            st.session_state[key] = value
-def run_streamlit_app():
-    st.set_page_config(page_title="Voice AI Medical Assistant", layout="wide")
-    initialize_app_session_state()
-    st.title("Voice AI Medical Assistant")
-    # Display prominent error if client is not initialized
-    if client is None:
-        st.error("⚠️ Gemini API key not found or invalid. Please set a valid GEMINI_API_KEY in your .env file.")
-        st.info("You can create a .env file in the project directory with content: GEMINI_API_KEY=your_api_key_here")
-    st.warning("IMPORTANT: This is a VOICE-ONLY interface. Speak to the assistant through your microphone.")
-    st.info("Remember: This AI cannot provide medical diagnoses. Always consult a healthcare professional for medical advice.")
-    with st.sidebar:
-        st.header("Session Control")
-        if not st.session_state.gemini_session_active:
-            # Fixed emojis
-            if st.button("🚀 Start Voice Assistant", type="primary", use_container_width=True, key="start_session_btn"):
-                st.session_state.gemini_session_active = True
-                gemini_loop = GeminiInteractionLoop()
-                st.session_state.gemini_loop_instance = gemini_loop
-                threading.Thread(target=lambda: asyncio.run(gemini_loop.run_main_loop()), name="GeminiLoopThread", daemon=True).start()
-                st.success("Voice Assistant starting... Please allow camera/microphone access in your browser if prompted.")
-                st.session_state.webrtc_component_key = f"webrtc_streamer_key_{int(time.time())}"
-                st.rerun()
-        else:
-            # Fixed emojis
-            if st.button("🛑 Stop Session", type="secondary", use_container_width=True, key="stop_session_btn"):
-                if st.session_state.gemini_loop_instance:
-                    st.session_state.gemini_loop_instance.signal_stop()
-                st.session_state.gemini_loop_instance = None
-                st.session_state.gemini_session_active = False
-                st.warning("Session stopping...")
-                time.sleep(0.5)
-                st.rerun()
-    if st.session_state.gemini_session_active:
-        st.subheader("Your Live Feed (from your browser)")
-        MEDIA_STREAM_CONSTRAINTS = {
-            "video": True,
-            "audio": {
-                "sampleRate": {"ideal": WEBRTC_REQUESTED_SEND_SAMPLE_RATE},
-                "channelCount": {"exact": WEBRTC_REQUESTED_AUDIO_CHANNELS},
-                "echoCancellation": True,
-                "noiseSuppression": True
-            }
-        }
-        webrtc_ctx = webrtc_streamer(
-            key=st.session_state.webrtc_component_key,
-            mode=WebRtcMode.SENDONLY,
-            rtc_configuration={
-                "iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]
-            },
-            media_stream_constraints=MEDIA_STREAM_CONSTRAINTS,
-            video_processor_factory=VideoProcessor,
-            audio_processor_factory=AudioProcessor,
-            async_processing=True,
         )
-        if webrtc_ctx.state.playing:
-            st.success("🎤 Voice Assistant is now ACTIVE. Speak to interact!")
-            st.caption("The assistant is listening through your microphone and watching through your camera.")
-        elif st.session_state.gemini_session_active:
-            st.caption("Connecting... Ensure camera/microphone permissions are granted in your browser.")
-            if hasattr(webrtc_ctx.state, 'error') and webrtc_ctx.state.error:
-                st.error(f"WebRTC Connection Error: {webrtc_ctx.state.error}")
-    else:
-        st.info("Click 'Start Voice Assistant' in the sidebar to begin.")
-    # Visual indicator for voice activity
-    if st.session_state.gemini_session_active and webrtc_ctx.state.playing:
-        with st.container():
-            st.markdown("### How to use the Voice Assistant")
-            st.markdown("""
-            1. **Speak naturally** - The assistant is listening through your microphone
-            2. **Show things to the camera** - The assistant can see what you're showing
-            3. **Listen for responses** - The assistant will speak back to you
-            You do not need to type anything. This is a completely voice-controlled interface.
-            """)
 if __name__ == "__main__":
-    if client is None:
-        logging.critical("Gemini client could not be initialized. Application cannot start.")
-    else:
-        run_streamlit_app()

 import os
+import sys
 import asyncio
 import logging
+import datetime
+import argparse
+import numpy as np
 import cv2
+from queue import Queue
+import time
+import google as genai
 from google.genai.types import Content, Part
+from azure.cognitiveservices.speech import SpeechConfig, SpeechSynthesizer, AudioConfig, ResultReason, CancellationReason
+import sounddevice as sd
+import soundfile as sf
+import uuid
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s:%(name)s:%(message)s')
+# Define system prompt for the medical assistant
+MEDICAL_ASSISTANT_SYSTEM_PROMPT = '''You are an AI Medical Assistant. Your primary function is to analyze visual information from the user's camera or screen and respond via voice.
 Your responsibilities are:
 1.  **Visual Observation and Description:** Carefully examine the images or video feed. Describe relevant details you observe.
 2.  **General Information (Non-Diagnostic):** Provide general information related to what is visually presented, if applicable. You are not a diagnostic tool.
 4.  **Tone:** Maintain a helpful, empathetic, and calm tone.
 5.  **Interaction:** After this initial instruction, you can make a brief acknowledgment of your role (e.g., "I'm ready to assist by looking at what you show me. Please remember to consult a doctor for medical advice."). Then, focus on responding to the user's visual input and questions.
 Example of a disclaimer you might use: "As an AI assistant, I can describe what I see, but I can't provide medical advice or diagnoses. For any health concerns, it's always best to speak with a doctor or other healthcare professional."
+'''
+# Class to handle Gemini-Azure interaction
 class GeminiInteractionLoop:
+    def __init__(self, gemini_api_key, azure_speech_key, azure_speech_region, use_camera=True, use_speech=True):
+        self.gemini_api_key = gemini_api_key
+        self.azure_speech_key = azure_speech_key
+        self.azure_speech_region = azure_speech_region
+        self.use_camera = use_camera
+        self.use_speech = use_speech
+        # Initialize Gemini API
+        genai.configure(api_key=self.gemini_api_key)
+        self.model = genai.GenerativeModel('gemini-pro-vision')
         self.gemini_session = None
+        # Initialize camera
+        self.camera = None
+        if self.use_camera:
             try:
+                self.camera = cv2.VideoCapture(0)
+                if not self.camera.isOpened():
+                    logging.error("Failed to open camera device")
+                    self.use_camera = False
             except Exception as e:
+                logging.error(f"Error initializing camera: {e}")
+                self.use_camera = False
+        # Initialize Azure Speech Service
+        if self.use_speech:
             try:
+                self.speech_config = SpeechConfig(subscription=self.azure_speech_key, region=self.azure_speech_region)
+                self.speech_config.speech_synthesis_voice_name = "en-US-JennyNeural"
+                self.output_path = os.path.join(os.getcwd(), "temp_audio")
+                os.makedirs(self.output_path, exist_ok=True)
             except Exception as e:
+                logging.error(f"Error initializing Azure Speech Service: {e}")
+                self.use_speech = False
+        # Async queues for communication
+        self.text_to_speech_queue = Queue()
+        self.is_running = True
+    # Capture image from camera
+    def capture_image(self):
+        if not self.use_camera or self.camera is None:
+            return None
+        ret, frame = self.camera.read()
+        if not ret:
+            logging.error("Failed to capture image from camera")
+            return None
+        return frame
+    # Stream media to Gemini
     async def stream_media_to_gemini(self):
+        logging.info("Starting media stream to Gemini...")
         try:
+            interval = 5  # seconds between frames
+            last_capture_time = 0
             while self.is_running:
+                current_time = time.time()
+                if current_time - last_capture_time >= interval:
+                    frame = self.capture_image()
+                    if frame is not None:
+                        _, encoded_image = cv2.imencode(".jpg", frame)
+                        image_bytes = encoded_image.tobytes()
+                        try:
+                            # Convert to format expected by Gemini
+                            image_part = Part.from_data(mime_type="image/jpeg", data=image_bytes)
+                            content = Content(role="user", parts=[image_part])
+                            # Send to Gemini
+                            self.gemini_session.content = content
+                            await self.gemini_session.send_client_content()
+                            logging.info("Sent image to Gemini")
+                        except Exception as e:
+                            logging.error(f"Error sending image to Gemini: {e}")
+                    last_capture_time = current_time
+                await asyncio.sleep(1)
+        except Exception as e:
+            logging.error(f"Exception in stream_media_to_gemini: {e}")
+    # Send text input to Gemini
+    async def send_text_input_to_gemini(self, text):
+        if not text or not self.gemini_session:
             return
+        try:
+            # Create content with text
+            text_part = Part.from_text(text)
+            content = Content(role="user", parts=[text_part])
+            # Send to Gemini
+            self.gemini_session.content = content
+            await self.gemini_session.send_client_content()
+            logging.info(f"Sent text to Gemini: {text}")
+        except Exception as e:
+            logging.error(f"Error sending text to Gemini: {e}")
+    # Process user text input
+    async def process_text_input(self):
+        logging.info("Starting text input processing...")
         try:
             while self.is_running:
+                user_input = input("Enter text (or 'exit' to quit): ")
+                if user_input.lower() == 'exit':
+                    self.is_running = False
+                    break
+                await self.send_text_input_to_gemini(user_input)
         except Exception as e:
+            logging.error(f"Exception in process_text_input: {e}")
+            self.is_running = False
+    # Process responses from Gemini
+    async def process_gemini_responses(self):
+        logging.info("Starting Gemini response processing...")
         try:
+            async for response in self.gemini_session:
+                if not self.is_running:
+                    break
                 try:
+                    # Process content
+                    if hasattr(response, 'text'):
+                        text = response.text
+                        if text:
+                            logging.info(f"Gemini response: {text}")
+                            if self.use_speech:
+                                self.text_to_speech_queue.put(text)
                 except Exception as e:
+                    logging.error(f"Error processing Gemini response: {e}")
+        except Exception as e:
+            logging.error(f"Exception in process_gemini_responses: {e}")
             self.is_running = False
+    # Text-to-speech processor
+    async def text_to_speech_processor(self):
+        logging.info("Starting text-to-speech processor...")
+        if not self.use_speech:
             return
+        try:
+            while self.is_running or not self.text_to_speech_queue.empty():
+                if not self.text_to_speech_queue.empty():
+                    text = self.text_to_speech_queue.get()
+                    await self._synthesize_speech(text)
+                else:
+                    await asyncio.sleep(0.5)
+        except Exception as e:
+            logging.error(f"Exception in text_to_speech_processor: {e}")
+    # Synthesize speech
+    async def _synthesize_speech(self, text):
+        if not self.use_speech:
             return
         try:
+            # Generate unique filename
+            file_path = os.path.join(self.output_path, f"speech_{uuid.uuid4()}.wav")
+            # Configure output
+            audio_config = AudioConfig(filename=file_path)
+            # Create synthesizer
+            synthesizer = SpeechSynthesizer(speech_config=self.speech_config, audio_config=audio_config)
+            # Synthesize speech
+            result = synthesizer.speak_text_async(text).get()
+            # Check result
+            if result.reason == ResultReason.SynthesizingAudioCompleted:
+                logging.info(f"Speech synthesized and saved to {file_path}")
+                # Play audio
+                await self._play_audio(file_path)
+            elif result.reason == ResultReason.Canceled:
+                cancellation = result.cancellation_details
+                logging.error(f"Speech synthesis canceled: {cancellation.reason}")
+                if cancellation.reason == CancellationReason.Error:
+                    logging.error(f"Error details: {cancellation.error_details}")
         except Exception as e:
+            logging.error(f"Error in speech synthesis: {e}")
+    # Play audio
+    async def _play_audio(self, file_path):
         try:
+            data, fs = sf.read(file_path)
+            sd.play(data, fs)
+            sd.wait()  # Wait until playback is done
+            # Clean up file
+            try:
+                os.remove(file_path)
+            except Exception as e:
+                logging.warning(f"Failed to remove temp audio file {file_path}: {e}")
+        except Exception as e:
+            logging.error(f"Error playing audio: {e}")
+    # Main loop
+    async def run_main_loop(self):
+        try:
+            logging.info("Initializing Gemini session...")
+            self.gemini_session = await self.model.start_session_async()
+            # Send system prompt
             try:
+                logging.info("Sending system prompt to Gemini...")
+                # Create Content object correctly
+                system_content = Content(
+                    role="user",
+                    parts=[Part(text=MEDICAL_ASSISTANT_SYSTEM_PROMPT)]
+                )
+                # Set the content property before calling send_client_content
+                self.gemini_session.content = system_content
+                # Call send_client_content without arguments
+                await self.gemini_session.send_client_content()
+                logging.info("System prompt sent successfully.")
+            except Exception as e:
+                logging.error(f"Failed to send system prompt: {e}", exc_info=True)
+                self.is_running = False
+                return
+            tasks = []
+            try:
+                logging.info("Creating async tasks for Gemini interaction...")
+                media_stream_task = asyncio.create_task(self.stream_media_to_gemini(), name="stream_media_to_gemini")
+                tasks.append(media_stream_task)
+                text_input_task = asyncio.create_task(self.process_text_input(), name="process_text_input")
+                tasks.append(text_input_task)
+                gemini_response_task = asyncio.create_task(self.process_gemini_responses(), name="process_gemini_responses")
+                tasks.append(gemini_response_task)
+                if self.use_speech:
+                    tts_task = asyncio.create_task(self.text_to_speech_processor(), name="text_to_speech_processor")
+                    tasks.append(tts_task)
+                await asyncio.gather(*tasks)
+            except asyncio.CancelledError:
+                logging.info("Main loop tasks cancelled")
             except Exception as e:
+                logging.error(f"Exception in main loop tasks: {e}")
+            finally:
+                # Cancel tasks
+                for task in tasks:
+                    if not task.done():
+                        task.cancel()
+                        try:
+                            await task
+                        except asyncio.CancelledError:
+                            logging.info(f"Task {task.get_name()} cancelled")
+        except Exception as e:
+            logging.error(f"Exception in run_main_loop: {e}")
+        finally:
+            # Cleanup
+            logging.info("Cleaning up resources...")
+            if self.camera is not None and self.use_camera:
+                self.camera.release()
+            if self.gemini_session is not None:
+                await self.gemini_session.close()
+    # Clean up resources
+    def cleanup(self):
+        logging.info("Cleaning up resources...")
+        if self.camera is not None and self.use_camera:
+            self.camera.release()
+# Main function
+def main():
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description="Medical Assistant using Gemini and Azure Speech")
+    parser.add_argument("--gemini-api-key", help="Gemini API Key", default=os.environ.get("GEMINI_API_KEY"))
+    parser.add_argument("--azure-speech-key", help="Azure Speech API Key", default=os.environ.get("AZURE_SPEECH_KEY"))
+    parser.add_argument("--azure-speech-region", help="Azure Speech Region", default=os.environ.get("AZURE_SPEECH_REGION", "eastus"))
+    parser.add_argument("--no-camera", help="Disable camera usage", action="store_true")
+    parser.add_argument("--no-speech", help="Disable speech synthesis", action="store_true")
+    args = parser.parse_args()
+    # Check required parameters
+    if not args.gemini_api_key:
+        print("Error: Gemini API Key is required. Provide it via --gemini-api-key or GEMINI_API_KEY environment variable.")
+        return 1
+    if not args.azure_speech_key and not args.no_speech:
+        print("Error: Azure Speech Key is required for speech synthesis. Provide it via --azure-speech-key or AZURE_SPEECH_KEY environment variable, or use --no-speech to disable speech.")
+        return 1
+    try:
+        # Create interaction loop
+        interaction_loop = GeminiInteractionLoop(
+            gemini_api_key=args.gemini_api_key,
+            azure_speech_key=args.azure_speech_key,
+            azure_speech_region=args.azure_speech_region,
+            use_camera=not args.no_camera,
+            use_speech=not args.no_speech
         )
+        # Run main loop
+        asyncio.run(interaction_loop.run_main_loop())
+    except KeyboardInterrupt:
+        logging.info("Keyboard interrupt received. Shutting down...")
+    except Exception as e:
+        logging.error(f"Unhandled exception: {e}", exc_info=True)
+        return 1
+    return 0
 if __name__ == "__main__":
+    sys.exit(main())