noumanjavaid commited on
Commit
a46c042
·
verified ·
1 Parent(s): 09684b3

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +28 -44
src/streamlit_app.py CHANGED
@@ -4,9 +4,9 @@ import os
4
  import asyncio
5
  import base64
6
  import io
7
- import atexit
8
  import threading
9
  import traceback
 
10
  import time
11
  import logging
12
  from dotenv import load_dotenv
@@ -24,8 +24,9 @@ from streamlit_webrtc import (
24
  WebRtcMode,
25
  AudioProcessorBase,
26
  VideoProcessorBase,
 
27
  )
28
- from aiortc import RTCIceServer, RTCConfiguration # For STUN server configuration
29
 
30
  # --- Configuration ---
31
  load_dotenv()
@@ -42,8 +43,8 @@ AUDIO_PLAYBACK_QUEUE_MAXSIZE = 50
42
  MEDIA_TO_GEMINI_QUEUE_MAXSIZE = 30
43
 
44
  # Video configuration
45
- VIDEO_FPS_TO_GEMINI = 2 # Target FPS to send to Gemini (increased slightly)
46
- VIDEO_API_RESIZE = (1024, 1024) # Max size for images sent to API
47
 
48
  MODEL_NAME = "models/gemini-2.0-flash-live-001"
49
 
@@ -67,7 +68,7 @@ Example of a disclaimer you might use: "As an AI assistant, I can describe what
67
  pya = pyaudio.PyAudio()
68
  def cleanup_pyaudio():
69
  logging.info("Terminating PyAudio instance.")
70
- if pya: # Check if pya is not None
71
  pya.terminate()
72
  atexit.register(cleanup_pyaudio)
73
 
@@ -83,10 +84,9 @@ if GEMINI_API_KEY:
83
  try:
84
  client = genai.Client(http_options={"api_version": "v1beta"}, api_key=GEMINI_API_KEY)
85
  except Exception as e:
86
- # This error will be shown in Streamlit UI if it happens at startup
87
  st.error(f"Failed to initialize Gemini client: {e}")
88
  logging.critical(f"Gemini client initialization failed: {e}", exc_info=True)
89
- st.stop() # Stop Streamlit app if client fails
90
  else:
91
  st.error("GEMINI_API_KEY not found in environment variables. Please set it for the application to run.")
92
  logging.critical("GEMINI_API_KEY not found.")
@@ -166,7 +166,6 @@ class GeminiInteractionLoop:
166
  logging.info(f"Gemini text response: {text_response[:100]}")
167
  if 'chat_messages' not in st.session_state: st.session_state.chat_messages = []
168
  st.session_state.chat_messages = st.session_state.chat_messages + [{"role": "assistant", "content": text_response}]
169
- # Consider st.experimental_rerun() if a mechanism exists to call it from main thread
170
  except types.generation_types.StopCandidateException: logging.info("Gemini response stream ended normally.")
171
  except Exception as e:
172
  if self.is_running: logging.error(f"Error receiving from Gemini: {e}", exc_info=True)
@@ -184,7 +183,7 @@ class GeminiInteractionLoop:
184
  while self.is_running:
185
  try:
186
  audio_chunk = await asyncio.wait_for(audio_from_gemini_playback_q.get(), timeout=1.0)
187
- if audio_chunk: # Not None (sentinel)
188
  await asyncio.to_thread(self.playback_stream.write, audio_chunk)
189
  if audio_chunk: audio_from_gemini_playback_q.task_done()
190
  except asyncio.TimeoutError: continue
@@ -204,17 +203,15 @@ class GeminiInteractionLoop:
204
  logging.info("Signal to stop GeminiInteractionLoop received.")
205
  self.is_running = False
206
  for q in [video_frames_to_gemini_q, audio_chunks_to_gemini_q, audio_from_gemini_playback_q]:
207
- try: q.put_nowait(None) # Sentinel to unblock .get()
208
  except asyncio.QueueFull: logging.warning(f"Queue was full when trying to put sentinel for stop signal.")
209
  except Exception as e: logging.error(f"Error putting sentinel in queue: {e}", exc_info=True)
210
 
211
-
212
  async def run_main_loop(self):
213
  self.async_event_loop = asyncio.get_running_loop()
214
  self.is_running = True
215
  logging.info("GeminiInteractionLoop run_main_loop starting...")
216
  if client is None:
217
-
218
  logging.critical("Gemini client is None in run_main_loop. Aborting.")
219
  return
220
 
@@ -255,7 +252,6 @@ class VideoProcessor(VideoProcessorBase):
255
  def __init__(self):
256
  self.frame_counter = 0
257
  self.last_gemini_send_time = time.monotonic()
258
- # No need to get loop here if create_task is used on the default loop
259
 
260
  async def _process_and_queue_frame_async(self, frame_ndarray):
261
  self.frame_counter += 1
@@ -279,28 +275,15 @@ class VideoProcessor(VideoProcessorBase):
279
  video_frames_to_gemini_q.put_nowait(api_data)
280
  except Exception as e: logging.error(f"Error processing/queueing video frame: {e}", exc_info=True)
281
 
282
- async def recv(self, frame): # Called by streamlit-webrtc
283
  img_bgr = frame.to_ndarray(format="bgr24")
284
  asyncio.create_task(self._process_and_queue_frame_async(img_bgr))
285
- return frame # Return original frame for WebRTC to display
286
 
287
  class AudioProcessor(AudioProcessorBase):
288
  async def _process_and_queue_audio_async(self, audio_frames):
289
- for frame in audio_frames: # frame is an AudioFrame from aiortc
290
- # frame.planes[0].to_bytes() is the raw audio data
291
- # frame.sample_rate, frame.layout.channels
292
- # logging.info(f"Audio frame: {len(frame.planes[0].to_bytes())} bytes, SR={frame.sample_rate}, C={frame.layout.channels}")
293
-
294
- # CRITICAL NOTE: This sends audio as received from WebRTC.
295
- # If Gemini requires a specific sample rate (e.g., 16000 Hz) and WebRTC provides
296
- # a different one (e.g., 48000 Hz), audio recognition may be poor.
297
- # Proper solution: Implement resampling here. This is omitted for brevity.
298
  audio_data = frame.planes[0].to_bytes()
299
- # Mime type should reflect the actual data being sent.
300
- # Example: "audio/L16;rate=48000;channels=1" if that's what WebRTC provides.
301
- # Gemini documentation should specify what it accepts for PCM.
302
- # Assuming "audio/pcm" is generic enough, or be more specific.
303
- # Forcing L16 (16-bit linear PCM) as that's common.
304
  mime_type = f"audio/L16;rate={frame.sample_rate};channels={frame.layout.channels}"
305
  api_data = {"data": audio_data, "mime_type": mime_type}
306
 
@@ -311,7 +294,7 @@ class AudioProcessor(AudioProcessorBase):
311
  audio_chunks_to_gemini_q.put_nowait(api_data)
312
  except Exception as e: logging.error(f"Error queueing audio chunk: {e}", exc_info=True)
313
 
314
- async def recv(self, frames): # Called by streamlit-webrtc
315
  asyncio.create_task(self._process_and_queue_audio_async(frames))
316
  return frames
317
 
@@ -321,7 +304,7 @@ def initialize_app_session_state():
321
  'gemini_session_active': False,
322
  'gemini_loop_instance': None,
323
  'chat_messages': [],
324
- 'webrtc_component_key': f"webrtc_streamer_key_{int(time.time())}", # Initial dynamic key
325
  }
326
  for key, value in defaults.items():
327
  if key not in st.session_state:
@@ -329,7 +312,7 @@ def initialize_app_session_state():
329
 
330
  def run_streamlit_app():
331
  st.set_page_config(page_title="Live AI Medical Assistant (HF Spaces)", layout="wide")
332
- initialize_app_session_state() # Ensure state is initialized
333
 
334
  st.title("Live AI Medical Assistant")
335
  st.markdown("Utilizing Gemini Live API via WebRTC on Hugging Face Spaces")
@@ -350,9 +333,9 @@ def run_streamlit_app():
350
  st.session_state.gemini_loop_instance = gemini_loop
351
  threading.Thread(target=lambda: asyncio.run(gemini_loop.run_main_loop()), name="GeminiLoopThread", daemon=True).start()
352
  st.success("Gemini session starting... WebRTC will attempt to connect.")
353
- st.session_state.webrtc_component_key = f"webrtc_streamer_key_{int(time.time())}" # Force re-render of WebRTC
354
  st.rerun()
355
- else: # Session is active
356
  if st.button("🛑 Stop Session", type="secondary", use_container_width=True, key="stop_session_btn"):
357
  if st.session_state.gemini_loop_instance:
358
  st.session_state.gemini_loop_instance.signal_stop()
@@ -364,39 +347,40 @@ def run_streamlit_app():
364
 
365
  if st.session_state.gemini_session_active:
366
  st.subheader("Your Live Feed (from your browser)")
367
- RTC_CONFIGURATION = RTCConfiguration({"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]})
368
  MEDIA_STREAM_CONSTRAINTS = {
369
- "video": True, # Or specific constraints like {"width": 640, "height": 480}
370
- "audio": { # Request specific audio format
371
  "sampleRate": {"ideal": WEBRTC_REQUESTED_SEND_SAMPLE_RATE},
372
  "channelCount": {"exact": WEBRTC_REQUESTED_AUDIO_CHANNELS},
373
- "echoCancellation": True, # Recommended for voice
374
- "noiseSuppression": True # Recommended for voice
375
  }
376
  }
377
 
378
  webrtc_ctx = webrtc_streamer(
379
  key=st.session_state.webrtc_component_key,
380
  mode=WebRtcMode.SENDONLY,
381
- rtc_configuration=RTC_CONFIGURATION,
 
 
382
  media_stream_constraints=MEDIA_STREAM_CONSTRAINTS,
383
  video_processor_factory=VideoProcessor,
384
  audio_processor_factory=AudioProcessor,
385
  async_processing=True,
386
- # desired_playing_state=st.session_state.gemini_session_active # Let it be controlled by rendering
387
  )
388
 
389
  if webrtc_ctx.state.playing:
390
  st.caption("WebRTC connected. Streaming your camera and microphone.")
391
  elif st.session_state.gemini_session_active:
392
  st.caption("WebRTC attempting to connect. Ensure camera/microphone permissions are granted in your browser.")
393
- if webrtc_ctx.state.error:
394
  st.error(f"WebRTC Connection Error: {webrtc_ctx.state.error}")
395
  else:
396
  st.info("Click 'Start Session' in the sidebar to enable the live feed and assistant.")
397
 
398
  st.subheader("Chat with Assistant")
399
- chat_placeholder = st.container() # Use a container for chat messages
400
  with chat_placeholder:
401
  for msg in st.session_state.get('chat_messages', []):
402
  with st.chat_message(msg["role"]):
@@ -427,7 +411,7 @@ def run_streamlit_app():
427
  st.rerun()
428
 
429
  if __name__ == "__main__":
430
- if client is None: # Final check before running
431
  logging.critical("Gemini client could not be initialized. Application cannot start.")
432
  else:
433
  run_streamlit_app()
 
4
  import asyncio
5
  import base64
6
  import io
 
7
  import threading
8
  import traceback
9
+ import atexit # Correctly imported
10
  import time
11
  import logging
12
  from dotenv import load_dotenv
 
24
  WebRtcMode,
25
  AudioProcessorBase,
26
  VideoProcessorBase,
27
+ # ClientSettings # Removed as it's not used in this version
28
  )
29
+ # from aiortc import RTCIceServer, RTCConfiguration # RTCConfiguration object not needed directly for webrtc_streamer
30
 
31
  # --- Configuration ---
32
  load_dotenv()
 
43
  MEDIA_TO_GEMINI_QUEUE_MAXSIZE = 30
44
 
45
  # Video configuration
46
+ VIDEO_FPS_TO_GEMINI = 2
47
+ VIDEO_API_RESIZE = (1024, 1024)
48
 
49
  MODEL_NAME = "models/gemini-2.0-flash-live-001"
50
 
 
68
  pya = pyaudio.PyAudio()
69
  def cleanup_pyaudio():
70
  logging.info("Terminating PyAudio instance.")
71
+ if pya:
72
  pya.terminate()
73
  atexit.register(cleanup_pyaudio)
74
 
 
84
  try:
85
  client = genai.Client(http_options={"api_version": "v1beta"}, api_key=GEMINI_API_KEY)
86
  except Exception as e:
 
87
  st.error(f"Failed to initialize Gemini client: {e}")
88
  logging.critical(f"Gemini client initialization failed: {e}", exc_info=True)
89
+ st.stop()
90
  else:
91
  st.error("GEMINI_API_KEY not found in environment variables. Please set it for the application to run.")
92
  logging.critical("GEMINI_API_KEY not found.")
 
166
  logging.info(f"Gemini text response: {text_response[:100]}")
167
  if 'chat_messages' not in st.session_state: st.session_state.chat_messages = []
168
  st.session_state.chat_messages = st.session_state.chat_messages + [{"role": "assistant", "content": text_response}]
 
169
  except types.generation_types.StopCandidateException: logging.info("Gemini response stream ended normally.")
170
  except Exception as e:
171
  if self.is_running: logging.error(f"Error receiving from Gemini: {e}", exc_info=True)
 
183
  while self.is_running:
184
  try:
185
  audio_chunk = await asyncio.wait_for(audio_from_gemini_playback_q.get(), timeout=1.0)
186
+ if audio_chunk:
187
  await asyncio.to_thread(self.playback_stream.write, audio_chunk)
188
  if audio_chunk: audio_from_gemini_playback_q.task_done()
189
  except asyncio.TimeoutError: continue
 
203
  logging.info("Signal to stop GeminiInteractionLoop received.")
204
  self.is_running = False
205
  for q in [video_frames_to_gemini_q, audio_chunks_to_gemini_q, audio_from_gemini_playback_q]:
206
+ try: q.put_nowait(None)
207
  except asyncio.QueueFull: logging.warning(f"Queue was full when trying to put sentinel for stop signal.")
208
  except Exception as e: logging.error(f"Error putting sentinel in queue: {e}", exc_info=True)
209
 
 
210
  async def run_main_loop(self):
211
  self.async_event_loop = asyncio.get_running_loop()
212
  self.is_running = True
213
  logging.info("GeminiInteractionLoop run_main_loop starting...")
214
  if client is None:
 
215
  logging.critical("Gemini client is None in run_main_loop. Aborting.")
216
  return
217
 
 
252
  def __init__(self):
253
  self.frame_counter = 0
254
  self.last_gemini_send_time = time.monotonic()
 
255
 
256
  async def _process_and_queue_frame_async(self, frame_ndarray):
257
  self.frame_counter += 1
 
275
  video_frames_to_gemini_q.put_nowait(api_data)
276
  except Exception as e: logging.error(f"Error processing/queueing video frame: {e}", exc_info=True)
277
 
278
+ async def recv(self, frame):
279
  img_bgr = frame.to_ndarray(format="bgr24")
280
  asyncio.create_task(self._process_and_queue_frame_async(img_bgr))
281
+ return frame
282
 
283
  class AudioProcessor(AudioProcessorBase):
284
  async def _process_and_queue_audio_async(self, audio_frames):
285
+ for frame in audio_frames:
 
 
 
 
 
 
 
 
286
  audio_data = frame.planes[0].to_bytes()
 
 
 
 
 
287
  mime_type = f"audio/L16;rate={frame.sample_rate};channels={frame.layout.channels}"
288
  api_data = {"data": audio_data, "mime_type": mime_type}
289
 
 
294
  audio_chunks_to_gemini_q.put_nowait(api_data)
295
  except Exception as e: logging.error(f"Error queueing audio chunk: {e}", exc_info=True)
296
 
297
+ async def recv(self, frames):
298
  asyncio.create_task(self._process_and_queue_audio_async(frames))
299
  return frames
300
 
 
304
  'gemini_session_active': False,
305
  'gemini_loop_instance': None,
306
  'chat_messages': [],
307
+ 'webrtc_component_key': f"webrtc_streamer_key_{int(time.time())}",
308
  }
309
  for key, value in defaults.items():
310
  if key not in st.session_state:
 
312
 
313
  def run_streamlit_app():
314
  st.set_page_config(page_title="Live AI Medical Assistant (HF Spaces)", layout="wide")
315
+ initialize_app_session_state()
316
 
317
  st.title("Live AI Medical Assistant")
318
  st.markdown("Utilizing Gemini Live API via WebRTC on Hugging Face Spaces")
 
333
  st.session_state.gemini_loop_instance = gemini_loop
334
  threading.Thread(target=lambda: asyncio.run(gemini_loop.run_main_loop()), name="GeminiLoopThread", daemon=True).start()
335
  st.success("Gemini session starting... WebRTC will attempt to connect.")
336
+ st.session_state.webrtc_component_key = f"webrtc_streamer_key_{int(time.time())}"
337
  st.rerun()
338
+ else:
339
  if st.button("🛑 Stop Session", type="secondary", use_container_width=True, key="stop_session_btn"):
340
  if st.session_state.gemini_loop_instance:
341
  st.session_state.gemini_loop_instance.signal_stop()
 
347
 
348
  if st.session_state.gemini_session_active:
349
  st.subheader("Your Live Feed (from your browser)")
350
+
351
  MEDIA_STREAM_CONSTRAINTS = {
352
+ "video": True,
353
+ "audio": {
354
  "sampleRate": {"ideal": WEBRTC_REQUESTED_SEND_SAMPLE_RATE},
355
  "channelCount": {"exact": WEBRTC_REQUESTED_AUDIO_CHANNELS},
356
+ "echoCancellation": True,
357
+ "noiseSuppression": True
358
  }
359
  }
360
 
361
  webrtc_ctx = webrtc_streamer(
362
  key=st.session_state.webrtc_component_key,
363
  mode=WebRtcMode.SENDONLY,
364
+ rtc_configuration={ # MODIFIED HERE: Pass dictionary directly
365
+ "iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]
366
+ },
367
  media_stream_constraints=MEDIA_STREAM_CONSTRAINTS,
368
  video_processor_factory=VideoProcessor,
369
  audio_processor_factory=AudioProcessor,
370
  async_processing=True,
 
371
  )
372
 
373
  if webrtc_ctx.state.playing:
374
  st.caption("WebRTC connected. Streaming your camera and microphone.")
375
  elif st.session_state.gemini_session_active:
376
  st.caption("WebRTC attempting to connect. Ensure camera/microphone permissions are granted in your browser.")
377
+ if hasattr(webrtc_ctx.state, 'error') and webrtc_ctx.state.error: # Check if error attribute exists
378
  st.error(f"WebRTC Connection Error: {webrtc_ctx.state.error}")
379
  else:
380
  st.info("Click 'Start Session' in the sidebar to enable the live feed and assistant.")
381
 
382
  st.subheader("Chat with Assistant")
383
+ chat_placeholder = st.container()
384
  with chat_placeholder:
385
  for msg in st.session_state.get('chat_messages', []):
386
  with st.chat_message(msg["role"]):
 
411
  st.rerun()
412
 
413
  if __name__ == "__main__":
414
+ if client is None:
415
  logging.critical("Gemini client could not be initialized. Application cannot start.")
416
  else:
417
  run_streamlit_app()