# qa.py import os import json import tempfile import streamlit as st from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration, AudioProcessorBase from utils import generate_audio_mp3, call_groq_api_for_qa import av import pydub import wave import numpy as np # For streaming from the mic, we need some RTC configuration RTC_CONFIGURATION = RTCConfiguration( {"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]} ) class AudioBufferProcessor(AudioProcessorBase): """ A custom audio processor that accumulates raw audio frames in memory. When the user stops, we can finalize them into a single WAV for STT. """ def __init__(self) -> None: self.frames = [] def recv_audio(self, frame: av.AudioFrame) -> av.AudioFrame: # Convert the audio frame to a pydub AudioSegment pcm = frame.to_ndarray() # The shape is (channels, samples) # We'll assume single channel or handle the first channel if pcm.ndim == 2 and pcm.shape[0] > 1: # If stereo, just take the first channel for STT pcm = pcm[0, :] sample_rate = frame.sample_rate samples = pcm.astype(np.int16).tobytes() segment = pydub.AudioSegment( data=samples, sample_width=2, # int16 frame_rate=sample_rate, channels=1 ) self.frames.append(segment) return frame def finalize_wav(self) -> str: """ Once the user stops recording, combine frames into a single WAV file. Returns path to the wav file. """ if not self.frames: return "" combined = sum(self.frames) with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_wav: combined.export(tmp_wav.name, format="wav") return tmp_wav.name def handle_qa_exchange(conversation_so_far: str, user_question: str) -> (bytes, str): """ 1) Build system prompt from conversation_so_far + user_question 2) Call the LLM to get short JSON 3) TTS the answer 4) Return (audio_bytes, answer_text) """ system_prompt = f""" You are John, the guest speaker. The user is asking a follow-up question. Conversation so far: {conversation_so_far} New user question: {user_question} Please respond in JSON with keys "speaker" and "text", e.g.: {{ "speaker": "John", "text": "Sure, here's my answer..." }} """ raw_json_response = call_groq_api_for_qa(system_prompt) response_dict = json.loads(raw_json_response) answer_text = response_dict.get("text", "") speaker = response_dict.get("speaker", "John") if not answer_text.strip(): return (None, "") # TTS audio_file_path = generate_audio_mp3(answer_text, "John") with open(audio_file_path, "rb") as f: audio_bytes = f.read() return (audio_bytes, answer_text)