MyPod_10 / qa.py
siddhartharyaai's picture
Update qa.py
b0e78f5 verified
raw
history blame
2.93 kB
# qa.py
import os
import json
import tempfile
import streamlit as st
from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration, AudioProcessorBase
from utils import generate_audio_mp3, call_groq_api_for_qa
import av
import pydub
import wave
import numpy as np
# For streaming from the mic, we need some RTC configuration
RTC_CONFIGURATION = RTCConfiguration(
{"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]}
)
class AudioBufferProcessor(AudioProcessorBase):
"""
A custom audio processor that accumulates raw audio frames in memory.
When the user stops, we can finalize them into a single WAV for STT.
"""
def __init__(self) -> None:
self.frames = []
def recv_audio(self, frame: av.AudioFrame) -> av.AudioFrame:
# Convert the audio frame to a pydub AudioSegment
pcm = frame.to_ndarray()
# The shape is (channels, samples)
# We'll assume single channel or handle the first channel
if pcm.ndim == 2 and pcm.shape[0] > 1:
# If stereo, just take the first channel for STT
pcm = pcm[0, :]
sample_rate = frame.sample_rate
samples = pcm.astype(np.int16).tobytes()
segment = pydub.AudioSegment(
data=samples,
sample_width=2, # int16
frame_rate=sample_rate,
channels=1
)
self.frames.append(segment)
return frame
def finalize_wav(self) -> str:
"""
Once the user stops recording, combine frames into a single WAV file.
Returns path to the wav file.
"""
if not self.frames:
return ""
combined = sum(self.frames)
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_wav:
combined.export(tmp_wav.name, format="wav")
return tmp_wav.name
def handle_qa_exchange(conversation_so_far: str, user_question: str) -> (bytes, str):
"""
1) Build system prompt from conversation_so_far + user_question
2) Call the LLM to get short JSON
3) TTS the answer
4) Return (audio_bytes, answer_text)
"""
system_prompt = f"""
You are John, the guest speaker. The user is asking a follow-up question.
Conversation so far:
{conversation_so_far}
New user question:
{user_question}
Please respond in JSON with keys "speaker" and "text", e.g.:
{{ "speaker": "John", "text": "Sure, here's my answer..." }}
"""
raw_json_response = call_groq_api_for_qa(system_prompt)
response_dict = json.loads(raw_json_response)
answer_text = response_dict.get("text", "")
speaker = response_dict.get("speaker", "John")
if not answer_text.strip():
return (None, "")
# TTS
audio_file_path = generate_audio_mp3(answer_text, "John")
with open(audio_file_path, "rb") as f:
audio_bytes = f.read()
return (audio_bytes, answer_text)