Spaces:
Running
Running
# qa.py | |
import os | |
import json | |
import tempfile | |
import streamlit as st | |
from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration, AudioProcessorBase | |
from utils import generate_audio_mp3, call_groq_api_for_qa | |
import av | |
import pydub | |
import wave | |
import numpy as np | |
# For streaming from the mic, we need some RTC configuration | |
RTC_CONFIGURATION = RTCConfiguration( | |
{"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]} | |
) | |
class AudioBufferProcessor(AudioProcessorBase): | |
""" | |
A custom audio processor that accumulates raw audio frames in memory. | |
When the user stops, we can finalize them into a single WAV for STT. | |
""" | |
def __init__(self) -> None: | |
self.frames = [] | |
def recv_audio(self, frame: av.AudioFrame) -> av.AudioFrame: | |
# Convert the audio frame to a pydub AudioSegment | |
pcm = frame.to_ndarray() | |
# The shape is (channels, samples) | |
# We'll assume single channel or handle the first channel | |
if pcm.ndim == 2 and pcm.shape[0] > 1: | |
# If stereo, just take the first channel for STT | |
pcm = pcm[0, :] | |
sample_rate = frame.sample_rate | |
samples = pcm.astype(np.int16).tobytes() | |
segment = pydub.AudioSegment( | |
data=samples, | |
sample_width=2, # int16 | |
frame_rate=sample_rate, | |
channels=1 | |
) | |
self.frames.append(segment) | |
return frame | |
def finalize_wav(self) -> str: | |
""" | |
Once the user stops recording, combine frames into a single WAV file. | |
Returns path to the wav file. | |
""" | |
if not self.frames: | |
return "" | |
combined = sum(self.frames) | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_wav: | |
combined.export(tmp_wav.name, format="wav") | |
return tmp_wav.name | |
def handle_qa_exchange(conversation_so_far: str, user_question: str) -> (bytes, str): | |
""" | |
1) Build system prompt from conversation_so_far + user_question | |
2) Call the LLM to get short JSON | |
3) TTS the answer | |
4) Return (audio_bytes, answer_text) | |
""" | |
system_prompt = f""" | |
You are John, the guest speaker. The user is asking a follow-up question. | |
Conversation so far: | |
{conversation_so_far} | |
New user question: | |
{user_question} | |
Please respond in JSON with keys "speaker" and "text", e.g.: | |
{{ "speaker": "John", "text": "Sure, here's my answer..." }} | |
""" | |
raw_json_response = call_groq_api_for_qa(system_prompt) | |
response_dict = json.loads(raw_json_response) | |
answer_text = response_dict.get("text", "") | |
speaker = response_dict.get("speaker", "John") | |
if not answer_text.strip(): | |
return (None, "") | |
# TTS | |
audio_file_path = generate_audio_mp3(answer_text, "John") | |
with open(audio_file_path, "rb") as f: | |
audio_bytes = f.read() | |
return (audio_bytes, answer_text) | |