File size: 2,932 Bytes
70347ba
 
 
 
37a3329
70347ba
b0e78f5
70347ba
b0e78f5
70347ba
b0e78f5
 
 
 
70347ba
b0e78f5
 
 
 
70347ba
b0e78f5
 
 
 
70347ba
b0e78f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70347ba
 
37a3329
70347ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0e78f5
70347ba
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# qa.py

import os
import json
import tempfile
import streamlit as st
from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration, AudioProcessorBase

from utils import generate_audio_mp3, call_groq_api_for_qa

import av
import pydub
import wave
import numpy as np

# For streaming from the mic, we need some RTC configuration
RTC_CONFIGURATION = RTCConfiguration(
    {"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]}
)

class AudioBufferProcessor(AudioProcessorBase):
    """
    A custom audio processor that accumulates raw audio frames in memory.
    When the user stops, we can finalize them into a single WAV for STT.
    """
    def __init__(self) -> None:
        self.frames = []

    def recv_audio(self, frame: av.AudioFrame) -> av.AudioFrame:
        # Convert the audio frame to a pydub AudioSegment
        pcm = frame.to_ndarray()
        # The shape is (channels, samples)
        # We'll assume single channel or handle the first channel
        if pcm.ndim == 2 and pcm.shape[0] > 1:
            # If stereo, just take the first channel for STT
            pcm = pcm[0, :]

        sample_rate = frame.sample_rate
        samples = pcm.astype(np.int16).tobytes()
        segment = pydub.AudioSegment(
            data=samples,
            sample_width=2,  # int16
            frame_rate=sample_rate,
            channels=1
        )
        self.frames.append(segment)
        return frame

    def finalize_wav(self) -> str:
        """
        Once the user stops recording, combine frames into a single WAV file.
        Returns path to the wav file.
        """
        if not self.frames:
            return ""
        combined = sum(self.frames)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_wav:
            combined.export(tmp_wav.name, format="wav")
            return tmp_wav.name


def handle_qa_exchange(conversation_so_far: str, user_question: str) -> (bytes, str):
    """
    1) Build system prompt from conversation_so_far + user_question
    2) Call the LLM to get short JSON
    3) TTS the answer
    4) Return (audio_bytes, answer_text)
    """
    system_prompt = f"""
    You are John, the guest speaker. The user is asking a follow-up question.
    Conversation so far:
    {conversation_so_far}

    New user question:
    {user_question}

    Please respond in JSON with keys "speaker" and "text", e.g.:
    {{ "speaker": "John", "text": "Sure, here's my answer..." }}
    """

    raw_json_response = call_groq_api_for_qa(system_prompt)
    response_dict = json.loads(raw_json_response)
    answer_text = response_dict.get("text", "")
    speaker = response_dict.get("speaker", "John")

    if not answer_text.strip():
        return (None, "")

    # TTS
    audio_file_path = generate_audio_mp3(answer_text, "John")
    with open(audio_file_path, "rb") as f:
        audio_bytes = f.read()

    return (audio_bytes, answer_text)