MyPod_10

Running

App Files Files Community

siddhartharyaai commited on Jan 14

Commit

b0e78f5

verified ·

1 Parent(s): 07e7e91

Update qa.py

Browse files

Files changed (1) hide show

qa.py +57 -51

qa.py CHANGED Viewed

@@ -1,43 +1,70 @@
 # qa.py
 import os
-import requests
 import json
 import tempfile
 import streamlit as st
-from utils import generate_audio_mp3  # Reuse your existing TTS function
-def transcribe_audio_deepgram(local_audio_path: str) -> str:
-    """
-    Sends a local audio file to Deepgram for STT.
-    Returns the transcript text if successful, or raises an error if failed.
-    """
-    DEEPGRAM_API_KEY = os.environ.get("DEEPGRAM_API_KEY")
-    if not DEEPGRAM_API_KEY:
-        raise ValueError("Deepgram API key not found in environment variables.")
-    url = "https://api.deepgram.com/v1/listen?model=nova-2&smart_format=true"
-    # For WAV -> "audio/wav". If user uploads MP3, you'd use "audio/mpeg".
-    headers = {
-        "Authorization": f"Token {DEEPGRAM_API_KEY}",
-        "Content-Type": "audio/wav"
-    }
-    with open(local_audio_path, "rb") as f:
-        response = requests.post(url, headers=headers, data=f)
-    response.raise_for_status()
-    data = response.json()
-    # Extract the transcript
-    transcript = data["results"]["channels"][0]["alternatives"][0].get("transcript", "")
-    return transcript
-def call_llm_for_qa(conversation_so_far: str, user_question: str) -> dict:
     """
-    Minimal function that calls your LLM (Groq) to answer a follow-up question.
-    Returns a Python dict, e.g.: {"speaker": "John", "text": "..."}
     """
     system_prompt = f"""
     You are John, the guest speaker. The user is asking a follow-up question.
@@ -51,37 +78,16 @@ def call_llm_for_qa(conversation_so_far: str, user_question: str) -> dict:
     {{ "speaker": "John", "text": "Sure, here's my answer..." }}
     """
-    from utils import call_groq_api_for_qa
     raw_json_response = call_groq_api_for_qa(system_prompt)
-    # Expect a JSON string: {"speaker": "John", "text": "some short answer"}
     response_dict = json.loads(raw_json_response)
-    return response_dict
-def handle_qa_exchange(user_question: str) -> (bytes, str):
-    """
-    1) Read conversation_so_far from session_state
-    2) Call the LLM for a short follow-up answer
-    3) Generate TTS audio
-    4) Return (audio_bytes, answer_text)
-    """
-    conversation_so_far = st.session_state.get("conversation_history", "")
-    # Ask the LLM
-    response_dict = call_llm_for_qa(conversation_so_far, user_question)
     answer_text = response_dict.get("text", "")
     speaker = response_dict.get("speaker", "John")
-    # Update conversation
-    new_history = conversation_so_far + f"\nUser: {user_question}\n{speaker}: {answer_text}\n"
-    st.session_state["conversation_history"] = new_history
     if not answer_text.strip():
         return (None, "")
     # TTS
-    audio_file_path = generate_audio_mp3(answer_text, "John")  # always John
     with open(audio_file_path, "rb") as f:
         audio_bytes = f.read()

 # qa.py
 import os
 import json
 import tempfile
 import streamlit as st
+from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration, AudioProcessorBase
+from utils import generate_audio_mp3, call_groq_api_for_qa
+import av
+import pydub
+import wave
+import numpy as np
+# For streaming from the mic, we need some RTC configuration
+RTC_CONFIGURATION = RTCConfiguration(
+    {"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]}
+)
+class AudioBufferProcessor(AudioProcessorBase):
+    """
+    A custom audio processor that accumulates raw audio frames in memory.
+    When the user stops, we can finalize them into a single WAV for STT.
     """
+    def __init__(self) -> None:
+        self.frames = []
+    def recv_audio(self, frame: av.AudioFrame) -> av.AudioFrame:
+        # Convert the audio frame to a pydub AudioSegment
+        pcm = frame.to_ndarray()
+        # The shape is (channels, samples)
+        # We'll assume single channel or handle the first channel
+        if pcm.ndim == 2 and pcm.shape[0] > 1:
+            # If stereo, just take the first channel for STT
+            pcm = pcm[0, :]
+        sample_rate = frame.sample_rate
+        samples = pcm.astype(np.int16).tobytes()
+        segment = pydub.AudioSegment(
+            data=samples,
+            sample_width=2,  # int16
+            frame_rate=sample_rate,
+            channels=1
+        )
+        self.frames.append(segment)
+        return frame
+    def finalize_wav(self) -> str:
+        """
+        Once the user stops recording, combine frames into a single WAV file.
+        Returns path to the wav file.
+        """
+        if not self.frames:
+            return ""
+        combined = sum(self.frames)
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_wav:
+            combined.export(tmp_wav.name, format="wav")
+            return tmp_wav.name
+def handle_qa_exchange(conversation_so_far: str, user_question: str) -> (bytes, str):
+    """
+    1) Build system prompt from conversation_so_far + user_question
+    2) Call the LLM to get short JSON
+    3) TTS the answer
+    4) Return (audio_bytes, answer_text)
     """
     system_prompt = f"""
     You are John, the guest speaker. The user is asking a follow-up question.
     {{ "speaker": "John", "text": "Sure, here's my answer..." }}
     """
     raw_json_response = call_groq_api_for_qa(system_prompt)
     response_dict = json.loads(raw_json_response)
     answer_text = response_dict.get("text", "")
     speaker = response_dict.get("speaker", "John")
     if not answer_text.strip():
         return (None, "")
     # TTS
+    audio_file_path = generate_audio_mp3(answer_text, "John")
     with open(audio_file_path, "rb") as f:
         audio_bytes = f.read()