siddhartharyaai commited on
Commit
b0e78f5
·
verified ·
1 Parent(s): 07e7e91

Update qa.py

Browse files
Files changed (1) hide show
  1. qa.py +57 -51
qa.py CHANGED
@@ -1,43 +1,70 @@
1
  # qa.py
2
 
3
  import os
4
- import requests
5
  import json
6
  import tempfile
7
  import streamlit as st
 
8
 
9
- from utils import generate_audio_mp3 # Reuse your existing TTS function
10
 
11
- def transcribe_audio_deepgram(local_audio_path: str) -> str:
12
- """
13
- Sends a local audio file to Deepgram for STT.
14
- Returns the transcript text if successful, or raises an error if failed.
15
- """
16
- DEEPGRAM_API_KEY = os.environ.get("DEEPGRAM_API_KEY")
17
- if not DEEPGRAM_API_KEY:
18
- raise ValueError("Deepgram API key not found in environment variables.")
19
-
20
- url = "https://api.deepgram.com/v1/listen?model=nova-2&smart_format=true"
21
- # For WAV -> "audio/wav". If user uploads MP3, you'd use "audio/mpeg".
22
- headers = {
23
- "Authorization": f"Token {DEEPGRAM_API_KEY}",
24
- "Content-Type": "audio/wav"
25
- }
26
-
27
- with open(local_audio_path, "rb") as f:
28
- response = requests.post(url, headers=headers, data=f)
29
- response.raise_for_status()
30
 
31
- data = response.json()
32
- # Extract the transcript
33
- transcript = data["results"]["channels"][0]["alternatives"][0].get("transcript", "")
34
- return transcript
35
 
36
-
37
- def call_llm_for_qa(conversation_so_far: str, user_question: str) -> dict:
 
 
38
  """
39
- Minimal function that calls your LLM (Groq) to answer a follow-up question.
40
- Returns a Python dict, e.g.: {"speaker": "John", "text": "..."}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  """
42
  system_prompt = f"""
43
  You are John, the guest speaker. The user is asking a follow-up question.
@@ -51,37 +78,16 @@ def call_llm_for_qa(conversation_so_far: str, user_question: str) -> dict:
51
  {{ "speaker": "John", "text": "Sure, here's my answer..." }}
52
  """
53
 
54
- from utils import call_groq_api_for_qa
55
-
56
  raw_json_response = call_groq_api_for_qa(system_prompt)
57
- # Expect a JSON string: {"speaker": "John", "text": "some short answer"}
58
  response_dict = json.loads(raw_json_response)
59
- return response_dict
60
-
61
-
62
- def handle_qa_exchange(user_question: str) -> (bytes, str):
63
- """
64
- 1) Read conversation_so_far from session_state
65
- 2) Call the LLM for a short follow-up answer
66
- 3) Generate TTS audio
67
- 4) Return (audio_bytes, answer_text)
68
- """
69
- conversation_so_far = st.session_state.get("conversation_history", "")
70
-
71
- # Ask the LLM
72
- response_dict = call_llm_for_qa(conversation_so_far, user_question)
73
  answer_text = response_dict.get("text", "")
74
  speaker = response_dict.get("speaker", "John")
75
 
76
- # Update conversation
77
- new_history = conversation_so_far + f"\nUser: {user_question}\n{speaker}: {answer_text}\n"
78
- st.session_state["conversation_history"] = new_history
79
-
80
  if not answer_text.strip():
81
  return (None, "")
82
 
83
  # TTS
84
- audio_file_path = generate_audio_mp3(answer_text, "John") # always John
85
  with open(audio_file_path, "rb") as f:
86
  audio_bytes = f.read()
87
 
 
1
  # qa.py
2
 
3
  import os
 
4
  import json
5
  import tempfile
6
  import streamlit as st
7
+ from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration, AudioProcessorBase
8
 
9
+ from utils import generate_audio_mp3, call_groq_api_for_qa
10
 
11
+ import av
12
+ import pydub
13
+ import wave
14
+ import numpy as np
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ # For streaming from the mic, we need some RTC configuration
17
+ RTC_CONFIGURATION = RTCConfiguration(
18
+ {"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]}
19
+ )
20
 
21
+ class AudioBufferProcessor(AudioProcessorBase):
22
+ """
23
+ A custom audio processor that accumulates raw audio frames in memory.
24
+ When the user stops, we can finalize them into a single WAV for STT.
25
  """
26
+ def __init__(self) -> None:
27
+ self.frames = []
28
+
29
+ def recv_audio(self, frame: av.AudioFrame) -> av.AudioFrame:
30
+ # Convert the audio frame to a pydub AudioSegment
31
+ pcm = frame.to_ndarray()
32
+ # The shape is (channels, samples)
33
+ # We'll assume single channel or handle the first channel
34
+ if pcm.ndim == 2 and pcm.shape[0] > 1:
35
+ # If stereo, just take the first channel for STT
36
+ pcm = pcm[0, :]
37
+
38
+ sample_rate = frame.sample_rate
39
+ samples = pcm.astype(np.int16).tobytes()
40
+ segment = pydub.AudioSegment(
41
+ data=samples,
42
+ sample_width=2, # int16
43
+ frame_rate=sample_rate,
44
+ channels=1
45
+ )
46
+ self.frames.append(segment)
47
+ return frame
48
+
49
+ def finalize_wav(self) -> str:
50
+ """
51
+ Once the user stops recording, combine frames into a single WAV file.
52
+ Returns path to the wav file.
53
+ """
54
+ if not self.frames:
55
+ return ""
56
+ combined = sum(self.frames)
57
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_wav:
58
+ combined.export(tmp_wav.name, format="wav")
59
+ return tmp_wav.name
60
+
61
+
62
+ def handle_qa_exchange(conversation_so_far: str, user_question: str) -> (bytes, str):
63
+ """
64
+ 1) Build system prompt from conversation_so_far + user_question
65
+ 2) Call the LLM to get short JSON
66
+ 3) TTS the answer
67
+ 4) Return (audio_bytes, answer_text)
68
  """
69
  system_prompt = f"""
70
  You are John, the guest speaker. The user is asking a follow-up question.
 
78
  {{ "speaker": "John", "text": "Sure, here's my answer..." }}
79
  """
80
 
 
 
81
  raw_json_response = call_groq_api_for_qa(system_prompt)
 
82
  response_dict = json.loads(raw_json_response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  answer_text = response_dict.get("text", "")
84
  speaker = response_dict.get("speaker", "John")
85
 
 
 
 
 
86
  if not answer_text.strip():
87
  return (None, "")
88
 
89
  # TTS
90
+ audio_file_path = generate_audio_mp3(answer_text, "John")
91
  with open(audio_file_path, "rb") as f:
92
  audio_bytes = f.read()
93