Spaces:
Sleeping
Sleeping
File size: 2,853 Bytes
48c504d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
# qa.py
import os
import requests
import json
import tempfile
import streamlit as st
from utils import generate_audio_mp3 # Reuse your existing TTS function
def transcribe_audio_deepgram(local_audio_path: str) -> str:
"""
Sends a local audio file to Deepgram for STT.
Returns the transcript text if successful, or raises an error if failed.
"""
DEEPGRAM_API_KEY = os.environ.get("DEEPGRAM_API_KEY")
if not DEEPGRAM_API_KEY:
raise ValueError("Deepgram API key not found in environment variables.")
url = "https://api.deepgram.com/v1/listen?model=nova-2&smart_format=true"
headers = {
"Authorization": f"Token {DEEPGRAM_API_KEY}",
"Content-Type": "audio/wav"
}
with open(local_audio_path, "rb") as f:
response = requests.post(url, headers=headers, data=f)
response.raise_for_status()
data = response.json()
# Extract the transcript
transcript = data["results"]["channels"][0]["alternatives"][0].get("transcript", "")
return transcript
def call_llm_for_qa(conversation_so_far: str, user_question: str) -> dict:
"""
Calls Groq LLM to answer a follow-up question.
Returns a Python dict: {"speaker": "John", "text": "..."}
"""
system_prompt = f"""
You are John, the guest speaker. The user is asking a follow-up question.
Conversation so far:
{conversation_so_far}
New user question:
{user_question}
Please respond in JSON with keys "speaker" and "text", e.g.:
{{ "speaker": "John", "text": "Sure, here's my answer..." }}
"""
from utils import call_groq_api_for_qa # Import from utils
raw_json_response = call_groq_api_for_qa(system_prompt) # Corrected call
# Expect a JSON string: {"speaker": "John", "text": "some short answer"}
response_dict = json.loads(raw_json_response)
return response_dict
def handle_qa_exchange(user_question: str) -> (bytes, str):
"""
1) Read conversation_so_far from session_state
2) Call the LLM for a short follow-up answer
3) Generate TTS audio
4) Return (audio_bytes, answer_text)
"""
conversation_so_far = st.session_state.get("conversation_history", "")
# Ask the LLM
response_dict = call_llm_for_qa(conversation_so_far, user_question)
answer_text = response_dict.get("text", "")
speaker = response_dict.get("speaker", "John")
# Update conversation
new_history = conversation_so_far + f"\nUser: {user_question}\n{speaker}: {answer_text}\n"
st.session_state["conversation_history"] = new_history
if not answer_text.strip():
return (None, "")
# TTS
audio_file_path = generate_audio_mp3(answer_text, "John") # always John
with open(audio_file_path, "rb") as f:
audio_bytes = f.read()
os.remove(audio_file_path)
return (audio_bytes, answer_text)
|