Spaces:
Sleeping
Sleeping
File size: 4,755 Bytes
d439419 0bb8b62 d439419 0bb8b62 e10a51a d439419 0bb8b62 d439419 0bb8b62 b74ae51 4a0a44f 0bb8b62 f383782 0bb8b62 d439419 0bb8b62 d439419 4a0a44f 0bb8b62 d439419 0bb8b62 4a0a44f d6d49d6 0bb8b62 d439419 4a0a44f 0bb8b62 e10a51a 0bb8b62 e10a51a 0bb8b62 4a0a44f e10a51a 4a0a44f 0bb8b62 4a0a44f 0bb8b62 d439419 0bb8b62 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import gradio as gr
import os
import json
import uuid
import threading
import time
import re
from dotenv import load_dotenv
from openai import OpenAI
from realtime_transcriber import WebSocketClient, connections, WEBSOCKET_URI, WEBSOCKET_HEADERS
# ------------------ Load Secrets ------------------
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ASSISTANT_ID = os.getenv("ASSISTANT_ID")
if not OPENAI_API_KEY or not ASSISTANT_ID:
raise ValueError("Missing OPENAI_API_KEY or ASSISTANT_ID")
client = OpenAI(api_key=OPENAI_API_KEY)
session_threads = {}
# ------------------ Session & Chat Logic ------------------
def reset_session():
session_id = str(uuid.uuid4())
session_threads[session_id] = client.beta.threads.create().id
return session_id
def process_chat(message, history, session_id):
thread_id = session_threads.get(session_id)
if not thread_id:
thread_id = client.beta.threads.create().id
session_threads[session_id] = thread_id
client.beta.threads.messages.create(thread_id=thread_id, role="user", content=message)
run = client.beta.threads.runs.create(thread_id=thread_id, assistant_id=ASSISTANT_ID)
while client.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run.id).status != "completed":
time.sleep(1)
messages = client.beta.threads.messages.list(thread_id=thread_id)
for msg in reversed(messages.data):
if msg.role == "assistant":
return msg.content[0].text.value
return "β οΈ Assistant did not respond."
def extract_image_url(text):
match = re.search(r'https://raw\.githubusercontent\.com/[^\s"]+\.png', text)
return match.group(0) if match else None
def handle_chat(message, history, session_id):
response = process_chat(message, history, session_id)
history.append((message, response))
image = extract_image_url(response)
return history, image
# ------------------ Transcript Actions ------------------
def ask_from_transcript(transcript, history, session_id):
return handle_chat(transcript, history, session_id)
def clear_all(client_id):
if client_id in connections:
connections[client_id].transcript = ""
return "", []
# ------------------ Real-Time Voice WebSocket ------------------
def create_websocket_client():
client_id = str(uuid.uuid4())
connections[client_id] = WebSocketClient(WEBSOCKET_URI, WEBSOCKET_HEADERS, client_id)
threading.Thread(target=connections[client_id].run, daemon=True).start()
return client_id
def clear_transcript(client_id):
if client_id in connections:
connections[client_id].transcript = ""
return ""
def send_audio_chunk(audio, client_id):
if client_id not in connections:
return "Initializing connection..."
sr, y = audio
connections[client_id].enqueue_audio_chunk(sr, y)
return connections[client_id].transcript
# ------------------ UI ------------------
with gr.Blocks(theme=gr.themes.Soft(), css="""
@media (max-width: 768px) {
.gr-col { width: 100% !important; }
.gr-row > div { flex-direction: column !important; }
}
""") as demo:
gr.Markdown("# π§ Document AI + ποΈ Voice Assistant")
session_id = gr.State(value=reset_session())
client_id = gr.State()
with gr.Row():
with gr.Column(scale=1):
image_display = gr.Image(label="π Extracted Document Image", show_label=True, height=480)
with gr.Column(scale=2):
chatbot = gr.Chatbot(label="π¬ Document Assistant", height=480)
text_input = gr.Textbox(label="Ask about the document", placeholder="e.g. What is clause 3.2?")
send_btn = gr.Button("Send")
send_btn.click(handle_chat, inputs=[text_input, chatbot, session_id], outputs=[chatbot, image_display])
text_input.submit(handle_chat, inputs=[text_input, chatbot, session_id], outputs=[chatbot, image_display])
with gr.Accordion("π€ Or Use Voice Instead", open=False):
with gr.Row():
transcript_box = gr.Textbox(label="Live Transcript", lines=6, interactive=False, autoscroll=True)
with gr.Row():
mic_input = gr.Audio(label="ποΈ Record", streaming=True)
with gr.Row():
ask_btn = gr.Button("π€ Ask from Transcript", size="lg")
clear_btn = gr.Button("π§Ή Clear Chat", size="lg")
mic_input.stream(fn=send_audio_chunk, inputs=[mic_input, client_id], outputs=transcript_box)
ask_btn.click(fn=ask_from_transcript, inputs=[transcript_box, chatbot, session_id], outputs=[chatbot, image_display])
clear_btn.click(fn=clear_all, inputs=[client_id], outputs=[transcript_box, chatbot])
demo.load(fn=create_websocket_client, outputs=client_id)
demo.launch()
|