File size: 4,755 Bytes
d439419
 
0bb8b62
d439419
0bb8b62
 
 
 
e10a51a
d439419
 
0bb8b62
 
 
 
 
 
 
 
d439419
0bb8b62
b74ae51
4a0a44f
0bb8b62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f383782
0bb8b62
 
 
d439419
0bb8b62
 
 
 
 
d439419
4a0a44f
 
 
 
 
 
 
 
 
 
0bb8b62
 
 
 
 
d439419
0bb8b62
 
 
 
 
 
 
 
 
 
 
 
 
4a0a44f
 
 
 
 
 
d6d49d6
0bb8b62
 
 
d439419
 
4a0a44f
 
 
 
0bb8b62
 
e10a51a
0bb8b62
 
e10a51a
0bb8b62
 
4a0a44f
 
 
e10a51a
4a0a44f
 
0bb8b62
 
4a0a44f
 
0bb8b62
d439419
0bb8b62
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import gradio as gr
import os
import json
import uuid
import threading
import time
import re
from dotenv import load_dotenv
from openai import OpenAI
from realtime_transcriber import WebSocketClient, connections, WEBSOCKET_URI, WEBSOCKET_HEADERS

# ------------------ Load Secrets ------------------
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ASSISTANT_ID = os.getenv("ASSISTANT_ID")

if not OPENAI_API_KEY or not ASSISTANT_ID:
    raise ValueError("Missing OPENAI_API_KEY or ASSISTANT_ID")

client = OpenAI(api_key=OPENAI_API_KEY)
session_threads = {}

# ------------------ Session & Chat Logic ------------------
def reset_session():
    session_id = str(uuid.uuid4())
    session_threads[session_id] = client.beta.threads.create().id
    return session_id

def process_chat(message, history, session_id):
    thread_id = session_threads.get(session_id)
    if not thread_id:
        thread_id = client.beta.threads.create().id
        session_threads[session_id] = thread_id

    client.beta.threads.messages.create(thread_id=thread_id, role="user", content=message)
    run = client.beta.threads.runs.create(thread_id=thread_id, assistant_id=ASSISTANT_ID)

    while client.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run.id).status != "completed":
        time.sleep(1)

    messages = client.beta.threads.messages.list(thread_id=thread_id)
    for msg in reversed(messages.data):
        if msg.role == "assistant":
            return msg.content[0].text.value
    return "⚠️ Assistant did not respond."

def extract_image_url(text):
    match = re.search(r'https://raw\.githubusercontent\.com/[^\s"]+\.png', text)
    return match.group(0) if match else None

def handle_chat(message, history, session_id):
    response = process_chat(message, history, session_id)
    history.append((message, response))
    image = extract_image_url(response)
    return history, image

# ------------------ Transcript Actions ------------------
def ask_from_transcript(transcript, history, session_id):
    return handle_chat(transcript, history, session_id)

def clear_all(client_id):
    if client_id in connections:
        connections[client_id].transcript = ""
    return "", []

# ------------------ Real-Time Voice WebSocket ------------------
def create_websocket_client():
    client_id = str(uuid.uuid4())
    connections[client_id] = WebSocketClient(WEBSOCKET_URI, WEBSOCKET_HEADERS, client_id)
    threading.Thread(target=connections[client_id].run, daemon=True).start()
    return client_id

def clear_transcript(client_id):
    if client_id in connections:
        connections[client_id].transcript = ""
    return ""

def send_audio_chunk(audio, client_id):
    if client_id not in connections:
        return "Initializing connection..."
    sr, y = audio
    connections[client_id].enqueue_audio_chunk(sr, y)
    return connections[client_id].transcript

# ------------------ UI ------------------
with gr.Blocks(theme=gr.themes.Soft(), css="""
@media (max-width: 768px) {
  .gr-col { width: 100% !important; }
  .gr-row > div { flex-direction: column !important; }
}
""") as demo:
    gr.Markdown("# 🧠 Document AI + πŸŽ™οΈ Voice Assistant")

    session_id = gr.State(value=reset_session())
    client_id = gr.State()

    with gr.Row():
        with gr.Column(scale=1):
            image_display = gr.Image(label="πŸ“‘ Extracted Document Image", show_label=True, height=480)
        with gr.Column(scale=2):
            chatbot = gr.Chatbot(label="πŸ’¬ Document Assistant", height=480)
            text_input = gr.Textbox(label="Ask about the document", placeholder="e.g. What is clause 3.2?")
            send_btn = gr.Button("Send")

    send_btn.click(handle_chat, inputs=[text_input, chatbot, session_id], outputs=[chatbot, image_display])
    text_input.submit(handle_chat, inputs=[text_input, chatbot, session_id], outputs=[chatbot, image_display])

    with gr.Accordion("🎀 Or Use Voice Instead", open=False):
        with gr.Row():
            transcript_box = gr.Textbox(label="Live Transcript", lines=6, interactive=False, autoscroll=True)
        with gr.Row():
            mic_input = gr.Audio(label="πŸŽ™οΈ Record", streaming=True)
        with gr.Row():
            ask_btn = gr.Button("πŸ€– Ask from Transcript", size="lg")
            clear_btn = gr.Button("🧹 Clear Chat", size="lg")

        mic_input.stream(fn=send_audio_chunk, inputs=[mic_input, client_id], outputs=transcript_box)
        ask_btn.click(fn=ask_from_transcript, inputs=[transcript_box, chatbot, session_id], outputs=[chatbot, image_display])
        clear_btn.click(fn=clear_all, inputs=[client_id], outputs=[transcript_box, chatbot])
        demo.load(fn=create_websocket_client, outputs=client_id)

demo.launch()