IAMTFRMZA commited on
Commit
d439419
Β·
verified Β·
1 Parent(s): fe49ca3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +146 -0
app.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import json
4
+ import uuid
5
+ import threading
6
+ import time
7
+ import re
8
+
9
+ from openai import OpenAI
10
+ from realtime_transcriber import WebSocketClient, connections, WEBSOCKET_URI, WEBSOCKET_HEADERS
11
+
12
+ # ------------------ Load API Key ------------------
13
+ from dotenv import load_dotenv
14
+ load_dotenv()
15
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
16
+ ASSISTANT_ID = os.getenv("ASSISTANT_ID")
17
+
18
+ if not OPENAI_API_KEY or not ASSISTANT_ID:
19
+ raise ValueError("Missing OPENAI_API_KEY or ASSISTANT_ID in environment variables")
20
+
21
+ client = OpenAI(api_key=OPENAI_API_KEY)
22
+
23
+ # ------------------ Chat Logic ------------------
24
+ session_threads = {}
25
+ session_messages = {}
26
+
27
+ def reset_session():
28
+ session_id = str(uuid.uuid4())
29
+ thread = client.beta.threads.create()
30
+ session_threads[session_id] = thread.id
31
+ session_messages[session_id] = []
32
+ return session_id
33
+
34
+ def process_chat(message, history, session_id):
35
+ thread_id = session_threads.get(session_id)
36
+ if not thread_id:
37
+ thread_id = client.beta.threads.create().id
38
+ session_threads[session_id] = thread_id
39
+
40
+ # Store user message
41
+ client.beta.threads.messages.create(
42
+ thread_id=thread_id,
43
+ role="user",
44
+ content=message
45
+ )
46
+
47
+ # Run assistant
48
+ run = client.beta.threads.runs.create(
49
+ thread_id=thread_id,
50
+ assistant_id=ASSISTANT_ID
51
+ )
52
+
53
+ while True:
54
+ run_status = client.beta.threads.runs.retrieve(
55
+ thread_id=thread_id,
56
+ run_id=run.id
57
+ )
58
+ if run_status.status == "completed":
59
+ break
60
+ time.sleep(1)
61
+
62
+ # Retrieve assistant message
63
+ messages = client.beta.threads.messages.list(thread_id=thread_id)
64
+ for msg in reversed(messages.data):
65
+ if msg.role == "assistant":
66
+ assistant_response = msg.content[0].text.value
67
+ break
68
+ else:
69
+ assistant_response = "⚠️ Assistant did not respond."
70
+
71
+ # Detect image if present
72
+ image_url = None
73
+ match = re.search(r'https://raw\.githubusercontent\.com/AndrewLORTech/surgical-pathology-manual/main/[\w\-/]*\.png', assistant_response)
74
+ if match:
75
+ image_url = match.group(0)
76
+
77
+ return assistant_response, image_url
78
+
79
+ # ------------------ Transcription Logic ------------------
80
+ def create_websocket_client():
81
+ client_id = str(uuid.uuid4())
82
+ connections[client_id] = WebSocketClient(WEBSOCKET_URI, WEBSOCKET_HEADERS, client_id)
83
+ threading.Thread(target=connections[client_id].run, daemon=True).start()
84
+ return client_id
85
+
86
+ def clear_transcript(client_id):
87
+ if client_id in connections:
88
+ connections[client_id].transcript = ""
89
+ return ""
90
+
91
+ def send_audio_chunk(audio, client_id):
92
+ if client_id not in connections:
93
+ return "Initializing connection..."
94
+ sr, y = audio
95
+ connections[client_id].enqueue_audio_chunk(sr, y)
96
+ return connections[client_id].transcript
97
+
98
+ # ------------------ Gradio Interface ------------------
99
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
100
+ gr.Markdown("# 🧠 Document AI + πŸŽ™οΈ Voice Assistant")
101
+
102
+ session_id = gr.State(value=reset_session())
103
+ client_id = gr.State()
104
+
105
+ # ---------- Section 1: Document Image Display ----------
106
+ with gr.Row():
107
+ image_display = gr.Image(label="πŸ“„ Document Page (auto-extracted if available)", interactive=False, visible=False)
108
+
109
+ # ---------- Section 2: Chat Interface ----------
110
+ with gr.Row():
111
+ chatbot = gr.ChatInterface(
112
+ fn=lambda message, history, session_id: (
113
+ process_chat(message, history, session_id)[0],
114
+ process_chat(message, history, session_id)[1],
115
+ ),
116
+ additional_inputs=[session_id],
117
+ render_markdown=True,
118
+ examples=["What does clause 3.2 mean?", "Summarize the timeline from the image."],
119
+ title="πŸ’¬ Document Assistant",
120
+ retry_btn="πŸ” Retry",
121
+ undo_btn="↩️ Undo",
122
+ clear_btn="πŸ—‘οΈ Clear",
123
+ )
124
+
125
+ # Link image preview if extracted
126
+ def update_image_display(message, history, session_id):
127
+ _, image_url = process_chat(message, history, session_id)
128
+ return gr.update(value=image_url, visible=bool(image_url))
129
+
130
+ chatbot.chatbot.change(fn=update_image_display, inputs=[chatbot.input, chatbot.chatbot, session_id], outputs=[image_display])
131
+
132
+ # ---------- Section 3: Voice Transcription ----------
133
+ gr.Markdown("## πŸŽ™οΈ Realtime Voice Transcription")
134
+
135
+ with gr.Row():
136
+ transcript_box = gr.Textbox(label="Live Transcript", lines=7, interactive=False, autoscroll=True)
137
+
138
+ with gr.Row():
139
+ mic_input = gr.Audio(source="microphone", streaming=True)
140
+ clear_button = gr.Button("Clear Transcript")
141
+
142
+ mic_input.stream(fn=send_audio_chunk, inputs=[mic_input, client_id], outputs=transcript_box)
143
+ clear_button.click(fn=clear_transcript, inputs=[client_id], outputs=transcript_box)
144
+ demo.load(fn=create_websocket_client, outputs=client_id)
145
+
146
+ demo.launch()