documentaitestv3

Sleeping

App Files Files Community

IAMTFRMZA commited on Apr 16

Commit

9850ad3

verified ·

1 Parent(s): 7f2459b

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -21

app.py CHANGED Viewed

@@ -17,7 +17,7 @@ HEADERS = {"Authorization": f"Bearer {OPENAI_API_KEY}", "OpenAI-Beta": "realtime
 WS_URI = "wss://api.openai.com/v1/realtime?intent=transcription"
 connections = {}
-# ============ WebSocket Client for Voice ============
 class WebSocketClient:
     def __init__(self, uri, headers, client_id):
         self.uri, self.headers, self.client_id = uri, headers, client_id
@@ -45,7 +45,10 @@ class WebSocketClient:
             buf = io.BytesIO(); sf.write(buf, int16, sr, format='WAV', subtype='PCM_16')
             audio = AudioSegment.from_file(buf, format="wav").set_frame_rate(24000)
             out = io.BytesIO(); audio.export(out, format="wav"); out.seek(0)
-            await self.websocket.send(json.dumps({"type": "input_audio_buffer.append", "audio": base64.b64encode(out.read()).decode()}))
     async def receive_messages(self):
         async for msg in self.websocket:
@@ -74,7 +77,7 @@ def clear_transcript(cid):
     if cid in connections: connections[cid].transcript = ""
     return ""
-# ============ Chat Assistant Logic ============
 def handle_chat(user_input, history, thread_id, image_url):
     if not OPENAI_API_KEY or not ASSISTANT_ID:
         return "❌ Missing secrets!", history, thread_id, image_url
@@ -97,7 +100,10 @@ def handle_chat(user_input, history, thread_id, image_url):
             if msg.role == "assistant":
                 content = msg.content[0].text.value
                 history.append((user_input, content))
-                match = re.search(r'https://raw\.githubusercontent\.com/AndrewLORTech/surgical-pathology-manual/main/[\w\-/]*\.png', content)
                 if match: image_url = match.group(0)
                 break
@@ -115,30 +121,37 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
     thread_state = gr.State()
     image_state = gr.State()
     client_id = gr.State()
     with gr.Row(equal_height=True):
-        # LEFT COLUMN — Image + Voice
-        with gr.Column(scale=1, min_width=400):
-            image_display = gr.Image(label="🖼️ Document Image", type="filepath", show_download_button=False, show_label=False)
             with gr.Row():
-                voice_input = gr.Audio(label="🎙️ Mic", streaming=True)
-                voice_transcript = gr.Textbox(label="📝 Transcript", lines=3, interactive=False)
-            clear_btn = gr.Button("🧹 Clear Transcript")
-        # RIGHT COLUMN — Chatbot
-        with gr.Column(scale=1.4, min_width=500):
-            chat = gr.Chatbot(label="💬 Chat", height=480)
-            user_prompt = gr.Textbox(placeholder="Ask your question...", show_label=False)
-            send_btn = gr.Button("Send", variant="primary")
-    # HOOKS
-    send_btn.click(handle_chat,
                    inputs=[user_prompt, chat_state, thread_state, image_state],
                    outputs=[user_prompt, chat, thread_state, image_state])
     image_state.change(fn=lambda x: x, inputs=image_state, outputs=image_display)
     voice_input.stream(fn=send_audio, inputs=[voice_input, client_id], outputs=voice_transcript, stream_every=0.5)
     clear_btn.click(fn=clear_transcript, inputs=[client_id], outputs=voice_transcript)
-    app.load(create_ws, outputs=[client_id])
 app.launch()

 WS_URI = "wss://api.openai.com/v1/realtime?intent=transcription"
 connections = {}
+# ============ WebSocket Client ============
 class WebSocketClient:
     def __init__(self, uri, headers, client_id):
         self.uri, self.headers, self.client_id = uri, headers, client_id
             buf = io.BytesIO(); sf.write(buf, int16, sr, format='WAV', subtype='PCM_16')
             audio = AudioSegment.from_file(buf, format="wav").set_frame_rate(24000)
             out = io.BytesIO(); audio.export(out, format="wav"); out.seek(0)
+            await self.websocket.send(json.dumps({
+                "type": "input_audio_buffer.append",
+                "audio": base64.b64encode(out.read()).decode()
+            }))
     async def receive_messages(self):
         async for msg in self.websocket:
     if cid in connections: connections[cid].transcript = ""
     return ""
+# ============ Chat Assistant ============
 def handle_chat(user_input, history, thread_id, image_url):
     if not OPENAI_API_KEY or not ASSISTANT_ID:
         return "❌ Missing secrets!", history, thread_id, image_url
             if msg.role == "assistant":
                 content = msg.content[0].text.value
                 history.append((user_input, content))
+                match = re.search(
+                    r'https://raw\.githubusercontent\.com/AndrewLORTech/surgical-pathology-manual/main/[\w\-/]*\.png',
+                    content
+                )
                 if match: image_url = match.group(0)
                 break
     thread_state = gr.State()
     image_state = gr.State()
     client_id = gr.State()
+    voice_enabled = gr.State(False)
     with gr.Row(equal_height=True):
+        with gr.Column(scale=1):
+            image_display = gr.Image(label="🖼️ Document", type="filepath", show_download_button=False)
+        with gr.Column(scale=1.4):
+            chat = gr.Chatbot(label="💬 Chat", height=460)
             with gr.Row():
+                user_prompt = gr.Textbox(placeholder="Ask your question...", show_label=False, scale=6)
+                mic_toggle_btn = gr.Button("🎙️", scale=1)
+                send_btn = gr.Button("Send", variant="primary", scale=2)
+            with gr.Accordion("🎤 Voice Transcription", open=False) as voice_section:
+                with gr.Row():
+                    voice_input = gr.Audio(label="Mic", streaming=True)
+                    voice_transcript = gr.Textbox(label="Transcript", lines=2, interactive=False)
+                clear_btn = gr.Button("🧹 Clear Transcript")
+    # FUNCTIONAL CONNECTIONS
+    def toggle_voice(curr):
+        return not curr, gr.update(visible=not curr)
+    mic_toggle_btn.click(fn=toggle_voice, inputs=voice_enabled, outputs=[voice_enabled, voice_section])
+    send_btn.click(fn=handle_chat,
                    inputs=[user_prompt, chat_state, thread_state, image_state],
                    outputs=[user_prompt, chat, thread_state, image_state])
     image_state.change(fn=lambda x: x, inputs=image_state, outputs=image_display)
     voice_input.stream(fn=send_audio, inputs=[voice_input, client_id], outputs=voice_transcript, stream_every=0.5)
     clear_btn.click(fn=clear_transcript, inputs=[client_id], outputs=voice_transcript)
+    app.load(fn=create_ws, outputs=[client_id])
 app.launch()