documentaitestv4

Sleeping

App Files Files Community

IAMTFRMZA commited on Apr 16

Commit

1d7fcfa

verified ·

1 Parent(s): e24f5bc

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -10

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# top of the file
 import gradio as gr
 import os, time, re, json, base64, asyncio, threading, uuid, io
 import numpy as np
@@ -8,7 +7,7 @@ from openai import OpenAI
 from websockets import connect
 from dotenv import load_dotenv
-# Load secrets
 load_dotenv()
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 ASSISTANT_ID = os.getenv("ASSISTANT_ID")
@@ -18,12 +17,10 @@ HEADERS = {"Authorization": f"Bearer {OPENAI_API_KEY}", "OpenAI-Beta": "realtime
 WS_URI = "wss://api.openai.com/v1/realtime?intent=transcription"
 connections = {}
-# WebSocket Client
 class WebSocketClient:
     def __init__(self, uri, headers, client_id):
-        self.uri = uri
-        self.headers = headers
-        self.client_id = client_id
         self.websocket = None
         self.queue = asyncio.Queue(maxsize=10)
         self.transcript = ""
@@ -71,7 +68,7 @@ class WebSocketClient:
             if data["type"] == "conversation.item.input_audio_transcription.delta":
                 self.transcript += data["delta"]
-# Real-time transcription connection manager
 def create_ws():
     cid = str(uuid.uuid4())
     client = WebSocketClient(WS_URI, HEADERS, cid)
@@ -106,7 +103,8 @@ def handle_chat(user_input, history, thread_id, image_url):
         while True:
             status = client.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run.id)
-            if status.status == "completed": break
             time.sleep(1)
         msgs = client.beta.threads.messages.list(thread_id=thread_id)
@@ -118,7 +116,8 @@ def handle_chat(user_input, history, thread_id, image_url):
                     r'https://raw\.githubusercontent\.com/AndrewLORTech/surgical-pathology-manual/main/[\w\-/]*\.png',
                     content
                 )
-                if match: image_url = match.group(0)
                 break
         return "", history, thread_id, image_url
@@ -126,15 +125,23 @@ def handle_chat(user_input, history, thread_id, image_url):
     except Exception as e:
         return f"❌ {e}", history, thread_id, image_url
 # ============ Gradio UI ============
 with gr.Blocks(theme=gr.themes.Soft()) as app:
     gr.Markdown("# 📄 Document AI Assistant")
     chat_state = gr.State([])
     thread_state = gr.State()
     image_state = gr.State()
     client_id = gr.State()
     voice_enabled = gr.State(False)
     with gr.Row(equal_height=True):
         with gr.Column(scale=1):
@@ -153,8 +160,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
                     voice_input = gr.Audio(label="Mic", streaming=True)
                     voice_transcript = gr.Textbox(label="Transcript", lines=2, interactive=False)
                 clear_btn = gr.Button("🧹 Clear Transcript")
-    # Functional bindings
     def toggle_voice(curr):
         return not curr, gr.update(visible=not curr)
@@ -163,8 +171,21 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
                    inputs=[user_prompt, chat_state, thread_state, image_state],
                    outputs=[user_prompt, chat, thread_state, image_state])
     image_state.change(fn=lambda x: x, inputs=image_state, outputs=image_display)
     voice_input.stream(fn=send_audio, inputs=[voice_input, client_id], outputs=voice_transcript, stream_every=0.5)
     clear_btn.click(fn=clear_transcript, inputs=[client_id], outputs=voice_transcript)
     app.load(fn=create_ws, outputs=[client_id])
 app.launch()

 import gradio as gr
 import os, time, re, json, base64, asyncio, threading, uuid, io
 import numpy as np
 from websockets import connect
 from dotenv import load_dotenv
+# ============ Load Secrets ============
 load_dotenv()
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 ASSISTANT_ID = os.getenv("ASSISTANT_ID")
 WS_URI = "wss://api.openai.com/v1/realtime?intent=transcription"
 connections = {}
+# ============ WebSocket Client ============
 class WebSocketClient:
     def __init__(self, uri, headers, client_id):
+        self.uri, self.headers, self.client_id = uri, headers, client_id
         self.websocket = None
         self.queue = asyncio.Queue(maxsize=10)
         self.transcript = ""
             if data["type"] == "conversation.item.input_audio_transcription.delta":
                 self.transcript += data["delta"]
+# ============ Connection Manager ============
 def create_ws():
     cid = str(uuid.uuid4())
     client = WebSocketClient(WS_URI, HEADERS, cid)
         while True:
             status = client.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run.id)
+            if status.status == "completed":
+                break
             time.sleep(1)
         msgs = client.beta.threads.messages.list(thread_id=thread_id)
                     r'https://raw\.githubusercontent\.com/AndrewLORTech/surgical-pathology-manual/main/[\w\-/]*\.png',
                     content
                 )
+                if match:
+                    image_url = match.group(0)
                 break
         return "", history, thread_id, image_url
     except Exception as e:
         return f"❌ {e}", history, thread_id, image_url
+# ============ Auto-Send Voice Toggle ============
+def maybe_send_transcript(transcript, history, thread_id, image_url, voice_only_enabled):
+    if voice_only_enabled and transcript.strip():
+        return handle_chat(transcript, history, thread_id, image_url)
+    return transcript, history, thread_id, image_url  # Keep transcript for manual sending
 # ============ Gradio UI ============
 with gr.Blocks(theme=gr.themes.Soft()) as app:
     gr.Markdown("# 📄 Document AI Assistant")
+    # STATES
     chat_state = gr.State([])
     thread_state = gr.State()
     image_state = gr.State()
     client_id = gr.State()
     voice_enabled = gr.State(False)
+    voice_only_state = gr.State(True)
     with gr.Row(equal_height=True):
         with gr.Column(scale=1):
                     voice_input = gr.Audio(label="Mic", streaming=True)
                     voice_transcript = gr.Textbox(label="Transcript", lines=2, interactive=False)
                 clear_btn = gr.Button("🧹 Clear Transcript")
+                voice_only_toggle = gr.Checkbox(label="Voice-Only Mode 🎤➡️💬", value=True)
+    # UI Event Bindings
     def toggle_voice(curr):
         return not curr, gr.update(visible=not curr)
                    inputs=[user_prompt, chat_state, thread_state, image_state],
                    outputs=[user_prompt, chat, thread_state, image_state])
     image_state.change(fn=lambda x: x, inputs=image_state, outputs=image_display)
+    # Real-time audio streaming
     voice_input.stream(fn=send_audio, inputs=[voice_input, client_id], outputs=voice_transcript, stream_every=0.5)
     clear_btn.click(fn=clear_transcript, inputs=[client_id], outputs=voice_transcript)
+    # Auto-send voice transcript if Voice-Only Mode is enabled
+    voice_input.change(
+        fn=maybe_send_transcript,
+        inputs=[voice_transcript, chat_state, thread_state, image_state, voice_only_state],
+        outputs=[user_prompt, chat, thread_state, image_state]
+    )
+    voice_only_toggle.change(fn=lambda x: x, inputs=voice_only_toggle, outputs=voice_only_state)
+    # Initialize WebSocket connection
     app.load(fn=create_ws, outputs=[client_id])
 app.launch()