IAMTFRMZA commited on
Commit
1d7fcfa
·
verified ·
1 Parent(s): e24f5bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -10
app.py CHANGED
@@ -1,4 +1,3 @@
1
- # top of the file
2
  import gradio as gr
3
  import os, time, re, json, base64, asyncio, threading, uuid, io
4
  import numpy as np
@@ -8,7 +7,7 @@ from openai import OpenAI
8
  from websockets import connect
9
  from dotenv import load_dotenv
10
 
11
- # Load secrets
12
  load_dotenv()
13
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
14
  ASSISTANT_ID = os.getenv("ASSISTANT_ID")
@@ -18,12 +17,10 @@ HEADERS = {"Authorization": f"Bearer {OPENAI_API_KEY}", "OpenAI-Beta": "realtime
18
  WS_URI = "wss://api.openai.com/v1/realtime?intent=transcription"
19
  connections = {}
20
 
21
- # WebSocket Client
22
  class WebSocketClient:
23
  def __init__(self, uri, headers, client_id):
24
- self.uri = uri
25
- self.headers = headers
26
- self.client_id = client_id
27
  self.websocket = None
28
  self.queue = asyncio.Queue(maxsize=10)
29
  self.transcript = ""
@@ -71,7 +68,7 @@ class WebSocketClient:
71
  if data["type"] == "conversation.item.input_audio_transcription.delta":
72
  self.transcript += data["delta"]
73
 
74
- # Real-time transcription connection manager
75
  def create_ws():
76
  cid = str(uuid.uuid4())
77
  client = WebSocketClient(WS_URI, HEADERS, cid)
@@ -106,7 +103,8 @@ def handle_chat(user_input, history, thread_id, image_url):
106
 
107
  while True:
108
  status = client.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run.id)
109
- if status.status == "completed": break
 
110
  time.sleep(1)
111
 
112
  msgs = client.beta.threads.messages.list(thread_id=thread_id)
@@ -118,7 +116,8 @@ def handle_chat(user_input, history, thread_id, image_url):
118
  r'https://raw\.githubusercontent\.com/AndrewLORTech/surgical-pathology-manual/main/[\w\-/]*\.png',
119
  content
120
  )
121
- if match: image_url = match.group(0)
 
122
  break
123
 
124
  return "", history, thread_id, image_url
@@ -126,15 +125,23 @@ def handle_chat(user_input, history, thread_id, image_url):
126
  except Exception as e:
127
  return f"❌ {e}", history, thread_id, image_url
128
 
 
 
 
 
 
 
129
  # ============ Gradio UI ============
130
  with gr.Blocks(theme=gr.themes.Soft()) as app:
131
  gr.Markdown("# 📄 Document AI Assistant")
132
 
 
133
  chat_state = gr.State([])
134
  thread_state = gr.State()
135
  image_state = gr.State()
136
  client_id = gr.State()
137
  voice_enabled = gr.State(False)
 
138
 
139
  with gr.Row(equal_height=True):
140
  with gr.Column(scale=1):
@@ -153,8 +160,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
153
  voice_input = gr.Audio(label="Mic", streaming=True)
154
  voice_transcript = gr.Textbox(label="Transcript", lines=2, interactive=False)
155
  clear_btn = gr.Button("🧹 Clear Transcript")
 
156
 
157
- # Functional bindings
158
  def toggle_voice(curr):
159
  return not curr, gr.update(visible=not curr)
160
 
@@ -163,8 +171,21 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
163
  inputs=[user_prompt, chat_state, thread_state, image_state],
164
  outputs=[user_prompt, chat, thread_state, image_state])
165
  image_state.change(fn=lambda x: x, inputs=image_state, outputs=image_display)
 
 
166
  voice_input.stream(fn=send_audio, inputs=[voice_input, client_id], outputs=voice_transcript, stream_every=0.5)
167
  clear_btn.click(fn=clear_transcript, inputs=[client_id], outputs=voice_transcript)
 
 
 
 
 
 
 
 
 
 
 
168
  app.load(fn=create_ws, outputs=[client_id])
169
 
170
  app.launch()
 
 
1
  import gradio as gr
2
  import os, time, re, json, base64, asyncio, threading, uuid, io
3
  import numpy as np
 
7
  from websockets import connect
8
  from dotenv import load_dotenv
9
 
10
+ # ============ Load Secrets ============
11
  load_dotenv()
12
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
13
  ASSISTANT_ID = os.getenv("ASSISTANT_ID")
 
17
  WS_URI = "wss://api.openai.com/v1/realtime?intent=transcription"
18
  connections = {}
19
 
20
+ # ============ WebSocket Client ============
21
  class WebSocketClient:
22
  def __init__(self, uri, headers, client_id):
23
+ self.uri, self.headers, self.client_id = uri, headers, client_id
 
 
24
  self.websocket = None
25
  self.queue = asyncio.Queue(maxsize=10)
26
  self.transcript = ""
 
68
  if data["type"] == "conversation.item.input_audio_transcription.delta":
69
  self.transcript += data["delta"]
70
 
71
+ # ============ Connection Manager ============
72
  def create_ws():
73
  cid = str(uuid.uuid4())
74
  client = WebSocketClient(WS_URI, HEADERS, cid)
 
103
 
104
  while True:
105
  status = client.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run.id)
106
+ if status.status == "completed":
107
+ break
108
  time.sleep(1)
109
 
110
  msgs = client.beta.threads.messages.list(thread_id=thread_id)
 
116
  r'https://raw\.githubusercontent\.com/AndrewLORTech/surgical-pathology-manual/main/[\w\-/]*\.png',
117
  content
118
  )
119
+ if match:
120
+ image_url = match.group(0)
121
  break
122
 
123
  return "", history, thread_id, image_url
 
125
  except Exception as e:
126
  return f"❌ {e}", history, thread_id, image_url
127
 
128
+ # ============ Auto-Send Voice Toggle ============
129
+ def maybe_send_transcript(transcript, history, thread_id, image_url, voice_only_enabled):
130
+ if voice_only_enabled and transcript.strip():
131
+ return handle_chat(transcript, history, thread_id, image_url)
132
+ return transcript, history, thread_id, image_url # Keep transcript for manual sending
133
+
134
  # ============ Gradio UI ============
135
  with gr.Blocks(theme=gr.themes.Soft()) as app:
136
  gr.Markdown("# 📄 Document AI Assistant")
137
 
138
+ # STATES
139
  chat_state = gr.State([])
140
  thread_state = gr.State()
141
  image_state = gr.State()
142
  client_id = gr.State()
143
  voice_enabled = gr.State(False)
144
+ voice_only_state = gr.State(True)
145
 
146
  with gr.Row(equal_height=True):
147
  with gr.Column(scale=1):
 
160
  voice_input = gr.Audio(label="Mic", streaming=True)
161
  voice_transcript = gr.Textbox(label="Transcript", lines=2, interactive=False)
162
  clear_btn = gr.Button("🧹 Clear Transcript")
163
+ voice_only_toggle = gr.Checkbox(label="Voice-Only Mode 🎤➡️💬", value=True)
164
 
165
+ # UI Event Bindings
166
  def toggle_voice(curr):
167
  return not curr, gr.update(visible=not curr)
168
 
 
171
  inputs=[user_prompt, chat_state, thread_state, image_state],
172
  outputs=[user_prompt, chat, thread_state, image_state])
173
  image_state.change(fn=lambda x: x, inputs=image_state, outputs=image_display)
174
+
175
+ # Real-time audio streaming
176
  voice_input.stream(fn=send_audio, inputs=[voice_input, client_id], outputs=voice_transcript, stream_every=0.5)
177
  clear_btn.click(fn=clear_transcript, inputs=[client_id], outputs=voice_transcript)
178
+
179
+ # Auto-send voice transcript if Voice-Only Mode is enabled
180
+ voice_input.change(
181
+ fn=maybe_send_transcript,
182
+ inputs=[voice_transcript, chat_state, thread_state, image_state, voice_only_state],
183
+ outputs=[user_prompt, chat, thread_state, image_state]
184
+ )
185
+
186
+ voice_only_toggle.change(fn=lambda x: x, inputs=voice_only_toggle, outputs=voice_only_state)
187
+
188
+ # Initialize WebSocket connection
189
  app.load(fn=create_ws, outputs=[client_id])
190
 
191
  app.launch()