IAMTFRMZA commited on
Commit
e24f5bc
·
verified ·
1 Parent(s): 1c4cc7f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -23
app.py CHANGED
@@ -1,13 +1,14 @@
 
1
  import gradio as gr
2
  import os, time, re, json, base64, asyncio, threading, uuid, io
3
  import numpy as np
4
  import soundfile as sf
5
  from pydub import AudioSegment
6
  from openai import OpenAI
7
- from websockets import connect, Data, ClientConnection
8
  from dotenv import load_dotenv
9
 
10
- # ============ Load Secrets ============
11
  load_dotenv()
12
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
13
  ASSISTANT_ID = os.getenv("ASSISTANT_ID")
@@ -17,34 +18,48 @@ HEADERS = {"Authorization": f"Bearer {OPENAI_API_KEY}", "OpenAI-Beta": "realtime
17
  WS_URI = "wss://api.openai.com/v1/realtime?intent=transcription"
18
  connections = {}
19
 
20
- # ============ WebSocket Client ============
21
  class WebSocketClient:
22
  def __init__(self, uri, headers, client_id):
23
- self.uri, self.headers, self.client_id = uri, headers, client_id
 
 
24
  self.websocket = None
25
  self.queue = asyncio.Queue(maxsize=10)
26
  self.transcript = ""
 
27
 
28
  async def connect(self):
29
- self.websocket = await connect(self.uri, additional_headers=self.headers)
30
- with open("openai_transcription_settings.json", "r") as f:
31
- await self.websocket.send(f.read())
32
- await asyncio.gather(self.receive_messages(), self.send_audio_chunks())
 
 
 
33
 
34
  def run(self):
35
- loop = asyncio.new_event_loop()
36
- asyncio.set_event_loop(loop)
37
- loop.run_until_complete(self.connect())
 
 
 
38
 
39
  async def send_audio_chunks(self):
40
  while True:
41
  sr, arr = await self.queue.get()
42
- if arr.ndim > 1: arr = arr.mean(axis=1)
43
- arr = (arr / np.max(np.abs(arr))) if np.max(np.abs(arr)) > 0 else arr
 
 
44
  int16 = (arr * 32767).astype(np.int16)
45
- buf = io.BytesIO(); sf.write(buf, int16, sr, format='WAV', subtype='PCM_16')
 
46
  audio = AudioSegment.from_file(buf, format="wav").set_frame_rate(24000)
47
- out = io.BytesIO(); audio.export(out, format="wav"); out.seek(0)
 
 
48
  await self.websocket.send(json.dumps({
49
  "type": "input_audio_buffer.append",
50
  "audio": base64.b64encode(out.read()).decode()
@@ -56,10 +71,7 @@ class WebSocketClient:
56
  if data["type"] == "conversation.item.input_audio_transcription.delta":
57
  self.transcript += data["delta"]
58
 
59
- def enqueue_audio_chunk(self, sr, arr):
60
- if not self.queue.full():
61
- asyncio.run_coroutine_threadsafe(self.queue.put((sr, arr)), asyncio.get_event_loop())
62
-
63
  def create_ws():
64
  cid = str(uuid.uuid4())
65
  client = WebSocketClient(WS_URI, HEADERS, cid)
@@ -68,13 +80,15 @@ def create_ws():
68
  return cid
69
 
70
  def send_audio(chunk, cid):
71
- if cid not in connections: return "Connecting..."
 
72
  sr, arr = chunk
73
  connections[cid].enqueue_audio_chunk(sr, arr)
74
  return connections[cid].transcript
75
 
76
  def clear_transcript(cid):
77
- if cid in connections: connections[cid].transcript = ""
 
78
  return ""
79
 
80
  # ============ Chat Assistant ============
@@ -116,7 +130,6 @@ def handle_chat(user_input, history, thread_id, image_url):
116
  with gr.Blocks(theme=gr.themes.Soft()) as app:
117
  gr.Markdown("# 📄 Document AI Assistant")
118
 
119
- # STATES
120
  chat_state = gr.State([])
121
  thread_state = gr.State()
122
  image_state = gr.State()
@@ -141,7 +154,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
141
  voice_transcript = gr.Textbox(label="Transcript", lines=2, interactive=False)
142
  clear_btn = gr.Button("🧹 Clear Transcript")
143
 
144
- # FUNCTIONAL CONNECTIONS
145
  def toggle_voice(curr):
146
  return not curr, gr.update(visible=not curr)
147
 
 
1
+ # top of the file
2
  import gradio as gr
3
  import os, time, re, json, base64, asyncio, threading, uuid, io
4
  import numpy as np
5
  import soundfile as sf
6
  from pydub import AudioSegment
7
  from openai import OpenAI
8
+ from websockets import connect
9
  from dotenv import load_dotenv
10
 
11
+ # Load secrets
12
  load_dotenv()
13
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
14
  ASSISTANT_ID = os.getenv("ASSISTANT_ID")
 
18
  WS_URI = "wss://api.openai.com/v1/realtime?intent=transcription"
19
  connections = {}
20
 
21
+ # WebSocket Client
22
  class WebSocketClient:
23
  def __init__(self, uri, headers, client_id):
24
+ self.uri = uri
25
+ self.headers = headers
26
+ self.client_id = client_id
27
  self.websocket = None
28
  self.queue = asyncio.Queue(maxsize=10)
29
  self.transcript = ""
30
+ self.loop = asyncio.new_event_loop()
31
 
32
  async def connect(self):
33
+ try:
34
+ self.websocket = await connect(self.uri, additional_headers=self.headers)
35
+ with open("openai_transcription_settings.json", "r") as f:
36
+ await self.websocket.send(f.read())
37
+ await asyncio.gather(self.receive_messages(), self.send_audio_chunks())
38
+ except Exception as e:
39
+ print(f"🔴 WebSocket Connection Failed: {e}")
40
 
41
  def run(self):
42
+ asyncio.set_event_loop(self.loop)
43
+ self.loop.run_until_complete(self.connect())
44
+
45
+ def enqueue_audio_chunk(self, sr, arr):
46
+ if not self.queue.full():
47
+ asyncio.run_coroutine_threadsafe(self.queue.put((sr, arr)), self.loop)
48
 
49
  async def send_audio_chunks(self):
50
  while True:
51
  sr, arr = await self.queue.get()
52
+ if arr.ndim > 1:
53
+ arr = arr.mean(axis=1)
54
+ if np.max(np.abs(arr)) > 0:
55
+ arr = arr / np.max(np.abs(arr))
56
  int16 = (arr * 32767).astype(np.int16)
57
+ buf = io.BytesIO()
58
+ sf.write(buf, int16, sr, format='WAV', subtype='PCM_16')
59
  audio = AudioSegment.from_file(buf, format="wav").set_frame_rate(24000)
60
+ out = io.BytesIO()
61
+ audio.export(out, format="wav")
62
+ out.seek(0)
63
  await self.websocket.send(json.dumps({
64
  "type": "input_audio_buffer.append",
65
  "audio": base64.b64encode(out.read()).decode()
 
71
  if data["type"] == "conversation.item.input_audio_transcription.delta":
72
  self.transcript += data["delta"]
73
 
74
+ # Real-time transcription connection manager
 
 
 
75
  def create_ws():
76
  cid = str(uuid.uuid4())
77
  client = WebSocketClient(WS_URI, HEADERS, cid)
 
80
  return cid
81
 
82
  def send_audio(chunk, cid):
83
+ if not cid or cid not in connections:
84
+ return "Connecting..."
85
  sr, arr = chunk
86
  connections[cid].enqueue_audio_chunk(sr, arr)
87
  return connections[cid].transcript
88
 
89
  def clear_transcript(cid):
90
+ if cid in connections:
91
+ connections[cid].transcript = ""
92
  return ""
93
 
94
  # ============ Chat Assistant ============
 
130
  with gr.Blocks(theme=gr.themes.Soft()) as app:
131
  gr.Markdown("# 📄 Document AI Assistant")
132
 
 
133
  chat_state = gr.State([])
134
  thread_state = gr.State()
135
  image_state = gr.State()
 
154
  voice_transcript = gr.Textbox(label="Transcript", lines=2, interactive=False)
155
  clear_btn = gr.Button("🧹 Clear Transcript")
156
 
157
+ # Functional bindings
158
  def toggle_voice(curr):
159
  return not curr, gr.update(visible=not curr)
160