Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,13 +1,14 @@
|
|
|
|
1 |
import gradio as gr
|
2 |
import os, time, re, json, base64, asyncio, threading, uuid, io
|
3 |
import numpy as np
|
4 |
import soundfile as sf
|
5 |
from pydub import AudioSegment
|
6 |
from openai import OpenAI
|
7 |
-
from websockets import connect
|
8 |
from dotenv import load_dotenv
|
9 |
|
10 |
-
#
|
11 |
load_dotenv()
|
12 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
13 |
ASSISTANT_ID = os.getenv("ASSISTANT_ID")
|
@@ -17,34 +18,48 @@ HEADERS = {"Authorization": f"Bearer {OPENAI_API_KEY}", "OpenAI-Beta": "realtime
|
|
17 |
WS_URI = "wss://api.openai.com/v1/realtime?intent=transcription"
|
18 |
connections = {}
|
19 |
|
20 |
-
#
|
21 |
class WebSocketClient:
|
22 |
def __init__(self, uri, headers, client_id):
|
23 |
-
self.uri
|
|
|
|
|
24 |
self.websocket = None
|
25 |
self.queue = asyncio.Queue(maxsize=10)
|
26 |
self.transcript = ""
|
|
|
27 |
|
28 |
async def connect(self):
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
33 |
|
34 |
def run(self):
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
38 |
|
39 |
async def send_audio_chunks(self):
|
40 |
while True:
|
41 |
sr, arr = await self.queue.get()
|
42 |
-
if arr.ndim > 1:
|
43 |
-
|
|
|
|
|
44 |
int16 = (arr * 32767).astype(np.int16)
|
45 |
-
buf = io.BytesIO()
|
|
|
46 |
audio = AudioSegment.from_file(buf, format="wav").set_frame_rate(24000)
|
47 |
-
out = io.BytesIO()
|
|
|
|
|
48 |
await self.websocket.send(json.dumps({
|
49 |
"type": "input_audio_buffer.append",
|
50 |
"audio": base64.b64encode(out.read()).decode()
|
@@ -56,10 +71,7 @@ class WebSocketClient:
|
|
56 |
if data["type"] == "conversation.item.input_audio_transcription.delta":
|
57 |
self.transcript += data["delta"]
|
58 |
|
59 |
-
|
60 |
-
if not self.queue.full():
|
61 |
-
asyncio.run_coroutine_threadsafe(self.queue.put((sr, arr)), asyncio.get_event_loop())
|
62 |
-
|
63 |
def create_ws():
|
64 |
cid = str(uuid.uuid4())
|
65 |
client = WebSocketClient(WS_URI, HEADERS, cid)
|
@@ -68,13 +80,15 @@ def create_ws():
|
|
68 |
return cid
|
69 |
|
70 |
def send_audio(chunk, cid):
|
71 |
-
if cid not in connections:
|
|
|
72 |
sr, arr = chunk
|
73 |
connections[cid].enqueue_audio_chunk(sr, arr)
|
74 |
return connections[cid].transcript
|
75 |
|
76 |
def clear_transcript(cid):
|
77 |
-
if cid in connections:
|
|
|
78 |
return ""
|
79 |
|
80 |
# ============ Chat Assistant ============
|
@@ -116,7 +130,6 @@ def handle_chat(user_input, history, thread_id, image_url):
|
|
116 |
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
117 |
gr.Markdown("# 📄 Document AI Assistant")
|
118 |
|
119 |
-
# STATES
|
120 |
chat_state = gr.State([])
|
121 |
thread_state = gr.State()
|
122 |
image_state = gr.State()
|
@@ -141,7 +154,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
141 |
voice_transcript = gr.Textbox(label="Transcript", lines=2, interactive=False)
|
142 |
clear_btn = gr.Button("🧹 Clear Transcript")
|
143 |
|
144 |
-
#
|
145 |
def toggle_voice(curr):
|
146 |
return not curr, gr.update(visible=not curr)
|
147 |
|
|
|
1 |
+
# top of the file
|
2 |
import gradio as gr
|
3 |
import os, time, re, json, base64, asyncio, threading, uuid, io
|
4 |
import numpy as np
|
5 |
import soundfile as sf
|
6 |
from pydub import AudioSegment
|
7 |
from openai import OpenAI
|
8 |
+
from websockets import connect
|
9 |
from dotenv import load_dotenv
|
10 |
|
11 |
+
# Load secrets
|
12 |
load_dotenv()
|
13 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
14 |
ASSISTANT_ID = os.getenv("ASSISTANT_ID")
|
|
|
18 |
WS_URI = "wss://api.openai.com/v1/realtime?intent=transcription"
|
19 |
connections = {}
|
20 |
|
21 |
+
# WebSocket Client
|
22 |
class WebSocketClient:
|
23 |
def __init__(self, uri, headers, client_id):
|
24 |
+
self.uri = uri
|
25 |
+
self.headers = headers
|
26 |
+
self.client_id = client_id
|
27 |
self.websocket = None
|
28 |
self.queue = asyncio.Queue(maxsize=10)
|
29 |
self.transcript = ""
|
30 |
+
self.loop = asyncio.new_event_loop()
|
31 |
|
32 |
async def connect(self):
|
33 |
+
try:
|
34 |
+
self.websocket = await connect(self.uri, additional_headers=self.headers)
|
35 |
+
with open("openai_transcription_settings.json", "r") as f:
|
36 |
+
await self.websocket.send(f.read())
|
37 |
+
await asyncio.gather(self.receive_messages(), self.send_audio_chunks())
|
38 |
+
except Exception as e:
|
39 |
+
print(f"🔴 WebSocket Connection Failed: {e}")
|
40 |
|
41 |
def run(self):
|
42 |
+
asyncio.set_event_loop(self.loop)
|
43 |
+
self.loop.run_until_complete(self.connect())
|
44 |
+
|
45 |
+
def enqueue_audio_chunk(self, sr, arr):
|
46 |
+
if not self.queue.full():
|
47 |
+
asyncio.run_coroutine_threadsafe(self.queue.put((sr, arr)), self.loop)
|
48 |
|
49 |
async def send_audio_chunks(self):
|
50 |
while True:
|
51 |
sr, arr = await self.queue.get()
|
52 |
+
if arr.ndim > 1:
|
53 |
+
arr = arr.mean(axis=1)
|
54 |
+
if np.max(np.abs(arr)) > 0:
|
55 |
+
arr = arr / np.max(np.abs(arr))
|
56 |
int16 = (arr * 32767).astype(np.int16)
|
57 |
+
buf = io.BytesIO()
|
58 |
+
sf.write(buf, int16, sr, format='WAV', subtype='PCM_16')
|
59 |
audio = AudioSegment.from_file(buf, format="wav").set_frame_rate(24000)
|
60 |
+
out = io.BytesIO()
|
61 |
+
audio.export(out, format="wav")
|
62 |
+
out.seek(0)
|
63 |
await self.websocket.send(json.dumps({
|
64 |
"type": "input_audio_buffer.append",
|
65 |
"audio": base64.b64encode(out.read()).decode()
|
|
|
71 |
if data["type"] == "conversation.item.input_audio_transcription.delta":
|
72 |
self.transcript += data["delta"]
|
73 |
|
74 |
+
# Real-time transcription connection manager
|
|
|
|
|
|
|
75 |
def create_ws():
|
76 |
cid = str(uuid.uuid4())
|
77 |
client = WebSocketClient(WS_URI, HEADERS, cid)
|
|
|
80 |
return cid
|
81 |
|
82 |
def send_audio(chunk, cid):
|
83 |
+
if not cid or cid not in connections:
|
84 |
+
return "Connecting..."
|
85 |
sr, arr = chunk
|
86 |
connections[cid].enqueue_audio_chunk(sr, arr)
|
87 |
return connections[cid].transcript
|
88 |
|
89 |
def clear_transcript(cid):
|
90 |
+
if cid in connections:
|
91 |
+
connections[cid].transcript = ""
|
92 |
return ""
|
93 |
|
94 |
# ============ Chat Assistant ============
|
|
|
130 |
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
131 |
gr.Markdown("# 📄 Document AI Assistant")
|
132 |
|
|
|
133 |
chat_state = gr.State([])
|
134 |
thread_state = gr.State()
|
135 |
image_state = gr.State()
|
|
|
154 |
voice_transcript = gr.Textbox(label="Transcript", lines=2, interactive=False)
|
155 |
clear_btn = gr.Button("🧹 Clear Transcript")
|
156 |
|
157 |
+
# Functional bindings
|
158 |
def toggle_voice(curr):
|
159 |
return not curr, gr.update(visible=not curr)
|
160 |
|