Spaces:
Sleeping
Sleeping
File size: 3,904 Bytes
d439419 95695d7 d439419 e4889b6 d439419 95695d7 d439419 84aaf41 ce51cd7 d439419 e4889b6 d439419 95695d7 d439419 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import gradio as gr
import os
import json
import uuid
import threading
import time
import re
from openai import OpenAI
from realtime_transcriber import WebSocketClient, connections, WEBSOCKET_URI, WEBSOCKET_HEADERS
# ------------------ Load API Key ------------------
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ASSISTANT_ID = os.getenv("ASSISTANT_ID")
if not OPENAI_API_KEY or not ASSISTANT_ID:
raise ValueError("Missing OPENAI_API_KEY or ASSISTANT_ID in environment variables")
client = OpenAI(api_key=OPENAI_API_KEY)
# ------------------ Chat Logic ------------------
session_threads = {}
def reset_session():
session_id = str(uuid.uuid4())
thread = client.beta.threads.create()
session_threads[session_id] = thread.id
return session_id
def process_chat(message, history, session_id):
thread_id = session_threads.get(session_id)
if not thread_id:
thread_id = client.beta.threads.create().id
session_threads[session_id] = thread_id
client.beta.threads.messages.create(
thread_id=thread_id,
role="user",
content=message
)
run = client.beta.threads.runs.create(
thread_id=thread_id,
assistant_id=ASSISTANT_ID
)
while True:
run_status = client.beta.threads.runs.retrieve(
thread_id=thread_id,
run_id=run.id
)
if run_status.status == "completed":
break
time.sleep(1)
messages = client.beta.threads.messages.list(thread_id=thread_id)
for msg in reversed(messages.data):
if msg.role == "assistant":
assistant_response = msg.content[0].text.value
break
else:
assistant_response = "β οΈ Assistant did not respond."
return assistant_response # β
only returning text now
# ------------------ Transcription Logic ------------------
def create_websocket_client():
client_id = str(uuid.uuid4())
connections[client_id] = WebSocketClient(WEBSOCKET_URI, WEBSOCKET_HEADERS, client_id)
threading.Thread(target=connections[client_id].run, daemon=True).start()
return client_id
def clear_transcript(client_id):
if client_id in connections:
connections[client_id].transcript = ""
return ""
def send_audio_chunk(audio, client_id):
if client_id not in connections:
return "Initializing connection..."
sr, y = audio
connections[client_id].enqueue_audio_chunk(sr, y)
return connections[client_id].transcript
# ------------------ Gradio Interface ------------------
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# π§ Document AI + ποΈ Voice Assistant")
session_id = gr.State(value=reset_session())
client_id = gr.State()
# ---------- Section 1: Chat Interface ----------
with gr.Row():
chatbot = gr.ChatInterface(
fn=lambda message, history, session_id: process_chat(message, history, session_id),
additional_inputs=[session_id],
examples=[
["What does clause 3.2 mean?"],
["Summarize the timeline from the image."]
],
title="π¬ Document Assistant"
)
# ---------- Section 2: Voice Transcription ----------
gr.Markdown("## ποΈ Realtime Voice Transcription")
with gr.Row():
transcript_box = gr.Textbox(label="Live Transcript", lines=7, interactive=False, autoscroll=True)
with gr.Row():
mic_input = gr.Audio(streaming=True) # β
fixed for Hugging Face compatibility
clear_button = gr.Button("Clear Transcript")
mic_input.stream(fn=send_audio_chunk, inputs=[mic_input, client_id], outputs=transcript_box)
clear_button.click(fn=clear_transcript, inputs=[client_id], outputs=transcript_box)
demo.load(fn=create_websocket_client, outputs=client_id)
demo.launch()
|