import streamlit as st import os import time import re import requests import tempfile import wave import numpy as np from openai import OpenAI from streamlit_audio_recorder import audio_recorder # ------------------ Page Config ------------------ st.set_page_config(page_title="Document AI Assistant", layout="wide") st.title("📄 Document AI Assistant") st.caption("Chat with an AI Assistant on your medical/pathology documents") # ------------------ Load Secrets ------------------ OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") ASSISTANT_ID = os.environ.get("ASSISTANT_ID") if not OPENAI_API_KEY or not ASSISTANT_ID: st.error("❌ Missing secrets. Please set both OPENAI_API_KEY and ASSISTANT_ID in Hugging Face Space settings.") st.stop() client = OpenAI(api_key=OPENAI_API_KEY) # ------------------ Session State Init ------------------ for key in ["messages", "thread_id", "image_url", "transcript"]: if key not in st.session_state: st.session_state[key] = [] if key == "messages" else None # ------------------ Whisper Transcription ------------------ def transcribe_audio(file_path, api_key): with open(file_path, "rb") as f: response = requests.post( "https://api.openai.com/v1/audio/transcriptions", headers={"Authorization": f"Bearer {api_key}"}, files={"file": f}, data={"model": "whisper-1"} ) return response.json().get("text", None) # ------------------ Sidebar & Layout ------------------ st.sidebar.header("🔧 Settings") if st.sidebar.button("🔄 Clear Chat"): st.session_state.messages = [] st.session_state.thread_id = None st.session_state.image_url = None st.session_state.transcript = None st.rerun() show_image = st.sidebar.checkbox("📖 Show Document Image", value=True) col1, col2 = st.columns([1, 2]) # ------------------ Image Panel ------------------ with col1: if show_image and st.session_state.image_url: st.image(st.session_state.image_url, caption="📑 Extracted Page", use_container_width=True) # ------------------ Chat + Mic Panel ------------------ with col2: for message in st.session_state.messages: st.chat_message(message["role"]).write(message["content"]) st.subheader("🎙️ Ask with Your Voice") audio_bytes = audio_recorder(pause_threshold=3.0, energy_threshold=-1.0, sample_rate=44100) if audio_bytes: # Save temporary WAV file with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpfile: tmpfile.write(audio_bytes) tmp_path = tmpfile.name st.audio(tmp_path, format="audio/wav") with st.spinner("🧠 Transcribing..."): transcript = transcribe_audio(tmp_path, OPENAI_API_KEY) if transcript: st.success("📝 Transcript: " + transcript) st.session_state.transcript = transcript # Submit Transcript to Assistant if st.session_state.transcript: if st.button("✅ Send Transcript to Assistant"): user_input = st.session_state.transcript st.session_state.transcript = None # reset st.session_state.messages.append({"role": "user", "content": user_input}) st.chat_message("user").write(user_input) try: if st.session_state.thread_id is None: thread = client.beta.threads.create() st.session_state.thread_id = thread.id thread_id = st.session_state.thread_id client.beta.threads.messages.create(thread_id=thread_id, role="user", content=user_input) run = client.beta.threads.runs.create(thread_id=thread_id, assistant_id=ASSISTANT_ID) with st.spinner("🤖 Assistant is thinking..."): while True: run_status = client.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run.id) if run_status.status == "completed": break time.sleep(1) messages = client.beta.threads.messages.list(thread_id=thread_id) assistant_message = next( (m.content[0].text.value for m in reversed(messages.data) if m.role == "assistant"), None ) st.chat_message("assistant").write(assistant_message) st.session_state.messages.append({"role": "assistant", "content": assistant_message}) # Extract GitHub image if available image_match = re.search( r'https://raw\.githubusercontent\.com/AndrewLORTech/surgical-pathology-manual/main/[\w\-/]*\.png', assistant_message ) if image_match: st.session_state.image_url = image_match.group(0) except Exception as e: st.error(f"❌ Error: {str(e)}") # Fallback text input if prompt := st.chat_input("💬 Or type your question..."): st.session_state.messages.append({"role": "user", "content": prompt}) st.chat_message("user").write(prompt) st.session_state.transcript = prompt # Treat like voice input for now