IAMTFRMZA commited on
Commit
81240ab
Β·
verified Β·
1 Parent(s): 887c5ba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -47
app.py CHANGED
@@ -5,10 +5,9 @@ import re
5
  import requests
6
  import tempfile
7
  import wave
8
- import av
9
  import numpy as np
10
  from openai import OpenAI
11
- from streamlit_webrtc import webrtc_streamer, WebRtcMode
12
 
13
  # ------------------ Page Config ------------------
14
  st.set_page_config(page_title="Document AI Assistant", layout="wide")
@@ -26,9 +25,9 @@ if not OPENAI_API_KEY or not ASSISTANT_ID:
26
  client = OpenAI(api_key=OPENAI_API_KEY)
27
 
28
  # ------------------ Session State Init ------------------
29
- for key in ["messages", "thread_id", "image_url", "audio_buffer", "transcript"]:
30
  if key not in st.session_state:
31
- st.session_state[key] = [] if key == "messages" or key == "audio_buffer" else None
32
 
33
  # ------------------ Whisper Transcription ------------------
34
  def transcribe_audio(file_path, api_key):
@@ -41,23 +40,13 @@ def transcribe_audio(file_path, api_key):
41
  )
42
  return response.json().get("text", None)
43
 
44
- # ------------------ Audio Save Helper ------------------
45
- def save_wav(frames, path, rate=48000):
46
- audio_data = np.concatenate(frames)
47
- with wave.open(path, 'wb') as wf:
48
- wf.setnchannels(1)
49
- wf.setsampwidth(2)
50
- wf.setframerate(rate)
51
- wf.writeframes(audio_data.tobytes())
52
-
53
- # ------------------ Sidebar Controls ------------------
54
  st.sidebar.header("πŸ”§ Settings")
55
  if st.sidebar.button("πŸ”„ Clear Chat"):
56
  st.session_state.messages = []
57
  st.session_state.thread_id = None
58
  st.session_state.image_url = None
59
  st.session_state.transcript = None
60
- st.session_state.audio_buffer = []
61
  st.rerun()
62
 
63
  show_image = st.sidebar.checkbox("πŸ“– Show Document Image", value=True)
@@ -68,44 +57,31 @@ with col1:
68
  if show_image and st.session_state.image_url:
69
  st.image(st.session_state.image_url, caption="πŸ“‘ Extracted Page", use_container_width=True)
70
 
71
- # ------------------ Chat + Voice Panel ------------------
72
  with col2:
73
  for message in st.session_state.messages:
74
  st.chat_message(message["role"]).write(message["content"])
75
 
76
- st.subheader("πŸŽ™οΈ Real-time Voice Input")
77
- is_recording = st.checkbox("🎀 Start Recording")
78
 
79
- if is_recording:
80
- audio_ctx = webrtc_streamer(key="voice", mode=WebRtcMode.SENDONLY)
 
 
 
 
 
81
 
82
- if audio_ctx.audio_receiver:
83
- try:
84
- audio_frames = []
85
- while True:
86
- result = audio_ctx.audio_receiver.recv()
87
- audio_data = result.to_ndarray()
88
- audio_frames.append(audio_data)
89
- if len(audio_frames) > 30:
90
- break
91
-
92
- tmp_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
93
- save_wav(audio_frames, tmp_path)
94
- st.audio(tmp_path, format="audio/wav")
95
-
96
- with st.spinner("🧠 Transcribing..."):
97
- transcript = transcribe_audio(tmp_path, OPENAI_API_KEY)
98
-
99
- if transcript:
100
- st.session_state.transcript = transcript
101
- st.success("πŸ“ Transcript: " + transcript)
102
- with open(tmp_path, "rb") as f:
103
- st.download_button("⬇️ Download Audio", f, file_name="recording.wav", mime="audio/wav")
104
 
105
- except Exception as e:
106
- st.error(f"Recording failed: {str(e)}")
107
 
108
- # Confirm & send transcript
 
 
 
 
109
  if st.session_state.transcript:
110
  if st.button("βœ… Send Transcript to Assistant"):
111
  user_input = st.session_state.transcript
@@ -138,6 +114,7 @@ with col2:
138
  st.chat_message("assistant").write(assistant_message)
139
  st.session_state.messages.append({"role": "assistant", "content": assistant_message})
140
 
 
141
  image_match = re.search(
142
  r'https://raw\.githubusercontent\.com/AndrewLORTech/surgical-pathology-manual/main/[\w\-/]*\.png',
143
  assistant_message
@@ -148,8 +125,9 @@ with col2:
148
  except Exception as e:
149
  st.error(f"❌ Error: {str(e)}")
150
 
151
- # Text input fallback
152
  if prompt := st.chat_input("πŸ’¬ Or type your question..."):
153
  st.session_state.messages.append({"role": "user", "content": prompt})
154
  st.chat_message("user").write(prompt)
155
- # Same logic could be duplicated here or modularized
 
 
5
  import requests
6
  import tempfile
7
  import wave
 
8
  import numpy as np
9
  from openai import OpenAI
10
+ from streamlit_audio_recorder import audio_recorder
11
 
12
  # ------------------ Page Config ------------------
13
  st.set_page_config(page_title="Document AI Assistant", layout="wide")
 
25
  client = OpenAI(api_key=OPENAI_API_KEY)
26
 
27
  # ------------------ Session State Init ------------------
28
+ for key in ["messages", "thread_id", "image_url", "transcript"]:
29
  if key not in st.session_state:
30
+ st.session_state[key] = [] if key == "messages" else None
31
 
32
  # ------------------ Whisper Transcription ------------------
33
  def transcribe_audio(file_path, api_key):
 
40
  )
41
  return response.json().get("text", None)
42
 
43
+ # ------------------ Sidebar & Layout ------------------
 
 
 
 
 
 
 
 
 
44
  st.sidebar.header("πŸ”§ Settings")
45
  if st.sidebar.button("πŸ”„ Clear Chat"):
46
  st.session_state.messages = []
47
  st.session_state.thread_id = None
48
  st.session_state.image_url = None
49
  st.session_state.transcript = None
 
50
  st.rerun()
51
 
52
  show_image = st.sidebar.checkbox("πŸ“– Show Document Image", value=True)
 
57
  if show_image and st.session_state.image_url:
58
  st.image(st.session_state.image_url, caption="πŸ“‘ Extracted Page", use_container_width=True)
59
 
60
+ # ------------------ Chat + Mic Panel ------------------
61
  with col2:
62
  for message in st.session_state.messages:
63
  st.chat_message(message["role"]).write(message["content"])
64
 
65
+ st.subheader("πŸŽ™οΈ Ask with Your Voice")
 
66
 
67
+ audio_bytes = audio_recorder(pause_threshold=3.0, energy_threshold=-1.0, sample_rate=44100)
68
+
69
+ if audio_bytes:
70
+ # Save temporary WAV file
71
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpfile:
72
+ tmpfile.write(audio_bytes)
73
+ tmp_path = tmpfile.name
74
 
75
+ st.audio(tmp_path, format="audio/wav")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
+ with st.spinner("🧠 Transcribing..."):
78
+ transcript = transcribe_audio(tmp_path, OPENAI_API_KEY)
79
 
80
+ if transcript:
81
+ st.success("πŸ“ Transcript: " + transcript)
82
+ st.session_state.transcript = transcript
83
+
84
+ # Submit Transcript to Assistant
85
  if st.session_state.transcript:
86
  if st.button("βœ… Send Transcript to Assistant"):
87
  user_input = st.session_state.transcript
 
114
  st.chat_message("assistant").write(assistant_message)
115
  st.session_state.messages.append({"role": "assistant", "content": assistant_message})
116
 
117
+ # Extract GitHub image if available
118
  image_match = re.search(
119
  r'https://raw\.githubusercontent\.com/AndrewLORTech/surgical-pathology-manual/main/[\w\-/]*\.png',
120
  assistant_message
 
125
  except Exception as e:
126
  st.error(f"❌ Error: {str(e)}")
127
 
128
+ # Fallback text input
129
  if prompt := st.chat_input("πŸ’¬ Or type your question..."):
130
  st.session_state.messages.append({"role": "user", "content": prompt})
131
  st.chat_message("user").write(prompt)
132
+ st.session_state.transcript = prompt # Treat like voice input for now
133
+