IAMTFRMZA commited on
Commit
51a57c1
·
verified ·
1 Parent(s): b1ba257

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -22
app.py CHANGED
@@ -13,6 +13,10 @@ from pydub import AudioSegment
13
  import time
14
  import uuid
15
 
 
 
 
 
16
  class LogColors:
17
  OK = '\033[94m'
18
  SUCCESS = '\033[92m'
@@ -31,11 +35,13 @@ WEBSOCKET_HEADERS = {
31
  "OpenAI-Beta": "realtime=v1"
32
  }
33
 
34
- css = """
35
- """
36
-
37
  connections = {}
38
 
 
 
 
 
39
  class WebSocketClient:
40
  def __init__(self, uri: str, headers: dict, client_id: str):
41
  self.uri = uri
@@ -51,7 +57,6 @@ class WebSocketClient:
51
  self.websocket = await connect(self.uri, additional_headers=self.headers)
52
  print(f"{LogColors.SUCCESS}Connected to OpenAI WebSocket{LogColors.ENDC}\n")
53
 
54
- # Send session settings to OpenAI
55
  with open("openai_transcription_settings.json", "r") as f:
56
  settings = f.read()
57
  await self.websocket.send(settings)
@@ -73,7 +78,7 @@ class WebSocketClient:
73
  delta = message_object["delta"]
74
  self.transcript += delta
75
  elif message_object["type"] == "conversation.item.input_audio_transcription.completed":
76
- self.transcript += ' ' if len(self.transcript) and self.transcript[-1] != ' ' else ''
77
  else:
78
  print(f"{LogColors.ERROR}Error: {message}{LogColors.ENDC}")
79
 
@@ -82,23 +87,18 @@ class WebSocketClient:
82
  audio_data = await self.queue.get()
83
  sample_rate, audio_array = audio_data
84
  if self.websocket:
85
- # Convert to mono if stereo
86
  if audio_array.ndim > 1:
87
  audio_array = audio_array.mean(axis=1)
88
-
89
- # Convert to float32 and normalize
90
  audio_array = audio_array.astype(np.float32)
91
  audio_array /= np.max(np.abs(audio_array)) if np.max(np.abs(audio_array)) > 0 else 1.0
92
-
93
- # Convert to 16-bit PCM
94
  audio_array_int16 = (audio_array * 32767).astype(np.int16)
95
-
96
  audio_buffer = io.BytesIO()
97
  sf.write(audio_buffer, audio_array_int16, sample_rate, format='WAV', subtype='PCM_16')
98
  audio_buffer.seek(0)
99
  audio_segment = AudioSegment.from_file(audio_buffer, format="wav")
100
  resampled_audio = audio_segment.set_frame_rate(24000)
101
-
102
  output_buffer = io.BytesIO()
103
  resampled_audio.export(output_buffer, format="wav")
104
  output_buffer.seek(0)
@@ -124,6 +124,10 @@ class WebSocketClient:
124
  print(f"{LogColors.WARNING}WebSocket connection closed{LogColors.ENDC}")
125
 
126
 
 
 
 
 
127
  def send_audio_chunk(new_chunk: gr.Audio, client_id: str):
128
  if client_id not in connections:
129
  return "Connection is being established, please try again in a few seconds."
@@ -142,21 +146,34 @@ def clear_transcript(client_id):
142
  connections[client_id].transcript = ""
143
  return ""
144
 
145
- if __name__ == "__main__":
146
- with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
147
- gr.Markdown(f"# Realtime transcription demo")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  with gr.Row():
149
- with gr.Column():
150
- output_textbox = gr.Textbox(label="Transcript", value="", lines=7, interactive=False, autoscroll=True)
151
- with gr.Row():
152
  with gr.Column(scale=5):
153
  audio_input = gr.Audio(streaming=True, format="wav")
154
  with gr.Column():
155
- clear_button = gr.Button("Clear")
156
-
157
  client_id = gr.State()
158
  clear_button.click(clear_transcript, inputs=[client_id], outputs=[output_textbox])
159
- audio_input.stream(send_audio_chunk, [audio_input, client_id], [output_textbox], stream_every=0.5, concurrency_limit=None)
160
  demo.load(create_new_websocket_connection, outputs=[client_id])
161
 
162
- demo.launch()
 
13
  import time
14
  import uuid
15
 
16
+ # =========================
17
+ # Setup & Configuration
18
+ # =========================
19
+
20
  class LogColors:
21
  OK = '\033[94m'
22
  SUCCESS = '\033[92m'
 
35
  "OpenAI-Beta": "realtime=v1"
36
  }
37
 
38
+ css = ""
 
 
39
  connections = {}
40
 
41
+ # =========================
42
+ # WebSocket Client Class
43
+ # =========================
44
+
45
  class WebSocketClient:
46
  def __init__(self, uri: str, headers: dict, client_id: str):
47
  self.uri = uri
 
57
  self.websocket = await connect(self.uri, additional_headers=self.headers)
58
  print(f"{LogColors.SUCCESS}Connected to OpenAI WebSocket{LogColors.ENDC}\n")
59
 
 
60
  with open("openai_transcription_settings.json", "r") as f:
61
  settings = f.read()
62
  await self.websocket.send(settings)
 
78
  delta = message_object["delta"]
79
  self.transcript += delta
80
  elif message_object["type"] == "conversation.item.input_audio_transcription.completed":
81
+ self.transcript += ' ' if self.transcript and self.transcript[-1] != ' ' else ''
82
  else:
83
  print(f"{LogColors.ERROR}Error: {message}{LogColors.ENDC}")
84
 
 
87
  audio_data = await self.queue.get()
88
  sample_rate, audio_array = audio_data
89
  if self.websocket:
 
90
  if audio_array.ndim > 1:
91
  audio_array = audio_array.mean(axis=1)
 
 
92
  audio_array = audio_array.astype(np.float32)
93
  audio_array /= np.max(np.abs(audio_array)) if np.max(np.abs(audio_array)) > 0 else 1.0
 
 
94
  audio_array_int16 = (audio_array * 32767).astype(np.int16)
95
+
96
  audio_buffer = io.BytesIO()
97
  sf.write(audio_buffer, audio_array_int16, sample_rate, format='WAV', subtype='PCM_16')
98
  audio_buffer.seek(0)
99
  audio_segment = AudioSegment.from_file(audio_buffer, format="wav")
100
  resampled_audio = audio_segment.set_frame_rate(24000)
101
+
102
  output_buffer = io.BytesIO()
103
  resampled_audio.export(output_buffer, format="wav")
104
  output_buffer.seek(0)
 
124
  print(f"{LogColors.WARNING}WebSocket connection closed{LogColors.ENDC}")
125
 
126
 
127
+ # =========================
128
+ # Helper Functions
129
+ # =========================
130
+
131
  def send_audio_chunk(new_chunk: gr.Audio, client_id: str):
132
  if client_id not in connections:
133
  return "Connection is being established, please try again in a few seconds."
 
146
  connections[client_id].transcript = ""
147
  return ""
148
 
149
+ # =========================
150
+ # Gradio UI Sections
151
+ # =========================
152
+
153
+ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
154
+
155
+ with gr.Tab("💬 Chat Assistant"):
156
+ gr.Markdown("### Chat Section (Coming Soon)")
157
+ gr.Textbox(label="Your question")
158
+ gr.Button("Send")
159
+
160
+ with gr.Tab("📄 Document Viewer"):
161
+ gr.Markdown("### Upload and View Documents")
162
+ gr.File(label="Upload Document", file_types=[".pdf", ".txt", ".docx"])
163
+ gr.Textbox(label="Document Preview", lines=10)
164
+
165
+ with gr.Tab("🎤 Voice Transcription"):
166
+ gr.Markdown("### Realtime Voice-to-Text Transcription")
167
+ with gr.Row():
168
+ output_textbox = gr.Textbox(label="Transcript", lines=7, interactive=False, autoscroll=True)
169
  with gr.Row():
 
 
 
170
  with gr.Column(scale=5):
171
  audio_input = gr.Audio(streaming=True, format="wav")
172
  with gr.Column():
173
+ clear_button = gr.Button("Clear Transcript")
 
174
  client_id = gr.State()
175
  clear_button.click(clear_transcript, inputs=[client_id], outputs=[output_textbox])
176
+ audio_input.stream(send_audio_chunk, [audio_input, client_id], [output_textbox], stream_every=0.5)
177
  demo.load(create_new_websocket_connection, outputs=[client_id])
178
 
179
+ demo.launch()