bcci commited on
Commit
d1c4428
·
verified ·
1 Parent(s): d31adca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +133 -128
app.py CHANGED
@@ -76,7 +76,6 @@ async def websocket_endpoint(websocket: WebSocket):
76
 
77
  try:
78
  while True:
79
- # Wait for the next audio chunk (sent as binary data)
80
  data = await websocket.receive()
81
  if data["type"] == "websocket.receive":
82
  if data.get("text") == "switch_to_tiny":
@@ -86,36 +85,30 @@ async def websocket_endpoint(websocket: WebSocket):
86
  current_model = transcriber_base
87
  continue
88
 
89
- # Convert the 16-bit PCM data to float32.
90
  chunk = pcm16_to_float32(data["bytes"])
91
  speech = np.concatenate((speech, chunk))
92
-
93
  if not recording:
94
- # Retain only the last few chunks when not recording.
95
  speech = speech[-lookback_size:]
96
 
97
- # Process VAD on the current chunk.
98
  vad_result = vad_iterator(chunk)
99
  current_time = time.time()
100
 
101
  if vad_result:
102
- # If VAD signals the start of speech and we're not already recording.
103
  if "start" in vad_result and not recording:
104
  recording = True
105
- start_time = current_time
106
- # If VAD signals the end of speech.
107
  if "end" in vad_result and recording:
108
  recording = False
109
  text = current_model(speech)
110
  await websocket.send_json({"type": "final", "transcript": text})
111
  caption_cache.append(text)
112
  speech = np.empty(0, dtype=np.float32)
113
- # Reset VAD state.
114
  vad_iterator.triggered = False
115
  vad_iterator.temp_end = 0
116
  vad_iterator.current_sample = 0
 
117
  elif recording:
118
- # If speech goes on too long, force an end.
119
  if (len(speech) / SAMPLING_RATE) > MAX_SPEECH_SECS:
120
  recording = False
121
  text = current_model(speech)
@@ -125,7 +118,8 @@ async def websocket_endpoint(websocket: WebSocket):
125
  vad_iterator.triggered = False
126
  vad_iterator.temp_end = 0
127
  vad_iterator.current_sample = 0
128
- # Send partial transcription updates periodically.
 
129
  if (current_time - last_partial_time) > MIN_REFRESH_SECS:
130
  text = current_model(speech)
131
  if last_output != text:
@@ -133,7 +127,6 @@ async def websocket_endpoint(websocket: WebSocket):
133
  await websocket.send_json({"type": "partial", "transcript": text})
134
  last_partial_time = current_time
135
  except WebSocketDisconnect:
136
- # If the client disconnects, send any final transcript if available.
137
  if recording and speech.size:
138
  text = current_model(speech)
139
  await websocket.send_json({"type": "final", "transcript": text})
@@ -151,123 +144,135 @@ async def get_home():
151
  </head>
152
  <body class="bg-gray-100 p-6">
153
  <div class="max-w-3xl mx-auto bg-white p-6 rounded-lg shadow-md">
154
- <h1 class="text-2xl font-bold mb-4">Realtime Transcription</h1>
155
- <button onclick="startTranscription()" class="bg-blue-500 text-white px-4 py-2 rounded mb-4">Start Transcription</button>
156
- <select id="modelSelect" onchange="switchModel()" class="bg-gray-200 px-4 py-2 rounded mb-4">
157
- <option value="tiny">Tiny Model</option>
158
- <option value="base">Base Model</option>
159
- </select>
160
- <p id="status" class="text-gray-600 mb-4">Click start to begin transcription.</p>
161
- <div id="transcription" class="border p-4 rounded mb-4 h-64 overflow-auto"></div>
162
- <div id="visualizer" class="border p-4 rounded h-64">
163
- <canvas id="audioCanvas" class="w-full h-full"></canvas>
164
- </div>
 
165
  </div>
166
- <script>
167
- let ws;
168
- let audioContext;
169
- let scriptProcessor;
170
- let mediaStream;
171
- let currentLine = document.createElement('span');
172
- let analyser;
173
- let canvas, canvasContext;
174
-
175
- document.getElementById('transcription').appendChild(currentLine);
176
- canvas = document.getElementById('audioCanvas');
177
- canvasContext = canvas.getContext('2d');
178
-
179
- async function startTranscription() {
180
- document.getElementById("status").innerText = "Connecting...";
181
- ws = new WebSocket("wss://" + location.host + "/ws/transcribe");
182
- ws.binaryType = 'arraybuffer';
183
- ws.onopen = async function() {
184
- document.getElementById("status").innerText = "Connected";
185
- try {
186
- mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true });
187
- audioContext = new AudioContext({ sampleRate: 16000 });
188
- const source = audioContext.createMediaStreamSource(mediaStream);
189
- analyser = audioContext.createAnalyser();
190
- analyser.fftSize = 2048;
191
- const bufferLength = analyser.frequencyBinCount;
192
- const dataArray = new Uint8Array(bufferLength);
193
- source.connect(analyser);
194
- scriptProcessor = audioContext.createScriptProcessor(512, 1, 1);
195
- scriptProcessor.onaudioprocess = function(event) {
196
- const inputData = event.inputBuffer.getChannelData(0);
197
- const pcm16 = floatTo16BitPCM(inputData);
198
- if (ws.readyState === WebSocket.OPEN) {
199
- ws.send(pcm16);
200
- }
201
- analyser.getByteTimeDomainData(dataArray);
202
- canvasContext.fillStyle = 'rgb(200, 200, 200)';
203
- canvasContext.fillRect(0, 0, canvas.width, canvas.height);
204
- canvasContext.lineWidth = 2;
205
- canvasContext.strokeStyle = 'rgb(0, 0, 0)';
206
- canvasContext.beginPath();
207
- let sliceWidth = canvas.width * 1.0 / bufferLength;
208
- let x = 0;
209
- for (let i = 0; i < bufferLength; i++) {
210
- let v = dataArray[i] / 128.0;
211
- let y = v * canvas.height / 2;
212
- if (i === 0) {
213
- canvasContext.moveTo(x, y);
214
- } else {
215
- canvasContext.lineTo(x, y);
216
- }
217
- x += sliceWidth;
218
- }
219
- canvasContext.lineTo(canvas.width, canvas.height / 2);
220
- canvasContext.stroke();
221
- };
222
- source.connect(scriptProcessor);
223
- scriptProcessor.connect(audioContext.destination);
224
- } catch (err) {
225
- document.getElementById("status").innerText = "Error: " + err;
226
- }
227
- };
228
- ws.onmessage = function(event) {
229
- const data = JSON.parse(event.data);
230
- if (data.type === 'partial') {
231
- currentLine.style.color = 'gray';
232
- currentLine.textContent = data.transcript + ' ';
233
- } else if (data.type === 'final') {
234
- currentLine.style.color = 'black';
235
- currentLine.textContent = data.transcript;
236
- currentLine = document.createElement('span');
237
- document.getElementById('transcription').appendChild(document.createElement('br'));
238
- document.getElementById('transcription').appendChild(currentLine);
239
- }
240
- };
241
- ws.onclose = function() {
242
- if (audioContext && audioContext.state !== 'closed') {
243
- audioContext.close();
244
- }
245
- document.getElementById("status").innerText = "Closed";
246
- };
247
- }
248
-
249
- function switchModel() {
250
- const model = document.getElementById("modelSelect").value;
251
- if (ws && ws.readyState === WebSocket.OPEN) {
252
- if (model === "tiny") {
253
- ws.send("switch_to_tiny");
254
- } else if (model === "base") {
255
- ws.send("switch_to_base");
256
- }
257
- }
258
- }
259
-
260
- function floatTo16BitPCM(input) {
261
- const buffer = new ArrayBuffer(input.length * 2);
262
- const output = new DataView(buffer);
263
- for (let i = 0; i < input.length; i++) {
264
- let s = Math.max(-1, Math.min(1, input[i]));
265
- output.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
266
- }
267
- return buffer;
268
- }
269
- </script>
270
- </body>
 
 
 
 
 
 
 
 
 
 
 
271
  </html>
272
  """
273
 
 
76
 
77
  try:
78
  while True:
 
79
  data = await websocket.receive()
80
  if data["type"] == "websocket.receive":
81
  if data.get("text") == "switch_to_tiny":
 
85
  current_model = transcriber_base
86
  continue
87
 
 
88
  chunk = pcm16_to_float32(data["bytes"])
89
  speech = np.concatenate((speech, chunk))
 
90
  if not recording:
 
91
  speech = speech[-lookback_size:]
92
 
 
93
  vad_result = vad_iterator(chunk)
94
  current_time = time.time()
95
 
96
  if vad_result:
 
97
  if "start" in vad_result and not recording:
98
  recording = True
99
+ await websocket.send_json({"type": "status", "message": "speaking_started"})
100
+
101
  if "end" in vad_result and recording:
102
  recording = False
103
  text = current_model(speech)
104
  await websocket.send_json({"type": "final", "transcript": text})
105
  caption_cache.append(text)
106
  speech = np.empty(0, dtype=np.float32)
 
107
  vad_iterator.triggered = False
108
  vad_iterator.temp_end = 0
109
  vad_iterator.current_sample = 0
110
+ await websocket.send_json({"type": "status", "message": "speaking_stopped"})
111
  elif recording:
 
112
  if (len(speech) / SAMPLING_RATE) > MAX_SPEECH_SECS:
113
  recording = False
114
  text = current_model(speech)
 
118
  vad_iterator.triggered = False
119
  vad_iterator.temp_end = 0
120
  vad_iterator.current_sample = 0
121
+ await websocket.send_json({"type": "status", "message": "speaking_stopped"})
122
+
123
  if (current_time - last_partial_time) > MIN_REFRESH_SECS:
124
  text = current_model(speech)
125
  if last_output != text:
 
127
  await websocket.send_json({"type": "partial", "transcript": text})
128
  last_partial_time = current_time
129
  except WebSocketDisconnect:
 
130
  if recording and speech.size:
131
  text = current_model(speech)
132
  await websocket.send_json({"type": "final", "transcript": text})
 
144
  </head>
145
  <body class="bg-gray-100 p-6">
146
  <div class="max-w-3xl mx-auto bg-white p-6 rounded-lg shadow-md">
147
+ <h1 class="text-2xl font-bold mb-4">Realtime Transcription</h1>
148
+ <button onclick="startTranscription()" class="bg-blue-500 text-white px-4 py-2 rounded mb-4">Start Transcription</button>
149
+ <select id="modelSelect" onchange="switchModel()" class="bg-gray-200 px-4 py-2 rounded mb-4">
150
+ <option value="tiny">Tiny Model</option>
151
+ <option value="base">Base Model</option>
152
+ </select>
153
+ <p id="status" class="text-gray-600 mb-4">Click start to begin transcription.</p>
154
+ <p id="speakingStatus" class="text-gray-600 mb-4"></p>
155
+ <div id="transcription" class="border p-4 rounded mb-4 h-64 overflow-auto"></div>
156
+ <div id="visualizer" class="border p-4 rounded h-64">
157
+ <canvas id="audioCanvas" class="w-full h-full"></canvas>
158
+ </div>
159
  </div>
160
+ <script>
161
+ let ws;
162
+ let audioContext;
163
+ let scriptProcessor;
164
+ let mediaStream;
165
+ let currentLine = document.createElement('span');
166
+ let analyser;
167
+ let canvas, canvasContext;
168
+
169
+ document.getElementById('transcription').appendChild(currentLine);
170
+ canvas = document.getElementById('audioCanvas');
171
+ canvasContext = canvas.getContext('2d');
172
+
173
+ async function startTranscription() {
174
+ document.getElementById("status").innerText = "Connecting...";
175
+ ws = new WebSocket("wss://" + location.host + "/ws/transcribe");
176
+ ws.binaryType = 'arraybuffer';
177
+
178
+ ws.onopen = async function() {
179
+ document.getElementById("status").innerText = "Connected";
180
+ try {
181
+ mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true });
182
+ audioContext = new AudioContext({ sampleRate: 16000 });
183
+ const source = audioContext.createMediaStreamSource(mediaStream);
184
+ analyser = audioContext.createAnalyser();
185
+ analyser.fftSize = 2048;
186
+ const bufferLength = analyser.frequencyBinCount;
187
+ const dataArray = new Uint8Array(bufferLength);
188
+ source.connect(analyser);
189
+ scriptProcessor = audioContext.createScriptProcessor(512, 1, 1);
190
+ scriptProcessor.onaudioprocess = function(event) {
191
+ const inputData = event.inputBuffer.getChannelData(0);
192
+ const pcm16 = floatTo16BitPCM(inputData);
193
+ if (ws.readyState === WebSocket.OPEN) {
194
+ ws.send(pcm16);
195
+ }
196
+ analyser.getByteTimeDomainData(dataArray);
197
+ canvasContext.fillStyle = 'rgb(200, 200, 200)';
198
+ canvasContext.fillRect(0, 0, canvas.width, canvas.height);
199
+ canvasContext.lineWidth = 2;
200
+ canvasContext.strokeStyle = 'rgb(0, 0, 0)';
201
+ canvasContext.beginPath();
202
+ let sliceWidth = canvas.width * 1.0 / bufferLength;
203
+ let x = 0;
204
+ for (let i = 0; i < bufferLength; i++) {
205
+ let v = dataArray[i] / 128.0;
206
+ let y = v * canvas.height / 2;
207
+ if (i === 0) {
208
+ canvasContext.moveTo(x, y);
209
+ } else {
210
+ canvasContext.lineTo(x, y);
211
+ }
212
+ x += sliceWidth;
213
+ }
214
+ canvasContext.lineTo(canvas.width, canvas.height / 2);
215
+ canvasContext.stroke();
216
+ };
217
+ source.connect(scriptProcessor);
218
+ scriptProcessor.connect(audioContext.destination);
219
+ } catch (err) {
220
+ document.getElementById("status").innerText = "Error: " + err;
221
+ }
222
+ };
223
+
224
+ ws.onmessage = function(event) {
225
+ const data = JSON.parse(event.data);
226
+ if (data.type === 'partial') {
227
+ currentLine.style.color = 'gray';
228
+ currentLine.textContent = data.transcript + ' ';
229
+ } else if (data.type === 'final') {
230
+ currentLine.style.color = 'black';
231
+ currentLine.textContent = data.transcript;
232
+ currentLine = document.createElement('span');
233
+ document.getElementById('transcription').appendChild(document.createElement('br'));
234
+ document.getElementById('transcription').appendChild(currentLine);
235
+ } else if (data.type === 'status') {
236
+ if (data.message === 'speaking_started') {
237
+ document.getElementById("speakingStatus").innerText = "Speaking Started";
238
+ document.getElementById("speakingStatus").style.color = "green";
239
+ } else if (data.message === 'speaking_stopped') {
240
+ document.getElementById("speakingStatus").innerText = "Speaking Stopped";
241
+ document.getElementById("speakingStatus").style.color = "red";
242
+ }
243
+ }
244
+ };
245
+
246
+ ws.onclose = function() {
247
+ if (audioContext && audioContext.state !== 'closed') {
248
+ audioContext.close();
249
+ }
250
+ document.getElementById("status").innerText = "Closed";
251
+ };
252
+ }
253
+
254
+ function switchModel() {
255
+ const model = document.getElementById("modelSelect").value;
256
+ if (ws && ws.readyState === WebSocket.OPEN) {
257
+ if (model === "tiny") {
258
+ ws.send("switch_to_tiny");
259
+ } else if (model === "base") {
260
+ ws.send("switch_to_base");
261
+ }
262
+ }
263
+ }
264
+
265
+ function floatTo16BitPCM(input) {
266
+ const buffer = new ArrayBuffer(input.length * 2);
267
+ const output = new DataView(buffer);
268
+ for (let i = 0; i < input.length; i++) {
269
+ let s = Math.max(-1, Math.min(1, input[i]));
270
+ output.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
271
+ }
272
+ return buffer;
273
+ }
274
+ </script>
275
+ </body>
276
  </html>
277
  """
278