Update app.py
Browse files
app.py
CHANGED
@@ -76,7 +76,6 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
76 |
|
77 |
try:
|
78 |
while True:
|
79 |
-
# Wait for the next audio chunk (sent as binary data)
|
80 |
data = await websocket.receive()
|
81 |
if data["type"] == "websocket.receive":
|
82 |
if data.get("text") == "switch_to_tiny":
|
@@ -86,36 +85,30 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
86 |
current_model = transcriber_base
|
87 |
continue
|
88 |
|
89 |
-
# Convert the 16-bit PCM data to float32.
|
90 |
chunk = pcm16_to_float32(data["bytes"])
|
91 |
speech = np.concatenate((speech, chunk))
|
92 |
-
|
93 |
if not recording:
|
94 |
-
# Retain only the last few chunks when not recording.
|
95 |
speech = speech[-lookback_size:]
|
96 |
|
97 |
-
# Process VAD on the current chunk.
|
98 |
vad_result = vad_iterator(chunk)
|
99 |
current_time = time.time()
|
100 |
|
101 |
if vad_result:
|
102 |
-
# If VAD signals the start of speech and we're not already recording.
|
103 |
if "start" in vad_result and not recording:
|
104 |
recording = True
|
105 |
-
|
106 |
-
|
107 |
if "end" in vad_result and recording:
|
108 |
recording = False
|
109 |
text = current_model(speech)
|
110 |
await websocket.send_json({"type": "final", "transcript": text})
|
111 |
caption_cache.append(text)
|
112 |
speech = np.empty(0, dtype=np.float32)
|
113 |
-
# Reset VAD state.
|
114 |
vad_iterator.triggered = False
|
115 |
vad_iterator.temp_end = 0
|
116 |
vad_iterator.current_sample = 0
|
|
|
117 |
elif recording:
|
118 |
-
# If speech goes on too long, force an end.
|
119 |
if (len(speech) / SAMPLING_RATE) > MAX_SPEECH_SECS:
|
120 |
recording = False
|
121 |
text = current_model(speech)
|
@@ -125,7 +118,8 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
125 |
vad_iterator.triggered = False
|
126 |
vad_iterator.temp_end = 0
|
127 |
vad_iterator.current_sample = 0
|
128 |
-
|
|
|
129 |
if (current_time - last_partial_time) > MIN_REFRESH_SECS:
|
130 |
text = current_model(speech)
|
131 |
if last_output != text:
|
@@ -133,7 +127,6 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
133 |
await websocket.send_json({"type": "partial", "transcript": text})
|
134 |
last_partial_time = current_time
|
135 |
except WebSocketDisconnect:
|
136 |
-
# If the client disconnects, send any final transcript if available.
|
137 |
if recording and speech.size:
|
138 |
text = current_model(speech)
|
139 |
await websocket.send_json({"type": "final", "transcript": text})
|
@@ -151,123 +144,135 @@ async def get_home():
|
|
151 |
</head>
|
152 |
<body class="bg-gray-100 p-6">
|
153 |
<div class="max-w-3xl mx-auto bg-white p-6 rounded-lg shadow-md">
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
|
|
165 |
</div>
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
</html>
|
272 |
"""
|
273 |
|
|
|
76 |
|
77 |
try:
|
78 |
while True:
|
|
|
79 |
data = await websocket.receive()
|
80 |
if data["type"] == "websocket.receive":
|
81 |
if data.get("text") == "switch_to_tiny":
|
|
|
85 |
current_model = transcriber_base
|
86 |
continue
|
87 |
|
|
|
88 |
chunk = pcm16_to_float32(data["bytes"])
|
89 |
speech = np.concatenate((speech, chunk))
|
|
|
90 |
if not recording:
|
|
|
91 |
speech = speech[-lookback_size:]
|
92 |
|
|
|
93 |
vad_result = vad_iterator(chunk)
|
94 |
current_time = time.time()
|
95 |
|
96 |
if vad_result:
|
|
|
97 |
if "start" in vad_result and not recording:
|
98 |
recording = True
|
99 |
+
await websocket.send_json({"type": "status", "message": "speaking_started"})
|
100 |
+
|
101 |
if "end" in vad_result and recording:
|
102 |
recording = False
|
103 |
text = current_model(speech)
|
104 |
await websocket.send_json({"type": "final", "transcript": text})
|
105 |
caption_cache.append(text)
|
106 |
speech = np.empty(0, dtype=np.float32)
|
|
|
107 |
vad_iterator.triggered = False
|
108 |
vad_iterator.temp_end = 0
|
109 |
vad_iterator.current_sample = 0
|
110 |
+
await websocket.send_json({"type": "status", "message": "speaking_stopped"})
|
111 |
elif recording:
|
|
|
112 |
if (len(speech) / SAMPLING_RATE) > MAX_SPEECH_SECS:
|
113 |
recording = False
|
114 |
text = current_model(speech)
|
|
|
118 |
vad_iterator.triggered = False
|
119 |
vad_iterator.temp_end = 0
|
120 |
vad_iterator.current_sample = 0
|
121 |
+
await websocket.send_json({"type": "status", "message": "speaking_stopped"})
|
122 |
+
|
123 |
if (current_time - last_partial_time) > MIN_REFRESH_SECS:
|
124 |
text = current_model(speech)
|
125 |
if last_output != text:
|
|
|
127 |
await websocket.send_json({"type": "partial", "transcript": text})
|
128 |
last_partial_time = current_time
|
129 |
except WebSocketDisconnect:
|
|
|
130 |
if recording and speech.size:
|
131 |
text = current_model(speech)
|
132 |
await websocket.send_json({"type": "final", "transcript": text})
|
|
|
144 |
</head>
|
145 |
<body class="bg-gray-100 p-6">
|
146 |
<div class="max-w-3xl mx-auto bg-white p-6 rounded-lg shadow-md">
|
147 |
+
<h1 class="text-2xl font-bold mb-4">Realtime Transcription</h1>
|
148 |
+
<button onclick="startTranscription()" class="bg-blue-500 text-white px-4 py-2 rounded mb-4">Start Transcription</button>
|
149 |
+
<select id="modelSelect" onchange="switchModel()" class="bg-gray-200 px-4 py-2 rounded mb-4">
|
150 |
+
<option value="tiny">Tiny Model</option>
|
151 |
+
<option value="base">Base Model</option>
|
152 |
+
</select>
|
153 |
+
<p id="status" class="text-gray-600 mb-4">Click start to begin transcription.</p>
|
154 |
+
<p id="speakingStatus" class="text-gray-600 mb-4"></p>
|
155 |
+
<div id="transcription" class="border p-4 rounded mb-4 h-64 overflow-auto"></div>
|
156 |
+
<div id="visualizer" class="border p-4 rounded h-64">
|
157 |
+
<canvas id="audioCanvas" class="w-full h-full"></canvas>
|
158 |
+
</div>
|
159 |
</div>
|
160 |
+
<script>
|
161 |
+
let ws;
|
162 |
+
let audioContext;
|
163 |
+
let scriptProcessor;
|
164 |
+
let mediaStream;
|
165 |
+
let currentLine = document.createElement('span');
|
166 |
+
let analyser;
|
167 |
+
let canvas, canvasContext;
|
168 |
+
|
169 |
+
document.getElementById('transcription').appendChild(currentLine);
|
170 |
+
canvas = document.getElementById('audioCanvas');
|
171 |
+
canvasContext = canvas.getContext('2d');
|
172 |
+
|
173 |
+
async function startTranscription() {
|
174 |
+
document.getElementById("status").innerText = "Connecting...";
|
175 |
+
ws = new WebSocket("wss://" + location.host + "/ws/transcribe");
|
176 |
+
ws.binaryType = 'arraybuffer';
|
177 |
+
|
178 |
+
ws.onopen = async function() {
|
179 |
+
document.getElementById("status").innerText = "Connected";
|
180 |
+
try {
|
181 |
+
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
182 |
+
audioContext = new AudioContext({ sampleRate: 16000 });
|
183 |
+
const source = audioContext.createMediaStreamSource(mediaStream);
|
184 |
+
analyser = audioContext.createAnalyser();
|
185 |
+
analyser.fftSize = 2048;
|
186 |
+
const bufferLength = analyser.frequencyBinCount;
|
187 |
+
const dataArray = new Uint8Array(bufferLength);
|
188 |
+
source.connect(analyser);
|
189 |
+
scriptProcessor = audioContext.createScriptProcessor(512, 1, 1);
|
190 |
+
scriptProcessor.onaudioprocess = function(event) {
|
191 |
+
const inputData = event.inputBuffer.getChannelData(0);
|
192 |
+
const pcm16 = floatTo16BitPCM(inputData);
|
193 |
+
if (ws.readyState === WebSocket.OPEN) {
|
194 |
+
ws.send(pcm16);
|
195 |
+
}
|
196 |
+
analyser.getByteTimeDomainData(dataArray);
|
197 |
+
canvasContext.fillStyle = 'rgb(200, 200, 200)';
|
198 |
+
canvasContext.fillRect(0, 0, canvas.width, canvas.height);
|
199 |
+
canvasContext.lineWidth = 2;
|
200 |
+
canvasContext.strokeStyle = 'rgb(0, 0, 0)';
|
201 |
+
canvasContext.beginPath();
|
202 |
+
let sliceWidth = canvas.width * 1.0 / bufferLength;
|
203 |
+
let x = 0;
|
204 |
+
for (let i = 0; i < bufferLength; i++) {
|
205 |
+
let v = dataArray[i] / 128.0;
|
206 |
+
let y = v * canvas.height / 2;
|
207 |
+
if (i === 0) {
|
208 |
+
canvasContext.moveTo(x, y);
|
209 |
+
} else {
|
210 |
+
canvasContext.lineTo(x, y);
|
211 |
+
}
|
212 |
+
x += sliceWidth;
|
213 |
+
}
|
214 |
+
canvasContext.lineTo(canvas.width, canvas.height / 2);
|
215 |
+
canvasContext.stroke();
|
216 |
+
};
|
217 |
+
source.connect(scriptProcessor);
|
218 |
+
scriptProcessor.connect(audioContext.destination);
|
219 |
+
} catch (err) {
|
220 |
+
document.getElementById("status").innerText = "Error: " + err;
|
221 |
+
}
|
222 |
+
};
|
223 |
+
|
224 |
+
ws.onmessage = function(event) {
|
225 |
+
const data = JSON.parse(event.data);
|
226 |
+
if (data.type === 'partial') {
|
227 |
+
currentLine.style.color = 'gray';
|
228 |
+
currentLine.textContent = data.transcript + ' ';
|
229 |
+
} else if (data.type === 'final') {
|
230 |
+
currentLine.style.color = 'black';
|
231 |
+
currentLine.textContent = data.transcript;
|
232 |
+
currentLine = document.createElement('span');
|
233 |
+
document.getElementById('transcription').appendChild(document.createElement('br'));
|
234 |
+
document.getElementById('transcription').appendChild(currentLine);
|
235 |
+
} else if (data.type === 'status') {
|
236 |
+
if (data.message === 'speaking_started') {
|
237 |
+
document.getElementById("speakingStatus").innerText = "Speaking Started";
|
238 |
+
document.getElementById("speakingStatus").style.color = "green";
|
239 |
+
} else if (data.message === 'speaking_stopped') {
|
240 |
+
document.getElementById("speakingStatus").innerText = "Speaking Stopped";
|
241 |
+
document.getElementById("speakingStatus").style.color = "red";
|
242 |
+
}
|
243 |
+
}
|
244 |
+
};
|
245 |
+
|
246 |
+
ws.onclose = function() {
|
247 |
+
if (audioContext && audioContext.state !== 'closed') {
|
248 |
+
audioContext.close();
|
249 |
+
}
|
250 |
+
document.getElementById("status").innerText = "Closed";
|
251 |
+
};
|
252 |
+
}
|
253 |
+
|
254 |
+
function switchModel() {
|
255 |
+
const model = document.getElementById("modelSelect").value;
|
256 |
+
if (ws && ws.readyState === WebSocket.OPEN) {
|
257 |
+
if (model === "tiny") {
|
258 |
+
ws.send("switch_to_tiny");
|
259 |
+
} else if (model === "base") {
|
260 |
+
ws.send("switch_to_base");
|
261 |
+
}
|
262 |
+
}
|
263 |
+
}
|
264 |
+
|
265 |
+
function floatTo16BitPCM(input) {
|
266 |
+
const buffer = new ArrayBuffer(input.length * 2);
|
267 |
+
const output = new DataView(buffer);
|
268 |
+
for (let i = 0; i < input.length; i++) {
|
269 |
+
let s = Math.max(-1, Math.min(1, input[i]));
|
270 |
+
output.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
|
271 |
+
}
|
272 |
+
return buffer;
|
273 |
+
}
|
274 |
+
</script>
|
275 |
+
</body>
|
276 |
</html>
|
277 |
"""
|
278 |
|