stt-4

Running

App Files Files Community

bcci commited on Feb 27

Commit

d1c4428

verified ·

1 Parent(s): d31adca

Update app.py

Browse files

Files changed (1) hide show

app.py +133 -128

app.py CHANGED Viewed

@@ -76,7 +76,6 @@ async def websocket_endpoint(websocket: WebSocket):
     try:
         while True:
-            # Wait for the next audio chunk (sent as binary data)
             data = await websocket.receive()
             if data["type"] == "websocket.receive":
                 if data.get("text") == "switch_to_tiny":
@@ -86,36 +85,30 @@ async def websocket_endpoint(websocket: WebSocket):
                     current_model = transcriber_base
                     continue
-            # Convert the 16-bit PCM data to float32.
             chunk = pcm16_to_float32(data["bytes"])
             speech = np.concatenate((speech, chunk))
             if not recording:
-                # Retain only the last few chunks when not recording.
                 speech = speech[-lookback_size:]
-            # Process VAD on the current chunk.
             vad_result = vad_iterator(chunk)
             current_time = time.time()
             if vad_result:
-                # If VAD signals the start of speech and we're not already recording.
                 if "start" in vad_result and not recording:
                     recording = True
-                    start_time = current_time
-                # If VAD signals the end of speech.
                 if "end" in vad_result and recording:
                     recording = False
                     text = current_model(speech)
                     await websocket.send_json({"type": "final", "transcript": text})
                     caption_cache.append(text)
                     speech = np.empty(0, dtype=np.float32)
-                    # Reset VAD state.
                     vad_iterator.triggered = False
                     vad_iterator.temp_end = 0
                     vad_iterator.current_sample = 0
             elif recording:
-                # If speech goes on too long, force an end.
                 if (len(speech) / SAMPLING_RATE) > MAX_SPEECH_SECS:
                     recording = False
                     text = current_model(speech)
@@ -125,7 +118,8 @@ async def websocket_endpoint(websocket: WebSocket):
                     vad_iterator.triggered = False
                     vad_iterator.temp_end = 0
                     vad_iterator.current_sample = 0
-                # Send partial transcription updates periodically.
                 if (current_time - last_partial_time) > MIN_REFRESH_SECS:
                     text = current_model(speech)
                     if last_output != text:
@@ -133,7 +127,6 @@ async def websocket_endpoint(websocket: WebSocket):
                         await websocket.send_json({"type": "partial", "transcript": text})
                     last_partial_time = current_time
     except WebSocketDisconnect:
-        # If the client disconnects, send any final transcript if available.
         if recording and speech.size:
             text = current_model(speech)
             await websocket.send_json({"type": "final", "transcript": text})
@@ -151,123 +144,135 @@ async def get_home():
      </head>
      <body class="bg-gray-100 p-6">
      <div class="max-w-3xl mx-auto bg-white p-6 rounded-lg shadow-md">
-     <h1 class="text-2xl font-bold mb-4">Realtime Transcription</h1>
-     <button onclick="startTranscription()" class="bg-blue-500 text-white px-4 py-2 rounded mb-4">Start Transcription</button>
-     <select id="modelSelect" onchange="switchModel()" class="bg-gray-200 px-4 py-2 rounded mb-4">
-     <option value="tiny">Tiny Model</option>
-     <option value="base">Base Model</option>
-     </select>
-     <p id="status" class="text-gray-600 mb-4">Click start to begin transcription.</p>
-     <div id="transcription" class="border p-4 rounded mb-4 h-64 overflow-auto"></div>
-     <div id="visualizer" class="border p-4 rounded h-64">
-     <canvas id="audioCanvas" class="w-full h-full"></canvas>
-     </div>
      </div>
-     <script>
-     let ws;
-     let audioContext;
-     let scriptProcessor;
-     let mediaStream;
-     let currentLine = document.createElement('span');
-     let analyser;
-     let canvas, canvasContext;
-     document.getElementById('transcription').appendChild(currentLine);
-     canvas = document.getElementById('audioCanvas');
-     canvasContext = canvas.getContext('2d');
-     async function startTranscription() {
-     document.getElementById("status").innerText = "Connecting...";
-     ws = new WebSocket("wss://" + location.host + "/ws/transcribe");
-     ws.binaryType = 'arraybuffer';
-     ws.onopen = async function() {
-     document.getElementById("status").innerText = "Connected";
-     try {
-     mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true });
-     audioContext = new AudioContext({ sampleRate: 16000 });
-     const source = audioContext.createMediaStreamSource(mediaStream);
-     analyser = audioContext.createAnalyser();
-     analyser.fftSize = 2048;
-     const bufferLength = analyser.frequencyBinCount;
-     const dataArray = new Uint8Array(bufferLength);
-     source.connect(analyser);
-     scriptProcessor = audioContext.createScriptProcessor(512, 1, 1);
-     scriptProcessor.onaudioprocess = function(event) {
-     const inputData = event.inputBuffer.getChannelData(0);
-     const pcm16 = floatTo16BitPCM(inputData);
-     if (ws.readyState === WebSocket.OPEN) {
-     ws.send(pcm16);
-     }
-     analyser.getByteTimeDomainData(dataArray);
-     canvasContext.fillStyle = 'rgb(200, 200, 200)';
-     canvasContext.fillRect(0, 0, canvas.width, canvas.height);
-     canvasContext.lineWidth = 2;
-     canvasContext.strokeStyle = 'rgb(0, 0, 0)';
-     canvasContext.beginPath();
-     let sliceWidth = canvas.width * 1.0 / bufferLength;
-     let x = 0;
-     for (let i = 0; i < bufferLength; i++) {
-     let v = dataArray[i] / 128.0;
-     let y = v * canvas.height / 2;
-     if (i === 0) {
-     canvasContext.moveTo(x, y);
-     } else {
-     canvasContext.lineTo(x, y);
-     }
-     x += sliceWidth;
-     }
-     canvasContext.lineTo(canvas.width, canvas.height / 2);
-     canvasContext.stroke();
-     };
-     source.connect(scriptProcessor);
-     scriptProcessor.connect(audioContext.destination);
-     } catch (err) {
-     document.getElementById("status").innerText = "Error: " + err;
-     }
-     };
-     ws.onmessage = function(event) {
-     const data = JSON.parse(event.data);
-     if (data.type === 'partial') {
-     currentLine.style.color = 'gray';
-     currentLine.textContent = data.transcript + ' ';
-     } else if (data.type === 'final') {
-     currentLine.style.color = 'black';
-     currentLine.textContent = data.transcript;
-     currentLine = document.createElement('span');
-     document.getElementById('transcription').appendChild(document.createElement('br'));
-     document.getElementById('transcription').appendChild(currentLine);
-     }
-     };
-     ws.onclose = function() {
-     if (audioContext && audioContext.state !== 'closed') {
-     audioContext.close();
-     }
-     document.getElementById("status").innerText = "Closed";
-     };
-     }
-     function switchModel() {
-     const model = document.getElementById("modelSelect").value;
-     if (ws && ws.readyState === WebSocket.OPEN) {
-     if (model === "tiny") {
-     ws.send("switch_to_tiny");
-     } else if (model === "base") {
-     ws.send("switch_to_base");
-     }
-     }
-     }
-     function floatTo16BitPCM(input) {
-     const buffer = new ArrayBuffer(input.length * 2);
-     const output = new DataView(buffer);
-     for (let i = 0; i < input.length; i++) {
-     let s = Math.max(-1, Math.min(1, input[i]));
-     output.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
-     }
-     return buffer;
-     }
-     </script>
-     </body>
     </html>
     """

     try:
         while True:
             data = await websocket.receive()
             if data["type"] == "websocket.receive":
                 if data.get("text") == "switch_to_tiny":
                     current_model = transcriber_base
                     continue
             chunk = pcm16_to_float32(data["bytes"])
             speech = np.concatenate((speech, chunk))
             if not recording:
                 speech = speech[-lookback_size:]
             vad_result = vad_iterator(chunk)
             current_time = time.time()
             if vad_result:
                 if "start" in vad_result and not recording:
                     recording = True
+                    await websocket.send_json({"type": "status", "message": "speaking_started"})
                 if "end" in vad_result and recording:
                     recording = False
                     text = current_model(speech)
                     await websocket.send_json({"type": "final", "transcript": text})
                     caption_cache.append(text)
                     speech = np.empty(0, dtype=np.float32)
                     vad_iterator.triggered = False
                     vad_iterator.temp_end = 0
                     vad_iterator.current_sample = 0
+                    await websocket.send_json({"type": "status", "message": "speaking_stopped"})
             elif recording:
                 if (len(speech) / SAMPLING_RATE) > MAX_SPEECH_SECS:
                     recording = False
                     text = current_model(speech)
                     vad_iterator.triggered = False
                     vad_iterator.temp_end = 0
                     vad_iterator.current_sample = 0
+                    await websocket.send_json({"type": "status", "message": "speaking_stopped"})
                 if (current_time - last_partial_time) > MIN_REFRESH_SECS:
                     text = current_model(speech)
                     if last_output != text:
                         await websocket.send_json({"type": "partial", "transcript": text})
                     last_partial_time = current_time
     except WebSocketDisconnect:
         if recording and speech.size:
             text = current_model(speech)
             await websocket.send_json({"type": "final", "transcript": text})
      </head>
      <body class="bg-gray-100 p-6">
      <div class="max-w-3xl mx-auto bg-white p-6 rounded-lg shadow-md">
+    <h1 class="text-2xl font-bold mb-4">Realtime Transcription</h1>
+    <button onclick="startTranscription()" class="bg-blue-500 text-white px-4 py-2 rounded mb-4">Start Transcription</button>
+    <select id="modelSelect" onchange="switchModel()" class="bg-gray-200 px-4 py-2 rounded mb-4">
+        <option value="tiny">Tiny Model</option>
+        <option value="base">Base Model</option>
+    </select>
+    <p id="status" class="text-gray-600 mb-4">Click start to begin transcription.</p>
+    <p id="speakingStatus" class="text-gray-600 mb-4"></p>
+    <div id="transcription" class="border p-4 rounded mb-4 h-64 overflow-auto"></div>
+    <div id="visualizer" class="border p-4 rounded h-64">
+        <canvas id="audioCanvas" class="w-full h-full"></canvas>
+    </div>
      </div>
+    <script>
+    let ws;
+    let audioContext;
+    let scriptProcessor;
+    let mediaStream;
+    let currentLine = document.createElement('span');
+    let analyser;
+    let canvas, canvasContext;
+    document.getElementById('transcription').appendChild(currentLine);
+    canvas = document.getElementById('audioCanvas');
+    canvasContext = canvas.getContext('2d');
+    async function startTranscription() {
+        document.getElementById("status").innerText = "Connecting...";
+        ws = new WebSocket("wss://" + location.host + "/ws/transcribe");
+        ws.binaryType = 'arraybuffer';
+        ws.onopen = async function() {
+            document.getElementById("status").innerText = "Connected";
+            try {
+                mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true });
+                audioContext = new AudioContext({ sampleRate: 16000 });
+                const source = audioContext.createMediaStreamSource(mediaStream);
+                analyser = audioContext.createAnalyser();
+                analyser.fftSize = 2048;
+                const bufferLength = analyser.frequencyBinCount;
+                const dataArray = new Uint8Array(bufferLength);
+                source.connect(analyser);
+                scriptProcessor = audioContext.createScriptProcessor(512, 1, 1);
+                scriptProcessor.onaudioprocess = function(event) {
+                    const inputData = event.inputBuffer.getChannelData(0);
+                    const pcm16 = floatTo16BitPCM(inputData);
+                    if (ws.readyState === WebSocket.OPEN) {
+                        ws.send(pcm16);
+                    }
+                    analyser.getByteTimeDomainData(dataArray);
+                    canvasContext.fillStyle = 'rgb(200, 200, 200)';
+                    canvasContext.fillRect(0, 0, canvas.width, canvas.height);
+                    canvasContext.lineWidth = 2;
+                    canvasContext.strokeStyle = 'rgb(0, 0, 0)';
+                    canvasContext.beginPath();
+                    let sliceWidth = canvas.width * 1.0 / bufferLength;
+                    let x = 0;
+                    for (let i = 0; i < bufferLength; i++) {
+                        let v = dataArray[i] / 128.0;
+                        let y = v * canvas.height / 2;
+                        if (i === 0) {
+                            canvasContext.moveTo(x, y);
+                        } else {
+                            canvasContext.lineTo(x, y);
+                        }
+                        x += sliceWidth;
+                    }
+                    canvasContext.lineTo(canvas.width, canvas.height / 2);
+                    canvasContext.stroke();
+                };
+                source.connect(scriptProcessor);
+                scriptProcessor.connect(audioContext.destination);
+            } catch (err) {
+                document.getElementById("status").innerText = "Error: " + err;
+            }
+        };
+        ws.onmessage = function(event) {
+            const data = JSON.parse(event.data);
+            if (data.type === 'partial') {
+                currentLine.style.color = 'gray';
+                currentLine.textContent = data.transcript + ' ';
+            } else if (data.type === 'final') {
+                currentLine.style.color = 'black';
+                currentLine.textContent = data.transcript;
+                currentLine = document.createElement('span');
+                document.getElementById('transcription').appendChild(document.createElement('br'));
+                document.getElementById('transcription').appendChild(currentLine);
+            } else if (data.type === 'status') {
+                if (data.message === 'speaking_started') {
+                    document.getElementById("speakingStatus").innerText = "Speaking Started";
+                    document.getElementById("speakingStatus").style.color = "green";
+                } else if (data.message === 'speaking_stopped') {
+                    document.getElementById("speakingStatus").innerText = "Speaking Stopped";
+                    document.getElementById("speakingStatus").style.color = "red";
+                }
+            }
+        };
+        ws.onclose = function() {
+            if (audioContext && audioContext.state !== 'closed') {
+                audioContext.close();
+            }
+            document.getElementById("status").innerText = "Closed";
+        };
+    }
+    function switchModel() {
+        const model = document.getElementById("modelSelect").value;
+        if (ws && ws.readyState === WebSocket.OPEN) {
+            if (model === "tiny") {
+                ws.send("switch_to_tiny");
+            } else if (model === "base") {
+                ws.send("switch_to_base");
+            }
+        }
+    }
+    function floatTo16BitPCM(input) {
+        const buffer = new ArrayBuffer(input.length * 2);
+        const output = new DataView(buffer);
+        for (let i = 0; i < input.length; i++) {
+            let s = Math.max(-1, Math.min(1, input[i]));
+            output.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
+        }
+        return buffer;
+    }
+    </script>
+    </body>
     </html>
     """