stt-4

Running

App Files Files Community

bcci commited on Feb 27

Commit

a4181e3

verified ·

1 Parent(s): c9365be

Update app.py

Browse files

Files changed (1) hide show

app.py +191 -50

app.py CHANGED Viewed

@@ -1,66 +1,207 @@
-from fastapi import FastAPI, UploadFile, File, HTTPException
-from transformers import MoonshineForConditionalGeneration, AutoProcessor
-import torch
-import librosa
-import io
-import os
-from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
-model = load_silero_vad()
 app = FastAPI()
-# Check for GPU availability
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-# Load the model and processor
-try:
-    model = MoonshineForConditionalGeneration.from_pretrained('UsefulSensors/moonshine-tiny').to(device).to(torch_dtype)
-    processor = AutoProcessor.from_pretrained('UsefulSensors/moonshine-tiny')
-except Exception as e:
-    print(f"Error loading model or processor: {e}")
-    exit()
-@app.post("/transcribe/")
-async def transcribe_audio(file: UploadFile = File(...)):
     """
-    Transcribes an uploaded audio file.
     """
-    if not file.filename.lower().endswith(('.mp3', '.wav', '.ogg', '.flac', '.m4a')): #add more formats as needed
-        raise HTTPException(status_code=400, detail="Invalid file format. Supported formats: .mp3, .wav, .ogg, .flac, .m4a")
-    try:
-        audio_bytes = await file.read()
-        audio_array, sampling_rate = librosa.load(io.BytesIO(audio_bytes), sr=processor.feature_extractor.sampling_rate)
-        # speech_timestamps = get_speech_timestamps(
-        #   torch.from_numpy(audio_array),
-        #   model,
-        #   return_seconds=True,  # Return speech timestamps in seconds (default is samples)
-        # )
-        # print(speech_timestamps)
-        inputs = processor(
-            audio_array,
-            return_tensors="pt",
-            sampling_rate=processor.feature_extractor.sampling_rate
-        )
-        inputs = inputs.to(device, torch_dtype)
-        token_limit_factor = 6.5 / processor.feature_extractor.sampling_rate
-        seq_lens = inputs.attention_mask.sum(dim=-1)
-        max_length = int((seq_lens * token_limit_factor).max().item())
-        generated_ids = model.generate(**inputs, max_length=max_length)
-        transcription = processor.decode(generated_ids[0], skip_special_tokens=True)
-        return {"transcription": transcription}
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error processing audio: {e}")
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=7860)

+import time
+import asyncio
+import numpy as np
+from fastapi import FastAPI, WebSocket, WebSocketDisconnect
+from fastapi.responses import HTMLResponse
+# Import your model and VAD libraries.
+from silero_vad import VADIterator, load_silero_vad
+from moonshine_onnx import MoonshineOnnxModel, load_tokenizer
+# Constants
+SAMPLING_RATE = 16000
+CHUNK_SIZE = 512            # Required for Silero VAD at 16kHz.
+LOOKBACK_CHUNKS = 5
+MAX_SPEECH_SECS = 15        # Maximum duration for a single transcription segment.
+MIN_REFRESH_SECS = 0.2      # Minimum interval for sending partial updates.
 app = FastAPI()
+class Transcriber:
+    def __init__(self, model_name: str, rate: int = 16000):
+        if rate != 16000:
+            raise ValueError("Moonshine supports sampling rate 16000 Hz.")
+        self.model = MoonshineOnnxModel(model_name=model_name)
+        self.rate = rate
+        self.tokenizer = load_tokenizer()
+        # Statistics (optional)
+        self.inference_secs = 0
+        self.number_inferences = 0
+        self.speech_secs = 0
+        # Warmup run.
+        self.__call__(np.zeros(int(rate), dtype=np.float32))
+    def __call__(self, speech: np.ndarray) -> str:
+        """Returns a transcription of the given speech (a float32 numpy array)."""
+        self.number_inferences += 1
+        self.speech_secs += len(speech) / self.rate
+        start_time = time.time()
+        tokens = self.model.generate(speech[np.newaxis, :].astype(np.float32))
+        text = self.tokenizer.decode_batch(tokens)[0]
+        self.inference_secs += time.time() - start_time
+        return text
+def pcm16_to_float32(pcm_data: bytes) -> np.ndarray:
     """
+    Convert 16-bit PCM bytes into a float32 numpy array with values in [-1, 1].
     """
+    int_data = np.frombuffer(pcm_data, dtype=np.int16)
+    float_data = int_data.astype(np.float32) / 32768.0
+    return float_data
+@app.websocket("/ws/transcribe")
+async def websocket_endpoint(websocket: WebSocket):
+    await websocket.accept()
+    # Initialize models.
+    model_name = "moonshine/tiny"
+    transcriber = Transcriber(model_name=model_name, rate=SAMPLING_RATE)
+    vad_model = load_silero_vad(onnx=True)
+    vad_iterator = VADIterator(
+        model=vad_model,
+        sampling_rate=SAMPLING_RATE,
+        threshold=0.5,
+        min_silence_duration_ms=300,
+    )
+    caption_cache = []
+    lookback_size = LOOKBACK_CHUNKS * CHUNK_SIZE
+    speech = np.empty(0, dtype=np.float32)
+    recording = False
+    last_partial_time = time.time()
+    try:
+        while True:
+            # Wait for the next audio chunk (sent as binary data)
+            data = await websocket.receive_bytes()
+            # Convert the 16-bit PCM data to float32.
+            chunk = pcm16_to_float32(data)
+            speech = np.concatenate((speech, chunk))
+            if not recording:
+                # Retain only the last few chunks when not recording.
+                speech = speech[-lookback_size:]
+            # Process VAD on the current chunk.
+            vad_result = vad_iterator(chunk)
+            current_time = time.time()
+            if vad_result:
+                # If VAD signals the start of speech and we're not already recording.
+                if "start" in vad_result and not recording:
+                    recording = True
+                    start_time = current_time
+                # If VAD signals the end of speech.
+                if "end" in vad_result and recording:
+                    recording = False
+                    text = transcriber(speech)
+                    await websocket.send_json({"type": "final", "transcript": text})
+                    caption_cache.append(text)
+                    speech = np.empty(0, dtype=np.float32)
+                    # Reset VAD state.
+                    vad_iterator.triggered = False
+                    vad_iterator.temp_end = 0
+                    vad_iterator.current_sample = 0
+            elif recording:
+                # If speech goes on too long, force an end.
+                if (len(speech) / SAMPLING_RATE) > MAX_SPEECH_SECS:
+                    recording = False
+                    text = transcriber(speech)
+                    await websocket.send_json({"type": "final", "transcript": text})
+                    caption_cache.append(text)
+                    speech = np.empty(0, dtype=np.float32)
+                    vad_iterator.triggered = False
+                    vad_iterator.temp_end = 0
+                    vad_iterator.current_sample = 0
+                # Send partial transcription updates periodically.
+                if (current_time - last_partial_time) > MIN_REFRESH_SECS:
+                    text = transcriber(speech)
+                    await websocket.send_json({"type": "partial", "transcript": text})
+                    last_partial_time = current_time
+    except WebSocketDisconnect:
+        # If the client disconnects, send any final transcript if available.
+        if recording and speech.size:
+            text = transcriber(speech)
+            await websocket.send_json({"type": "final", "transcript": text})
+        print("WebSocket disconnected")
+@app.get("/", response_class=HTMLResponse)
+async def get_home():
+    return """
+    <!DOCTYPE html>
+    <html>
+      <head>
+        <meta charset="UTF-8">
+        <title>AssemblyAI Realtime Transcription</title>
+      </head>
+      <body>
+        <h1>Realtime Transcription</h1>
+        <button onclick="startTranscription()">Start Transcription</button>
+        <p id="status">Click start to begin transcription.</p>
+        <div id="transcription" style="border:1px solid #ccc; padding:10px; margin-top:10px; height:200px; overflow:auto;"></div>
+        <script>
+          let ws;
+          let audioContext;
+          let scriptProcessor;
+          let mediaStream;
+          let currentLine = document.createElement('span');
+          document.getElementById('transcription').appendChild(currentLine);
+          async function startTranscription() {
+              document.getElementById("status").innerText = "Connecting...";
+              ws = new WebSocket("wss://" + location.host + "/ws/transcribe");
+              ws.binaryType = 'arraybuffer';
+              ws.onopen = async function() {
+                  document.getElementById("status").innerText = "Connected";
+                  try {
+                      mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true });
+                      audioContext = new AudioContext({ sampleRate: 16000 });
+                      const source = audioContext.createMediaStreamSource(mediaStream);
+                      scriptProcessor = audioContext.createScriptProcessor(1024, 1, 1);
+                      scriptProcessor.onaudioprocess = function(event) {
+                          const inputData = event.inputBuffer.getChannelData(0);
+                          const pcm16 = floatTo16BitPCM(inputData);
+                          if (ws.readyState === WebSocket.OPEN) {
+                              ws.send(pcm16);
+                          }
+                      };
+                      source.connect(scriptProcessor);
+                      scriptProcessor.connect(audioContext.destination);
+                  } catch (err) {
+                      document.getElementById("status").innerText = "Error: " + err;
+                  }
+              };
+              ws.onmessage = function(event) {
+                  const data = JSON.parse(event.data);
+                  if (data.type === 'partial') {
+                      currentLine.style.color = 'gray';
+                      currentLine.textContent = data.transcript + ' ';
+                  } else if (data.type === 'final') {
+                      currentLine.style.color = 'black';
+                      currentLine.textContent = data.transcript;
+                      currentLine = document.createElement('span');
+                      document.getElementById('transcription').appendChild(document.createElement('br'));
+                      document.getElementById('transcription').appendChild(currentLine);
+                  }
+              };
+              ws.onclose = function() {
+                  if (audioContext && audioContext.state !== 'closed') {
+                      audioContext.close();
+                  }
+                  document.getElementById("status").innerText = "Closed";
+              };
+          }
+          function floatTo16BitPCM(input) {
+              const buffer = new ArrayBuffer(input.length * 2);
+              const output = new DataView(buffer);
+              for (let i = 0; i < input.length; i++) {
+                  let s = Math.max(-1, Math.min(1, input[i]));
+                  output.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
+              }
+              return buffer;
+          }
+        </script>
+      </body>
+    </html>
+    """
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)