stt-4

Running

App Files Files Community

bcci commited on Mar 5

Commit

50babed

verified ·

1 Parent(s): 4bbccfd

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -33

app.py CHANGED Viewed

@@ -6,7 +6,13 @@ from fastapi.responses import HTMLResponse
 # Import your model and VAD libraries.
 from silero_vad import VADIterator, load_silero_vad
-from moonshine_onnx import MoonshineOnnxModel, load_tokenizer
 # Constants
 SAMPLING_RATE = 16000
@@ -17,29 +23,29 @@ MIN_REFRESH_SECS = 1  # Minimum interval for sending partial updates.
 app = FastAPI()
-class Transcriber:
-    def __init__(self, model_name: str, rate: int = 16000):
-        if rate != 16000:
-            raise ValueError("Moonshine supports sampling rate 16000 Hz.")
-        self.model = MoonshineOnnxModel(model_name=model_name)
-        self.rate = rate
-        self.tokenizer = load_tokenizer()
-        # Statistics (optional)
-        self.inference_secs = 0
-        self.number_inferences = 0
-        self.speech_secs = 0
-        # Warmup run.
-        self.__call__(np.zeros(int(rate), dtype=np.float32))
-    def __call__(self, speech: np.ndarray) -> str:
-        """Returns a transcription of the given speech (a float32 numpy array)."""
-        self.number_inferences += 1
-        self.speech_secs += len(speech) / self.rate
-        start_time = time.time()
-        tokens = self.model.generate(speech[np.newaxis, :].astype(np.float32))
-        text = self.tokenizer.decode_batch(tokens)[0]
-        self.inference_secs += time.time() - start_time
-        return text
 def pcm16_to_float32(pcm_data: bytes) -> np.ndarray:
     """
@@ -50,10 +56,10 @@ def pcm16_to_float32(pcm_data: bytes) -> np.ndarray:
     return float_data
 # Initialize models.
-model_name_tiny = "moonshine/tiny"
-model_name_base = "moonshine/base"
-transcriber_tiny = Transcriber(model_name=model_name_tiny, rate=SAMPLING_RATE)
-transcriber_base = Transcriber(model_name=model_name_base, rate=SAMPLING_RATE)
 vad_model = load_silero_vad(onnx=True)
 vad_iterator = VADIterator(
     model=vad_model,
@@ -79,10 +85,10 @@ async def websocket_endpoint(websocket: WebSocket):
             data = await websocket.receive()
             if data["type"] == "websocket.receive":
                 if data.get("text") == "switch_to_tiny":
-                    current_model = transcriber_tiny
                     continue
                 elif data.get("text") == "switch_to_base":
-                    current_model = transcriber_base
                     continue
             chunk = pcm16_to_float32(data["bytes"])
@@ -100,7 +106,7 @@ async def websocket_endpoint(websocket: WebSocket):
                 if "end" in vad_result and recording:
                     recording = False
-                    text = current_model(speech)
                     await websocket.send_json({"type": "final", "transcript": text})
                     caption_cache.append(text)
                     speech = np.empty(0, dtype=np.float32)
@@ -111,7 +117,7 @@ async def websocket_endpoint(websocket: WebSocket):
             elif recording:
                 if (len(speech) / SAMPLING_RATE) > MAX_SPEECH_SECS:
                     recording = False
-                    text = current_model(speech)
                     await websocket.send_json({"type": "final", "transcript": text})
                     caption_cache.append(text)
                     speech = np.empty(0, dtype=np.float32)
@@ -121,14 +127,14 @@ async def websocket_endpoint(websocket: WebSocket):
                     await websocket.send_json({"type": "status", "message": "speaking_stopped"})
                 if (current_time - last_partial_time) > MIN_REFRESH_SECS:
-                    text = current_model(speech)
                     if last_output != text:
                         last_output = text
                         await websocket.send_json({"type": "partial", "transcript": text})
                     last_partial_time = current_time
     except WebSocketDisconnect:
         if recording and speech.size:
-            text = current_model(speech)
             await websocket.send_json({"type": "final", "transcript": text})
         print("WebSocket disconnected")

 # Import your model and VAD libraries.
 from silero_vad import VADIterator, load_silero_vad
+from transformers import AutoProcessor, pipeline
+from optimum.onnxruntime import ORTModelForSpeechSeq2Seq
+processor = AutoProcessor.from_pretrained("optimum/whisper-tiny.en")
+model = ORTModelForSpeechSeq2Seq.from_pretrained("optimum/whisper-tiny.en")
+speech_recognition = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor)
 # Constants
 SAMPLING_RATE = 16000
 app = FastAPI()
+# class Transcriber:
+#     def __init__(self, model_name: str, rate: int = 16000):
+#         if rate != 16000:
+#             raise ValueError("Moonshine supports sampling rate 16000 Hz.")
+#         self.model = MoonshineOnnxModel(model_name=model_name)
+#         self.rate = rate
+#         self.tokenizer = load_tokenizer()
+#         # Statistics (optional)
+#         self.inference_secs = 0
+#         self.number_inferences = 0
+#         self.speech_secs = 0
+#         # Warmup run.
+#         self.__call__(np.zeros(int(rate), dtype=np.float32))
+#     def __call__(self, speech: np.ndarray) -> str:
+#         """Returns a transcription of the given speech (a float32 numpy array)."""
+#         self.number_inferences += 1
+#         self.speech_secs += len(speech) / self.rate
+#         start_time = time.time()
+#         tokens = self.model.generate(speech[np.newaxis, :].astype(np.float32))
+#         text = self.tokenizer.decode_batch(tokens)[0]
+#         self.inference_secs += time.time() - start_time
+#         return text
 def pcm16_to_float32(pcm_data: bytes) -> np.ndarray:
     """
     return float_data
 # Initialize models.
+# model_name_tiny = "moonshine/tiny"
+# model_name_base = "moonshine/base"
+# transcriber_tiny = Transcriber(model_name=model_name_tiny, rate=SAMPLING_RATE)
+# transcriber_base = Transcriber(model_name=model_name_base, rate=SAMPLING_RATE)
 vad_model = load_silero_vad(onnx=True)
 vad_iterator = VADIterator(
     model=vad_model,
             data = await websocket.receive()
             if data["type"] == "websocket.receive":
                 if data.get("text") == "switch_to_tiny":
+                    # current_model = transcriber_tiny
                     continue
                 elif data.get("text") == "switch_to_base":
+                    # current_model = transcriber_base
                     continue
             chunk = pcm16_to_float32(data["bytes"])
                 if "end" in vad_result and recording:
                     recording = False
+                    text = pipe({"sampling_rate": 16000, "raw": speech})["text"]
                     await websocket.send_json({"type": "final", "transcript": text})
                     caption_cache.append(text)
                     speech = np.empty(0, dtype=np.float32)
             elif recording:
                 if (len(speech) / SAMPLING_RATE) > MAX_SPEECH_SECS:
                     recording = False
+                    text = pipe({"sampling_rate": 16000, "raw": speech})["text"]
                     await websocket.send_json({"type": "final", "transcript": text})
                     caption_cache.append(text)
                     speech = np.empty(0, dtype=np.float32)
                     await websocket.send_json({"type": "status", "message": "speaking_stopped"})
                 if (current_time - last_partial_time) > MIN_REFRESH_SECS:
+                    text = pipe({"sampling_rate": 16000, "raw": speech})["text"]
                     if last_output != text:
                         last_output = text
                         await websocket.send_json({"type": "partial", "transcript": text})
                     last_partial_time = current_time
     except WebSocketDisconnect:
         if recording and speech.size:
+            text = pipe({"sampling_rate": 16000, "raw": speech})["text"]
             await websocket.send_json({"type": "final", "transcript": text})
         print("WebSocket disconnected")