File size: 5,605 Bytes
c390e38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# stt.py
# -*- coding: utf-8 -*-
"""
stt.py — a module for Speech-to-Text via pywhispercpp

Classes
-------
SpeechToText
    Encapsulates model loading, recording, saving, and transcription.

Usage (as a script)
-------------------
python -m stt --model tiny.en --duration 5

or in code
-----------
from stt import SpeechToText
stt = SpeechToText()
text = stt.transcribe()
"""

import os
import tempfile
import time
import datetime
import numpy as np
import sounddevice as sd
from scipy.io.wavfile import write as write_wav
import webrtcvad
from pywhispercpp.model import Model as Whisper


class SpeechToText:
    """
    A Speech-to-Text helper using pywhispercpp's Whisper + Qdrant for speaker metadata.

    Parameters
    ----------
    model_name : str
        Whisper model to load (e.g. "tiny.en", "base", "small.en", etc.).
    sample_rate : int
        Audio sample rate (must match Whisper's 16 kHz).
    record_duration : float
        Default seconds to record when calling `.record_audio()`.
    temp_dir : str
        Directory for temporary WAV files.
    verbose : bool
        Print progress messages if True.
    """

    def __init__(
        self,
        model_name: str = "tiny.en",
        sample_rate: int = 16_000,
        record_duration: float = 5.0,
        temp_dir: str = None,
        verbose: bool = True,
    ):
        self.model_name = model_name
        self.sample_rate = sample_rate
        self.record_duration = record_duration
        self.temp_dir = temp_dir or tempfile.gettempdir()
        self.verbose = verbose

        # load Whisper model
        if self.verbose:
            print(f"[STT] Loading Whisper model '{self.model_name}'...")
            t0 = time.time()
        self._model = Whisper(model=self.model_name)
        if self.verbose:
            print(f"[STT] Model loaded in {time.time() - t0:.2f}s")

    def record_audio(self, duration: float = None) -> np.ndarray:
        """
        Record from the default mic for `duration` seconds, return float32 mono waveform.
        """
        duration = duration or self.record_duration
        if self.verbose:
            print(f"[STT] Recording for {duration}s at {self.sample_rate}Hz...")
        frames = sd.rec(
            int(duration * self.sample_rate),
            samplerate=self.sample_rate,
            channels=1,
            dtype="int16",
        )
        sd.wait()
        if self.verbose:
            print("[STT] Recording finished.")
        # convert to float32 in [-1, 1]
        return (frames.astype(np.float32) / 32768.0).flatten()

    def save_wav(self, audio: np.ndarray, filename: str = None) -> str:
        """
        Save float32 waveform `audio` to an int16 WAV at `filename`.
        If filename is None, create one in temp_dir.
        Returns the path.
        """
        filename = filename or os.path.join(
            self.temp_dir,
            f"stt_{datetime.datetime.now():%Y%m%d_%H%M%S}.wav"
        )
        os.makedirs(os.path.dirname(filename), exist_ok=True)

        # convert back to int16
        int16 = (audio * 32767).astype(np.int16)
        write_wav(filename, self.sample_rate, int16)
        if self.verbose:
            print(f"[STT] Saved WAV to {filename}")
        return filename

    def transcribe_file(self, wav_path: str, n_threads: int = 4) -> str:
        """
        Transcribe existing WAV file at `wav_path`. Returns the text.
        """
        if not os.path.isfile(wav_path):
            raise FileNotFoundError(f"No such file: {wav_path}")
        if self.verbose:
            print(f"[STT] Transcribing file {wav_path}…")
            t0 = time.time()

        # pywhispercpp API may return segments or text
        result = self._model.transcribe(wav_path, n_threads=n_threads)
        # cleanup temp if in our temp_dir
        if wav_path.startswith(self.temp_dir):
            try:
                os.remove(wav_path)
            except OSError:
                pass

        # collect text
        if isinstance(result, list):
            text = "".join([seg.text for seg in result])
        else:
            # assume Whisper stores text internally
            text = self._model.get_text()

        if self.verbose:
            print(f"[STT] Transcription complete ({time.time() - t0:.2f}s).")
        return text.strip()

    def transcribe(
        self,
        duration: float = None,
        save_temp: bool = False,
        n_threads: int = 4,
    ) -> str:
        """
        Record + save (optional) + transcribe in one call.
        Returns the transcribed text.
        """
        audio = self.record_audio(duration)
        wav_path = self.save_wav(audio) if save_temp else self.save_wav(audio)
        return self.transcribe_file(wav_path, n_threads=n_threads)


# Optional: make module runnable as a script
if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="STT using pywhispercpp")
    parser.add_argument(
        "--model", "-m",
        default="small.en",
        help="Whisper model name (e.g. tiny.en, base, small.en)",
    )
    parser.add_argument(
        "--duration", "-d",
        type=float,
        default=5.0,
        help="Seconds to record",
    )
    parser.add_argument(
        "--no-save", action="store_true",
        help="Do not save the recorded WAV",
    )
    args = parser.parse_args()

    stt = SpeechToText(
        model_name=args.model,
        record_duration=args.duration,
        verbose=True
    )
    text = stt.transcribe(save_temp=not args.no_save)
    print("\n=== Transcription ===")
    print(text)