LiamKhoaLe commited on
Commit
aeade20
·
1 Parent(s): f4c538b

Migrate to whis-v3 transcription. Update using click to record, not hold to record

Browse files
Files changed (5) hide show
  1. Dockerfile +2 -1
  2. app.py +16 -16
  3. requirements.txt +8 -9
  4. statics/index.html +3 -3
  5. statics/script.js +22 -19
Dockerfile CHANGED
@@ -11,7 +11,8 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
11
  WORKDIR /app
12
 
13
  # Install system dependencies
14
- RUN apt-get update && apt-get install -y ffmpeg \
 
15
  ca-certificates curl dnsutils gcc openssl && \
16
  rm -rf /var/lib/apt/lists/*
17
 
 
11
  WORKDIR /app
12
 
13
  # Install system dependencies
14
+ RUN apt-get update && apt-get install -y \
15
+ ffmpeg libsndfile1 \
16
  ca-certificates curl dnsutils gcc openssl && \
17
  rm -rf /var/lib/apt/lists/*
18
 
app.py CHANGED
@@ -17,7 +17,8 @@ from google.genai import types
17
  # ASR
18
  import numpy as np
19
  from pydub import AudioSegment
20
- from transformers import WhisperProcessor, WhisperForConditionalGeneration
 
21
 
22
  # Misc
23
  from PIL import Image
@@ -28,7 +29,7 @@ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
28
  if not GEMINI_API_KEY:
29
  raise RuntimeError("❌ GEMINI_API_KEY must be set as env var")
30
 
31
- ASR_MODEL_ID = "openai/whisper-small.en"
32
  ASR_LANGUAGE = "en"
33
  SAMPLE_RATE = 16_000
34
 
@@ -73,7 +74,7 @@ check_system_resources()
73
  ##############################################################################
74
 
75
  # Global ASR (lazy-loaded)
76
- processor = model = None
77
 
78
  def build_prompt(question: str) -> str:
79
  return (
@@ -87,14 +88,15 @@ def memory_mb() -> float:
87
 
88
  @app.on_event("startup")
89
  async def load_models():
90
- global processor, model
91
- cache = Path("model_cache"); cache.mkdir(exist_ok=True)
92
- processor = WhisperProcessor.from_pretrained(ASR_MODEL_ID, cache_dir=cache)
93
- model = WhisperForConditionalGeneration.from_pretrained(ASR_MODEL_ID, cache_dir=cache)
94
- forced = processor.get_decoder_prompt_ids(task="transcribe", language="english")
95
- model.config.forced_decoder_ids = forced
96
- model.to("cpu").eval()
97
- logger.info("[MODEL] 🔊 Whisper loaded ✔")
 
98
 
99
  @app.get("/")
100
  async def root() -> FileResponse: # serve SPA
@@ -146,11 +148,9 @@ async def voice_transcribe(file: UploadFile = File(...)):
146
  tmp.write(await file.read()); tmp_path = tmp.name
147
  # Audio processing and transcription
148
  try:
149
- seg = AudioSegment.from_file(tmp_path).set_frame_rate(SAMPLE_RATE).set_channels(1)
150
- audio = np.array(seg.get_array_of_samples()).astype(np.float32) / (2 ** 15)
151
- inputs = processor(audio, sampling_rate=SAMPLE_RATE, return_tensors="pt")
152
- ids = model.generate(inputs.input_features.to(model.device))
153
- question = processor.decode(ids[0], skip_special_tokens=True).strip()
154
  if not question:
155
  raise ValueError("No speech detected")
156
  logger.info(f"[MODEL] Transcribed text: {question}")
 
17
  # ASR
18
  import numpy as np
19
  from pydub import AudioSegment
20
+ import torch
21
+ from transformers import pipeline
22
 
23
  # Misc
24
  from PIL import Image
 
29
  if not GEMINI_API_KEY:
30
  raise RuntimeError("❌ GEMINI_API_KEY must be set as env var")
31
 
32
+ ASR_MODEL_ID = "openai/whisper-large-v3" # was whisper-small.en
33
  ASR_LANGUAGE = "en"
34
  SAMPLE_RATE = 16_000
35
 
 
74
  ##############################################################################
75
 
76
  # Global ASR (lazy-loaded)
77
+ asr_pipe = None
78
 
79
  def build_prompt(question: str) -> str:
80
  return (
 
88
 
89
  @app.on_event("startup")
90
  async def load_models():
91
+ global asr_pipe
92
+ device = 0 if torch.cuda.is_available() else "cpu"
93
+ asr_pipe = pipeline(
94
+ task="automatic-speech-recognition",
95
+ model=ASR_MODEL_ID,
96
+ chunk_length_s=30,
97
+ device=device,
98
+ )
99
+ logger.info("[MODEL] 🔊 Whisper‑v3 pipeline loaded ✔")
100
 
101
  @app.get("/")
102
  async def root() -> FileResponse: # serve SPA
 
148
  tmp.write(await file.read()); tmp_path = tmp.name
149
  # Audio processing and transcription
150
  try:
151
+ # Directly pass file path to the pipeline
152
+ result = asr_pipe(tmp_path, batch_size=8, generate_kwargs={"task": "transcribe"})
153
+ question = result["text"].strip()
 
 
154
  if not question:
155
  raise ValueError("No speech detected")
156
  logger.info(f"[MODEL] Transcribed text: {question}")
requirements.txt CHANGED
@@ -1,24 +1,23 @@
1
  # Core server
2
  fastapi
3
  uvicorn[standard]
4
- aiofiles # Static files
5
 
6
- # Voice‑to‑text (Whisper via Transformers)
7
- transformers # For whisper
8
- torch # Just to run transformer so don't remove
9
  accelerate
 
10
 
11
  # Audio & Image
12
  pydub
13
- ffmpeg-python
14
- openai-whisper # pulls tiny‑en / small‑en
15
  pillow
16
 
17
  # Gemini Flash 2.5
18
  google-genai
19
 
20
  # Utilities
21
- psutil # Lightweight health logging
22
- python-multipart # File uploads
23
  huggingface_hub
24
- python-dotenv
 
1
  # Core server
2
  fastapi
3
  uvicorn[standard]
4
+ aiofiles
5
 
6
+ # Voice‑to‑text (Whisper v3 via Transformers)
7
+ transformers
8
+ torch
9
  accelerate
10
+ ffmpeg-python
11
 
12
  # Audio & Image
13
  pydub
 
 
14
  pillow
15
 
16
  # Gemini Flash 2.5
17
  google-genai
18
 
19
  # Utilities
20
+ psutil
21
+ python-multipart
22
  huggingface_hub
23
+ python-dotenv
statics/index.html CHANGED
@@ -13,9 +13,9 @@
13
  <body>
14
  <main class="container">
15
  <h1>Interview Q&amp;A Assistant</h1>
16
- <p class="subtitle">🎙&nbsp;Hold the button, ask your interview question, release to get an answer.<br>
17
- 📸&nbsp;Or upload a screenshot of the question.</p>
18
- <button id="record-button" class="record-btn">🎤 Hold&nbsp;to&nbsp;Ask</button>
19
  <button id="screenshot-button" class="screenshot-btn">📸 Drop&nbsp;your&nbsp;Screenshot</button>
20
  <input id="file-input" type="file" accept="image/*" hidden />
21
  <section class="output-section">
 
13
  <body>
14
  <main class="container">
15
  <h1>Interview Q&amp;A Assistant</h1>
16
+ <p class="subtitle">🎙 Click <strong>Start Recording</strong>, ask your question, then click <strong>Stop</strong> to transcribe.<br>
17
+ 📸 Or upload a screenshot of the question.</p>
18
+ <button id="record-button" class="record-btn">🎤 Start Recording</button>
19
  <button id="screenshot-button" class="screenshot-btn">📸 Drop&nbsp;your&nbsp;Screenshot</button>
20
  <input id="file-input" type="file" accept="image/*" hidden />
21
  <section class="output-section">
statics/script.js CHANGED
@@ -22,20 +22,22 @@ function typeEffect(el, text, speed = 30) {
22
  /* ─────────────────── Abort-controller wrapper ───────────────── */
23
  let currentController = null;
24
  function fetchWithAbort(url, opts = {}) {
25
- if (currentController) currentController.abort(); // cancel previous req
26
  currentController = new AbortController();
27
  return fetch(url, { ...opts, signal: currentController.signal });
28
  }
29
 
30
  /* ─────────────────── Audio recording setup ─────────────────── */
31
- let mediaRecorder, chunks = [];
 
32
  async function initMedia() {
33
  const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
34
  mediaRecorder = new MediaRecorder(stream);
35
-
36
  mediaRecorder.ondataavailable = e => chunks.push(e.data);
37
 
38
  mediaRecorder.onstop = async () => {
 
 
39
  const audioBlob = new Blob(chunks, { type: "audio/wav" });
40
  chunks = [];
41
 
@@ -56,7 +58,20 @@ async function initMedia() {
56
  };
57
  }
58
 
59
- /* ─────────────── Screenshot / image-question upload ─────────── */
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  fileInput.addEventListener("change", async (e) => {
61
  const file = e.target.files[0];
62
  if (!file) return;
@@ -79,15 +94,7 @@ fileInput.addEventListener("change", async (e) => {
79
  });
80
  screenshotBtn.addEventListener("click", () => fileInput.click());
81
 
82
- /* ─────────────────── Hold-to-record UX ─────────────────────── */
83
- function bindRecordBtn() {
84
- recordBtn.addEventListener("mousedown", () => mediaRecorder.start());
85
- recordBtn.addEventListener("mouseup", () => mediaRecorder.stop());
86
- recordBtn.addEventListener("touchstart", e => { e.preventDefault(); mediaRecorder.start(); });
87
- recordBtn.addEventListener("touchend", e => { e.preventDefault(); mediaRecorder.stop(); });
88
- }
89
-
90
- /* ─────────────────── Editable question block ───────────────── */
91
  function enableEdit() {
92
  questionEl.contentEditable = "true";
93
  questionEl.classList.add("editing");
@@ -121,10 +128,9 @@ questionEl.addEventListener("keydown", (e) => {
121
  }
122
  });
123
 
124
- /* ─────────────────────── helpers ───────────────────────────── */
125
  function displayQa(data) {
126
  let qHtml = "", aHtml = "";
127
- // Parse and bind JSON now as Q&A can be an array with more than 1 component(s)
128
  const qaList = Array.isArray(data) ? data : [data];
129
  qaList.forEach((item, idx) => {
130
  const q = item.question || "[no question]";
@@ -133,17 +139,14 @@ function displayQa(data) {
133
  aHtml += `<strong>Q${idx + 1}:</strong> ${DOMPurify.sanitize(marked.parseInline(q))}<br>`;
134
  aHtml += `<strong>A${idx + 1}:</strong> ${DOMPurify.sanitize(marked.parse(a))}<hr>`;
135
  });
136
- // Type effect with trimming element
137
  typeEffect(questionEl, qHtml.trim());
138
  setTimeout(() => { answerEl.innerHTML = aHtml.trim(); }, 400);
139
  }
140
 
141
-
142
- /* ─────────────────────── bootstrap ─────────────────────────── */
143
  window.addEventListener("DOMContentLoaded", async () => {
144
  try {
145
  await initMedia();
146
- bindRecordBtn();
147
  } catch {
148
  alert("Microphone permission is required.");
149
  }
 
22
  /* ─────────────────── Abort-controller wrapper ───────────────── */
23
  let currentController = null;
24
  function fetchWithAbort(url, opts = {}) {
25
+ if (currentController) currentController.abort(); // cancel previous req
26
  currentController = new AbortController();
27
  return fetch(url, { ...opts, signal: currentController.signal });
28
  }
29
 
30
  /* ─────────────────── Audio recording setup ─────────────────── */
31
+ let mediaRecorder, chunks = [], isRecording = false;
32
+
33
  async function initMedia() {
34
  const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
35
  mediaRecorder = new MediaRecorder(stream);
 
36
  mediaRecorder.ondataavailable = e => chunks.push(e.data);
37
 
38
  mediaRecorder.onstop = async () => {
39
+ recordBtn.textContent = "🎤 Start Recording";
40
+ isRecording = false;
41
  const audioBlob = new Blob(chunks, { type: "audio/wav" });
42
  chunks = [];
43
 
 
58
  };
59
  }
60
 
61
+ /* ─────────────── Click-to-record UX ─────────────── */
62
+ recordBtn.addEventListener("click", () => {
63
+ if (!mediaRecorder) return;
64
+ if (isRecording) {
65
+ mediaRecorder.stop();
66
+ } else {
67
+ chunks = [];
68
+ mediaRecorder.start();
69
+ recordBtn.textContent = "🎤 Stop Recording";
70
+ isRecording = true;
71
+ }
72
+ });
73
+
74
+ /* ─────────────── Screenshot upload ─────────────── */
75
  fileInput.addEventListener("change", async (e) => {
76
  const file = e.target.files[0];
77
  if (!file) return;
 
94
  });
95
  screenshotBtn.addEventListener("click", () => fileInput.click());
96
 
97
+ /* ─────────────── Editable question block ─────────────── */
 
 
 
 
 
 
 
 
98
  function enableEdit() {
99
  questionEl.contentEditable = "true";
100
  questionEl.classList.add("editing");
 
128
  }
129
  });
130
 
131
+ /* ─────────────── render helpers ─────────────── */
132
  function displayQa(data) {
133
  let qHtml = "", aHtml = "";
 
134
  const qaList = Array.isArray(data) ? data : [data];
135
  qaList.forEach((item, idx) => {
136
  const q = item.question || "[no question]";
 
139
  aHtml += `<strong>Q${idx + 1}:</strong> ${DOMPurify.sanitize(marked.parseInline(q))}<br>`;
140
  aHtml += `<strong>A${idx + 1}:</strong> ${DOMPurify.sanitize(marked.parse(a))}<hr>`;
141
  });
 
142
  typeEffect(questionEl, qHtml.trim());
143
  setTimeout(() => { answerEl.innerHTML = aHtml.trim(); }, 400);
144
  }
145
 
146
+ /* ─────────────── init ─────────────── */
 
147
  window.addEventListener("DOMContentLoaded", async () => {
148
  try {
149
  await initMedia();
 
150
  } catch {
151
  alert("Microphone permission is required.");
152
  }