Spaces:

BinKhoaLe1812
/

Interview_AI

Running

App Files Files Community

LiamKhoaLe commited on 15 days ago

Commit

aeade20

1 Parent(s): f4c538b

Migrate to whis-v3 transcription. Update using click to record, not hold to record

Browse files

Files changed (5) hide show

Dockerfile +2 -1
app.py +16 -16
requirements.txt +8 -9
statics/index.html +3 -3
statics/script.js +22 -19

Dockerfile CHANGED Viewed

@@ -11,7 +11,8 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
 WORKDIR /app
 # Install system dependencies
-RUN apt-get update && apt-get install -y ffmpeg \
     ca-certificates curl dnsutils gcc openssl && \
     rm -rf /var/lib/apt/lists/*

 WORKDIR /app
 # Install system dependencies
+RUN apt-get update && apt-get install -y \
+    ffmpeg libsndfile1 \
     ca-certificates curl dnsutils gcc openssl && \
     rm -rf /var/lib/apt/lists/*

app.py CHANGED Viewed

@@ -17,7 +17,8 @@ from google.genai import types
 # ASR
 import numpy as np
 from pydub import AudioSegment
-from transformers import WhisperProcessor, WhisperForConditionalGeneration
 # Misc
 from PIL import Image
@@ -28,7 +29,7 @@ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 if not GEMINI_API_KEY:
     raise RuntimeError("❌ GEMINI_API_KEY must be set as env var")
-ASR_MODEL_ID = "openai/whisper-small.en"
 ASR_LANGUAGE = "en"
 SAMPLE_RATE  = 16_000
@@ -73,7 +74,7 @@ check_system_resources()
 ##############################################################################
 # Global ASR (lazy-loaded)
-processor = model = None
 def build_prompt(question: str) -> str:
     return (
@@ -87,14 +88,15 @@ def memory_mb() -> float:
 @app.on_event("startup")
 async def load_models():
-    global processor, model
-    cache = Path("model_cache"); cache.mkdir(exist_ok=True)
-    processor = WhisperProcessor.from_pretrained(ASR_MODEL_ID, cache_dir=cache)
-    model     = WhisperForConditionalGeneration.from_pretrained(ASR_MODEL_ID, cache_dir=cache)
-    forced    = processor.get_decoder_prompt_ids(task="transcribe", language="english")
-    model.config.forced_decoder_ids = forced
-    model.to("cpu").eval()
-    logger.info("[MODEL] 🔊 Whisper loaded ✔")
 @app.get("/")
 async def root() -> FileResponse: # serve SPA
@@ -146,11 +148,9 @@ async def voice_transcribe(file: UploadFile = File(...)):
         tmp.write(await file.read()); tmp_path = tmp.name
     # Audio processing and transcription
     try:
-        seg = AudioSegment.from_file(tmp_path).set_frame_rate(SAMPLE_RATE).set_channels(1)
-        audio = np.array(seg.get_array_of_samples()).astype(np.float32) / (2 ** 15)
-        inputs = processor(audio, sampling_rate=SAMPLE_RATE, return_tensors="pt")
-        ids    = model.generate(inputs.input_features.to(model.device))
-        question = processor.decode(ids[0], skip_special_tokens=True).strip()
         if not question:
             raise ValueError("No speech detected")
         logger.info(f"[MODEL] Transcribed text: {question}")

 # ASR
 import numpy as np
 from pydub import AudioSegment
+import torch
+from transformers import pipeline
 # Misc
 from PIL import Image
 if not GEMINI_API_KEY:
     raise RuntimeError("❌ GEMINI_API_KEY must be set as env var")
+ASR_MODEL_ID = "openai/whisper-large-v3"   # was whisper-small.en
 ASR_LANGUAGE = "en"
 SAMPLE_RATE  = 16_000
 ##############################################################################
 # Global ASR (lazy-loaded)
+asr_pipe = None
 def build_prompt(question: str) -> str:
     return (
 @app.on_event("startup")
 async def load_models():
+    global asr_pipe
+    device = 0 if torch.cuda.is_available() else "cpu"
+    asr_pipe = pipeline(
+        task="automatic-speech-recognition",
+        model=ASR_MODEL_ID,
+        chunk_length_s=30,
+        device=device,
+    )
+    logger.info("[MODEL] 🔊 Whisper‑v3 pipeline loaded ✔")
 @app.get("/")
 async def root() -> FileResponse: # serve SPA
         tmp.write(await file.read()); tmp_path = tmp.name
     # Audio processing and transcription
     try:
+        # Directly pass file path to the pipeline
+        result = asr_pipe(tmp_path, batch_size=8, generate_kwargs={"task": "transcribe"})
+        question = result["text"].strip()
         if not question:
             raise ValueError("No speech detected")
         logger.info(f"[MODEL] Transcribed text: {question}")

requirements.txt CHANGED Viewed

@@ -1,24 +1,23 @@
 # Core server
 fastapi
 uvicorn[standard]
-aiofiles               # Static files
-# Voice‑to‑text (Whisper via Transformers)
-transformers           # For whisper
-torch                  # Just to run transformer so don't remove
 accelerate
 # Audio & Image
 pydub
-ffmpeg-python
-openai-whisper         # pulls tiny‑en / small‑en
 pillow
 # Gemini Flash 2.5
 google-genai
 # Utilities
-psutil                 # Lightweight health logging
-python-multipart       # File uploads
 huggingface_hub
-python-dotenv

 # Core server
 fastapi
 uvicorn[standard]
+aiofiles
+# Voice‑to‑text (Whisper v3 via Transformers)
+transformers
+torch
 accelerate
+ffmpeg-python
 # Audio & Image
 pydub
 pillow
 # Gemini Flash 2.5
 google-genai
 # Utilities
+psutil
+python-multipart
 huggingface_hub
+python-dotenv

statics/index.html CHANGED Viewed

@@ -13,9 +13,9 @@
 <body>
   <main class="container">
     <h1>Interview Q&amp;A Assistant</h1>
-    <p class="subtitle">🎙&nbsp;Hold the button, ask your interview question, release to get an answer.<br>
-      📸&nbsp;Or upload a screenshot of the question.</p>
-      <button id="record-button"     class="record-btn">🎤 Hold&nbsp;to&nbsp;Ask</button>
       <button id="screenshot-button" class="screenshot-btn">📸 Drop&nbsp;your&nbsp;Screenshot</button>
       <input  id="file-input" type="file" accept="image/*" hidden />
       <section class="output-section">

 <body>
   <main class="container">
     <h1>Interview Q&amp;A Assistant</h1>
+      <p class="subtitle">🎙 Click <strong>Start Recording</strong>, ask your question, then click <strong>Stop</strong> to transcribe.<br>
+      📸 Or upload a screenshot of the question.</p>
+      <button id="record-button" class="record-btn">🎤 Start Recording</button>
       <button id="screenshot-button" class="screenshot-btn">📸 Drop&nbsp;your&nbsp;Screenshot</button>
       <input  id="file-input" type="file" accept="image/*" hidden />
       <section class="output-section">

statics/script.js CHANGED Viewed

@@ -22,20 +22,22 @@ function typeEffect(el, text, speed = 30) {
 /* ─────────────────── Abort-controller wrapper ───────────────── */
 let currentController = null;
 function fetchWithAbort(url, opts = {}) {
-  if (currentController) currentController.abort();      // cancel previous req
   currentController = new AbortController();
   return fetch(url, { ...opts, signal: currentController.signal });
 }
 /* ─────────────────── Audio recording setup ─────────────────── */
-let mediaRecorder, chunks = [];
 async function initMedia() {
   const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
   mediaRecorder = new MediaRecorder(stream);
   mediaRecorder.ondataavailable = e => chunks.push(e.data);
   mediaRecorder.onstop = async () => {
     const audioBlob = new Blob(chunks, { type: "audio/wav" });
     chunks = [];
@@ -56,7 +58,20 @@ async function initMedia() {
   };
 }
-/* ─────────────── Screenshot / image-question upload ─────────── */
 fileInput.addEventListener("change", async (e) => {
   const file = e.target.files[0];
   if (!file) return;
@@ -79,15 +94,7 @@ fileInput.addEventListener("change", async (e) => {
 });
 screenshotBtn.addEventListener("click", () => fileInput.click());
-/* ─────────────────── Hold-to-record UX ─────────────────────── */
-function bindRecordBtn() {
-  recordBtn.addEventListener("mousedown", () => mediaRecorder.start());
-  recordBtn.addEventListener("mouseup",   () => mediaRecorder.stop());
-  recordBtn.addEventListener("touchstart", e => { e.preventDefault(); mediaRecorder.start(); });
-  recordBtn.addEventListener("touchend",   e => { e.preventDefault(); mediaRecorder.stop();  });
-}
-/* ─────────────────── Editable question block ───────────────── */
 function enableEdit() {
   questionEl.contentEditable = "true";
   questionEl.classList.add("editing");
@@ -121,10 +128,9 @@ questionEl.addEventListener("keydown", (e) => {
   }
 });
-/* ─────────────────────── helpers ───────────────────────────── */
 function displayQa(data) {
   let qHtml = "", aHtml = "";
-  // Parse and bind JSON now as Q&A can be an array with more than 1 component(s)
   const qaList = Array.isArray(data) ? data : [data];
   qaList.forEach((item, idx) => {
     const q = item.question || "[no question]";
@@ -133,17 +139,14 @@ function displayQa(data) {
     aHtml += `<strong>Q${idx + 1}:</strong> ${DOMPurify.sanitize(marked.parseInline(q))}<br>`;
     aHtml += `<strong>A${idx + 1}:</strong> ${DOMPurify.sanitize(marked.parse(a))}<hr>`;
   });
-  // Type effect with trimming element
   typeEffect(questionEl, qHtml.trim());
   setTimeout(() => { answerEl.innerHTML = aHtml.trim(); }, 400);
 }
-/* ─────────────────────── bootstrap ─────────────────────────── */
 window.addEventListener("DOMContentLoaded", async () => {
   try {
     await initMedia();
-    bindRecordBtn();
   } catch {
     alert("Microphone permission is required.");
   }

 /* ─────────────────── Abort-controller wrapper ───────────────── */
 let currentController = null;
 function fetchWithAbort(url, opts = {}) {
+  if (currentController) currentController.abort(); // cancel previous req
   currentController = new AbortController();
   return fetch(url, { ...opts, signal: currentController.signal });
 }
 /* ─────────────────── Audio recording setup ─────────────────── */
+let mediaRecorder, chunks = [], isRecording = false;
 async function initMedia() {
   const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
   mediaRecorder = new MediaRecorder(stream);
   mediaRecorder.ondataavailable = e => chunks.push(e.data);
   mediaRecorder.onstop = async () => {
+    recordBtn.textContent = "🎤 Start Recording";
+    isRecording = false;
     const audioBlob = new Blob(chunks, { type: "audio/wav" });
     chunks = [];
   };
 }
+/* ─────────────── Click-to-record UX ─────────────── */
+recordBtn.addEventListener("click", () => {
+  if (!mediaRecorder) return;
+  if (isRecording) {
+    mediaRecorder.stop();
+  } else {
+    chunks = [];
+    mediaRecorder.start();
+    recordBtn.textContent = "🎤 Stop Recording";
+    isRecording = true;
+  }
+});
+/* ─────────────── Screenshot upload ─────────────── */
 fileInput.addEventListener("change", async (e) => {
   const file = e.target.files[0];
   if (!file) return;
 });
 screenshotBtn.addEventListener("click", () => fileInput.click());
+/* ─────────────── Editable question block ─────────────── */
 function enableEdit() {
   questionEl.contentEditable = "true";
   questionEl.classList.add("editing");
   }
 });
+/* ─────────────── render helpers ─────────────── */
 function displayQa(data) {
   let qHtml = "", aHtml = "";
   const qaList = Array.isArray(data) ? data : [data];
   qaList.forEach((item, idx) => {
     const q = item.question || "[no question]";
     aHtml += `<strong>Q${idx + 1}:</strong> ${DOMPurify.sanitize(marked.parseInline(q))}<br>`;
     aHtml += `<strong>A${idx + 1}:</strong> ${DOMPurify.sanitize(marked.parse(a))}<hr>`;
   });
   typeEffect(questionEl, qHtml.trim());
   setTimeout(() => { answerEl.innerHTML = aHtml.trim(); }, 400);
 }
+/* ─────────────── init ─────────────── */
 window.addEventListener("DOMContentLoaded", async () => {
   try {
     await initMedia();
   } catch {
     alert("Microphone permission is required.");
   }