LiamKhoaLe commited on
Commit
6e2027d
·
1 Parent(s): abebb93

Rm torchaudio, use resampy

Browse files
Files changed (2) hide show
  1. app.py +16 -4
  2. requirements.txt +3 -3
app.py CHANGED
@@ -1,5 +1,6 @@
1
  # Access site: https://binkhoale1812-interview-ai.hf.space/
2
  import os
 
3
  import tempfile
4
  from pathlib import Path
5
  from typing import Dict
@@ -12,12 +13,13 @@ from fastapi.staticfiles import StaticFiles
12
 
13
  # AI + LLM
14
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
15
- import torch
16
- os.environ["NUMBA_DISABLE_CACHE"] = "1"
17
- import librosa
18
  from google import genai
19
  from google.genai import types
20
 
 
 
 
 
21
 
22
  ############################################
23
  # ── Configuration ────────────────────────
@@ -132,7 +134,17 @@ async def voice_transcribe(file: UploadFile = File(...)): # noqa: B008
132
  tmp_path = tmp.name
133
  try:
134
  # ── 1. Transcribe
135
- speech, sample_rate = librosa.load(tmp_path, sr=16000)
 
 
 
 
 
 
 
 
 
 
136
  inputs = processor(speech, sampling_rate=sample_rate, return_tensors="pt")
137
  input_features = inputs["input_features"].to("cpu")
138
  generated_ids = model.generate(input_features)
 
1
  # Access site: https://binkhoale1812-interview-ai.hf.space/
2
  import os
3
+ os.environ["NUMBA_DISABLE_CACHE"] = "1"
4
  import tempfile
5
  from pathlib import Path
6
  from typing import Dict
 
13
 
14
  # AI + LLM
15
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
 
 
 
16
  from google import genai
17
  from google.genai import types
18
 
19
+ # Audio Transcribe
20
+ from scipy.io import wavfile
21
+ import resampy
22
+ import numpy as np
23
 
24
  ############################################
25
  # ── Configuration ────────────────────────
 
134
  tmp_path = tmp.name
135
  try:
136
  # ── 1. Transcribe
137
+ # Load WAV
138
+ sample_rate, data = wavfile.read(tmp_path)
139
+ # Convert to float32 if needed
140
+ if data.dtype != np.float32:
141
+ data = data.astype(np.float32) / np.iinfo(data.dtype).max
142
+ # Resample to 16 kHz for Whisper
143
+ if sample_rate != 16000:
144
+ data = resampy.resample(data, sample_rate, 16000)
145
+ sample_rate = 16000
146
+ # Obtain speech and process to tensor
147
+ speech = data
148
  inputs = processor(speech, sampling_rate=sample_rate, return_tensors="pt")
149
  input_features = inputs["input_features"].to("cpu")
150
  generated_ids = model.generate(input_features)
requirements.txt CHANGED
@@ -4,11 +4,11 @@ uvicorn[standard]
4
  aiofiles # Static file serving
5
  python-multipart # File uploads
6
 
7
- # Voice‑to‑text (Whisper via 🤗 Transformers)
8
  transformers==4.38.2 # ensure recent enough
9
- torch
10
  huggingface_hub
11
- librosa
 
12
 
13
  # Gemini Flash 2.5
14
  google-genai
 
4
  aiofiles # Static file serving
5
  python-multipart # File uploads
6
 
7
+ # Voice‑to‑text (Whisper via Transformers)
8
  transformers==4.38.2 # ensure recent enough
 
9
  huggingface_hub
10
+ scipy
11
+ resampy
12
 
13
  # Gemini Flash 2.5
14
  google-genai