Spaces:
Running
Running
Commit
·
6e2027d
1
Parent(s):
abebb93
Rm torchaudio, use resampy
Browse files- app.py +16 -4
- requirements.txt +3 -3
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
# Access site: https://binkhoale1812-interview-ai.hf.space/
|
2 |
import os
|
|
|
3 |
import tempfile
|
4 |
from pathlib import Path
|
5 |
from typing import Dict
|
@@ -12,12 +13,13 @@ from fastapi.staticfiles import StaticFiles
|
|
12 |
|
13 |
# AI + LLM
|
14 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
15 |
-
import torch
|
16 |
-
os.environ["NUMBA_DISABLE_CACHE"] = "1"
|
17 |
-
import librosa
|
18 |
from google import genai
|
19 |
from google.genai import types
|
20 |
|
|
|
|
|
|
|
|
|
21 |
|
22 |
############################################
|
23 |
# ── Configuration ────────────────────────
|
@@ -132,7 +134,17 @@ async def voice_transcribe(file: UploadFile = File(...)): # noqa: B008
|
|
132 |
tmp_path = tmp.name
|
133 |
try:
|
134 |
# ── 1. Transcribe
|
135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
inputs = processor(speech, sampling_rate=sample_rate, return_tensors="pt")
|
137 |
input_features = inputs["input_features"].to("cpu")
|
138 |
generated_ids = model.generate(input_features)
|
|
|
1 |
# Access site: https://binkhoale1812-interview-ai.hf.space/
|
2 |
import os
|
3 |
+
os.environ["NUMBA_DISABLE_CACHE"] = "1"
|
4 |
import tempfile
|
5 |
from pathlib import Path
|
6 |
from typing import Dict
|
|
|
13 |
|
14 |
# AI + LLM
|
15 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
|
|
|
|
|
|
16 |
from google import genai
|
17 |
from google.genai import types
|
18 |
|
19 |
+
# Audio Transcribe
|
20 |
+
from scipy.io import wavfile
|
21 |
+
import resampy
|
22 |
+
import numpy as np
|
23 |
|
24 |
############################################
|
25 |
# ── Configuration ────────────────────────
|
|
|
134 |
tmp_path = tmp.name
|
135 |
try:
|
136 |
# ── 1. Transcribe
|
137 |
+
# Load WAV
|
138 |
+
sample_rate, data = wavfile.read(tmp_path)
|
139 |
+
# Convert to float32 if needed
|
140 |
+
if data.dtype != np.float32:
|
141 |
+
data = data.astype(np.float32) / np.iinfo(data.dtype).max
|
142 |
+
# Resample to 16 kHz for Whisper
|
143 |
+
if sample_rate != 16000:
|
144 |
+
data = resampy.resample(data, sample_rate, 16000)
|
145 |
+
sample_rate = 16000
|
146 |
+
# Obtain speech and process to tensor
|
147 |
+
speech = data
|
148 |
inputs = processor(speech, sampling_rate=sample_rate, return_tensors="pt")
|
149 |
input_features = inputs["input_features"].to("cpu")
|
150 |
generated_ids = model.generate(input_features)
|
requirements.txt
CHANGED
@@ -4,11 +4,11 @@ uvicorn[standard]
|
|
4 |
aiofiles # Static file serving
|
5 |
python-multipart # File uploads
|
6 |
|
7 |
-
# Voice‑to‑text (Whisper via
|
8 |
transformers==4.38.2 # ensure recent enough
|
9 |
-
torch
|
10 |
huggingface_hub
|
11 |
-
|
|
|
12 |
|
13 |
# Gemini Flash 2.5
|
14 |
google-genai
|
|
|
4 |
aiofiles # Static file serving
|
5 |
python-multipart # File uploads
|
6 |
|
7 |
+
# Voice‑to‑text (Whisper via Transformers)
|
8 |
transformers==4.38.2 # ensure recent enough
|
|
|
9 |
huggingface_hub
|
10 |
+
scipy
|
11 |
+
resampy
|
12 |
|
13 |
# Gemini Flash 2.5
|
14 |
google-genai
|