Spaces:
Running
Running
Commit
·
218573f
1
Parent(s):
f1e695a
Change to torchaudio
Browse files- app.py +3 -1
- requirements.txt +1 -0
app.py
CHANGED
@@ -13,6 +13,7 @@ from fastapi.staticfiles import StaticFiles
|
|
13 |
# AI + LLM
|
14 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
15 |
import torch
|
|
|
16 |
import soundfile as sf
|
17 |
from google import genai
|
18 |
from google.genai import types
|
@@ -130,7 +131,8 @@ async def voice_transcribe(file: UploadFile = File(...)): # noqa: B008
|
|
130 |
tmp_path = tmp.name
|
131 |
try:
|
132 |
# ── 1. Transcribe
|
133 |
-
|
|
|
134 |
inputs = processor(speech, sampling_rate=sample_rate, return_tensors="pt")
|
135 |
input_features = inputs["input_features"].to("cpu")
|
136 |
generated_ids = model.generate(input_features)
|
|
|
13 |
# AI + LLM
|
14 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
15 |
import torch
|
16 |
+
import torchaudio
|
17 |
import soundfile as sf
|
18 |
from google import genai
|
19 |
from google.genai import types
|
|
|
131 |
tmp_path = tmp.name
|
132 |
try:
|
133 |
# ── 1. Transcribe
|
134 |
+
waveform, sample_rate = torchaudio.load(tmp_path)
|
135 |
+
speech = waveform[0].numpy() # Convert to numpy for WhisperProcessor
|
136 |
inputs = processor(speech, sampling_rate=sample_rate, return_tensors="pt")
|
137 |
input_features = inputs["input_features"].to("cpu")
|
138 |
generated_ids = model.generate(input_features)
|
requirements.txt
CHANGED
@@ -8,6 +8,7 @@ python-multipart # File uploads
|
|
8 |
soundfile
|
9 |
transformers==4.38.2 # ensure recent enough
|
10 |
torch
|
|
|
11 |
huggingface_hub
|
12 |
|
13 |
# Gemini Flash 2.5
|
|
|
8 |
soundfile
|
9 |
transformers==4.38.2 # ensure recent enough
|
10 |
torch
|
11 |
+
torchaudio>=2.1.0
|
12 |
huggingface_hub
|
13 |
|
14 |
# Gemini Flash 2.5
|