Spaces:
Running
Running
File size: 3,078 Bytes
9aaf513 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import functools
import numpy as np
from faster_whisper.vad import VadOptions
from fastapi import (
File,
UploadFile,
)
from fastapi import APIRouter, BackgroundTasks, Depends, Response, status
from typing import List, Dict
from datetime import datetime
from modules.vad.silero_vad import SileroVAD
from modules.whisper.data_classes import VadParams
from backend.common.audio import read_audio
from backend.common.models import QueueResponse
from backend.db.task.dao import add_task_to_db, update_task_status_in_db
from backend.db.task.models import TaskStatus, TaskType
vad_router = APIRouter(prefix="/vad", tags=["Voice Activity Detection"])
@functools.lru_cache
def get_vad_model() -> SileroVAD:
inferencer = SileroVAD()
inferencer.update_model()
return inferencer
def run_vad(
audio: np.ndarray,
params: VadOptions,
identifier: str,
) -> List[Dict]:
update_task_status_in_db(
identifier=identifier,
update_data={
"uuid": identifier,
"status": TaskStatus.IN_PROGRESS,
"updated_at": datetime.utcnow()
}
)
start_time = datetime.utcnow()
audio, speech_chunks = get_vad_model().run(
audio=audio,
vad_parameters=params
)
elapsed_time = (datetime.utcnow() - start_time).total_seconds()
update_task_status_in_db(
identifier=identifier,
update_data={
"uuid": identifier,
"status": TaskStatus.COMPLETED,
"updated_at": datetime.utcnow(),
"result": speech_chunks,
"duration": elapsed_time
}
)
return speech_chunks
@vad_router.post(
"/",
response_model=QueueResponse,
status_code=status.HTTP_201_CREATED,
summary="Voice Activity Detection",
description="Detect voice parts in the provided audio or video file to generate a timeline of speech segments.",
)
async def vad(
background_tasks: BackgroundTasks,
file: UploadFile = File(..., description="Audio or video file to detect voices."),
params: VadParams = Depends()
) -> QueueResponse:
if not isinstance(file, np.ndarray):
audio, info = await read_audio(file=file)
else:
audio, info = file, None
vad_options = VadOptions(
threshold=params.threshold,
min_speech_duration_ms=params.min_speech_duration_ms,
max_speech_duration_s=params.max_speech_duration_s,
min_silence_duration_ms=params.min_silence_duration_ms,
speech_pad_ms=params.speech_pad_ms
)
identifier = add_task_to_db(
status=TaskStatus.QUEUED,
file_name=file.filename,
audio_duration=info.duration if info else None,
task_type=TaskType.VAD,
task_params=params.model_dump(),
)
background_tasks.add_task(run_vad, audio=audio, params=vad_options, identifier=identifier)
return QueueResponse(identifier=identifier, status=TaskStatus.QUEUED, message="VAD task has queued")
|