Spaces:

soiz1
/

Whisper-WebUI

Running

File size: 2,797 Bytes

9aaf513

from modules.whisper.whisper_factory import WhisperFactory
from modules.whisper.data_classes import *
from modules.utils.subtitle_manager import read_file
from modules.utils.paths import WEBUI_DIR
from test_config import *

import requests
import pytest
import gradio as gr
import os


@pytest.mark.parametrize(

    "whisper_type,vad_filter,bgm_separation,diarization",

    [

        (WhisperImpl.WHISPER.value, False, False, False),

        (WhisperImpl.FASTER_WHISPER.value, False, False, False),

        (WhisperImpl.INSANELY_FAST_WHISPER.value, False, False, False)

    ]

)
def test_transcribe(

    whisper_type: str,

    vad_filter: bool,

    bgm_separation: bool,

    diarization: bool,

):
    audio_path = TEST_FILE_PATH

    answer = TEST_ANSWER
    if diarization:
        answer = "SPEAKER_00|"+TEST_ANSWER

    whisper_inferencer = WhisperFactory.create_whisper_inference(
        whisper_type=whisper_type,
    )
    print(
        f"""Whisper Device : {whisper_inferencer.device}\n"""
        f"""BGM Separation Device: {whisper_inferencer.music_separator.device}\n"""
        f"""Diarization Device: {whisper_inferencer.diarizer.device}"""
    )

    hparams = TranscriptionPipelineParams(
        whisper=WhisperParams(
            model_size=TEST_WHISPER_MODEL,
            compute_type=whisper_inferencer.current_compute_type
        ),
        vad=VadParams(
            vad_filter=vad_filter
        ),
        bgm_separation=BGMSeparationParams(
            is_separate_bgm=bgm_separation,
            enable_offload=True
        ),
        diarization=DiarizationParams(
            is_diarize=diarization
        ),
    ).to_list()

    subtitle_str, file_paths = whisper_inferencer.transcribe_file(
        [audio_path],
        None,
        None,
        None,
        "SRT",
        False,
        gr.Progress(),
        *hparams,
    )
    subtitle = read_file(file_paths[0]).split("\n")
    assert calculate_wer(answer, subtitle[2].strip().replace(",", "").replace(".", "")) < 0.1

    if not is_pytube_detected_bot():
        subtitle_str, file_path = whisper_inferencer.transcribe_youtube(
            TEST_YOUTUBE_URL,
            "SRT",
            False,
            gr.Progress(),
            *hparams,
        )
        assert isinstance(subtitle_str, str) and subtitle_str
        assert os.path.exists(file_path)

    subtitle_str, file_path = whisper_inferencer.transcribe_mic(
        audio_path,
        "SRT",
        False,
        gr.Progress(),
        *hparams,
    )
    subtitle = read_file(file_path).split("\n")
    wer = calculate_wer(answer, subtitle[2].strip().replace(",", "").replace(".", ""))
    assert wer < 0.1, f"WER is too high, it's {wer}"