""" Evaluator module. Provides functions to evaluate a given model on a dataset sample using the Faster Whisper model, and generate HTML visualization blocks of the word alignment. """ import concurrent.futures import gc import io import queue import threading from typing import Dict, Generator, List import soundfile as sf from hebrew import Hebrew from tqdm import tqdm from transformers.models.whisper.english_normalizer import BasicTextNormalizer from visual_eval.visualization import render_visualize_jiwer_result_html class HebrewTextNormalizer(BasicTextNormalizer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) superfluous_chars_to_remove = "\u061c" # Arabic letter mark superfluous_chars_to_remove += ( "\u200b\u200c\u200d" # Zero-width space, non-joiner, joiner ) superfluous_chars_to_remove += "\u200e\u200f" # LTR and RTL marks superfluous_chars_to_remove += ( "\u202a\u202b\u202c\u202d\u202e" # LTR/RTL embedding, pop, override ) superfluous_chars_to_remove += "\u2066\u2067\u2068\u2069" # Isolate controls superfluous_chars_to_remove += "\ufeff" # Zero-width no-break space self.superfluous_hebrew_unicode_symbols_translator = str.maketrans( {ord(c): None for c in superfluous_chars_to_remove} ) self.quotes_translator = str.maketrans({ord(c): None for c in "\"'"}) def __remove_niqqud(self, text: str) -> str: return Hebrew(text).no_niqqud().string def __remove_superfluous_hebrew_unicode_symbols(self, text: str) -> str: return text.translate(self.superfluous_hebrew_unicode_symbols_translator) def __remove_quotes(self, text: str) -> str: return text.translate(self.quotes_translator) def __call__(self, text): text = self.__remove_niqqud(text) text = self.__remove_superfluous_hebrew_unicode_symbols(text) text = self.__remove_quotes(text) text = super().__call__(text) return text