Yoad
First commit with actual logic
2f5cf2f
"""
Evaluator module.
Provides functions to evaluate a given model on a dataset sample using the Faster Whisper model,
and generate HTML visualization blocks of the word alignment.
"""
import concurrent.futures
import gc
import io
import queue
import threading
from typing import Dict, Generator, List
import soundfile as sf
from hebrew import Hebrew
from tqdm import tqdm
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
from visual_eval.visualization import render_visualize_jiwer_result_html
class HebrewTextNormalizer(BasicTextNormalizer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
superfluous_chars_to_remove = "\u061c" # Arabic letter mark
superfluous_chars_to_remove += (
"\u200b\u200c\u200d" # Zero-width space, non-joiner, joiner
)
superfluous_chars_to_remove += "\u200e\u200f" # LTR and RTL marks
superfluous_chars_to_remove += (
"\u202a\u202b\u202c\u202d\u202e" # LTR/RTL embedding, pop, override
)
superfluous_chars_to_remove += "\u2066\u2067\u2068\u2069" # Isolate controls
superfluous_chars_to_remove += "\ufeff" # Zero-width no-break space
self.superfluous_hebrew_unicode_symbols_translator = str.maketrans(
{ord(c): None for c in superfluous_chars_to_remove}
)
self.quotes_translator = str.maketrans({ord(c): None for c in "\"'"})
def __remove_niqqud(self, text: str) -> str:
return Hebrew(text).no_niqqud().string
def __remove_superfluous_hebrew_unicode_symbols(self, text: str) -> str:
return text.translate(self.superfluous_hebrew_unicode_symbols_translator)
def __remove_quotes(self, text: str) -> str:
return text.translate(self.quotes_translator)
def __call__(self, text):
text = self.__remove_niqqud(text)
text = self.__remove_superfluous_hebrew_unicode_symbols(text)
text = self.__remove_quotes(text)
text = super().__call__(text)
return text