Spaces:
Running
Running
""" | |
Evaluator module. | |
Provides functions to evaluate a given model on a dataset sample using the Faster Whisper model, | |
and generate HTML visualization blocks of the word alignment. | |
""" | |
import concurrent.futures | |
import gc | |
import io | |
import queue | |
import threading | |
from typing import Dict, Generator, List | |
import soundfile as sf | |
from hebrew import Hebrew | |
from tqdm import tqdm | |
from transformers.models.whisper.english_normalizer import BasicTextNormalizer | |
from visual_eval.visualization import render_visualize_jiwer_result_html | |
class HebrewTextNormalizer(BasicTextNormalizer): | |
def __init__(self, *args, **kwargs): | |
super().__init__(*args, **kwargs) | |
superfluous_chars_to_remove = "\u061c" # Arabic letter mark | |
superfluous_chars_to_remove += ( | |
"\u200b\u200c\u200d" # Zero-width space, non-joiner, joiner | |
) | |
superfluous_chars_to_remove += "\u200e\u200f" # LTR and RTL marks | |
superfluous_chars_to_remove += ( | |
"\u202a\u202b\u202c\u202d\u202e" # LTR/RTL embedding, pop, override | |
) | |
superfluous_chars_to_remove += "\u2066\u2067\u2068\u2069" # Isolate controls | |
superfluous_chars_to_remove += "\ufeff" # Zero-width no-break space | |
self.superfluous_hebrew_unicode_symbols_translator = str.maketrans( | |
{ord(c): None for c in superfluous_chars_to_remove} | |
) | |
self.quotes_translator = str.maketrans({ord(c): None for c in "\"'"}) | |
def __remove_niqqud(self, text: str) -> str: | |
return Hebrew(text).no_niqqud().string | |
def __remove_superfluous_hebrew_unicode_symbols(self, text: str) -> str: | |
return text.translate(self.superfluous_hebrew_unicode_symbols_translator) | |
def __remove_quotes(self, text: str) -> str: | |
return text.translate(self.quotes_translator) | |
def __call__(self, text): | |
text = self.__remove_niqqud(text) | |
text = self.__remove_superfluous_hebrew_unicode_symbols(text) | |
text = self.__remove_quotes(text) | |
text = super().__call__(text) | |
return text | |