File size: 2,059 Bytes
2f5cf2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
"""
Evaluator module.
Provides functions to evaluate a given model on a dataset sample using the Faster Whisper model,
and generate HTML visualization blocks of the word alignment.
"""

import concurrent.futures
import gc
import io
import queue
import threading
from typing import Dict, Generator, List

import soundfile as sf
from hebrew import Hebrew
from tqdm import tqdm
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

from visual_eval.visualization import render_visualize_jiwer_result_html


class HebrewTextNormalizer(BasicTextNormalizer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        superfluous_chars_to_remove = "\u061c"  # Arabic letter mark
        superfluous_chars_to_remove += (
            "\u200b\u200c\u200d"  # Zero-width space, non-joiner, joiner
        )
        superfluous_chars_to_remove += "\u200e\u200f"  # LTR and RTL marks
        superfluous_chars_to_remove += (
            "\u202a\u202b\u202c\u202d\u202e"  # LTR/RTL embedding, pop, override
        )
        superfluous_chars_to_remove += "\u2066\u2067\u2068\u2069"  # Isolate controls
        superfluous_chars_to_remove += "\ufeff"  # Zero-width no-break space
        self.superfluous_hebrew_unicode_symbols_translator = str.maketrans(
            {ord(c): None for c in superfluous_chars_to_remove}
        )

        self.quotes_translator = str.maketrans({ord(c): None for c in "\"'"})

    def __remove_niqqud(self, text: str) -> str:
        return Hebrew(text).no_niqqud().string

    def __remove_superfluous_hebrew_unicode_symbols(self, text: str) -> str:
        return text.translate(self.superfluous_hebrew_unicode_symbols_translator)

    def __remove_quotes(self, text: str) -> str:
        return text.translate(self.quotes_translator)

    def __call__(self, text):
        text = self.__remove_niqqud(text)
        text = self.__remove_superfluous_hebrew_unicode_symbols(text)
        text = self.__remove_quotes(text)
        text = super().__call__(text)
        return text