P2A-test-NV / laser /laser_encoders /test_laser_tokenizer.py
KuangDW
Add laser2.spm using Git LFS
05d3571
#!/usr/bin/python3
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
#
# LASER Language-Agnostic SEntence Representations
# is a toolkit to calculate multilingual sentence embeddings
# and to use them for document classification, bitext filtering
# and mining
#
# --------------------------------------------------------
# Tests for LaserTokenizer
import os
import warnings
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import List
import numpy as np
import pytest
from laser_encoders import (
LaserEncoderPipeline,
initialize_encoder,
initialize_tokenizer,
)
@pytest.fixture
def tokenizer(tmp_path: Path):
tokenizer_instance = initialize_tokenizer(model_dir=tmp_path, laser="laser2")
return tokenizer_instance
@pytest.fixture
def input_text() -> str:
return "This is a test sentence."
@pytest.fixture
def test_readme_params() -> dict:
return {
"lang": "igbo",
"input_sentences": ["nnọọ, kedu ka ị mere"],
"expected_embedding_shape": (1, 1024),
"expected_array": [
0.3807628,
-0.27941525,
-0.17819545,
0.44144684,
-0.38985375,
0.04719935,
0.20238206,
-0.03934783,
0.0118901,
0.28986093,
],
}
def test_tokenize(tokenizer, input_text: str):
expected_output = "▁this ▁is ▁a ▁test ▁sent ence ."
assert tokenizer.tokenize(input_text) == expected_output
def test_tokenizer_call_method(tokenizer, input_text: str):
single_string = "This is a test sentence."
expected_output = "▁this ▁is ▁a ▁test ▁sent ence ."
assert tokenizer(single_string) == expected_output
list_of_strings = ["This is a test sentence.", "This is another test sentence."]
expected_output = [
"▁this ▁is ▁a ▁test ▁sent ence .",
"▁this ▁is ▁another ▁test ▁sent ence .",
]
assert tokenizer(list_of_strings) == expected_output
def test_normalization(tokenizer):
test_data = "Hello!!! How are you??? I'm doing great."
expected_output = "▁hel lo !!! ▁how ▁are ▁you ??? ▁i ' m ▁do ing ▁great ."
assert tokenizer.tokenize(test_data) == expected_output
def test_descape(tokenizer):
test_data = "I <3 Apple & Carrots!"
expected_output = "▁i ▁<3 ▁app le ▁& ▁car ro ts !"
tokenizer.descape = True
assert tokenizer.tokenize(test_data) == expected_output
def test_lowercase(tokenizer):
test_data = "THIS OUTPUT MUST BE UPPERCASE"
expected_output = "▁TH IS ▁ OU TP UT ▁ MU ST ▁BE ▁ UP PER CA SE"
tokenizer.lower_case = False
assert tokenizer.tokenize(test_data) == expected_output
def test_is_printable(tokenizer):
test_data = "Hello, \tWorld! ABC\x1f123"
expected_output = "▁hel lo , ▁world ! ▁ab c ▁12 3"
assert tokenizer.tokenize(test_data) == expected_output
def test_tokenize_file(tokenizer, input_text: str):
with TemporaryDirectory() as temp_dir:
input_file = os.path.join(temp_dir, "input.txt")
output_file = os.path.join(temp_dir, "output.txt")
with open(input_file, "w") as file:
file.write(input_text)
tokenizer.tokenize_file(
inp_fname=Path(input_file),
out_fname=Path(output_file),
)
with open(output_file, "r") as file:
output = file.read().strip()
expected_output = "▁this ▁is ▁a ▁test ▁sent ence ."
assert output == expected_output
def test_tokenize_file_overwrite(tokenizer, input_text: str):
with TemporaryDirectory() as temp_dir:
input_file = os.path.join(temp_dir, "input.txt")
output_file = os.path.join(temp_dir, "output.txt")
with open(input_file, "w") as file:
file.write(input_text)
with open(output_file, "w") as file:
file.write("Existing output")
# Test when over_write is False
tokenizer.over_write = False
tokenizer.tokenize_file(
inp_fname=Path(input_file),
out_fname=Path(output_file),
)
with open(output_file, "r") as file:
output = file.read().strip()
assert output == "Existing output"
# Test when over_write is True
tokenizer.over_write = True
tokenizer.tokenize_file(
inp_fname=Path(input_file),
out_fname=Path(output_file),
)
with open(output_file, "r") as file:
output = file.read().strip()
expected_output = "▁this ▁is ▁a ▁test ▁sent ence ."
assert output == expected_output
@pytest.mark.parametrize(
"laser, expected_array, lang",
[
(
"laser2",
[
1.042462512850761414e-02,
6.325428839772939682e-03,
-3.032622225873637944e-05,
9.033476933836936951e-03,
2.937933895736932755e-04,
4.489220678806304932e-03,
2.334521152079105377e-03,
-9.427300537936389446e-04,
-1.571535394759848714e-04,
2.095808042213320732e-03,
],
None,
),
(
"laser3",
[
3.038274645805358887e-01,
4.151830971240997314e-01,
-2.458990514278411865e-01,
3.153458833694458008e-01,
-5.153598189353942871e-01,
-6.035178527235984802e-02,
2.210616767406463623e-01,
-2.701394855976104736e-01,
-4.902199506759643555e-01,
-3.126966953277587891e-02,
],
"zul_Latn",
),
],
)
def test_sentence_encoder(
tmp_path: Path,
tokenizer,
laser: str,
expected_array: List,
lang: str,
input_text: str,
):
sentence_encoder = initialize_encoder(model_dir=tmp_path, laser=laser, lang=lang)
tokenized_text = tokenizer.tokenize(input_text)
sentence_embedding = sentence_encoder.encode_sentences([tokenized_text])
assert isinstance(sentence_embedding, np.ndarray)
assert sentence_embedding.shape == (1, 1024)
assert np.allclose(expected_array, sentence_embedding[:, :10], atol=1e-3)
def test_laser_encoder_pipeline(tmp_path: Path, test_readme_params: dict):
lang = test_readme_params["lang"]
input_sentences = test_readme_params["input_sentences"]
expected_embedding_shape = test_readme_params["expected_embedding_shape"]
expected_array = test_readme_params["expected_array"]
encoder = LaserEncoderPipeline(model_dir=tmp_path, lang=lang)
embeddings = encoder.encode_sentences(input_sentences)
assert isinstance(embeddings, np.ndarray)
assert embeddings.shape == expected_embedding_shape
assert np.allclose(expected_array, embeddings[:, :10], atol=1e-3)
def test_separate_initialization_and_encoding(
tmp_path, tokenizer, test_readme_params: dict
):
lang = test_readme_params["lang"]
input_sentences = test_readme_params["input_sentences"]
expected_embedding_shape = test_readme_params["expected_embedding_shape"]
expected_array = test_readme_params["expected_array"]
tokenized_sentence = tokenizer.tokenize(input_sentences[0])
sentence_encoder = initialize_encoder(model_dir=tmp_path, lang=lang)
# Encode tokenized sentences into embeddings
embeddings = sentence_encoder.encode_sentences([tokenized_sentence])
assert isinstance(embeddings, np.ndarray)
assert embeddings.shape == expected_embedding_shape
assert np.allclose(expected_array, embeddings[:, :10], atol=1e-3)
def test_encoder_normalization(tmp_path: Path, test_readme_params: dict):
lang = test_readme_params["lang"]
input_sentences = test_readme_params["input_sentences"]
encoder = LaserEncoderPipeline(model_dir=tmp_path, lang=lang)
normalized_embeddings = encoder.encode_sentences(
input_sentences, normalize_embeddings=True
)
norm = np.linalg.norm(normalized_embeddings[0])
assert np.allclose(norm, 1.0, atol=1e-3)
def test_encoder_default_behaviour(tmp_path: Path, test_readme_params: dict):
lang = test_readme_params["lang"]
input_sentences = test_readme_params["input_sentences"]
encoder = LaserEncoderPipeline(model_dir=tmp_path, lang=lang)
default_embeddings = encoder.encode_sentences(input_sentences)
non_normalized_embeddings = encoder.encode_sentences(
input_sentences, normalize_embeddings=False
)
assert np.allclose(default_embeddings, non_normalized_embeddings)
def test_encoder_non_normalization(tmp_path: Path, test_readme_params: dict):
lang = test_readme_params["lang"]
input_sentences = test_readme_params["input_sentences"]
encoder = LaserEncoderPipeline(model_dir=tmp_path, lang=lang)
non_normalized_embeddings = encoder.encode_sentences(
input_sentences, normalize_embeddings=False
)
norm = np.linalg.norm(non_normalized_embeddings[0])
assert not np.isclose(norm, 1)
def test_optional_lang_with_laser2(tmp_path: Path):
with pytest.warns(
UserWarning,
match="The 'lang' parameter is optional when using 'laser2'. It will be ignored.",
):
encoder = LaserEncoderPipeline(lang="en", laser="laser2", model_dir=tmp_path)
def test_required_lang_with_laser3(tmp_path: Path):
with pytest.raises(
ValueError, match="For 'laser3', the 'lang' parameter is required."
):
encoder = LaserEncoderPipeline(laser="laser3", model_dir=tmp_path)
def test_missing_lang_and_laser(tmp_path: Path):
with pytest.raises(
ValueError, match="Either 'laser' or 'lang' should be provided."
):
encoder = LaserEncoderPipeline(model_dir=tmp_path)