Spaces:

nvidia
/

P2A-test-NV

Sleeping

P2A-test-NV / laser /laser_encoders /test_laser_tokenizer.py

KuangDW

Add laser2.spm using Git LFS

05d3571 14 days ago

10 kB

	#!/usr/bin/python3
	# Copyright (c) Facebook, Inc. and its affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the BSD-style license found in the
	# LICENSE file in the root directory of this source tree.
	#
	# LASER Language-Agnostic SEntence Representations
	# is a toolkit to calculate multilingual sentence embeddings
	# and to use them for document classification, bitext filtering
	# and mining
	#
	# --------------------------------------------------------
	# Tests for LaserTokenizer

	import os
	import warnings
	from pathlib import Path
	from tempfile import TemporaryDirectory
	from typing import List

	import numpy as np
	import pytest

	from laser_encoders import (
	LaserEncoderPipeline,
	initialize_encoder,
	initialize_tokenizer,
	)


	@pytest.fixture
	def tokenizer(tmp_path: Path):
	tokenizer_instance = initialize_tokenizer(model_dir=tmp_path, laser="laser2")
	return tokenizer_instance


	@pytest.fixture
	def input_text() -> str:
	return "This is a test sentence."


	@pytest.fixture
	def test_readme_params() -> dict:
	return {
	"lang": "igbo",
	"input_sentences": ["nnọọ, kedu ka ị mere"],
	"expected_embedding_shape": (1, 1024),
	"expected_array": [
	0.3807628,
	-0.27941525,
	-0.17819545,
	0.44144684,
	-0.38985375,
	0.04719935,
	0.20238206,
	-0.03934783,
	0.0118901,
	0.28986093,
	],
	}


	def test_tokenize(tokenizer, input_text: str):
	expected_output = "▁this ▁is ▁a ▁test ▁sent ence ."
	assert tokenizer.tokenize(input_text) == expected_output


	def test_tokenizer_call_method(tokenizer, input_text: str):
	single_string = "This is a test sentence."
	expected_output = "▁this ▁is ▁a ▁test ▁sent ence ."
	assert tokenizer(single_string) == expected_output

	list_of_strings = ["This is a test sentence.", "This is another test sentence."]
	expected_output = [
	"▁this ▁is ▁a ▁test ▁sent ence .",
	"▁this ▁is ▁another ▁test ▁sent ence .",
	]
	assert tokenizer(list_of_strings) == expected_output


	def test_normalization(tokenizer):
	test_data = "Hello!!! How are you??? I'm doing great."
	expected_output = "▁hel lo !!! ▁how ▁are ▁you ??? ▁i ' m ▁do ing ▁great ."
	assert tokenizer.tokenize(test_data) == expected_output


	def test_descape(tokenizer):
	test_data = "I <3 Apple & Carrots!"
	expected_output = "▁i ▁<3 ▁app le ▁& ▁car ro ts !"
	tokenizer.descape = True
	assert tokenizer.tokenize(test_data) == expected_output


	def test_lowercase(tokenizer):
	test_data = "THIS OUTPUT MUST BE UPPERCASE"
	expected_output = "▁TH IS ▁ OU TP UT ▁ MU ST ▁BE ▁ UP PER CA SE"
	tokenizer.lower_case = False
	assert tokenizer.tokenize(test_data) == expected_output


	def test_is_printable(tokenizer):
	test_data = "Hello, \tWorld! ABC\x1f123"
	expected_output = "▁hel lo , ▁world ! ▁ab c ▁12 3"
	assert tokenizer.tokenize(test_data) == expected_output


	def test_tokenize_file(tokenizer, input_text: str):
	with TemporaryDirectory() as temp_dir:
	input_file = os.path.join(temp_dir, "input.txt")
	output_file = os.path.join(temp_dir, "output.txt")

	with open(input_file, "w") as file:
	file.write(input_text)

	tokenizer.tokenize_file(
	inp_fname=Path(input_file),
	out_fname=Path(output_file),
	)

	with open(output_file, "r") as file:
	output = file.read().strip()

	expected_output = "▁this ▁is ▁a ▁test ▁sent ence ."
	assert output == expected_output


	def test_tokenize_file_overwrite(tokenizer, input_text: str):
	with TemporaryDirectory() as temp_dir:
	input_file = os.path.join(temp_dir, "input.txt")
	output_file = os.path.join(temp_dir, "output.txt")

	with open(input_file, "w") as file:
	file.write(input_text)

	with open(output_file, "w") as file:
	file.write("Existing output")

	# Test when over_write is False
	tokenizer.over_write = False
	tokenizer.tokenize_file(
	inp_fname=Path(input_file),
	out_fname=Path(output_file),
	)

	with open(output_file, "r") as file:
	output = file.read().strip()

	assert output == "Existing output"

	# Test when over_write is True
	tokenizer.over_write = True
	tokenizer.tokenize_file(
	inp_fname=Path(input_file),
	out_fname=Path(output_file),
	)

	with open(output_file, "r") as file:
	output = file.read().strip()

	expected_output = "▁this ▁is ▁a ▁test ▁sent ence ."
	assert output == expected_output


	@pytest.mark.parametrize(
	"laser, expected_array, lang",
	[
	(
	"laser2",
	[
	1.042462512850761414e-02,
	6.325428839772939682e-03,
	-3.032622225873637944e-05,
	9.033476933836936951e-03,
	2.937933895736932755e-04,
	4.489220678806304932e-03,
	2.334521152079105377e-03,
	-9.427300537936389446e-04,
	-1.571535394759848714e-04,
	2.095808042213320732e-03,
	],
	None,
	),
	(
	"laser3",
	[
	3.038274645805358887e-01,
	4.151830971240997314e-01,
	-2.458990514278411865e-01,
	3.153458833694458008e-01,
	-5.153598189353942871e-01,
	-6.035178527235984802e-02,
	2.210616767406463623e-01,
	-2.701394855976104736e-01,
	-4.902199506759643555e-01,
	-3.126966953277587891e-02,
	],
	"zul_Latn",
	),
	],
	)
	def test_sentence_encoder(
	tmp_path: Path,
	tokenizer,
	laser: str,
	expected_array: List,
	lang: str,
	input_text: str,
	):
	sentence_encoder = initialize_encoder(model_dir=tmp_path, laser=laser, lang=lang)
	tokenized_text = tokenizer.tokenize(input_text)
	sentence_embedding = sentence_encoder.encode_sentences([tokenized_text])

	assert isinstance(sentence_embedding, np.ndarray)
	assert sentence_embedding.shape == (1, 1024)
	assert np.allclose(expected_array, sentence_embedding[:, :10], atol=1e-3)


	def test_laser_encoder_pipeline(tmp_path: Path, test_readme_params: dict):
	lang = test_readme_params["lang"]
	input_sentences = test_readme_params["input_sentences"]
	expected_embedding_shape = test_readme_params["expected_embedding_shape"]
	expected_array = test_readme_params["expected_array"]

	encoder = LaserEncoderPipeline(model_dir=tmp_path, lang=lang)
	embeddings = encoder.encode_sentences(input_sentences)

	assert isinstance(embeddings, np.ndarray)
	assert embeddings.shape == expected_embedding_shape
	assert np.allclose(expected_array, embeddings[:, :10], atol=1e-3)


	def test_separate_initialization_and_encoding(
	tmp_path, tokenizer, test_readme_params: dict
	):
	lang = test_readme_params["lang"]
	input_sentences = test_readme_params["input_sentences"]
	expected_embedding_shape = test_readme_params["expected_embedding_shape"]
	expected_array = test_readme_params["expected_array"]

	tokenized_sentence = tokenizer.tokenize(input_sentences[0])
	sentence_encoder = initialize_encoder(model_dir=tmp_path, lang=lang)

	# Encode tokenized sentences into embeddings
	embeddings = sentence_encoder.encode_sentences([tokenized_sentence])

	assert isinstance(embeddings, np.ndarray)
	assert embeddings.shape == expected_embedding_shape
	assert np.allclose(expected_array, embeddings[:, :10], atol=1e-3)


	def test_encoder_normalization(tmp_path: Path, test_readme_params: dict):
	lang = test_readme_params["lang"]
	input_sentences = test_readme_params["input_sentences"]

	encoder = LaserEncoderPipeline(model_dir=tmp_path, lang=lang)
	normalized_embeddings = encoder.encode_sentences(
	input_sentences, normalize_embeddings=True
	)
	norm = np.linalg.norm(normalized_embeddings[0])

	assert np.allclose(norm, 1.0, atol=1e-3)


	def test_encoder_default_behaviour(tmp_path: Path, test_readme_params: dict):
	lang = test_readme_params["lang"]
	input_sentences = test_readme_params["input_sentences"]

	encoder = LaserEncoderPipeline(model_dir=tmp_path, lang=lang)
	default_embeddings = encoder.encode_sentences(input_sentences)
	non_normalized_embeddings = encoder.encode_sentences(
	input_sentences, normalize_embeddings=False
	)

	assert np.allclose(default_embeddings, non_normalized_embeddings)


	def test_encoder_non_normalization(tmp_path: Path, test_readme_params: dict):
	lang = test_readme_params["lang"]
	input_sentences = test_readme_params["input_sentences"]

	encoder = LaserEncoderPipeline(model_dir=tmp_path, lang=lang)
	non_normalized_embeddings = encoder.encode_sentences(
	input_sentences, normalize_embeddings=False
	)
	norm = np.linalg.norm(non_normalized_embeddings[0])

	assert not np.isclose(norm, 1)


	def test_optional_lang_with_laser2(tmp_path: Path):
	with pytest.warns(
	UserWarning,
	match="The 'lang' parameter is optional when using 'laser2'. It will be ignored.",
	):
	encoder = LaserEncoderPipeline(lang="en", laser="laser2", model_dir=tmp_path)


	def test_required_lang_with_laser3(tmp_path: Path):
	with pytest.raises(
	ValueError, match="For 'laser3', the 'lang' parameter is required."
	):
	encoder = LaserEncoderPipeline(laser="laser3", model_dir=tmp_path)


	def test_missing_lang_and_laser(tmp_path: Path):
	with pytest.raises(
	ValueError, match="Either 'laser' or 'lang' should be provided."
	):
	encoder = LaserEncoderPipeline(model_dir=tmp_path)