Spaces:
Sleeping
Sleeping
import os | |
import tempfile | |
import pytest | |
from laser_encoders.download_models import LaserModelDownloader | |
from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE | |
from laser_encoders.laser_tokenizer import initialize_tokenizer | |
from laser_encoders.models import initialize_encoder | |
def test_validate_language_models_and_tokenize_laser3(lang): | |
with tempfile.TemporaryDirectory() as tmp_dir: | |
print(f"Created temporary directory for {lang}", tmp_dir) | |
downloader = LaserModelDownloader(model_dir=tmp_dir) | |
if lang in ["kashmiri", "kas", "central kanuri", "knc"]: | |
with pytest.raises(ValueError) as excinfo: | |
downloader.download_laser3(lang) | |
assert "ValueError" in str(excinfo.value) | |
print(f"{lang} language model raised a ValueError as expected.") | |
else: | |
downloader.download_laser3(lang) | |
encoder = initialize_encoder(lang, model_dir=tmp_dir) | |
tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir) | |
# Test tokenization with a sample sentence | |
tokenized = tokenizer.tokenize("This is a sample sentence.") | |
print(f"{lang} model validated successfully") | |
def test_validate_language_models_and_tokenize_laser2(lang): | |
with tempfile.TemporaryDirectory() as tmp_dir: | |
print(f"Created temporary directory for {lang}", tmp_dir) | |
downloader = LaserModelDownloader(model_dir=tmp_dir) | |
downloader.download_laser2() | |
encoder = initialize_encoder(lang, model_dir=tmp_dir) | |
tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir) | |
# Test tokenization with a sample sentence | |
tokenized = tokenizer.tokenize("This is a sample sentence.") | |
print(f"{lang} model validated successfully") | |
class MockLaserModelDownloader(LaserModelDownloader): | |
def __init__(self, model_dir): | |
self.model_dir = model_dir | |
def download_laser3(self, lang): | |
lang = self.get_language_code(LASER3_LANGUAGE, lang) | |
file_path = os.path.join(self.model_dir, f"laser3-{lang}.v1.pt") | |
if not os.path.exists(file_path): | |
raise FileNotFoundError(f"Could not find {file_path}.") | |
def download_laser2(self): | |
files = ["laser2.pt", "laser2.spm", "laser2.cvocab"] | |
for file_name in files: | |
file_path = os.path.join(self.model_dir, file_name) | |
if not os.path.exists(file_path): | |
raise FileNotFoundError(f"Could not find {file_path}.") | |
CACHE_DIR = "/home/user/.cache/models" # Change this to the desired cache directory | |
# This uses the mock downloader | |
def test_validate_language_models_and_tokenize_mock_laser3(lang): | |
downloader = MockLaserModelDownloader(model_dir=CACHE_DIR) | |
try: | |
downloader.download_laser3(lang) | |
except FileNotFoundError as e: | |
raise pytest.error(str(e)) | |
encoder = initialize_encoder(lang, model_dir=CACHE_DIR) | |
tokenizer = initialize_tokenizer(lang, model_dir=CACHE_DIR) | |
tokenized = tokenizer.tokenize("This is a sample sentence.") | |
print(f"{lang} model validated successfully") | |
# This uses the mock downloader | |
def test_validate_language_models_and_tokenize_mock_laser2(lang): | |
downloader = MockLaserModelDownloader(model_dir=CACHE_DIR) | |
try: | |
downloader.download_laser2() | |
except FileNotFoundError as e: | |
raise pytest.error(str(e)) | |
encoder = initialize_encoder(lang, model_dir=CACHE_DIR) | |
tokenizer = initialize_tokenizer(lang, model_dir=CACHE_DIR) | |
tokenized = tokenizer.tokenize("This is a sample sentence.") | |
print(f"{lang} model validated successfully") | |