Spaces:

nvidia
/

Plan2Align-NV

Sleeping

App Files Files Community

KuangDW commited on Apr 15

Commit

05d3571

1 Parent(s): bbf3202

Add laser2.spm using Git LFS

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
app.py +2 -4
laser/.github/workflows/lint_and_tests.yml +32 -0
laser/.gitignore +15 -0
laser/CODE_OF_CONDUCT.md +5 -0
laser/CONTRIBUTING.md +37 -0
laser/LICENSE +30 -0
laser/README.md +159 -0
laser/docker/Dockerfile +38 -0
laser/docker/README.md +82 -0
laser/docker/app.py +64 -0
laser/docker/decode.py +7 -0
laser/install_external_tools.sh +200 -0
laser/install_models.sh +48 -0
laser/laser2.cvocab +0 -0
laser/laser2.spm +3 -0
laser/laser_encoders/README.md +149 -0
laser/laser_encoders/__init__.py +16 -0
laser/laser_encoders/download_models.py +154 -0
laser/laser_encoders/language_list.py +564 -0
laser/laser_encoders/laser_tokenizer.py +179 -0
laser/laser_encoders/models.py +426 -0
laser/laser_encoders/test_laser_tokenizer.py +310 -0
laser/laser_encoders/test_models_initialization.py +57 -0
laser/laser_encoders/validate_models.py +108 -0
laser/pyproject.toml +69 -0
laser/remove_external_tools.sh +26 -0
laser/source/embed.py +362 -0
laser/source/eval.py +381 -0
laser/source/lib/indexing.py +258 -0
laser/source/lib/romanize_lc.py +51 -0
laser/source/lib/text_processing.py +272 -0
laser/source/mine_bitexts.py +302 -0
laser/source/nli.py +371 -0
laser/source/paraphrase.py +285 -0
laser/source/pxsim.py +251 -0
laser/source/sent_classif.py +273 -0
laser/source/similarity_search.py +113 -0
laser/source/xsim.py +165 -0
laser/tasks/CCMatrix/MatrixMine.pdf +0 -0
laser/tasks/CCMatrix/README.md +39 -0
laser/tasks/CCMatrix/dl_cc_matrix.py +338 -0
laser/tasks/SentimentAnalysis/README.md +34 -0
laser/tasks/SentimentAnalysis/SentimentAnalysis.ipynb +0 -0
laser/tasks/WikiMatrix/README.md +93 -0
laser/tasks/WikiMatrix/WikiMatrix-bleu.pdf +0 -0
laser/tasks/WikiMatrix/WikiMatrix-sizes.pdf +0 -0
laser/tasks/WikiMatrix/extract.py +81 -0
laser/tasks/WikiMatrix/list_of_bitexts.txt +1620 -0
laser/tasks/bucc/README.md +94 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+laser/laser2.spm filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -16,9 +16,7 @@ import pkg_resources
 import sys
 login(token=os.environ.get("LA_NAME"))
-laser_token = os.environ.get("ENC")
-laser_path = snapshot_download(repo_id="KuangDW/laser", use_auth_token=laser_token)
-os.environ["LASER"] = laser_path
 def check_and_install(package, required_version):
     try:
@@ -326,7 +324,7 @@ with gr.Blocks(title="Test-Time Machine Translation with Plan2Align") as demo:
     gr.Examples(
         examples=[
-            ["台灣夜市文化豐富多彩...", "Chinese", "English", 2, 0.7, 1, ["Original", "Plan2Align"]],
             ["台北101曾經是世界最高的建築物，它不僅是台灣的地標，也象徵著經濟成就和創新精神。", "Chinese", "Russian", 2, 0.7, 1, ["Original", "Plan2Align"]],
             ["阿里山日出和森林鐵路是台灣最著名的自然景觀之一，每年吸引數十萬遊客前來欣賞雲海和壯麗的日出。", "Chinese", "German", 2, 0.7, 1, ["Original", "Plan2Align"]],
             ["珍珠奶茶，這款源自台灣的獨特飲品，不僅在台灣本地深受喜愛，更以其獨特的風味和口感，在全球掀起了一股熱潮，成為了一種跨越文化、風靡全球的時尚飲品。", "Chinese", "Japanese", 3, 0.7, 3, ["Original", "Plan2Align"]],

 import sys
 login(token=os.environ.get("LA_NAME"))
+os.environ["LASER"] = "laser"
 def check_and_install(package, required_version):
     try:
     gr.Examples(
         examples=[
+            ["台灣夜市文化豐富多彩，從士林夜市到饒河街夜市，提供各種美食、遊戲和購物體驗，吸引了無數遊客。", "Chinese", "English", 2, 0.7, 1, ["Original", "Plan2Align"]],
             ["台北101曾經是世界最高的建築物，它不僅是台灣的地標，也象徵著經濟成就和創新精神。", "Chinese", "Russian", 2, 0.7, 1, ["Original", "Plan2Align"]],
             ["阿里山日出和森林鐵路是台灣最著名的自然景觀之一，每年吸引數十萬遊客前來欣賞雲海和壯麗的日出。", "Chinese", "German", 2, 0.7, 1, ["Original", "Plan2Align"]],
             ["珍珠奶茶，這款源自台灣的獨特飲品，不僅在台灣本地深受喜愛，更以其獨特的風味和口感，在全球掀起了一股熱潮，成為了一種跨越文化、風靡全球的時尚飲品。", "Chinese", "Japanese", 3, 0.7, 3, ["Original", "Plan2Align"]],

laser/.github/workflows/lint_and_tests.yml ADDED Viewed

	@@ -0,0 +1,32 @@

+name: lint_and_tests
+on: [push, pull_request]
+jobs:
+  build:
+    strategy:
+      max-parallel: 1
+      matrix:
+        platform: [ubuntu-latest]
+        python-version: [3.8]
+    runs-on: ${{ matrix.platform }}
+    steps:
+    - uses: actions/checkout@v2
+    - name: Install dependencies
+      run: |
+        python --version
+        python -m pip install --upgrade 'pip>=23.2.1'
+        python -m pip show pip
+        python -m pip install -e '.[dev]'
+    - name: isort
+      run: cd laser_encoders && isort --check --diff .
+    - name: black
+      run: cd laser_encoders && black --check --diff .
+    - name: pytest
+      run: pytest laser_encoders

laser/.gitignore ADDED Viewed

	@@ -0,0 +1,15 @@

+source/__pycache__
+source/lib/__pycache__
+models
+tools-external
+tasks/mldoc/MLDoc
+embed
+tasks/bucc/downloaded
+tasks/similarity/dev/
+tasks/xnli/XNLI-1.0*
+tasks/xnli/multinli_1.0*
+.??*swp
+.idea
+__pycache__
+nllb
+dist

laser/CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,5 @@

+# Code of Conduct
+Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
+Please read the [full text](https://code.fb.com/codeofconduct)
+so that you can understand what actions will and will not be tolerated.

laser/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,37 @@

+# Contributing to LASER
+We want to make contributing to this project as easy and transparent as
+possible.
+## Our Development Process
+Minor changes and improvements will be released on an ongoing basis.
+## Pull Requests
+We actively welcome your pull requests.
+1. Fork the repo and create your branch from `master`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+Complete your CLA here: <https://code.facebook.com/cla>
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+## Coding Style
+* 4 spaces for indentation rather than tabs
+* 80 character line length
+* PEP8 formatting
+## License
+By contributing to LASER, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.

laser/LICENSE ADDED Viewed

	@@ -0,0 +1,30 @@

+BSD License
+For Language-Agnostic SEntence Representations (LASER) software
+Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name Facebook nor the names of its contributors may be used to
+   endorse or promote products derived from this software without specific
+   prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

laser/README.md ADDED Viewed

	@@ -0,0 +1,159 @@

+# LASER  Language-Agnostic SEntence Representations
+LASER is a library to calculate and use multilingual sentence embeddings.
+**NEWS**
+* 2023/11/30 Released [**P-xSIM**](tasks/pxsim), a dual approach extension to multilingual similarity search (xSIM)
+* 2023/11/16 Released [**laser_encoders**](laser_encoders), a pip-installable package supporting LASER-2 and LASER-3 models
+* 2023/06/26 [**xSIM++**](https://arxiv.org/abs/2306.12907) evaluation pipeline and data [**released**](tasks/xsimplusplus/README.md)
+* 2022/07/06 Updated LASER models with support for over 200 languages are [**now available**](nllb/README.md)
+* 2022/07/06 Multilingual similarity search (**xSIM**) evaluation pipeline [**released**](tasks/xsim/README.md)
+* 2022/05/03 [**Librivox S2S is available**](tasks/librivox-s2s): Speech-to-Speech translations automatically mined in Librivox [9]
+* 2019/11/08 [**CCMatrix is available**](tasks/CCMatrix): Mining billions of high-quality parallel sentences on the WEB [8]
+* 2019/07/31 Gilles Bodard and Jérémy Rapin provided a [**Docker environment**](docker) to use LASER
+* 2019/07/11 [**WikiMatrix is available**](tasks/WikiMatrix): bitext extraction for 1620 language pairs in WikiPedia [7]
+* 2019/03/18 switch to BSD license
+* 2019/02/13 The code to perform bitext mining is [**now available**](tasks/bucc)
+**CURRENT VERSION:**
+* We now provide updated LASER models which support over 200 languages. Please see [here](nllb/README.md) for more details including how to download the models and perform inference.
+According to our experience, the sentence encoder also supports code-switching, i.e.
+the same sentences can contain words in several different languages.
+We have also some evidence that the encoder can generalize to other
+languages which have not been seen during training, but which are in
+a language family which is covered by other languages.
+A detailed description of how the multilingual sentence embeddings are trained can
+be found [here](https://arxiv.org/abs/2205.12654), together with an experimental evaluation.
+## The core sentence embedding package: `laser_encoders`
+We provide a package `laser_encoders` with minimal dependencies.
+It supports LASER-2 (a single encoder for the languages listed [below](#supported-languages))
+and LASER-3 (147 language-specific encoders described [here](nllb/README.md)).
+The package can be installed simply with `pip install laser_encoders` and used as below:
+```python
+from laser_encoders import LaserEncoderPipeline
+encoder = LaserEncoderPipeline(lang="eng_Latn")
+embeddings = encoder.encode_sentences(["Hi!", "This is a sentence encoder."])
+print(embeddings.shape)  # (2, 1024)
+```
+The laser_encoders [readme file](laser_encoders) provides more examples of its installation and usage.
+## The full LASER kit
+Apart from the `laser_encoders`, we provide support for LASER-1 (the original multilingual encoder)
+and for various LASER applications listed below.
+### Dependencies
+* Python >= 3.7
+* [PyTorch 1.0](http://pytorch.org/)
+* [NumPy](http://www.numpy.org/), tested with 1.15.4
+* [Cython](https://pypi.org/project/Cython/), needed by Python wrapper of FastBPE, tested with 0.29.6
+* [Faiss](https://github.com/facebookresearch/faiss), for fast similarity search and bitext mining
+* [transliterate 1.10.2](https://pypi.org/project/transliterate) (`pip install transliterate`)
+* [jieba 0.39](https://pypi.org/project/jieba/), Chinese segmenter (`pip install jieba`)
+* [mecab 0.996](https://pypi.org/project/JapaneseTokenizer/), Japanese segmenter
+* tokenization from the Moses encoder (installed automatically)
+* [FastBPE](https://github.com/glample/fastBPE), fast C++ implementation of byte-pair encoding (installed automatically)
+* [Fairseq](https://github.com/pytorch/fairseq), sequence modeling toolkit (`pip install fairseq==0.12.1`)
+* [tabulate](https://pypi.org/project/tabulate), pretty-print tabular data (`pip install tabulate`)
+* [pandas](https://pypi.org/project/pandas), data analysis toolkit (`pip install pandas`)
+* [Sentencepiece](https://github.com/google/sentencepiece), subword tokenization (installed automatically)
+### Installation
+* install the `laser_encoders` package by e.g. `pip install -e .` for installing it in the editable mode
+* set the environment variable 'LASER' to the root of the installation, e.g.
+  `export LASER="${HOME}/projects/laser"`
+* download encoders from Amazon s3 by e.g. `bash ./nllb/download_models.sh`
+* download third party software by `bash ./install_external_tools.sh`
+* download the data used in the example tasks (see description for each task)
+## Applications
+We showcase several applications of multilingual sentence embeddings
+with code to reproduce our results (in the directory "tasks").
+* [**Cross-lingual document classification**](tasks/mldoc) using the
+  [*MLDoc*](https://github.com/facebookresearch/MLDoc) corpus [2,6]
+* [**WikiMatrix**](tasks/WikiMatrix)
+   Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia [7]
+* [**Bitext mining**](tasks/bucc) using the
+  [*BUCC*](https://comparable.limsi.fr/bucc2018/bucc2018-task.html) corpus [3,5]
+* [**Cross-lingual NLI**](tasks/xnli)
+  using the [*XNLI*](https://www.nyu.edu/projects/bowman/xnli/) corpus [4,5,6]
+* [**Multilingual similarity search**](tasks/similarity) [1,6]
+* [**Sentence embedding of text files**](tasks/embed)
+  example how to calculate sentence embeddings for arbitrary text files in any of the supported language.
+**For all tasks, we use exactly the same multilingual encoder, without any task specific optimization or fine-tuning.**
+## License
+LASER is BSD-licensed, as found in the [`LICENSE`](LICENSE) file in the root directory of this source tree.
+## Supported languages
+The original LASER model was trained on the following languages:
+Afrikaans, Albanian, Amharic, Arabic, Armenian, Aymara, Azerbaijani, Basque, Belarusian, Bengali,
+Berber languages, Bosnian, Breton, Bulgarian, Burmese, Catalan, Central/Kadazan Dusun, Central Khmer,
+Chavacano, Chinese, Coastal Kadazan, Cornish, Croatian, Czech, Danish, Dutch, Eastern Mari, English,
+Esperanto, Estonian, Finnish, French, Galician, Georgian, German, Greek, Hausa, Hebrew, Hindi,
+Hungarian, Icelandic, Ido, Indonesian, Interlingua, Interlingue, Irish, Italian, Japanese, Kabyle,
+Kazakh, Korean, Kurdish, Latvian, Latin, Lingua Franca Nova, Lithuanian, Low German/Saxon,
+Macedonian, Malagasy, Malay, Malayalam, Maldivian (Divehi), Marathi, Norwegian (Bokmål), Occitan,
+Persian (Farsi), Polish, Portuguese, Romanian, Russian, Serbian, Sindhi, Sinhala, Slovak, Slovenian,
+Somali, Spanish, Swahili, Swedish, Tagalog, Tajik, Tamil, Tatar, Telugu, Thai, Turkish, Uighur,
+Ukrainian, Urdu, Uzbek, Vietnamese, Wu Chinese and Yue Chinese.
+We have also observed that the model seems to generalize well to other (minority) languages or dialects, e.g.
+Asturian, Egyptian Arabic, Faroese, Kashubian, North Moluccan Malay, Nynorsk Norwegian, Piedmontese, Sorbian, Swabian,
+Swiss German or Western Frisian.
+### LASER3
+Updated LASER models referred to as *[LASER3](nllb/README.md)* supplement the above list with support for 147 languages. The full list of supported languages can be seen [here](nllb/README.md#list-of-available-laser3-encoders).
+## References
+[1] Holger Schwenk and Matthijs Douze,
+    [*Learning Joint Multilingual Sentence Representations with Neural Machine Translation*](https://aclanthology.info/papers/W17-2619/w17-2619),
+    ACL workshop on Representation Learning for NLP, 2017
+[2] Holger Schwenk and Xian Li,
+    [*A Corpus for Multilingual Document Classification in Eight Languages*](http://www.lrec-conf.org/proceedings/lrec2018/pdf/658.pdf),
+    LREC, pages 3548-3551, 2018.
+[3] Holger Schwenk,
+    [*Filtering and Mining Parallel Data in a Joint Multilingual Space*](http://aclweb.org/anthology/P18-2037)
+    ACL, July 2018
+[4] Alexis Conneau, Guillaume Lample, Ruty Rinott, Adina Williams, Samuel R. Bowman, Holger Schwenk and Veselin Stoyanov,
+    [*XNLI: Cross-lingual Sentence Understanding through Inference*](https://aclweb.org/anthology/D18-1269),
+    EMNLP, 2018.
+[5] Mikel Artetxe and Holger Schwenk,
+    [*Margin-based Parallel Corpus Mining with Multilingual Sentence Embeddings*](https://arxiv.org/abs/1811.01136)
+    arXiv, Nov 3 2018.
+[6] Mikel Artetxe and Holger Schwenk,
+    [*Massively Multilingual Sentence Embeddings for Zero-Shot Cross-Lingual Transfer and Beyond*](https://arxiv.org/abs/1812.10464)
+    arXiv, Dec 26 2018.
+[7] Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman,
+    [*WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia*](https://arxiv.org/abs/1907.05791)
+    arXiv, July 11  2019.
+[8] Holger Schwenk, Guillaume Wenzek, Sergey Edunov, Edouard Grave and Armand Joulin
+    [*CCMatrix: Mining Billions of High-Quality Parallel Sentences on the WEB*](https://arxiv.org/abs/1911.04944)
+[9] Paul-Ambroise Duquenne, Hongyu Gong, Holger Schwenk,
+    [*Multimodal and Multilingual Embeddings for Large-Scale Speech Mining,*](https://papers.nips.cc/paper/2021/hash/8466f9ace6a9acbe71f75762ffc890f1-Abstract.html), NeurIPS 2021, pages 15748-15761.
+[10] Kevin Heffernan, Onur Celebi, and Holger Schwenk,
+     [*Bitext Mining Using Distilled Sentence Representations for Low-Resource Languages*](https://arxiv.org/abs/2205.12654)

laser/docker/Dockerfile ADDED Viewed

	@@ -0,0 +1,38 @@

+FROM continuumio/miniconda3
+MAINTAINER Gilles Bodart <[email protected]>
+# Install build-essential (compiler and development tools)
+RUN apt-get update && \
+    apt-get install -y build-essential && \
+    rm -rf /var/lib/apt/lists/*
+RUN conda create -n env python=3.8
+RUN echo "source activate env" > ~/.bashrc
+ENV PATH /opt/conda/envs/env/bin:$PATH
+# Set the working directory to /app
+WORKDIR /app
+# Copy the local laser-encoders repository
+COPY laser_encoders /app/laser_encoders
+COPY pyproject.toml /app/pyproject.toml
+RUN pip install --upgrade pip
+RUN pip install -e .
+RUN pip install Flask==2.3.3 Requests==2.31.0
+# Define the argument for language
+ARG langs="eng_Latn"
+# Download language models for each specified language
+RUN for lang in $langs; do \
+        python -m laser_encoders.download_models --lang=$lang; \
+    done
+# Open the port 80
+EXPOSE 80
+COPY docker/app.py /app/app.py
+CMD ["/bin/bash"]

laser/docker/README.md ADDED Viewed

	@@ -0,0 +1,82 @@

+## LASER Docker Image
+This image provides a convenient way to run LASER in a Docker container.
+### Building the image
+To build the image, run the following command from the root of the LASER directory:
+```
+docker build --tag laser -f docker/Dockerfile .
+```
+### Specifying Languages with `langs` Argument
+You can pre-download the encoders and tokenizers for specific languages by using the `langs` build argument. This argument accepts a space-separated list of language codes. For example, to build an image with models for English and French, use the following command:
+```
+docker build --build-arg langs="eng_Latn fra_Latn" -t laser -f docker/Dockerfile .
+```
+If the `langs` argument is not specified during the build process, the image will default to building with English (`eng_Latn`). It's important to note that in this default case where English is selected, the LASER2 model, which supports 92 languages, is used. For a comprehensive list of LASER2 supported languages, refer to `LASER2_LANGUAGES_LIST` in [`language_list.py`](https://github.com/facebookresearch/LASER/blob/main/laser_encoders/language_list.py).
+### Running the Image
+Once the image is built, you can run it with the following command:
+```
+docker run -it laser
+```
+**Note:** If you want to expose a local port to the REST server on top of the embed task, you can do so by executing the following command instead of the last command:
+```
+docker run -it -p [CHANGEME_LOCAL_PORT]:80 laser python app.py
+```
+This will override the command line entrypoint of the Docker container.
+Example:
+```
+docker run -it -p 8081:80 laser python app.py
+```
+This Flask server will serve a REST Api that can be use by calling your server with this URL :
+```
+http://127.0.0.1:[CHANGEME_LOCAL_PORT]/vectorize?q=[YOUR_SENTENCE_URL_ENCODED]&lang=[LANGUAGE]
+```
+Example:
+```
+http://127.0.0.1:8081/vectorize?q=ki%20lo%20'orukọ%20ẹ&lang=yor
+```
+Sample response:
+```
+{
+    "content": "ki lo 'orukọ ẹ",
+    "embedding": [
+        [
+            -0.10241681337356567,
+            0.11120740324258804,
+            -0.26641348004341125,
+            -0.055699944496154785,
+            ....
+            ....
+            ....
+            -0.034048307687044144,
+            0.11005636304616928,
+            -0.3238321840763092,
+            -0.060631975531578064,
+            -0.19269055128097534,
+        ]
+}
+```
+Here is an example of how you can send requests to it with python:
+```python
+import requests
+import numpy as np
+url = "http://127.0.0.1:[CHANGEME_LOCAL_PORT]/vectorize"
+params = {"q": "Hey, how are you?\nI'm OK and you?", "lang": "en"}
+resp = requests.get(url=url, params=params).json()
+print(resp["embedding"])
+```

laser/docker/app.py ADDED Viewed

	@@ -0,0 +1,64 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+import socket
+from flask import Flask, jsonify, request
+from laser_encoders import LaserEncoderPipeline
+from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE
+app = Flask(__name__)
+# Global cache for encoders
+encoder_cache = {}
+laser2_encoder = None
+@app.route("/")
+def root():
+    print("/")
+    html = "<h3>Hello {name}!</h3>" "<b>Hostname:</b> {hostname}<br/>"
+    return html.format(name=os.getenv("LASER", "world"), hostname=socket.gethostname())
+@app.route("/vectorize", methods=["GET"])
+def vectorize():
+    content = request.args.get("q")
+    lang = request.args.get(
+        "lang", "eng"
+    )  # Default to English if 'lang' is not provided
+    if content is None:
+        return jsonify({"error": "Missing input content"}), 400
+    try:
+        global laser2_encoder
+        if lang in LASER2_LANGUAGE:  # Checks for both 3-letter code or 8-letter code
+            if not laser2_encoder:
+                laser2_encoder = LaserEncoderPipeline(lang=lang)
+            encoder = laser2_encoder
+        else:
+            lang_code = LASER3_LANGUAGE.get(
+                lang, lang
+            )  # Use language code as key to prevent multiple entries for same language
+            if lang_code not in encoder_cache:
+                encoder_cache[lang_code] = LaserEncoderPipeline(lang=lang_code)
+            encoder = encoder_cache[lang_code]
+        embeddings = encoder.encode_sentences([content])
+        embeddings_list = embeddings.tolist()
+        body = {"content": content, "embedding": embeddings_list}
+        return jsonify(body), 200
+    except ValueError as e:
+        # Check if the exception is due to an unsupported language
+        if "unsupported language" in str(e).lower():
+            return jsonify({"error": f"Language '{lang}' is not supported."}), 400
+        else:
+            return jsonify({"error": str(e)}), 400
+if __name__ == "__main__":
+    app.run(debug=True, port=80, host="0.0.0.0")

laser/docker/decode.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import numpy as np
+import sys
+dim = 1024
+X = np.fromfile(sys.argv[1], dtype=np.float32, count=-1)
+X.resize(X.shape[0] // dim, dim)
+print(X)

laser/install_external_tools.sh ADDED Viewed

	@@ -0,0 +1,200 @@

+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# LASER  Language-Agnostic SEntence Representations
+# is a toolkit to calculate multilingual sentence embeddings
+# and to use them for document classification, bitext filtering
+# and mining
+#
+#-------------------------------------------------------
+#
+# This bash script installs third party software
+#
+if [ -z ${LASER} ] ; then
+  echo "Please set the environment variable 'LASER'"
+  exit
+fi
+###################################################################
+#
+# Generic helper functions
+#
+###################################################################
+MKDIR () {
+  dname=$1
+  if [ ! -d ${dname} ] ; then
+    echo " - creating directory ${dname}"
+    mkdir -p ${dname}
+  fi
+}
+bdir="${LASER}"
+tools_ext="${bdir}/tools-external"
+MKDIR $tools_ext
+###################################################################
+#
+# Tokenization tools from Moses
+# It is important to use the official release V4 and not the current one
+# to obtain the same results than the published ones.
+# (the behavior of the tokenizer for end-of-sentence abbreviations has changed)
+#
+###################################################################
+InstallMosesTools () {
+  moses_git="https://raw.githubusercontent.com/moses-smt/mosesdecoder/RELEASE-4.0/scripts"
+  moses_files=("tokenizer/tokenizer.perl" "tokenizer/detokenizer.perl" \
+               "tokenizer/normalize-punctuation.perl" \
+               "tokenizer/remove-non-printing-char.perl" \
+               "tokenizer/deescape-special-chars.perl" \
+               "tokenizer/lowercase.perl" \
+               "tokenizer/basic-protected-patterns" \
+              )
+  wdir="${tools_ext}/moses-tokenizer/tokenizer"
+  MKDIR ${wdir}
+  cd ${wdir}
+  for f in ${moses_files[@]} ; do
+    if [ ! -f `basename ${f}` ] ; then
+      echo " - download ${f}"
+      wget -q ${moses_git}/${f}
+    fi
+  done
+  chmod 755 *perl
+  # download non-breaking prefixes per language
+  moses_non_breakings="share/nonbreaking_prefixes/nonbreaking_prefix"
+  moses_non_breaking_langs=( \
+      "ca" "cs" "de" "el" "en" "es" "fi" "fr" "ga" "hu" "is" \
+      "it" "lt" "lv" "nl" "pl" "pt" "ro" "ru" "sk" "sl" "sv" \
+      "ta" "yue" "zh" )
+  wdir="${tools_ext}/moses-tokenizer/share/nonbreaking_prefixes"
+  MKDIR ${wdir}
+  cd ${wdir}
+  for l in ${moses_non_breaking_langs[@]} ; do
+    f="${moses_non_breakings}.${l}"
+    if [ ! -f `basename ${f}` ] ; then
+      echo " - download ${f}"
+      wget -q ${moses_git}/${f}
+    fi
+  done
+}
+###################################################################
+#
+# FAST BPE
+#
+###################################################################
+InstallFastBPE () {
+  cd ${tools_ext}
+  if [ ! -x fastBPE/fast ] ; then
+    echo " - download fastBPE software from github"
+    wget https://github.com/glample/fastBPE/archive/master.zip
+    unzip master.zip
+    /bin/rm master.zip
+    mv fastBPE-master fastBPE
+    cd fastBPE
+    echo " - compiling"
+    g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast
+    if [ $? -eq 1 ] ; then
+      echo "ERROR: compilation failed, please install manually"; exit
+    fi
+    python setup.py install
+  fi
+}
+###################################################################
+#
+# SENTENCEPIECE
+#
+###################################################################
+InstallSentencePiece () {
+  cd ${tools_ext}
+  if [ ! -d sentencepiece-master ] ; then
+    echo " - download sentencepiece from github"
+    wget https://github.com/google/sentencepiece/archive/master.zip
+    unzip master.zip
+    /bin/rm master.zip
+    if [ ! -s /usr/local/bin/spm_encode ] ; then
+      echo " - building code "
+      cd sentencepiece-master
+      mkdir build
+      cd build
+      cmake ..
+      make -j 10
+    fi
+  fi
+}
+###################################################################
+#
+# Install Japanese tokenizer Mecab
+# We do not use automatic installation with "pip" but directly add the soruce directory
+#
+###################################################################
+InstallMecab () {
+  cd ${tools_ext}
+  if [ ! -x mecab/mecab/bin/mecab ] ; then
+    echo " - download mecab from github"
+    wget https://github.com/taku910/mecab/archive/master.zip
+    unzip master.zip
+    #/bin/rm master.zip
+    if [ ! -s mecab/bin/mecab ] ; then
+      mkdir mecab
+      cd mecab-master/mecab
+      echo " - installing code"
+      ./configure --prefix ${tools_ext}/mecab && make && make install
+      if [ $? -q 1 ] ; then
+        echo "ERROR: installation failed, please install manually"; exit
+      fi
+    fi
+    if [ ! -d mecab/lib/mecab/dic/ipadic ] ; then
+      cd ${tools_ext}/mecab-master/mecab-ipadic
+      echo " - installing dictionaries"
+      ./configure --prefix ${tools_ext}/mecab --with-mecab-config=${tools_ext}/mecab/bin/mecab-config \
+        && make && make install
+      if [ $? -eq 1 ] ; then
+        echo "ERROR: compilation failed, please install manually"; exit
+      fi
+    fi
+  fi
+}
+###################################################################
+#
+# main
+#
+###################################################################
+echo "Installing the laser_encoders package in editable mode"
+pip install -e .
+echo "Installing external tools"
+InstallMosesTools
+InstallFastBPE
+InstallSentencePiece
+#InstallMecab
+echo ""
+echo "automatic installation of the Japanese tokenizer mecab may be tricky"
+echo "Please install it manually from https://github.com/taku910/mecab"
+echo ""
+echo "The installation directory should be ${LASER}/tools-external/mecab"
+echo ""

laser/install_models.sh ADDED Viewed

	@@ -0,0 +1,48 @@

+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# LASER  Language-Agnostic SEntence Representations
+# is a toolkit to calculate multilingual sentence embeddings
+# and to use them for document classification, bitext filtering
+# and mining
+#
+#-------------------------------------------------------
+#
+# This bash script installs sentence encoders from Amazon s3
+#
+if [ -z ${LASER} ] ; then
+  echo "Please set the environment variable 'LASER'"
+  exit
+fi
+mdir="${LASER}/models"
+# available encoders
+s3="https://dl.fbaipublicfiles.com/laser/models"
+networks=("bilstm.eparl21.2018-11-19.pt" \
+          "eparl21.fcodes" "eparl21.fvocab" \
+          "bilstm.93langs.2018-12-26.pt" \
+          "93langs.fcodes" "93langs.fvocab")
+echo "Downloading networks"
+if [ ! -d ${mdir} ] ; then
+  echo " - creating directory ${mdir}"
+  mkdir -p ${mdir}
+fi
+cd ${mdir}
+for f in ${networks[@]} ; do
+  if [ -f ${f} ] ; then
+    echo " - ${mdir}/${f} already downloaded"
+  else
+    echo " - ${f}"
+    wget -q ${s3}/${f}
+  fi
+done

laser/laser2.cvocab ADDED Viewed

The diff for this file is too large to render. See raw diff

laser/laser2.spm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f7ef5da4408b94a096ff72d31d90f8ba438b4ab772764eb50c3db5e201fb384
+size 1008139

laser/laser_encoders/README.md ADDED Viewed

	@@ -0,0 +1,149 @@

+# LASER encoders
+LASER Language-Agnostic SEntence Representations Toolkit
+laser_encoders is the official Python package for the Facebook [LASER](https://github.com/facebookresearch/LASER) library. It provides a simple and convenient way to use LASER embeddings in Python. It allows you to calculate multilingual sentence embeddings using the LASER toolkit. These embeddings can be utilized for various natural language processing tasks, including document classification, bitext filtering, and mining.
+## Dependencies
+- Python `>= 3.8`
+- [PyTorch `>= 1.10.0`](http://pytorch.org/)
+- sacremoses `>=0.1.0`
+- sentencepiece `>=0.1.99`
+- numpy `>=1.21.3`
+- fairseq `>=0.12.2`
+You can find a full list of requirements [here](https://github.com/facebookresearch/LASER/blob/main/pyproject.toml)
+## Installation
+You can install `laser_encoders` package from PyPI:
+```sh
+pip install laser_encoders
+```
+Alternatively, you can install it from a local clone of this repository, in editable mode:
+```sh
+pip install . -e
+```
+## Usage
+Here's a simple example on how to obtain embeddings for sentences using the `LaserEncoderPipeline`:
+>**Note:** By default, the models will be downloaded to the `~/.cache/laser_encoders` directory. To specify a different download location, you can provide the argument `model_dir=path/to/model/directory`
+```py
+from laser_encoders import LaserEncoderPipeline
+# Initialize the LASER encoder pipeline
+encoder = LaserEncoderPipeline(lang="igbo")
+# Encode sentences into embeddings
+embeddings = encoder.encode_sentences(["nnọọ, kedu ka ị mere"])
+# If you want the output embeddings to be L2-normalized, set normalize_embeddings to True
+normalized_embeddings = encoder.encode_sentences(["nnọọ, kedu ka ị mere"], normalize_embeddings=True)
+```
+If you prefer more control over the tokenization and encoding process, you can initialize the tokenizer and encoder separately:
+```py
+from laser_encoders import initialize_encoder, initialize_tokenizer
+# Initialize the LASER tokenizer
+tokenizer = initialize_tokenizer(lang="igbo")
+tokenized_sentence = tokenizer.tokenize("nnọọ, kedu ka ị mere")
+# Initialize the LASER sentence encoder
+encoder = initialize_encoder(lang="igbo")
+# Encode tokenized sentences into embeddings
+embeddings = encoder.encode_sentences([tokenized_sentence])
+```
+>By default, the `spm` flag is set to `True` when initializing the encoder, ensuring the accompanying spm model is downloaded.
+**Supported Languages:** You can specify any language from the [FLORES200](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200) dataset. This includes both languages identified by their full codes (like "ibo_Latn") and simpler alternatives (like "igbo").
+## Downloading the pre-trained models
+If you prefer to download the models individually, you can use the following command:
+```sh
+python -m laser_encoders.download_models --lang=your_prefered_language  # e.g., --lang="igbo""
+```
+By default, the downloaded models will be stored in the `~/.cache/laser_encoders` directory. To specify a different download location, utilize the following command:
+```sh
+python -m laser_encoders.download_models --model-dir=path/to/model/directory
+```
+> For a comprehensive list of available arguments, you can use the `--help` command with the download_models script.
+Once you have successfully downloaded the models, you can utilize the `SentenceEncoder` to tokenize and encode your text in your desired language. Here's an example of how you can achieve this:
+```py
+from laser_encoders.models import SentenceEncoder
+from pathlib import Path
+encoder = SentenceEncoder(model_path=path/to/downloaded/model, spm_model=Path(path/to/spm_model), spm_vocab=path/to/cvocab)
+embeddings = encoder("This is a test sentence.")
+```
+If you want to perform tokenization seperately, you can do this below:
+```py
+from laser_encoders.laser_tokenizer import LaserTokenizer
+tokenizer = LaserTokenizer(spm_model=Path(path/to/spm_model))
+tokenized_sentence = tokenizer.tokenize("This is a test sentence.")
+encoder = SentenceEncoder(model_path=path/to/downloaded/model, spm_vocab=path/to/cvocab)
+embeddings = encoder.encode_sentences([tokenized_sentence])
+```
+For tokenizing a file instead of a string, you can use the following:
+```py
+tokenized_sentence = tokenizer.tokenize_file(inp_fname=Path(path/to/input_file.txt), out_fname=Path(path/to/output_file.txt))
+```
+### Now you can use these embeddings for downstream tasks
+For more advanced usage and options, please refer to the official LASER repository documentation.
+## LASER Versions and Associated Packages
+For users familiar with the earlier version of LASER, you might have encountered the [`laserembeddings`](https://pypi.org/project/laserembeddings/) package. This package primarily dealt with LASER-1 model embeddings.
+For the latest LASER-2,3 models, use the newly introduced `laser_encoders` package, which offers better performance and support for a wider range of languages.
+## Contributing
+We welcome contributions from the developer community to enhance and improve laser_encoders. If you'd like to contribute, you can:
+1. Submit bug reports or feature requests through GitHub issues.
+1. Fork the repository, make changes, and submit pull requests for review.
+Please follow our [Contribution Guidelines](https://github.com/facebookresearch/LASER/blob/main/CONTRIBUTING.md) to ensure a smooth process.
+### Code of Conduct
+We expect all contributors to adhere to our [Code of Conduct](https://github.com/facebookresearch/LASER/blob/main/CODE_OF_CONDUCT.md).
+### Contributors
+The following people have contributed to this project:
+- [Victor Joseph](https://github.com/CaptainVee)
+- [Paul Okewunmi](https://github.com/Paulooh007)
+- [Siddharth Singh Rana](https://github.com/NIXBLACK11)
+- [David Dale](https://github.com/avidale/)
+- [Holger Schwenk](https://github.com/hoschwenk)
+- [Kevin Heffernan](https://github.com/heffernankevin)
+### License
+This package is released under the [LASER](https://github.com/facebookresearch/LASER/blob/main/LICENSE) BSD License.

laser/laser_encoders/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# LASER  Language-Agnostic SEntence Representations
+# is a toolkit to calculate multilingual sentence embeddings
+# and to use them for document classification, bitext filtering
+# and mining
+#
+# -------------------------------------------------------
+from laser_encoders.laser_tokenizer import initialize_tokenizer
+from laser_encoders.models import LaserEncoderPipeline, initialize_encoder

laser/laser_encoders/download_models.py ADDED Viewed

	@@ -0,0 +1,154 @@

+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# LASER  Language-Agnostic SEntence Representations
+# is a toolkit to calculate multilingual sentence embeddings
+# and to use them for document classification, bitext filtering
+# and mining
+#
+# -------------------------------------------------------
+#
+# This python script installs NLLB LASER2 and LASER3 sentence encoders from Amazon s3
+import argparse
+import logging
+import os
+import shutil
+import sys
+import tempfile
+from pathlib import Path
+import requests
+from tqdm import tqdm
+from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE, SPM_LANGUAGE
+logging.basicConfig(
+    stream=sys.stdout,
+    level=logging.INFO,
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+)
+logger = logging.getLogger(__name__)
+class LaserModelDownloader:
+    def __init__(self, model_dir: str = None):
+        if model_dir is None:
+            model_dir = os.path.expanduser("~/.cache/laser_encoders")
+            os.makedirs(model_dir, exist_ok=True)
+        self.model_dir = Path(model_dir)
+        self.base_url = "https://dl.fbaipublicfiles.com/nllb/laser"
+    def download(self, filename: str):
+        # Because on windows os.path.join will use "\" insted of "/", so link would be:
+        # https://dl.fbaipublicfiles.com/nllb/laser\laser2.pt instead of https://dl.fbaipublicfiles.com/nllb/laser/laser2.pt
+        # which results in a failed download.
+        url = f"{self.base_url}/{filename}"
+        local_file_path = os.path.join(self.model_dir, filename)
+        if os.path.exists(local_file_path):
+            logger.info(f" - {filename} already downloaded")
+        else:
+            logger.info(f" - Downloading {filename}")
+            tf = tempfile.NamedTemporaryFile(delete=False)
+            temp_file_path = tf.name
+            with tf:
+                response = requests.get(url, stream=True)
+                total_size = int(response.headers.get("Content-Length", 0))
+                progress_bar = tqdm(total=total_size, unit_scale=True, unit="B")
+                for chunk in response.iter_content(chunk_size=1024):
+                    tf.write(chunk)
+                    progress_bar.update(len(chunk))
+                progress_bar.close()
+            shutil.move(temp_file_path, local_file_path)
+    def get_language_code(self, language_list: dict, lang: str) -> str:
+        try:
+            lang_3_4 = language_list[lang]
+            if isinstance(lang_3_4, list):
+                options = ", ".join(f"'{opt}'" for opt in lang_3_4)
+                raise ValueError(
+                    f"Language '{lang}' has multiple options: {options}. Please specify using the 'lang' argument."
+                )
+            return lang_3_4
+        except KeyError:
+            raise ValueError(
+                f"language name: {lang} not found in language list. Specify a supported language name"
+            )
+    def download_laser2(self):
+        self.download("laser2.pt")
+        self.download("laser2.spm")
+        self.download("laser2.cvocab")
+    def download_laser3(self, lang: str, spm: bool = False):
+        result = self.get_language_code(LASER3_LANGUAGE, lang)
+        if isinstance(result, list):
+            raise ValueError(
+                f"There are script-specific models available for {lang}. Please choose one from the following: {result}"
+            )
+        lang = result
+        self.download(f"laser3-{lang}.v1.pt")
+        if spm:
+            if lang in SPM_LANGUAGE:
+                self.download(f"laser3-{lang}.v1.spm")
+                self.download(f"laser3-{lang}.v1.cvocab")
+            else:
+                self.download(f"laser2.spm")
+                self.download(f"laser2.cvocab")
+    def main(self, args):
+        if args.laser:
+            if args.laser == "laser2":
+                self.download_laser2()
+            elif args.laser == "laser3":
+                self.download_laser3(lang=args.lang, spm=args.spm)
+            else:
+                raise ValueError(
+                    f"Unsupported laser model: {args.laser}. Choose either laser2 or laser3."
+                )
+        else:
+            if args.lang in LASER3_LANGUAGE:
+                self.download_laser3(lang=args.lang, spm=args.spm)
+            elif args.lang in LASER2_LANGUAGE:
+                self.download_laser2()
+            else:
+                raise ValueError(
+                    f"Unsupported language name: {args.lang}. Please specify a supported language name using --lang."
+                )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="LASER: Download Laser models")
+    parser.add_argument(
+        "--laser",
+        type=str,
+        help="Laser model to download",
+    )
+    parser.add_argument(
+        "--lang",
+        type=str,
+        help="The language name in FLORES200 format",
+    )
+    parser.add_argument(
+        "--spm",
+        action="store_false",
+        help="Do not download the SPM model?",
+    )
+    parser.add_argument(
+        "--model-dir", type=str, help="The directory to download the models to"
+    )
+    args = parser.parse_args()
+    downloader = LaserModelDownloader(args.model_dir)
+    downloader.main(args)

laser/laser_encoders/language_list.py ADDED Viewed

	@@ -0,0 +1,564 @@

+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# LASER  Language-Agnostic SEntence Representations
+# is a toolkit to calculate multilingual sentence embeddings
+# and to use them for document classification, bitext filtering
+# and mining
+#
+# -------------------------------------------------------
+# Language mapping to handle different language codes and names
+def build_language_names_dict(language_list: list, language_names: dict) -> dict:
+    """
+    Build a dictionary mapping language names to their corresponding language codes.
+    Parameters:
+        - language_list (list): A list of language codes.
+        - language_names (dict): A dictionary mapping language codes to language names.
+    Returns:
+        - dict: A dictionary mapping language names to their corresponding language codes.
+    """
+    result_dict = {}
+    for lang_code in language_list:
+        if lang_code not in language_names:
+            raise ValueError(
+                f"Language code '{lang_code}' not found in the provided language_names dictionary."
+            )
+        names_list = language_names[lang_code]
+        # Ensure names_list is always a list
+        if not isinstance(names_list, list):
+            names_list = [names_list]
+        for name in names_list:
+            if name not in result_dict:
+                result_dict[name] = []
+            result_dict[name].append(lang_code)
+    # Remove single-element lists and convert them to the element itself
+    for key in result_dict:
+        if len(result_dict[key]) == 1:
+            result_dict[key] = result_dict[key][0]
+    return result_dict
+SPM_LANGUAGE = [
+    "amh_Ethi",
+    "ayr_Latn",
+    "azj_Latn",
+    "bak_Cyrl",
+    "bel_Cyrl",
+    "bod_Tibt",
+    "ckb_Arab",
+    "crh_Latn",
+    "dik_Latn",
+    "dzo_Tibt",
+    "fur_Latn",
+    "fuv_Latn",
+    "grn_Latn",
+    "kab_Latn",
+    "kac_Latn",
+    "kaz_Cyrl",
+    "kir_Cyrl",
+    "kmr_Latn",
+    "lij_Latn",
+    "lim_Latn",
+    "lmo_Latn",
+    "ltg_Latn",
+    "mya_Mymr",
+    "pbt_Arab",
+    "pes_Arab",
+    "prs_Arab",
+    "sat_Beng",
+    "scn_Latn",
+    "srd_Latn",
+    "szl_Latn",
+    "taq_Latn",
+    "tgk_Cyrl",
+    "tir_Ethi",
+    "tzm_Tfng",
+    "vec_Latn",
+]
+##################################
+###### LANGUAGE NAMES ############
+##################################
+LANGUAGE_NAMES = {
+    "ace_Arab": ["acehnese", "ace", "ace_Arab"],
+    "ace_Latn": ["acehnese", "ace", "ace_Latn"],
+    "acm_Arab": ["mesopotamian arabic", "acm", "acm_Arab"],
+    "acq_Arab": ["ta’izzi-adeni arabic", "acq", "acq_Arab"],
+    "aeb_Arab": ["tunisian arabic", "aeb", "aeb_Arab"],
+    "afr_Latn": ["afrikaans", "afr", "afr_Latn"],
+    "ajp_Arab": ["south levantine arabic", "ajp", "ajp_Arab"],
+    "aka_Latn": ["akan", "aka", "aka_Latn"],
+    "amh_Ethi": ["amharic", "amh", "amh_Ethi"],
+    "apc_Arab": ["north levantine arabic", "apc", "apc_Arab"],
+    "arb_Arab": ["modern standard arabic", "arb", "arb_Arab"],
+    "arb_Latn": ["modern standard arabic", "arb", "arb_Latn"],
+    "ars_Arab": ["najdi arabic", "ars", "ars_Arab"],
+    "ary_Arab": ["moroccan arabic", "ary", "ary_Arab"],
+    "arz_Arab": ["egyptian arabic", "arz", "arz_Arab"],
+    "asm_Beng": ["assamese", "asm", "asm_Beng"],
+    "ast_Latn": ["asturian", "ast", "ast_Latn"],
+    "awa_Deva": ["awadhi", "awa", "awa_Deva"],
+    "ayr_Latn": ["central aymara", "ayr", "ayr_Latn"],
+    "azb_Arab": ["south azerbaijani", "azb", "azb_Arab"],
+    "azj_Latn": ["north azerbaijani", "azj", "azj_Latn"],
+    "bak_Cyrl": ["bashkir", "bak", "bak_Cyrl"],
+    "bam_Latn": ["bambara", "bam", "bam_Latn"],
+    "ban_Latn": ["balinese", "ban", "ban_Latn"],
+    "bel_Cyrl": ["belarusian", "bel", "bel_Cyrl"],
+    "bem_Latn": ["bemba", "bem", "bem_Latn"],
+    "ben_Beng": ["bengali", "ben", "ben_Beng"],
+    "bho_Deva": ["bhojpuri", "bho", "bho_Deva"],
+    "bjn_Arab": ["banjar", "bjn", "bjn_Arab"],
+    "bjn_Latn": ["banjar", "bjn", "bjn_Latn"],
+    "bod_Tibt": ["standard tibetan", "bod", "bod_Tibt"],
+    "bos_Latn": ["bosnian", "bos", "bos_Latn"],
+    "bug_Latn": ["buginese", "bug", "bug_Latn"],
+    "bul_Cyrl": ["bulgarian", "bul", "bul_Cyrl"],
+    "cat_Latn": ["catalan", "cat", "cat_Latn"],
+    "ceb_Latn": ["cebuano", "ceb", "ceb_Latn"],
+    "ces_Latn": ["czech", "ces", "ces_Latn"],
+    "cjk_Latn": ["chokwe", "cjk", "cjk_Latn"],
+    "ckb_Arab": ["central kurdish", "ckb", "ckb_Arab"],
+    "crh_Latn": ["crimean tatar", "crh", "crh_Latn"],
+    "cym_Latn": ["welsh", "cym", "cym_Latn"],
+    "dan_Latn": ["danish", "dan", "dan_Latn"],
+    "deu_Latn": ["german", "deu", "deu_Latn"],
+    "dik_Latn": ["southwestern dinka", "dik", "dik_Latn"],
+    "dyu_Latn": ["dyula", "dyu", "dyu_Latn"],
+    "dzo_Tibt": ["dzongkha", "dzo", "dzo_Tibt"],
+    "ell_Grek": ["greek", "ell", "ell_Grek"],
+    "eng_Latn": ["english", "eng", "eng_Latn"],
+    "epo_Latn": ["esperanto", "epo", "epo_Latn"],
+    "est_Latn": ["estonian", "est", "est_Latn"],
+    "eus_Latn": ["basque", "eus", "eus_Latn"],
+    "ewe_Latn": ["ewe", "ewe_Latn"],
+    "fao_Latn": ["faroese", "fao", "fao_Latn"],
+    "fij_Latn": ["fijian", "fij", "fij_Latn"],
+    "fin_Latn": ["finnish", "fin", "fin_Latn"],
+    "fon_Latn": ["fon", "fon_Latn"],
+    "fra_Latn": ["french", "fra", "fra_Latn"],
+    "fur_Latn": ["friulian", "fur", "fur_Latn"],
+    "fuv_Latn": ["nigerian fulfulde", "fuv", "fuv_Latn"],
+    "gla_Latn": ["scottish gaelic", "gla", "gla_Latn"],
+    "gle_Latn": ["irish", "gle", "gle_Latn"],
+    "glg_Latn": ["galician", "glg", "glg_Latn"],
+    "grn_Latn": ["guarani", "grn", "grn_Latn"],
+    "guj_Gujr": ["gujarati", "guj", "guj_Gujr"],
+    "hat_Latn": ["haitian creole", "hat", "hat_Latn"],
+    "hau_Latn": ["hausa", "hau", "hau_Latn"],
+    "heb_Hebr": ["hebrew", "heb", "heb_Hebr"],
+    "hin_Deva": ["hindi", "hin", "hin_Deva"],
+    "hne_Deva": ["chhattisgarhi", "hne", "hne_Deva"],
+    "hrv_Latn": ["croatian", "hrv", "hrv_Latn"],
+    "hun_Latn": ["hungarian", "hun", "hun_Latn"],
+    "hye_Armn": ["armenian", "hye", "hye_Armn"],
+    "ibo_Latn": ["igbo", "ibo", "ibo_Latn"],
+    "ilo_Latn": ["ilocano", "ilo", "ilo_Latn"],
+    "ind_Latn": ["indonesian", "ind", "ind_Latn"],
+    "isl_Latn": ["icelandic", "isl", "isl_Latn"],
+    "ita_Latn": ["italian", "ita", "ita_Latn"],
+    "jav_Latn": ["javanese", "jav", "jav_Latn"],
+    "jpn_Jpan": ["japanese", "jpn", "jpn_Jpan"],
+    "kab_Latn": ["kabyle", "kab", "kab_Latn"],
+    "kac_Latn": ["jingpho", "kac", "kac_Latn"],
+    "kam_Latn": ["kamba", "kam", "kam_Latn"],
+    "kan_Knda": ["kannada", "kan", "kan_Knda"],
+    "kas_Arab": ["kashmiri", "kas", "kas_Arab"],
+    "kas_Deva": ["kashmiri", "kas", "kas_Deva"],
+    "kat_Geor": ["georgian", "kat", "kat_Geor"],
+    "knc_Arab": ["central kanuri", "knc", "knc_Arab"],
+    "knc_Latn": ["central kanuri", "knc", "knc_Latn"],
+    "kaz_Cyrl": ["kazakh", "kaz", "kaz_Cyrl"],
+    "kbp_Latn": ["kabiyè", "kbp", "kbp_Latn"],
+    "kea_Latn": ["kabuverdianu", "kea", "kea_Latn"],
+    "khm_Khmr": ["khmer", "khm", "khm_Khmr"],
+    "kik_Latn": ["kikuyu", "kik", "kik_Latn"],
+    "kin_Latn": ["kinyarwanda", "kin", "kin_Latn"],
+    "kir_Cyrl": ["kyrgyz", "kir", "kir_Cyrl"],
+    "kmb_Latn": ["kimbundu", "kmb", "kmb_Latn"],
+    "kmr_Latn": ["northern kurdish", "kmr", "kmr_Latn"],
+    "kon_Latn": ["kikongo", "kon", "kon_Latn"],
+    "kor_Hang": ["korean", "kor", "kor_Hang"],
+    "lao_Laoo": ["lao", "lao_Laoo"],
+    "lij_Latn": ["ligurian", "lij", "lij_Latn"],
+    "lim_Latn": ["limburgish", "lim", "lim_Latn"],
+    "lin_Latn": ["lingala", "lin", "lin_Latn"],
+    "lit_Latn": ["lithuanian", "lit", "lit_Latn"],
+    "lmo_Latn": ["lombard", "lmo", "lmo_Latn"],
+    "ltg_Latn": ["latgalian", "ltg", "ltg_Latn"],
+    "ltz_Latn": ["luxembourgish", "ltz", "ltz_Latn"],
+    "lua_Latn": ["luba-kasai", "lua", "lua_Latn"],
+    "lug_Latn": ["ganda", "lug", "lug_Latn"],
+    "luo_Latn": ["luo", "luo_Latn"],
+    "lus_Latn": ["mizo", "lus", "lus_Latn"],
+    "lvs_Latn": ["standard latvian", "lvs", "lvs_Latn"],
+    "mag_Deva": ["magahi", "mag", "mag_Deva"],
+    "mai_Deva": ["maithili", "mai", "mai_Deva"],
+    "mal_Mlym": ["malayalam", "mal", "mal_Mlym"],
+    "mar_Deva": ["marathi", "mar", "mar_Deva"],
+    "min_Arab": ["minangkabau", "min", "min_Arab"],
+    "min_Latn": ["minangkabau", "min", "min_Latn"],
+    "mkd_Cyrl": ["macedonian", "mkd", "mkd_Cyrl"],
+    "plt_Latn": ["plateau malagasy", "plt", "plt_Latn"],
+    "mlt_Latn": ["maltese", "mlt", "mlt_Latn"],
+    "mni_Beng": ["meitei", "mni", "mni_Beng"],
+    "khk_Cyrl": ["halh mongolian", "khk", "khk_Cyrl"],
+    "mos_Latn": ["mossi", "mos", "mos_Latn"],
+    "mri_Latn": ["maori", "mri", "mri_Latn"],
+    "mya_Mymr": ["burmese", "mya", "mya_Mymr"],
+    "nld_Latn": ["dutch", "nld", "nld_Latn"],
+    "nno_Latn": ["norwegian nynorsk", "nno", "nno_Latn"],
+    "nob_Latn": ["norwegian bokmål", "nob", "nob_Latn"],
+    "npi_Deva": ["nepali", "npi", "npi_Deva"],
+    "nso_Latn": ["northern sotho", "nso", "nso_Latn"],
+    "nus_Latn": ["nuer", "nus", "nus_Latn"],
+    "nya_Latn": ["nyanja", "nya", "nya_Latn"],
+    "oci_Latn": ["occitan", "oci", "oci_Latn"],
+    "gaz_Latn": ["west central oromo", "gaz", "gaz_Latn"],
+    "ory_Orya": ["odia", "ory", "ory_Orya"],
+    "pag_Latn": ["pangasinan", "pag", "pag_Latn"],
+    "pan_Guru": ["eastern panjabi", "pan", "pan_Guru"],
+    "pap_Latn": ["papiamento", "pap", "pap_Latn"],
+    "pes_Arab": ["western persian", "pes", "pes_Arab"],
+    "pol_Latn": ["polish", "pol", "pol_Latn"],
+    "por_Latn": ["portuguese", "por", "por_Latn"],
+    "prs_Arab": ["dari", "prs", "prs_Arab"],
+    "pbt_Arab": ["southern pashto", "pbt", "pbt_Arab"],
+    "quy_Latn": ["ayacucho quechua", "quy", "quy_Latn"],
+    "ron_Latn": ["romanian", "ron", "ron_Latn"],
+    "run_Latn": ["rundi", "run", "run_Latn"],
+    "rus_Cyrl": ["russian", "rus", "rus_Cyrl"],
+    "sag_Latn": ["sango", "sag", "sag_Latn"],
+    "san_Deva": ["sanskrit", "san", "san_Deva"],
+    "sat_Olck": ["santali", "sat", "sat_Olck"],
+    "scn_Latn": ["sicilian", "scn", "scn_Latn"],
+    "shn_Mymr": ["shan", "shn", "shn_Mymr"],
+    "sin_Sinh": ["sinhala", "sin", "sin_Sinh"],
+    "slk_Latn": ["slovak", "slk", "slk_Latn"],
+    "slv_Latn": ["slovenian", "slv", "slv_Latn"],
+    "smo_Latn": ["samoan", "smo", "smo_Latn"],
+    "sna_Latn": ["shona", "sna", "sna_Latn"],
+    "snd_Arab": ["sindhi", "snd", "snd_Arab"],
+    "som_Latn": ["somali", "som", "som_Latn"],
+    "sot_Latn": ["southern sotho", "sot", "sot_Latn"],
+    "spa_Latn": ["spanish", "spa", "spa_Latn"],
+    "als_Latn": ["tosk albanian", "als", "als_Latn"],
+    "srd_Latn": ["sardinian", "srd", "srd_Latn"],
+    "srp_Cyrl": ["serbian", "srp", "srp_Cyrl"],
+    "ssw_Latn": ["swati", "ssw", "ssw_Latn"],
+    "sun_Latn": ["sundanese", "sun", "sun_Latn"],
+    "swe_Latn": ["swedish", "swe", "swe_Latn"],
+    "swh_Latn": ["swahili", "swh", "swh_Latn"],
+    "szl_Latn": ["silesian", "szl", "szl_Latn"],
+    "tam_Taml": ["tamil", "tam", "tam_Taml"],
+    "tat_Cyrl": ["tatar", "tat", "tat_Cyrl"],
+    "tel_Telu": ["telugu", "tel", "tel_Telu"],
+    "tgk_Cyrl": ["tajik", "tgk", "tgk_Cyrl"],
+    "tgl_Latn": ["tagalog", "tgl", "tgl_Latn"],
+    "tha_Thai": ["thai", "tha", "tha_Thai"],
+    "tir_Ethi": ["tigrinya", "tir", "tir_Ethi"],
+    "taq_Latn": ["tamasheq", "taq", "taq_Latn"],
+    "taq_Tfng": ["tamasheq", "taq", "taq_Tfng"],
+    "tpi_Latn": ["tok pisin", "tpi", "tpi_Latn"],
+    "tsn_Latn": ["tswana", "tsn", "tsn_Latn"],
+    "tso_Latn": ["tsonga", "tso", "tso_Latn"],
+    "tuk_Latn": ["turkmen", "tuk", "tuk_Latn"],
+    "tum_Latn": ["tumbuka", "tum", "tum_Latn"],
+    "tur_Latn": ["turkish", "tur", "tur_Latn"],
+    "twi_Latn": ["twi", "twi_Latn"],
+    "tzm_Tfng": ["central atlas tamazight", "tzm", "tzm_Tfng"],
+    "uig_Arab": ["uyghur", "uig", "uig_Arab"],
+    "ukr_Cyrl": ["ukrainian", "ukr", "ukr_Cyrl"],
+    "umb_Latn": ["umbundu", "umb", "umb_Latn"],
+    "urd_Arab": ["urdu", "urd", "urd_Arab"],
+    "uzn_Latn": ["northern uzbek", "uzn", "uzn_Latn"],
+    "vec_Latn": ["venetian", "vec", "vec_Latn"],
+    "vie_Latn": ["vietnamese", "vie", "vie_Latn"],
+    "war_Latn": ["waray", "war", "war_Latn"],
+    "wol_Latn": ["wolof", "wol", "wol_Latn"],
+    "xho_Latn": ["xhosa", "xho", "xho_Latn"],
+    "ydd_Hebr": ["eastern yiddish", "ydd", "ydd_Hebr"],
+    "yor_Latn": ["yoruba", "yor", "yor_Latn"],
+    "yue_Hant": ["yue chinese", "yue", "yue_Hant"],
+    "zho_Hans": ["chinese", "zho", "zho_Hans"],
+    "zho_Hant": ["chinese", "zho", "zho_Hant"],
+    "zsm_Latn": ["standard malay", "zsm", "zsm_Latn"],
+    "zul_Latn": ["zulu", "zul", "zul_Latn"],
+    "diq_Latn": ["southern zaza", "diq", "diq_Latn"],
+    "sat_Beng": ["santali", "sat", "sat_Beng"],
+}
+##################################
+###### LASER 3 ###################
+##################################
+LASER3_LANGUAGES_LIST = [
+    "ace_Latn",
+    "aka_Latn",
+    "als_Latn",
+    "amh_Ethi",
+    "asm_Beng",
+    "awa_Deva",
+    "ayr_Latn",
+    "azb_Arab",
+    "azj_Latn",
+    "bak_Cyrl",
+    "bam_Latn",
+    "ban_Latn",
+    "bel_Cyrl",
+    "bem_Latn",
+    "ben_Beng",
+    "bho_Deva",
+    "bjn_Latn",
+    "bod_Tibt",
+    "bug_Latn",
+    "ceb_Latn",
+    "cjk_Latn",
+    "ckb_Arab",
+    "crh_Latn",
+    "cym_Latn",
+    "dik_Latn",
+    "diq_Latn",
+    "dyu_Latn",
+    "dzo_Tibt",
+    "ewe_Latn",
+    "fao_Latn",
+    "fij_Latn",
+    "fon_Latn",
+    "fur_Latn",
+    "fuv_Latn",
+    "gaz_Latn",
+    "gla_Latn",
+    "gle_Latn",
+    "grn_Latn",
+    "guj_Gujr",
+    "hat_Latn",
+    "hau_Latn",
+    "hin_Deva",
+    "hne_Deva",
+    "hye_Armn",
+    "ibo_Latn",
+    "ilo_Latn",
+    "ind_Latn",
+    "jav_Latn",
+    "kab_Latn",
+    "kac_Latn",
+    "kam_Latn",
+    "kan_Knda",
+    "kas_Arab",
+    "kas_Deva",
+    "kat_Geor",
+    "kaz_Cyrl",
+    "kbp_Latn",
+    "kea_Latn",
+    "khk_Cyrl",
+    "khm_Khmr",
+    "kik_Latn",
+    "kin_Latn",
+    "kir_Cyrl",
+    "kmb_Latn",
+    "kmr_Latn",
+    "knc_Arab",
+    "knc_Latn",
+    "kon_Latn",
+    "lao_Laoo",
+    "lij_Latn",
+    "lim_Latn",
+    "lin_Latn",
+    "lmo_Latn",
+    "ltg_Latn",
+    "ltz_Latn",
+    "lua_Latn",
+    "lug_Latn",
+    "luo_Latn",
+    "lus_Latn",
+    "mag_Deva",
+    "mai_Deva",
+    "mal_Mlym",
+    "mar_Deva",
+    "min_Latn",
+    "mlt_Latn",
+    "mni_Beng",
+    "mos_Latn",
+    "mri_Latn",
+    "mya_Mymr",
+    "npi_Deva",
+    "nso_Latn",
+    "nus_Latn",
+    "nya_Latn",
+    "ory_Orya",
+    "pag_Latn",
+    "pan_Guru",
+    "pap_Latn",
+    "pbt_Arab",
+    "pes_Arab",
+    "plt_Latn",
+    "prs_Arab",
+    "quy_Latn",
+    "run_Latn",
+    "sag_Latn",
+    "san_Deva",
+    "sat_Beng",
+    "scn_Latn",
+    "shn_Mymr",
+    "sin_Sinh",
+    "smo_Latn",
+    "sna_Latn",
+    "snd_Arab",
+    "som_Latn",
+    "sot_Latn",
+    "srd_Latn",
+    "ssw_Latn",
+    "sun_Latn",
+    "swh_Latn",
+    "szl_Latn",
+    "tam_Taml",
+    "taq_Latn",
+    "tat_Cyrl",
+    "tel_Telu",
+    "tgk_Cyrl",
+    "tgl_Latn",
+    "tha_Thai",
+    "tir_Ethi",
+    "tpi_Latn",
+    "tsn_Latn",
+    "tso_Latn",
+    "tuk_Latn",
+    "tum_Latn",
+    "tur_Latn",
+    "twi_Latn",
+    "tzm_Tfng",
+    "uig_Arab",
+    "umb_Latn",
+    "urd_Arab",
+    "uzn_Latn",
+    "vec_Latn",
+    "war_Latn",
+    "wol_Latn",
+    "xho_Latn",
+    "ydd_Hebr",
+    "yor_Latn",
+    "zsm_Latn",
+    "zul_Latn",
+]
+LASER3_LANGUAGE = build_language_names_dict(LASER3_LANGUAGES_LIST, LANGUAGE_NAMES)
+##################################
+###### LASER 2 ###################
+##################################
+LASER2_LANGUAGES_LIST = [
+    "acm_Arab",
+    "acq_Arab",
+    "aeb_Arab",
+    "afr_Latn",
+    "ajp_Arab",
+    "amh_Ethi",
+    "apc_Arab",
+    "arb_Arab",
+    "arb_Latn",
+    "ars_Arab",
+    "ary_Arab",
+    "arz_Arab",
+    "ayr_Latn",
+    "azb_Arab",
+    "azj_Latn",
+    "bel_Cyrl",
+    "ben_Beng",
+    "bos_Latn",
+    "bul_Cyrl",
+    "cat_Latn",
+    "ces_Latn",
+    "ckb_Arab",
+    "crh_Latn",
+    "dan_Latn",
+    "deu_Latn",
+    "ell_Grek",
+    "eng_Latn",
+    "epo_Latn",
+    "est_Latn",
+    "eus_Latn",
+    "fin_Latn",
+    "fra_Latn",
+    "gle_Latn",
+    "glg_Latn",
+    "hau_Latn",
+    "heb_Hebr",
+    "hin_Deva",
+    "hrv_Latn",
+    "hun_Latn",
+    "hye_Armn",
+    "ind_Latn",
+    "isl_Latn",
+    "ita_Latn",
+    "jpn_Jpan",
+    "kab_Latn",
+    "kat_Geor",
+    "kaz_Cyrl",
+    "khm_Khmr",
+    "kmr_Latn",
+    "kor_Hang",
+    "lit_Latn",
+    "lvs_Latn",
+    "mal_Mlym",
+    "mar_Deva",
+    "mkd_Cyrl",
+    "plt_Latn",
+    "mya_Mymr",
+    "nld_Latn",
+    "nob_Latn",
+    "oci_Latn",
+    "pes_Arab",
+    "pol_Latn",
+    "por_Latn",
+    "ron_Latn",
+    "rus_Cyrl",
+    "sin_Sinh",
+    "slk_Latn",
+    "slv_Latn",
+    "snd_Arab",
+    "som_Latn",
+    "spa_Latn",
+    "als_Latn",
+    "srp_Cyrl",
+    "swe_Latn",
+    "swh_Latn",
+    "tam_Taml",
+    "tat_Cyrl",
+    "tel_Telu",
+    "tgk_Cyrl",
+    "tgl_Latn",
+    "tha_Thai",
+    "tur_Latn",
+    "uig_Arab",
+    "ukr_Cyrl",
+    "urd_Arab",
+    "uzn_Latn",
+    "vie_Latn",
+    "yue_Hant",
+    "yue_Hant",
+    "zho_Hans",
+    "zho_Hant",
+    "zsm_Latn",
+]
+LASER2_LANGUAGE = build_language_names_dict(LASER2_LANGUAGES_LIST, LANGUAGE_NAMES)

laser/laser_encoders/laser_tokenizer.py ADDED Viewed

	@@ -0,0 +1,179 @@

+#!/usr/bin/python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# LASER  Language-Agnostic SEntence Representations
+# is a toolkit to calculate multilingual sentence embeddings
+# and to use them for document classification, bitext filtering
+# and mining
+#
+# --------------------------------------------------------
+#
+# Helper functions for tokenization
+import gzip
+import logging
+import os
+import re
+import sys
+from pathlib import Path
+from typing import IO, List
+import sentencepiece as spm
+from sacremoses import MosesDetokenizer, MosesPunctNormalizer
+from unicategories import categories
+from laser_encoders.download_models import LaserModelDownloader
+from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE, SPM_LANGUAGE
+SPACE_NORMALIZER = re.compile(r"\s+")
+NON_PRINT_CHARS = set(c for c in categories["C"].characters())
+logging.basicConfig(
+    stream=sys.stdout,
+    level=logging.INFO,
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+)
+logger = logging.getLogger("preprocess")
+class LaserTokenizer:
+    def __init__(
+        self,
+        spm_model: Path,
+        lang: str = "en",
+        lower_case: bool = True,
+        descape: bool = False,
+        verbose: bool = False,
+        over_write: bool = False,
+        normalize_punct: bool = True,
+    ):
+        self.spm_model = spm_model
+        self.lang = lang
+        self.lower_case = lower_case
+        self.descape = descape
+        self.verbose = verbose
+        self.over_write = over_write
+        self.normalize_punct = normalize_punct
+        assert spm_model.exists(), f"spm model file: {spm_model} does not exist"
+        self.moses_punct_normalizer = MosesPunctNormalizer(self.lang, perl_parity=True)
+        # add parity with MOSES release-4.0
+        self.moses_punct_normalizer.substitutions[21] = ("‘", r'"')
+        self.moses_punct_normalizer.substitutions[22] = ("‚", r'"')
+        self.moses_detokenizer = MosesDetokenizer()
+        self.spm_encoder = spm.SentencePieceProcessor(model_file=str(self.spm_model))
+    def open(self, file: Path, mode: str, encoding="utf-8") -> IO:
+        return (
+            gzip.open(file, mode, encoding=encoding)
+            if file.name.endswith(".gz")
+            else open(file, mode, encoding=encoding)
+        )
+    def log(self, message: str) -> None:
+        if self.verbose:
+            logger.info(message)
+    def tokenize(self, text: str) -> str:
+        # Preprocessing
+        sentence_text = "".join([c if c not in NON_PRINT_CHARS else " " for c in text])
+        if self.normalize_punct:
+            sentence_text = self.moses_punct_normalizer.normalize(sentence_text)
+        if self.descape:
+            sentence_text = self.moses_detokenizer.unescape_xml(text=sentence_text)
+        if self.lower_case:
+            sentence_text = sentence_text.lower()
+        # SentencePiece encoding
+        encoded_text = " ".join(self.spm_encoder.encode(sentence_text, out_type=str))
+        return encoded_text
+    def tokenize_file(self, inp_fname: Path, out_fname: Path) -> None:
+        if not self.over_write and out_fname.exists():
+            self.log(f"tokenized file {out_fname.name} already exists")
+            return
+        else:
+            self.log(
+                f"tokenizing {inp_fname.name}"
+                + f"{' (de-escaped)' if self.descape else ''}"
+                + f"{' (lower-cased)' if self.lower_case else ' (cased)'} "
+                + f"(punctuation-normalization lang: {self.lang})"
+            )
+            with self.open(inp_fname, "rt") as file_in, open(
+                out_fname, "w"
+            ) as file_out:
+                for line in file_in:
+                    tokens = self.tokenize(line.strip())
+                    file_out.write(tokens + "\n")
+    def __call__(self, text_or_batch):
+        if isinstance(text_or_batch, str):
+            return self.tokenize(text_or_batch)
+        else:
+            return self.tokenize_batch(text_or_batch)
+    def tokenize_batch(self, batch: List[str]) -> List[List[str]]:
+        return [self.tokenize(text) for text in batch]
+    def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
+        return [self.spm_encoder.DecodeIds(ids) for ids in ids]
+    def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
+        ids = []
+        for token in tokens:
+            # Apply the same tokenization logic as in _tokenize method
+            tokens = SPACE_NORMALIZER.sub(" ", token).strip().split()
+            # Initialize an empty tensor for this token's IDs
+            token_ids = []
+            for i, token in enumerate(tokens):
+                token_id = self.spm_encoder.PieceToId(token)
+                if token_id == 0:  # Handle out-of-vocabulary tokens
+                    token_id = self.spm_encoder.PieceToId("<unk>")
+                token_ids.append(token_id)
+            # Append token IDs to the final IDs tensor
+            ids.extend(token_ids)
+        return ids
+def initialize_tokenizer(lang: str = None, model_dir: str = None, laser: str = None):
+    downloader = LaserModelDownloader(model_dir)
+    if laser is not None:
+        if laser == "laser3":
+            lang = downloader.get_language_code(LASER3_LANGUAGE, lang)
+            if lang in SPM_LANGUAGE:
+                filename = f"laser3-{lang}.v1.spm"
+            else:
+                filename = "laser2.spm"
+        elif laser == "laser2":
+            filename = "laser2.spm"
+        else:
+            raise ValueError(
+                f"Unsupported laser model: {laser}. Choose either laser2 or laser3."
+            )
+    else:
+        if lang in LASER3_LANGUAGE:
+            lang = downloader.get_language_code(LASER3_LANGUAGE, lang)
+            if lang in SPM_LANGUAGE:
+                filename = f"laser3-{lang}.v1.spm"
+            else:
+                filename = "laser2.spm"
+        elif lang in LASER2_LANGUAGE:
+            filename = "laser2.spm"
+        else:
+            raise ValueError(
+                f"Unsupported language name: {lang}. Please specify a supported language name."
+            )
+    downloader.download(filename)
+    model_path = os.path.join(downloader.model_dir, filename)
+    return LaserTokenizer(spm_model=Path(model_path))

laser/laser_encoders/models.py ADDED Viewed

	@@ -0,0 +1,426 @@

+#!/usr/bin/python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# LASER  Language-Agnostic SEntence Representations
+# is a toolkit to calculate multilingual sentence embeddings
+# and to use them for document classification, bitext filtering
+# and mining
+#
+# --------------------------------------------------------
+import logging
+import os
+import re
+import sys
+import warnings
+from collections import namedtuple
+from pathlib import Path
+import numpy as np
+import torch
+import torch.nn as nn
+from fairseq.data.dictionary import Dictionary
+from fairseq.models.transformer import Embedding, TransformerEncoder
+from fairseq.modules import LayerNorm
+from laser_encoders.download_models import LaserModelDownloader
+from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE
+from laser_encoders.laser_tokenizer import LaserTokenizer, initialize_tokenizer
+SPACE_NORMALIZER = re.compile(r"\s+")
+Batch = namedtuple("Batch", "srcs tokens lengths")
+logging.basicConfig(
+    stream=sys.stdout,
+    level=logging.INFO,
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+)
+logger = logging.getLogger("embed")
+class SentenceEncoder:
+    def __init__(
+        self,
+        model_path,
+        max_sentences=None,
+        max_tokens=None,
+        spm_vocab=None,
+        spm_model=None,
+        cpu=False,
+        fp16=False,
+        verbose=False,
+        sort_kind="quicksort",
+    ):
+        if verbose:
+            logger.info(f"loading encoder: {model_path}")
+        self.spm_model = spm_model
+        if self.spm_model:
+            self.tokenizer = LaserTokenizer(spm_model=Path(self.spm_model))
+        self.use_cuda = torch.cuda.is_available() and not cpu
+        self.max_sentences = max_sentences
+        self.max_tokens = max_tokens
+        if self.max_tokens is None and self.max_sentences is None:
+            self.max_sentences = 1
+        state_dict = torch.load(model_path)
+        if "params" in state_dict:
+            self.encoder = LaserLstmEncoder(**state_dict["params"])
+            self.encoder.load_state_dict(state_dict["model"])
+            self.dictionary = state_dict["dictionary"]
+            self.prepend_bos = False
+            self.left_padding = False
+        else:
+            self.encoder = LaserTransformerEncoder(state_dict, spm_vocab)
+            self.dictionary = self.encoder.dictionary.indices
+            self.prepend_bos = state_dict["cfg"]["model"].prepend_bos
+            self.left_padding = state_dict["cfg"]["model"].left_pad_source
+        del state_dict
+        self.bos_index = self.dictionary["<s>"] = 0
+        self.pad_index = self.dictionary["<pad>"] = 1
+        self.eos_index = self.dictionary["</s>"] = 2
+        self.unk_index = self.dictionary["<unk>"] = 3
+        if fp16:
+            self.encoder.half()
+        if self.use_cuda:
+            if verbose:
+                logger.info("transfer encoder to GPU")
+            self.encoder.cuda()
+        self.encoder.eval()
+        self.sort_kind = sort_kind
+    def __call__(self, text_or_batch):
+        if self.spm_model:
+            text_or_batch = self.tokenizer(text_or_batch)
+            if isinstance(text_or_batch, str):
+                text_or_batch = [text_or_batch]
+            return self.encode_sentences(text_or_batch)
+        else:
+            raise ValueError(
+                "Either initialize the encoder with an spm_model or pre-tokenize and use the encode_sentences method."
+            )
+    def _process_batch(self, batch):
+        tokens = batch.tokens
+        lengths = batch.lengths
+        if self.use_cuda:
+            tokens = tokens.cuda()
+            lengths = lengths.cuda()
+        with torch.no_grad():
+            sentemb = self.encoder(tokens, lengths)["sentemb"]
+        embeddings = sentemb.detach().cpu().numpy()
+        return embeddings
+    def _tokenize(self, line):
+        tokens = SPACE_NORMALIZER.sub(" ", line).strip().split()
+        ntokens = len(tokens)
+        if self.prepend_bos:
+            ids = torch.LongTensor(ntokens + 2)
+            ids[0] = self.bos_index
+            for i, token in enumerate(tokens):
+                ids[i + 1] = self.dictionary.get(token, self.unk_index)
+            ids[ntokens + 1] = self.eos_index
+        else:
+            ids = torch.LongTensor(ntokens + 1)
+            for i, token in enumerate(tokens):
+                ids[i] = self.dictionary.get(token, self.unk_index)
+            ids[ntokens] = self.eos_index
+        return ids
+    def _make_batches(self, lines):
+        tokens = [self._tokenize(line) for line in lines]
+        lengths = np.array([t.numel() for t in tokens])
+        indices = np.argsort(-lengths, kind=self.sort_kind)
+        def batch(tokens, lengths, indices):
+            toks = tokens[0].new_full((len(tokens), tokens[0].shape[0]), self.pad_index)
+            if not self.left_padding:
+                for i in range(len(tokens)):
+                    toks[i, : tokens[i].shape[0]] = tokens[i]
+            else:
+                for i in range(len(tokens)):
+                    toks[i, -tokens[i].shape[0] :] = tokens[i]
+            return (
+                Batch(srcs=None, tokens=toks, lengths=torch.LongTensor(lengths)),
+                indices,
+            )
+        batch_tokens, batch_lengths, batch_indices = [], [], []
+        ntokens = nsentences = 0
+        for i in indices:
+            if nsentences > 0 and (
+                (self.max_tokens is not None and ntokens + lengths[i] > self.max_tokens)
+                or (self.max_sentences is not None and nsentences == self.max_sentences)
+            ):
+                yield batch(batch_tokens, batch_lengths, batch_indices)
+                ntokens = nsentences = 0
+                batch_tokens, batch_lengths, batch_indices = [], [], []
+            batch_tokens.append(tokens[i])
+            batch_lengths.append(lengths[i])
+            batch_indices.append(i)
+            ntokens += tokens[i].shape[0]
+            nsentences += 1
+        if nsentences > 0:
+            yield batch(batch_tokens, batch_lengths, batch_indices)
+    def encode_sentences(self, sentences, normalize_embeddings=False):
+        indices = []
+        results = []
+        for batch, batch_indices in self._make_batches(sentences):
+            indices.extend(batch_indices)
+            encoded_batch = self._process_batch(batch)
+            if normalize_embeddings:
+                # Perform L2 normalization on the embeddings
+                norms = np.linalg.norm(encoded_batch, axis=1, keepdims=True)
+                encoded_batch = encoded_batch / norms
+            results.append(encoded_batch)
+        return np.vstack(results)[np.argsort(indices, kind=self.sort_kind)]
+class LaserTransformerEncoder(TransformerEncoder):
+    def __init__(self, state_dict, vocab_path):
+        self.dictionary = Dictionary.load(vocab_path)
+        if any(
+            k in state_dict["model"]
+            for k in ["encoder.layer_norm.weight", "layer_norm.weight"]
+        ):
+            self.dictionary.add_symbol("<mask>")
+        cfg = state_dict["cfg"]["model"]
+        self.sentemb_criterion = cfg.sentemb_criterion
+        self.pad_idx = self.dictionary.pad_index
+        self.bos_idx = self.dictionary.bos_index
+        embed_tokens = Embedding(
+            len(self.dictionary),
+            cfg.encoder_embed_dim,
+            self.pad_idx,
+        )
+        super().__init__(cfg, self.dictionary, embed_tokens)
+        if "decoder.version" in state_dict["model"]:
+            self._remove_decoder_layers(state_dict)
+        if "layer_norm.weight" in state_dict["model"]:
+            self.layer_norm = LayerNorm(cfg.encoder_embed_dim)
+        self.load_state_dict(state_dict["model"])
+    def _remove_decoder_layers(self, state_dict):
+        for key in list(state_dict["model"].keys()):
+            if not key.startswith(
+                (
+                    "encoder.layer_norm",
+                    "encoder.layers",
+                    "encoder.embed",
+                    "encoder.version",
+                )
+            ):
+                del state_dict["model"][key]
+            else:
+                renamed_key = key.replace("encoder.", "")
+                state_dict["model"][renamed_key] = state_dict["model"].pop(key)
+    def forward(self, src_tokens, src_lengths):
+        encoder_out = super().forward(src_tokens, src_lengths)
+        if isinstance(encoder_out, dict):
+            x = encoder_out["encoder_out"][0]  # T x B x C
+        else:
+            x = encoder_out[0]
+        if self.sentemb_criterion == "cls":
+            cls_indices = src_tokens.eq(self.bos_idx).t()
+            sentemb = x[cls_indices, :]
+        else:
+            padding_mask = src_tokens.eq(self.pad_idx).t().unsqueeze(-1)
+            if padding_mask.any():
+                x = x.float().masked_fill_(padding_mask, float("-inf")).type_as(x)
+            sentemb = x.max(dim=0)[0]
+        return {"sentemb": sentemb}
+class LaserLstmEncoder(nn.Module):
+    def __init__(
+        self,
+        num_embeddings,
+        padding_idx,
+        embed_dim=320,
+        hidden_size=512,
+        num_layers=1,
+        bidirectional=False,
+        left_pad=True,
+        padding_value=0.0,
+    ):
+        super().__init__()
+        self.num_layers = num_layers
+        self.bidirectional = bidirectional
+        self.hidden_size = hidden_size
+        self.padding_idx = padding_idx
+        self.embed_tokens = nn.Embedding(
+            num_embeddings, embed_dim, padding_idx=self.padding_idx
+        )
+        self.lstm = nn.LSTM(
+            input_size=embed_dim,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            bidirectional=bidirectional,
+        )
+        self.left_pad = left_pad
+        self.padding_value = padding_value
+        self.output_units = hidden_size
+        if bidirectional:
+            self.output_units *= 2
+    def forward(self, src_tokens, src_lengths):
+        bsz, seqlen = src_tokens.size()
+        # embed tokens
+        x = self.embed_tokens(src_tokens)
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        # pack embedded source tokens into a PackedSequence
+        packed_x = nn.utils.rnn.pack_padded_sequence(x, src_lengths.data.tolist())
+        # apply LSTM
+        if self.bidirectional:
+            state_size = 2 * self.num_layers, bsz, self.hidden_size
+        else:
+            state_size = self.num_layers, bsz, self.hidden_size
+        h0 = x.data.new(*state_size).zero_()
+        c0 = x.data.new(*state_size).zero_()
+        packed_outs, (final_hiddens, final_cells) = self.lstm(packed_x, (h0, c0))
+        # unpack outputs and apply dropout
+        x, _ = nn.utils.rnn.pad_packed_sequence(
+            packed_outs, padding_value=self.padding_value
+        )
+        assert list(x.size()) == [seqlen, bsz, self.output_units]
+        if self.bidirectional:
+            def combine_bidir(outs):
+                return torch.cat(
+                    [
+                        torch.cat([outs[2 * i], outs[2 * i + 1]], dim=0).view(
+                            1, bsz, self.output_units
+                        )
+                        for i in range(self.num_layers)
+                    ],
+                    dim=0,
+                )
+            final_hiddens = combine_bidir(final_hiddens)
+            final_cells = combine_bidir(final_cells)
+        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()
+        # Set padded outputs to -inf so they are not selected by max-pooling
+        padding_mask = src_tokens.eq(self.padding_idx).t().unsqueeze(-1)
+        if padding_mask.any():
+            x = x.float().masked_fill_(padding_mask, float("-inf")).type_as(x)
+        # Build the sentence embedding by max-pooling over the encoder outputs
+        sentemb = x.max(dim=0)[0]
+        return {
+            "sentemb": sentemb,
+            "encoder_out": (x, final_hiddens, final_cells),
+            "encoder_padding_mask": encoder_padding_mask
+            if encoder_padding_mask.any()
+            else None,
+        }
+def initialize_encoder(
+    lang: str = None,
+    model_dir: str = None,
+    spm: bool = True,
+    laser: str = None,
+):
+    downloader = LaserModelDownloader(model_dir)
+    if laser is not None:
+        if laser == "laser3":
+            lang = downloader.get_language_code(LASER3_LANGUAGE, lang)
+            downloader.download_laser3(lang=lang, spm=spm)
+            file_path = f"laser3-{lang}.v1"
+        elif laser == "laser2":
+            downloader.download_laser2()
+            file_path = "laser2"
+        else:
+            raise ValueError(
+                f"Unsupported laser model: {laser}. Choose either laser2 or laser3."
+            )
+    else:
+        if lang in LASER3_LANGUAGE:
+            lang = downloader.get_language_code(LASER3_LANGUAGE, lang)
+            downloader.download_laser3(lang=lang, spm=spm)
+            file_path = f"laser3-{lang}.v1"
+        elif lang in LASER2_LANGUAGE:
+            downloader.download_laser2()
+            file_path = "laser2"
+        else:
+            raise ValueError(
+                f"Unsupported language name: {lang}. Please specify a supported language name."
+            )
+    model_dir = downloader.model_dir
+    model_path = os.path.join(model_dir, f"{file_path}.pt")
+    spm_vocab = os.path.join(model_dir, f"{file_path}.cvocab")
+    if not os.path.exists(spm_vocab):
+        # if there is no cvocab for the laser3 lang use laser2 cvocab
+        spm_vocab = os.path.join(model_dir, "laser2.cvocab")
+    return SentenceEncoder(model_path=model_path, spm_vocab=spm_vocab, spm_model=None)
+class LaserEncoderPipeline:
+    def __init__(
+        self,
+        lang: str = None,
+        model_dir: str = None,
+        spm: bool = True,
+        laser: str = None,
+    ):
+        if laser == "laser2" and lang is not None:
+            warnings.warn(
+                "Warning: The 'lang' parameter is optional when using 'laser2'. It will be ignored."
+            )
+        if laser == "laser3" and lang is None:
+            raise ValueError("For 'laser3', the 'lang' parameter is required.")
+        if laser is None and lang is None:
+            raise ValueError("Either 'laser' or 'lang' should be provided.")
+        self.tokenizer = initialize_tokenizer(
+            lang=lang, model_dir=model_dir, laser=laser
+        )
+        self.encoder = initialize_encoder(
+            lang=lang, model_dir=model_dir, spm=spm, laser=laser
+        )
+    def encode_sentences(
+        self, sentences: list, normalize_embeddings: bool = False
+    ) -> list:
+        """
+        Tokenizes and encodes a list of sentences.
+        Args:
+        - sentences (list of str): List of sentences to tokenize and encode.
+        Returns:
+        - List of embeddings for each sentence.
+        """
+        tokenized_sentences = [
+            self.tokenizer.tokenize(sentence) for sentence in sentences
+        ]
+        return self.encoder.encode_sentences(tokenized_sentences, normalize_embeddings)

laser/laser_encoders/test_laser_tokenizer.py ADDED Viewed

	@@ -0,0 +1,310 @@

+#!/usr/bin/python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# LASER  Language-Agnostic SEntence Representations
+# is a toolkit to calculate multilingual sentence embeddings
+# and to use them for document classification, bitext filtering
+# and mining
+#
+# --------------------------------------------------------
+# Tests for LaserTokenizer
+import os
+import warnings
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import List
+import numpy as np
+import pytest
+from laser_encoders import (
+    LaserEncoderPipeline,
+    initialize_encoder,
+    initialize_tokenizer,
+)
+@pytest.fixture
+def tokenizer(tmp_path: Path):
+    tokenizer_instance = initialize_tokenizer(model_dir=tmp_path, laser="laser2")
+    return tokenizer_instance
+@pytest.fixture
+def input_text() -> str:
+    return "This is a test sentence."
+@pytest.fixture
+def test_readme_params() -> dict:
+    return {
+        "lang": "igbo",
+        "input_sentences": ["nnọọ, kedu ka ị mere"],
+        "expected_embedding_shape": (1, 1024),
+        "expected_array": [
+            0.3807628,
+            -0.27941525,
+            -0.17819545,
+            0.44144684,
+            -0.38985375,
+            0.04719935,
+            0.20238206,
+            -0.03934783,
+            0.0118901,
+            0.28986093,
+        ],
+    }
+def test_tokenize(tokenizer, input_text: str):
+    expected_output = "▁this ▁is ▁a ▁test ▁sent ence ."
+    assert tokenizer.tokenize(input_text) == expected_output
+def test_tokenizer_call_method(tokenizer, input_text: str):
+    single_string = "This is a test sentence."
+    expected_output = "▁this ▁is ▁a ▁test ▁sent ence ."
+    assert tokenizer(single_string) == expected_output
+    list_of_strings = ["This is a test sentence.", "This is another test sentence."]
+    expected_output = [
+        "▁this ▁is ▁a ▁test ▁sent ence .",
+        "▁this ▁is ▁another ▁test ▁sent ence .",
+    ]
+    assert tokenizer(list_of_strings) == expected_output
+def test_normalization(tokenizer):
+    test_data = "Hello!!! How are you??? I'm doing great."
+    expected_output = "▁hel lo !!! ▁how ▁are ▁you ??? ▁i ' m ▁do ing ▁great ."
+    assert tokenizer.tokenize(test_data) == expected_output
+def test_descape(tokenizer):
+    test_data = "I &lt;3 Apple &amp; Carrots!"
+    expected_output = "▁i ▁<3 ▁app le ▁& ▁car ro ts !"
+    tokenizer.descape = True
+    assert tokenizer.tokenize(test_data) == expected_output
+def test_lowercase(tokenizer):
+    test_data = "THIS OUTPUT MUST BE UPPERCASE"
+    expected_output = "▁TH IS ▁ OU TP UT ▁ MU ST ▁BE ▁ UP PER CA SE"
+    tokenizer.lower_case = False
+    assert tokenizer.tokenize(test_data) == expected_output
+def test_is_printable(tokenizer):
+    test_data = "Hello, \tWorld! ABC\x1f123"
+    expected_output = "▁hel lo , ▁world ! ▁ab c ▁12 3"
+    assert tokenizer.tokenize(test_data) == expected_output
+def test_tokenize_file(tokenizer, input_text: str):
+    with TemporaryDirectory() as temp_dir:
+        input_file = os.path.join(temp_dir, "input.txt")
+        output_file = os.path.join(temp_dir, "output.txt")
+        with open(input_file, "w") as file:
+            file.write(input_text)
+        tokenizer.tokenize_file(
+            inp_fname=Path(input_file),
+            out_fname=Path(output_file),
+        )
+        with open(output_file, "r") as file:
+            output = file.read().strip()
+        expected_output = "▁this ▁is ▁a ▁test ▁sent ence ."
+        assert output == expected_output
+def test_tokenize_file_overwrite(tokenizer, input_text: str):
+    with TemporaryDirectory() as temp_dir:
+        input_file = os.path.join(temp_dir, "input.txt")
+        output_file = os.path.join(temp_dir, "output.txt")
+        with open(input_file, "w") as file:
+            file.write(input_text)
+        with open(output_file, "w") as file:
+            file.write("Existing output")
+        # Test when over_write is False
+        tokenizer.over_write = False
+        tokenizer.tokenize_file(
+            inp_fname=Path(input_file),
+            out_fname=Path(output_file),
+        )
+        with open(output_file, "r") as file:
+            output = file.read().strip()
+        assert output == "Existing output"
+        # Test when over_write is True
+        tokenizer.over_write = True
+        tokenizer.tokenize_file(
+            inp_fname=Path(input_file),
+            out_fname=Path(output_file),
+        )
+        with open(output_file, "r") as file:
+            output = file.read().strip()
+        expected_output = "▁this ▁is ▁a ▁test ▁sent ence ."
+        assert output == expected_output
+@pytest.mark.parametrize(
+    "laser, expected_array, lang",
+    [
+        (
+            "laser2",
+            [
+                1.042462512850761414e-02,
+                6.325428839772939682e-03,
+                -3.032622225873637944e-05,
+                9.033476933836936951e-03,
+                2.937933895736932755e-04,
+                4.489220678806304932e-03,
+                2.334521152079105377e-03,
+                -9.427300537936389446e-04,
+                -1.571535394759848714e-04,
+                2.095808042213320732e-03,
+            ],
+            None,
+        ),
+        (
+            "laser3",
+            [
+                3.038274645805358887e-01,
+                4.151830971240997314e-01,
+                -2.458990514278411865e-01,
+                3.153458833694458008e-01,
+                -5.153598189353942871e-01,
+                -6.035178527235984802e-02,
+                2.210616767406463623e-01,
+                -2.701394855976104736e-01,
+                -4.902199506759643555e-01,
+                -3.126966953277587891e-02,
+            ],
+            "zul_Latn",
+        ),
+    ],
+)
+def test_sentence_encoder(
+    tmp_path: Path,
+    tokenizer,
+    laser: str,
+    expected_array: List,
+    lang: str,
+    input_text: str,
+):
+    sentence_encoder = initialize_encoder(model_dir=tmp_path, laser=laser, lang=lang)
+    tokenized_text = tokenizer.tokenize(input_text)
+    sentence_embedding = sentence_encoder.encode_sentences([tokenized_text])
+    assert isinstance(sentence_embedding, np.ndarray)
+    assert sentence_embedding.shape == (1, 1024)
+    assert np.allclose(expected_array, sentence_embedding[:, :10], atol=1e-3)
+def test_laser_encoder_pipeline(tmp_path: Path, test_readme_params: dict):
+    lang = test_readme_params["lang"]
+    input_sentences = test_readme_params["input_sentences"]
+    expected_embedding_shape = test_readme_params["expected_embedding_shape"]
+    expected_array = test_readme_params["expected_array"]
+    encoder = LaserEncoderPipeline(model_dir=tmp_path, lang=lang)
+    embeddings = encoder.encode_sentences(input_sentences)
+    assert isinstance(embeddings, np.ndarray)
+    assert embeddings.shape == expected_embedding_shape
+    assert np.allclose(expected_array, embeddings[:, :10], atol=1e-3)
+def test_separate_initialization_and_encoding(
+    tmp_path, tokenizer, test_readme_params: dict
+):
+    lang = test_readme_params["lang"]
+    input_sentences = test_readme_params["input_sentences"]
+    expected_embedding_shape = test_readme_params["expected_embedding_shape"]
+    expected_array = test_readme_params["expected_array"]
+    tokenized_sentence = tokenizer.tokenize(input_sentences[0])
+    sentence_encoder = initialize_encoder(model_dir=tmp_path, lang=lang)
+    # Encode tokenized sentences into embeddings
+    embeddings = sentence_encoder.encode_sentences([tokenized_sentence])
+    assert isinstance(embeddings, np.ndarray)
+    assert embeddings.shape == expected_embedding_shape
+    assert np.allclose(expected_array, embeddings[:, :10], atol=1e-3)
+def test_encoder_normalization(tmp_path: Path, test_readme_params: dict):
+    lang = test_readme_params["lang"]
+    input_sentences = test_readme_params["input_sentences"]
+    encoder = LaserEncoderPipeline(model_dir=tmp_path, lang=lang)
+    normalized_embeddings = encoder.encode_sentences(
+        input_sentences, normalize_embeddings=True
+    )
+    norm = np.linalg.norm(normalized_embeddings[0])
+    assert np.allclose(norm, 1.0, atol=1e-3)
+def test_encoder_default_behaviour(tmp_path: Path, test_readme_params: dict):
+    lang = test_readme_params["lang"]
+    input_sentences = test_readme_params["input_sentences"]
+    encoder = LaserEncoderPipeline(model_dir=tmp_path, lang=lang)
+    default_embeddings = encoder.encode_sentences(input_sentences)
+    non_normalized_embeddings = encoder.encode_sentences(
+        input_sentences, normalize_embeddings=False
+    )
+    assert np.allclose(default_embeddings, non_normalized_embeddings)
+def test_encoder_non_normalization(tmp_path: Path, test_readme_params: dict):
+    lang = test_readme_params["lang"]
+    input_sentences = test_readme_params["input_sentences"]
+    encoder = LaserEncoderPipeline(model_dir=tmp_path, lang=lang)
+    non_normalized_embeddings = encoder.encode_sentences(
+        input_sentences, normalize_embeddings=False
+    )
+    norm = np.linalg.norm(non_normalized_embeddings[0])
+    assert not np.isclose(norm, 1)
+def test_optional_lang_with_laser2(tmp_path: Path):
+    with pytest.warns(
+        UserWarning,
+        match="The 'lang' parameter is optional when using 'laser2'. It will be ignored.",
+    ):
+        encoder = LaserEncoderPipeline(lang="en", laser="laser2", model_dir=tmp_path)
+def test_required_lang_with_laser3(tmp_path: Path):
+    with pytest.raises(
+        ValueError, match="For 'laser3', the 'lang' parameter is required."
+    ):
+        encoder = LaserEncoderPipeline(laser="laser3", model_dir=tmp_path)
+def test_missing_lang_and_laser(tmp_path: Path):
+    with pytest.raises(
+        ValueError, match="Either 'laser' or 'lang' should be provided."
+    ):
+        encoder = LaserEncoderPipeline(model_dir=tmp_path)

laser/laser_encoders/test_models_initialization.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import os
+import tempfile
+import pytest
+from laser_encoders.download_models import LaserModelDownloader
+from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE
+from laser_encoders.laser_tokenizer import initialize_tokenizer
+from laser_encoders.models import initialize_encoder
+def test_validate_achnese_models_and_tokenize_laser3(lang="acehnese"):
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        print(f"Created temporary directory for {lang}", tmp_dir)
+        downloader = LaserModelDownloader(model_dir=tmp_dir)
+        downloader.download_laser3(lang)
+        encoder = initialize_encoder(lang, model_dir=tmp_dir)
+        tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir)
+        # Test tokenization with a sample sentence
+        tokenized = tokenizer.tokenize("This is a sample sentence.")
+    print(f"{lang} model validated successfully")
+def test_validate_english_models_and_tokenize_laser2(lang="english"):
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        print(f"Created temporary directory for {lang}", tmp_dir)
+        downloader = LaserModelDownloader(model_dir=tmp_dir)
+        downloader.download_laser2()
+        encoder = initialize_encoder(lang, model_dir=tmp_dir)
+        tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir)
+        # Test tokenization with a sample sentence
+        tokenized = tokenizer.tokenize("This is a sample sentence.")
+    print(f"{lang} model validated successfully")
+def test_validate_kashmiri_models_and_tokenize_laser3(lang="kas"):
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        print(f"Created temporary directory for {lang}", tmp_dir)
+        downloader = LaserModelDownloader(model_dir=tmp_dir)
+        with pytest.raises(ValueError):
+            downloader.download_laser3(lang)
+            encoder = initialize_encoder(lang, model_dir=tmp_dir)
+            tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir)
+            # Test tokenization with a sample sentence
+            tokenized = tokenizer.tokenize("This is a sample sentence.")
+    print(f"{lang} model validated successfully")

laser/laser_encoders/validate_models.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import os
+import tempfile
+import pytest
+from laser_encoders.download_models import LaserModelDownloader
+from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE
+from laser_encoders.laser_tokenizer import initialize_tokenizer
+from laser_encoders.models import initialize_encoder
+@pytest.mark.slow
+@pytest.mark.parametrize("lang", LASER3_LANGUAGE)
+def test_validate_language_models_and_tokenize_laser3(lang):
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        print(f"Created temporary directory for {lang}", tmp_dir)
+        downloader = LaserModelDownloader(model_dir=tmp_dir)
+        if lang in ["kashmiri", "kas", "central kanuri", "knc"]:
+            with pytest.raises(ValueError) as excinfo:
+                downloader.download_laser3(lang)
+            assert "ValueError" in str(excinfo.value)
+            print(f"{lang} language model raised a ValueError as expected.")
+        else:
+            downloader.download_laser3(lang)
+            encoder = initialize_encoder(lang, model_dir=tmp_dir)
+            tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir)
+            # Test tokenization with a sample sentence
+            tokenized = tokenizer.tokenize("This is a sample sentence.")
+    print(f"{lang} model validated successfully")
+@pytest.mark.slow
+@pytest.mark.parametrize("lang", LASER2_LANGUAGE)
+def test_validate_language_models_and_tokenize_laser2(lang):
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        print(f"Created temporary directory for {lang}", tmp_dir)
+        downloader = LaserModelDownloader(model_dir=tmp_dir)
+        downloader.download_laser2()
+        encoder = initialize_encoder(lang, model_dir=tmp_dir)
+        tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir)
+        # Test tokenization with a sample sentence
+        tokenized = tokenizer.tokenize("This is a sample sentence.")
+    print(f"{lang} model validated successfully")
+class MockLaserModelDownloader(LaserModelDownloader):
+    def __init__(self, model_dir):
+        self.model_dir = model_dir
+    def download_laser3(self, lang):
+        lang = self.get_language_code(LASER3_LANGUAGE, lang)
+        file_path = os.path.join(self.model_dir, f"laser3-{lang}.v1.pt")
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"Could not find {file_path}.")
+    def download_laser2(self):
+        files = ["laser2.pt", "laser2.spm", "laser2.cvocab"]
+        for file_name in files:
+            file_path = os.path.join(self.model_dir, file_name)
+            if not os.path.exists(file_path):
+                raise FileNotFoundError(f"Could not find {file_path}.")
+CACHE_DIR = "/home/user/.cache/models"  # Change this to the desired cache directory
+# This uses the mock downloader
+@pytest.mark.slow
+@pytest.mark.parametrize("lang", LASER3_LANGUAGE)
+def test_validate_language_models_and_tokenize_mock_laser3(lang):
+    downloader = MockLaserModelDownloader(model_dir=CACHE_DIR)
+    try:
+        downloader.download_laser3(lang)
+    except FileNotFoundError as e:
+        raise pytest.error(str(e))
+    encoder = initialize_encoder(lang, model_dir=CACHE_DIR)
+    tokenizer = initialize_tokenizer(lang, model_dir=CACHE_DIR)
+    tokenized = tokenizer.tokenize("This is a sample sentence.")
+    print(f"{lang} model validated successfully")
+# This uses the mock downloader
+@pytest.mark.slow
+@pytest.mark.parametrize("lang", LASER2_LANGUAGE)
+def test_validate_language_models_and_tokenize_mock_laser2(lang):
+    downloader = MockLaserModelDownloader(model_dir=CACHE_DIR)
+    try:
+        downloader.download_laser2()
+    except FileNotFoundError as e:
+        raise pytest.error(str(e))
+    encoder = initialize_encoder(lang, model_dir=CACHE_DIR)
+    tokenizer = initialize_tokenizer(lang, model_dir=CACHE_DIR)
+    tokenized = tokenizer.tokenize("This is a sample sentence.")
+    print(f"{lang} model validated successfully")

laser/pyproject.toml ADDED Viewed

	@@ -0,0 +1,69 @@

+[build-system]
+requires = ["flit_core >=3.2,<4", "setuptools"]
+build-backend = "flit_core.buildapi"
+[project]
+name = "laser_encoders"
+version = "0.0.2"
+authors = [{name = "Facebook AI Research"}]
+description = "LASER  Language-Agnostic SEntence Representations is a toolkit to calculate multilingual sentence embeddings and to use them for document classification, bitext filtering and mining"
+readme = "laser_encoders/README.md"
+requires-python = ">=3.8"
+dependencies = [
+    'sacremoses==0.1.0',
+    'unicategories>=0.1.2',
+    'sentencepiece>=0.1.99',
+    'numpy>=1.21.3',
+    'torch>=1.10.0',
+    'fairseq>=0.12.2',
+]
+classifiers=[
+    "License :: OSI Approved :: BSD License",
+    "Topic :: Scientific/Engineering",
+    "Development Status :: 4 - Beta",
+]
+[project.urls]
+"Homepage" = "https://github.com/facebookresearch/LASER"
+"Bug Tracker" = "https://github.com/facebookresearch/LASER/issues"
+[project.optional-dependencies]
+  dev = [
+      # Test
+      "pytest>=4.3.0",
+      # Format
+      "black==22.3.0",
+      "isort>=5.10.1",
+      # Linters
+      "mypy>=0.782",
+      "pylint>=2.8.0",
+      # Release
+      "flit>=3.5.1"
+  ]
+[tool.black]
+# Black defaults are great !
+[tool.isort]
+profile = "black"
+skip_gitignore = true
+skip_glob = ["website/*", "*.pyx"]
+[tool.mypy]
+python_version = "3.8"
+show_error_codes = true
+check_untyped_defs = true
+ignore_missing_imports = true
+files = [
+  "laser_encoders/"
+]
+[tool.pytest.ini_options]
+testpaths = ["laser_encoders"]
+python_files = [
+  "test_*.py",
+]

laser/remove_external_tools.sh ADDED Viewed

	@@ -0,0 +1,26 @@

+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# LASER  Language-Agnostic SEntence Representations
+# is a toolkit to calculate multilingual sentence embeddings
+# and to use them for document classification, bitext filtering
+# and mining
+#
+#-------------------------------------------------------
+#
+# This bash script removes all installed third party software
+#
+if [ -z ${LASER+x} ] ; then
+  echo "Please set the environment variable 'LASER'"
+  exit
+fi
+bdir="${LASER}"
+tools_ext="${bdir}/tools-external"
+/bin/rm -rf ${tools_ext}

laser/source/embed.py ADDED Viewed

	@@ -0,0 +1,362 @@

+#!/usr/bin/python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# LASER  Language-Agnostic SEntence Representations
+# is a toolkit to calculate multilingual sentence embeddings
+# and to use them for document classification, bitext filtering
+# and mining
+#
+# --------------------------------------------------------
+#
+# Tool to calculate to embed a text file
+# The functions can be also imported into another Python code
+import argparse
+import logging
+import os
+import re
+import sys
+import tempfile
+import time
+from collections import namedtuple
+from pathlib import Path
+from subprocess import run
+from typing import Optional, Union
+assert os.environ.get("LASER"), "Please set the environment variable LASER"
+LASER = os.environ["LASER"]
+sys.path.append(LASER)
+import numpy as np
+from lib.text_processing import BPEfastApply, SPMApply, Token
+from laser_encoders.models import SentenceEncoder
+SPACE_NORMALIZER = re.compile(r"\s+")
+Batch = namedtuple("Batch", "srcs tokens lengths")
+logging.basicConfig(
+    stream=sys.stdout,
+    level=logging.INFO,
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+)
+logger = logging.getLogger("embed")
+def buffered_read(fp, buffer_size):
+    buffer = []
+    for src_str in fp:
+        buffer.append(src_str.strip())
+        if len(buffer) >= buffer_size:
+            yield buffer
+            buffer = []
+    if len(buffer) > 0:
+        yield buffer
+class HuggingFaceEncoder:
+    def __init__(self, encoder_name: str, verbose=False):
+        from sentence_transformers import SentenceTransformer
+        encoder = f"sentence-transformers/{encoder_name}"
+        if verbose:
+            logger.info(f"loading HuggingFace encoder: {encoder}")
+        self.encoder = SentenceTransformer(encoder)
+    def encode_sentences(self, sentences):
+        return self.encoder.encode(sentences)
+def load_model(
+    encoder: str,
+    spm_model: str,
+    bpe_codes: str,
+    hugging_face=False,
+    verbose=False,
+    **encoder_kwargs,
+) -> Union[SentenceEncoder, HuggingFaceEncoder]:
+    if hugging_face:
+        return HuggingFaceEncoder(encoder, verbose=verbose)
+    if spm_model:
+        spm_vocab = str(Path(spm_model).with_suffix(".cvocab"))
+        if verbose:
+            logger.info(f"spm_model: {spm_model}")
+            logger.info(f"spm_cvocab: {spm_vocab}")
+    else:
+        spm_vocab = None
+    return SentenceEncoder(
+        encoder, spm_vocab=spm_vocab, verbose=verbose, **encoder_kwargs
+    )
+def EncodeLoad(args):
+    args.buffer_size = max(args.buffer_size, 1)
+    assert (
+        not args.max_sentences or args.max_sentences <= args.buffer_size
+    ), "--max-sentences/--batch-size cannot be larger than --buffer-size"
+    print(" - loading encoder", args.encoder)
+    return SentenceEncoder(
+        args.encoder,
+        max_sentences=args.max_sentences,
+        max_tokens=args.max_tokens,
+        cpu=args.cpu,
+        verbose=args.verbose,
+    )
+def EncodeTime(t):
+    t = int(time.time() - t)
+    if t < 1000:
+        return "{:d}s".format(t)
+    else:
+        return "{:d}m{:d}s".format(t // 60, t % 60)
+# Encode sentences (existing file pointers)
+def EncodeFilep(
+    encoder, inp_file, out_file, buffer_size=10000, fp16=False, verbose=False
+):
+    n = 0
+    t = time.time()
+    for sentences in buffered_read(inp_file, buffer_size):
+        encoded = encoder.encode_sentences(sentences)
+        if fp16:
+            encoded = encoded.astype(np.float16)
+        encoded.tofile(out_file)
+        n += len(sentences)
+        if verbose and n % 10000 == 0:
+            logger.info("encoded {:d} sentences".format(n))
+    if verbose:
+        logger.info(f"encoded {n} sentences in {EncodeTime(t)}")
+# Encode sentences (file names)
+def EncodeFile(
+    encoder,
+    inp_fname,
+    out_fname,
+    buffer_size=10000,
+    fp16=False,
+    verbose=False,
+    over_write=False,
+    inp_encoding="utf-8",
+):
+    # TODO :handle over write
+    if not os.path.isfile(out_fname):
+        if verbose:
+            logger.info(
+                "encoding {} to {}".format(
+                    inp_fname if len(inp_fname) > 0 else "stdin",
+                    out_fname,
+                )
+            )
+        fin = (
+            open(inp_fname, "r", encoding=inp_encoding, errors="surrogateescape")
+            if len(inp_fname) > 0
+            else sys.stdin
+        )
+        fout = open(out_fname, mode="wb")
+        EncodeFilep(
+            encoder, fin, fout, buffer_size=buffer_size, fp16=fp16, verbose=verbose
+        )
+        fin.close()
+        fout.close()
+    elif not over_write and verbose:
+        logger.info("encoder: {} exists already".format(os.path.basename(out_fname)))
+# Load existing embeddings
+def EmbedLoad(fname, dim=1024, verbose=False, fp16=False):
+    x = np.fromfile(fname, dtype=(np.float16 if fp16 else np.float32), count=-1)
+    x.resize(x.shape[0] // dim, dim)
+    if verbose:
+        print(" - Embeddings: {:s}, {:d}x{:d}".format(fname, x.shape[0], dim))
+    return x
+# Get memory mapped embeddings
+def EmbedMmap(fname, dim=1024, dtype=np.float32, verbose=False):
+    nbex = int(os.path.getsize(fname) / dim / np.dtype(dtype).itemsize)
+    E = np.memmap(fname, mode="r", dtype=dtype, shape=(nbex, dim))
+    if verbose:
+        print(" - embeddings on disk: {:s} {:d} x {:d}".format(fname, nbex, dim))
+    return E
+def embed_sentences(
+    ifname: str,
+    output: str,
+    encoder: Union[SentenceEncoder, HuggingFaceEncoder] = None,
+    encoder_path: str = None,
+    hugging_face=False,
+    token_lang: Optional[str] = "--",
+    bpe_codes: Optional[str] = None,
+    spm_lang: Optional[str] = "en",
+    spm_model: Optional[str] = None,
+    verbose: bool = False,
+    buffer_size: int = 10000,
+    max_tokens: int = 12000,
+    max_sentences: Optional[int] = None,
+    cpu: bool = False,
+    fp16: bool = False,
+    sort_kind: str = "quicksort",
+):
+    assert encoder or encoder_path, "Provide initialised encoder or encoder_path"
+    buffer_size = max(buffer_size, 1)
+    assert (
+        not max_sentences or max_sentences <= buffer_size
+    ), "--max-sentences/--batch-size cannot be larger than --buffer-size"
+    assert not (bpe_codes and spm_model), "Cannot specify both spm and bpe"
+    if encoder_path:
+        encoder = load_model(
+            encoder_path,
+            spm_model,
+            bpe_codes,
+            verbose=verbose,
+            hugging_face=hugging_face,
+            max_sentences=max_sentences,
+            max_tokens=max_tokens,
+            sort_kind=sort_kind,
+            cpu=cpu,
+        )
+    if not ifname:
+        ifname = ""  # default to stdin
+    with tempfile.TemporaryDirectory() as tmpdir:
+        if token_lang != "--":
+            tok_fname = os.path.join(tmpdir, "tok")
+            Token(
+                ifname,
+                tok_fname,
+                lang=token_lang,
+                romanize=True if token_lang == "el" else False,
+                lower_case=True,
+                gzip=False,
+                verbose=verbose,
+                over_write=False,
+            )
+            ifname = tok_fname
+        if bpe_codes:
+            if ifname == "":  # stdin
+                ifname = os.path.join(tmpdir, "no_tok")
+                run(f"cat > {ifname}", shell=True)
+            bpe_fname = os.path.join(tmpdir, "bpe")
+            BPEfastApply(
+                ifname, bpe_fname, bpe_codes, verbose=verbose, over_write=False
+            )
+            ifname = bpe_fname
+        if spm_model:
+            spm_fname = os.path.join(tmpdir, "spm")
+            SPMApply(
+                ifname,
+                spm_fname,
+                spm_model,
+                lang=spm_lang,
+                lower_case=True,
+                verbose=verbose,
+                over_write=False,
+            )
+            ifname = spm_fname
+        EncodeFile(
+            encoder,
+            ifname,
+            output,
+            verbose=verbose,
+            over_write=False,
+            buffer_size=buffer_size,
+            fp16=fp16,
+        )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="LASER: Embed sentences")
+    parser.add_argument(
+        "-i",
+        "--input",
+        type=str,
+        default=None,
+        help="Input text file",
+    )
+    parser.add_argument("--encoder", type=str, required=True, help="encoder to be used")
+    parser.add_argument(
+        "--token-lang",
+        type=str,
+        default="--",
+        help="Perform tokenization with given language ('--' for no tokenization)",
+    )
+    parser.add_argument(
+        "--bpe-codes", type=str, default=None, help="Apply BPE using specified codes"
+    )
+    parser.add_argument(
+        "--spm-lang", type=str, default="en", help="Apply SPM using specified language"
+    )
+    parser.add_argument(
+        "--spm-model", type=str, default=None, help="Apply SPM using specified model"
+    )
+    parser.add_argument("-v", "--verbose", action="store_true", help="Detailed output")
+    parser.add_argument(
+        "-o", "--output", required=True, help="Output sentence embeddings"
+    )
+    parser.add_argument(
+        "--buffer-size", type=int, default=10000, help="Buffer size (sentences)"
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=12000,
+        help="Maximum number of tokens to process in a batch",
+    )
+    parser.add_argument(
+        "--max-sentences",
+        type=int,
+        default=None,
+        help="Maximum number of sentences to process in a batch",
+    )
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Store embedding matrices in fp16 instead of fp32",
+    )
+    parser.add_argument("--cpu", action="store_true", help="Use CPU instead of GPU")
+    parser.add_argument(
+        "--sort-kind",
+        type=str,
+        default="quicksort",
+        choices=["quicksort", "mergesort"],
+        help="Algorithm used to sort batch by length",
+    )
+    parser.add_argument(
+        "--use-hugging-face",
+        action="store_true",
+        help="Use a HuggingFace sentence transformer",
+    )
+    args = parser.parse_args()
+    embed_sentences(
+        ifname=args.input,
+        encoder_path=args.encoder,
+        token_lang=args.token_lang,
+        bpe_codes=args.bpe_codes,
+        spm_lang=args.spm_lang,
+        hugging_face=args.use_hugging_face,
+        spm_model=args.spm_model,
+        verbose=args.verbose,
+        output=args.output,
+        buffer_size=args.buffer_size,
+        max_tokens=args.max_tokens,
+        max_sentences=args.max_sentences,
+        cpu=args.cpu,
+        fp16=args.fp16,
+        sort_kind=args.sort_kind,
+    )

laser/source/eval.py ADDED Viewed

	@@ -0,0 +1,381 @@

+#!/usr/bin/python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# LASER  Language-Agnostic SEntence Representations
+# is a toolkit to calculate multilingual sentence embeddings
+# and to use them for document classification, bitext filtering
+# and mining
+#
+# --------------------------------------------------------
+#
+# Tool to calculate multilingual similarity error rate
+# on various predefined test sets
+import os
+import argparse
+import pandas
+import tempfile
+import numpy as np
+from pathlib import Path
+import itertools
+import logging
+import sys
+from typing import List, Tuple, Dict
+from tabulate import tabulate
+from collections import defaultdict
+from xsim import xSIM
+from embed import embed_sentences, load_model
+logging.basicConfig(
+    stream=sys.stdout,
+    level=logging.INFO,
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+)
+logger = logging.getLogger("eval")
+class Eval:
+    def __init__(self, args):
+        self.base_dir = args.base_dir
+        self.corpus = args.corpus
+        self.split = args.corpus_part
+        self.min_sents = args.min_sents
+        self.index_comparison = args.index_comparison
+        self.emb_dimension = args.embedding_dimension
+        self.encoder_args = {
+            k: v
+            for k, v in args._get_kwargs()
+            if k in ["max_sentences", "max_tokens", "cpu", "sort_kind", "verbose"]
+        }
+        self.src_bpe_codes = args.src_bpe_codes
+        self.tgt_bpe_codes = args.tgt_bpe_codes
+        self.src_spm_model = args.src_spm_model
+        self.tgt_spm_model = args.tgt_spm_model
+        logger.info("loading src encoder")
+        self.src_encoder = load_model(
+            args.src_encoder,
+            self.src_spm_model,
+            self.src_bpe_codes,
+            hugging_face=args.use_hugging_face,
+            **self.encoder_args,
+        )
+        if args.tgt_encoder:
+            logger.info("loading tgt encoder")
+            self.tgt_encoder = load_model(
+                args.tgt_encoder,
+                self.tgt_spm_model,
+                self.tgt_bpe_codes,
+                hugging_face=args.use_hugging_face,
+                **self.encoder_args,
+            )
+        else:
+            logger.info("encoding tgt using src encoder")
+            self.tgt_encoder = self.src_encoder
+            self.tgt_bpe_codes = self.src_bpe_codes
+            self.tgt_spm_model = self.src_spm_model
+        self.nway = args.nway
+        self.buffer_size = args.buffer_size
+        self.fp16 = args.fp16
+        self.margin = args.margin
+    def _embed(
+        self, tmpdir, langs, encoder, spm_model, bpe_codes, tgt_aug_langs=[]
+    ) -> List[List[str]]:
+        emb_data = []
+        for lang in langs:
+            augjson = None
+            fname = f"{lang}.{self.split}"
+            infile = self.base_dir / self.corpus / self.split / fname
+            assert infile.exists(), f"{infile} does not exist"
+            outfile = tmpdir / fname
+            if lang in tgt_aug_langs:
+                fname = f"{lang}_augmented.{self.split}"
+                fjname = f"{lang}_errtype.{self.split}.json"
+                augment_dir = self.base_dir / self.corpus / (self.split + "_augmented")
+                augjson = augment_dir / fjname
+                auginfile = augment_dir / fname
+                assert augjson.exists(), f"{augjson} does not exist"
+                assert auginfile.exists(), f"{auginfile} does not exist"
+                combined_infile = tmpdir / f"combined_{lang}"
+                with open(combined_infile, "w") as newfile:
+                    for f in [infile, auginfile]:
+                        with open(f) as fin:
+                            newfile.write(fin.read())
+                infile = combined_infile
+            embed_sentences(
+                str(infile),
+                str(outfile),
+                encoder=encoder,
+                spm_model=spm_model,
+                bpe_codes=bpe_codes,
+                token_lang=lang if bpe_codes else "--",
+                buffer_size=self.buffer_size,
+                fp16=self.fp16,
+                **self.encoder_args,
+            )
+            assert (
+                os.path.isfile(outfile) and os.path.getsize(outfile) > 0
+            ), f"Error encoding {infile}"
+            emb_data.append([lang, infile, outfile, augjson])
+        return emb_data
+    def _xsim(
+        self, src_emb, src_lang, tgt_emb, tgt_lang, tgt_txt, augjson=None
+    ) -> Tuple[int, int, Dict[str, int]]:
+        return xSIM(
+            src_emb,
+            tgt_emb,
+            margin=self.margin,
+            dim=self.emb_dimension,
+            fp16=self.fp16,
+            eval_text=tgt_txt if not self.index_comparison else None,
+            augmented_json=augjson,
+        )
+    def calc_xsim(
+        self, embdir, src_langs, tgt_langs, tgt_aug_langs, err_sum=0, totl_nbex=0
+    ) -> None:
+        outputs = []
+        src_emb_data = self._embed(
+            embdir,
+            src_langs,
+            self.src_encoder,
+            self.src_spm_model,
+            self.src_bpe_codes,
+        )
+        tgt_emb_data = self._embed(
+            embdir,
+            tgt_langs,
+            self.tgt_encoder,
+            self.tgt_spm_model,
+            self.tgt_bpe_codes,
+            tgt_aug_langs,
+        )
+        aug_df = defaultdict(lambda: defaultdict())
+        combs = list(itertools.product(src_emb_data, tgt_emb_data))
+        for (src_lang, _, src_emb, _), (tgt_lang, tgt_txt, tgt_emb, augjson) in combs:
+            if src_lang == tgt_lang:
+                continue
+            err, nbex, aug_report = self._xsim(
+                src_emb, src_lang, tgt_emb, tgt_lang, tgt_txt, augjson
+            )
+            result = round(100 * err / nbex, 2)
+            if tgt_lang in tgt_aug_langs:
+                aug_df[tgt_lang][src_lang] = aug_report
+            if nbex < self.min_sents:
+                result = "skipped"
+            else:
+                err_sum += err
+                totl_nbex += nbex
+            outputs.append(
+                [self.corpus, f"{src_lang}-{tgt_lang}", f"{result}", f"{nbex}"]
+            )
+        outputs.append(
+            [
+                self.corpus,
+                "average",
+                f"{round(100 * err_sum / totl_nbex, 2)}",
+                f"{len(combs)}",
+            ]
+        )
+        print(
+            tabulate(
+                outputs,
+                tablefmt="psql",
+                headers=[
+                    "dataset",
+                    "src-tgt",
+                    "xsim" + ("(++)" if tgt_aug_langs else ""),
+                    "nbex",
+                ],
+            )
+        )
+        for tgt_aug_lang in tgt_aug_langs:
+            df = pandas.DataFrame.from_dict(aug_df[tgt_aug_lang]).fillna(0).T
+            print(
+                f"\nAbsolute error under augmented transformations for: {tgt_aug_lang}"
+            )
+            print(f"{tabulate(df, df.columns, floatfmt='.2f', tablefmt='grid')}")
+    def calc_xsim_nway(self, embdir, langs) -> None:
+        err_matrix = np.zeros((len(langs), len(langs)))
+        emb_data = self._embed(
+            embdir,
+            langs,
+            self.src_encoder,
+            self.src_spm_model,
+            self.src_bpe_codes,
+        )
+        for i1, (src_lang, _, src_emb, _) in enumerate(emb_data):
+            for i2, (tgt_lang, tgt_txt, tgt_emb, _) in enumerate(emb_data):
+                if src_lang == tgt_lang:
+                    err_matrix[i1, i2] = 0
+                else:
+                    err, nbex, _ = self._xsim(
+                        src_emb, src_lang, tgt_emb, tgt_lang, tgt_txt
+                    )
+                    err_matrix[i1, i2] = 100 * err / nbex
+        df = pandas.DataFrame(err_matrix, columns=langs, index=langs)
+        df.loc["avg"] = df.sum() / float(df.shape[0] - 1)  # exclude diagonal in average
+        print(f"\n{tabulate(df, langs, floatfmt='.2f', tablefmt='grid')}\n\n")
+        print(f"Global average: {df.loc['avg'].mean():.2f}")
+def run_eval(args) -> None:
+    evaluation = Eval(args)
+    tmp_dir = None
+    if args.embed_dir:
+        os.makedirs(args.embed_dir, exist_ok=True)
+        embed_dir = args.embed_dir
+    else:
+        tmp_dir = tempfile.TemporaryDirectory()
+        embed_dir = Path(tmp_dir.name)
+    src_langs = sorted(args.src_langs.split(","))
+    tgt_aug_langs = sorted(args.tgt_aug_langs.split(",")) if args.tgt_aug_langs else []
+    if evaluation.nway:
+        evaluation.calc_xsim_nway(embed_dir, src_langs)
+    else:
+        assert (
+            args.tgt_langs
+        ), "Please provide tgt langs when not performing n-way comparison"
+        tgt_langs = sorted(args.tgt_langs.split(","))
+        evaluation.calc_xsim(embed_dir, src_langs, tgt_langs, tgt_aug_langs)
+    if tmp_dir:
+        tmp_dir.cleanup()  # remove temporary directory
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="LASER: multilingual similarity error evaluation"
+    )
+    parser.add_argument(
+        "--base-dir",
+        type=Path,
+        default=None,
+        help="Base directory for evaluation files",
+        required=True,
+    )
+    parser.add_argument(
+        "--corpus",
+        type=str,
+        default=None,
+        help="Name of evaluation corpus",
+        required=True,
+    )
+    parser.add_argument(
+        "--corpus-part",
+        type=str,
+        default=None,
+        help="Specify split of the corpus to use e.g., dev",
+        required=True,
+    )
+    parser.add_argument(
+        "--margin",
+        type=str,
+        default=None,
+        help="Margin for xSIM calculation. See: https://aclanthology.org/P19-1309",
+    )
+    parser.add_argument(
+        "--min-sents",
+        type=int,
+        default=100,
+        help="Only use test sets which have at least N sentences",
+    )
+    parser.add_argument(
+        "--nway", action="store_true", help="Test N-way for corpora which support it"
+    )
+    parser.add_argument(
+        "--embed-dir",
+        type=Path,
+        default=None,
+        help="Store/load embeddings from specified directory (default temporary)",
+    )
+    parser.add_argument(
+        "--index-comparison",
+        action="store_true",
+        help="Use index comparison instead of texts (not recommended when test data contains duplicates)",
+    )
+    parser.add_argument("--src-spm-model", type=str, default=None)
+    parser.add_argument("--tgt-spm-model", type=str, default=None)
+    parser.add_argument(
+        "--src-bpe-codes",
+        type=str,
+        default=None,
+        help="Path to bpe codes for src model",
+    )
+    parser.add_argument(
+        "--tgt-bpe-codes",
+        type=str,
+        default=None,
+        help="Path to bpe codes for tgt model",
+    )
+    parser.add_argument("--src-encoder", type=str, default=None, required=True)
+    parser.add_argument("--tgt-encoder", type=str, default=None)
+    parser.add_argument(
+        "--buffer-size", type=int, default=100, help="Buffer size (sentences)"
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=12000,
+        help="Maximum number of tokens to process in a batch",
+    )
+    parser.add_argument(
+        "--max-sentences",
+        type=int,
+        default=None,
+        help="Maximum number of sentences to process in a batch",
+    )
+    parser.add_argument("--cpu", action="store_true", help="Use CPU instead of GPU")
+    parser.add_argument(
+        "--src-langs",
+        type=str,
+        default=None,
+        help="Source-side languages for evaluation",
+        required=True,
+    )
+    parser.add_argument(
+        "--tgt-langs",
+        type=str,
+        default=None,
+        help="Target-side languages for evaluation",
+    )
+    parser.add_argument(
+        "--tgt-aug-langs",
+        type=str,
+        default=None,
+        help="languages with augmented data",
+        required=False,
+    )
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Store embedding matrices in fp16 instead of fp32",
+    )
+    parser.add_argument(
+        "--sort-kind",
+        type=str,
+        default="quicksort",
+        choices=["quicksort", "mergesort"],
+        help="Algorithm used to sort batch by length",
+    )
+    parser.add_argument(
+        "--use-hugging-face",
+        action="store_true",
+        help="Use a HuggingFace sentence transformer",
+    )
+    parser.add_argument(
+        "--embedding-dimension",
+        type=int,
+        default=1024,
+        help="Embedding dimension for encoders",
+    )
+    parser.add_argument("-v", "--verbose", action="store_true", help="Detailed output")
+    args = parser.parse_args()
+    run_eval(args)

laser/source/lib/indexing.py ADDED Viewed

	@@ -0,0 +1,258 @@

+#!/usr/bin/python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# LASER  Language-Agnostic SEntence Representations
+# is a toolkit to calculate multilingual sentence embeddings
+# and to use them for document classification, bitext filtering
+# and mining
+#
+# --------------------------------------------------------
+#
+# tools for indexing and search with FAISS
+import faiss
+import os.path
+import sys
+import numpy as np
+#-------------------------------------------------------------
+# Get list of fnames:
+#  - we loop over the list of given languages
+#  - for each language, we also check if there are splitted files .%03d
+def SplitFnames(par_fname, langs):
+    fnames = []
+    for l in langs:
+        fname = par_fname + '.' + l
+        if os.path.isfile(fname):
+            fnames.append(fname)
+        for i in range(1000):
+            fname = par_fname + '.' + l + '.{:03d}'.format(i)
+            if os.path.isfile(fname):
+                fnames.append(fname)
+    if len(fnames) == 0:
+        print("ERROR: no embeddings found in {:s}*".format(par_fname))
+        sys.exit(1)
+    return fnames
+def SplitOpen(par_fname, langs, dim, dtype, verbose=False):
+    M = []
+    nf = 0
+    nc = 0
+    print('Reading sentence embeddings')
+    print(' - memory mapped files {:s}'.format(par_fname))
+    for fname in SplitFnames(par_fname, langs):
+        n = int(os.path.getsize(fname) / dim / np.dtype(dtype).itemsize)
+        if verbose:
+            print(' - {:s}: {:d} x {:d}'.format(fname, n, dim))
+        Mi = np.memmap(fname, mode='r', dtype=dtype, shape=(n, dim))
+        nc += n
+        nf += 1
+        M.append(Mi)
+    print(' - total of {:d} files: {:d} x {:d}'.format(nf, nc, dim))
+    return M
+def SplitAccess(M, idx):
+    i = idx
+    for Mi in M:
+        n = Mi.shape[0]
+        if i < n:
+            return Mi[i,:]
+        i -= n
+    print('ERROR: index {:d} is too large form memory mapped files'.format(idx))
+    sys.exit(1)
+###############################################################################
+# create an FAISS index on the given data
+def IndexCreate(dname, idx_type,
+                verbose=False, normalize=True, save_index=False, dim=1024):
+    assert idx_type == 'FlatL2', 'only FlatL2 index is currently supported'
+    x = np.fromfile(dname, dtype=np.float32, count=-1)
+    nbex = x.shape[0] // dim
+    print(' - embedding: {:s} {:d} examples of dim {:d}'
+          .format(dname, nbex, dim))
+    x.resize(nbex, dim)
+    print(' - creating FAISS index')
+    idx = faiss.IndexFlatL2(dim)
+    if normalize:
+        faiss.normalize_L2(x)
+    idx.add(x)
+    if save_index:
+        iname = 'TODO'
+        print(' - saving index into ' + iname)
+        faiss.write_index(idx, iname)
+    return x, idx
+###############################################################################
+# search closest vector for all languages pairs and calculate error rate
+def IndexSearchMultiple(data, idx, langs, verbose=False, texts=None, print_errors=False):
+    nl = len(data)
+    nbex = data[0].shape[0]
+    err = np.zeros((nl, nl)).astype(float)
+    ref = np.linspace(0, nbex-1, nbex).astype(int)  # [0, nbex)
+    if verbose:
+        if texts is None:
+            print('Calculating similarity error (indices):')
+        else:
+            print('Calculating similarity error (textual):')
+    for i1 in range(nl):
+        for i2 in range(nl):
+            if i1 != i2:
+                D, I = idx[i2].search(data[i1], 1)
+                if texts: # do textual comparison
+                    e1 = 0
+                    for p in range(I.shape[0]):
+                        if texts[i2][p] != texts[i2][I[p,0]]:
+                            e1 += 1
+                            if print_errors:
+                                print('Error {:s}\n      {:s}'
+                                      .format(texts[i2][p].strip(), texts[i2][I[p,0]].strip()))
+                    err[i1, i2] = e1 / nbex
+                else:  # do index based comparision
+                    err[i1, i2] \
+                        = (nbex - np.equal(I.reshape(nbex), ref)
+                           .astype(int).sum()) / nbex
+                if verbose:
+                    print(' - similarity error {:s}/{:s}: {:5.2f}%'
+                          .format(langs[i1], langs[i2],
+                                  100.0 * err[i1, i2]))
+    return err
+###############################################################################
+# print confusion matrix
+def IndexPrintConfusionMatrix(err, langs):
+    nl = len(langs)
+    assert nl == err.shape[0], 'size of errror matrix doesn not match'
+    print('Confusion matrix:')
+    print('{:8s}'.format('langs'), end='')
+    for i2 in range(nl):
+        print('{:8s} '.format(langs[i2]), end='')
+    print('{:8s}'.format('avg'))
+    for i1 in range(nl):
+        print('{:3s}'.format(langs[i1]), end='')
+        for i2 in range(nl):
+            print('{:8.2f}%'.format(100 * err[i1, i2]), end='')
+        print('{:8.2f}%'.format(100 * err[i1, :].sum() / (nl-1)))
+    print('avg', end='')
+    for i2 in range(nl):
+        print('{:8.2f}%'.format(100 * err[:, i2].sum() / (nl-1)), end='')
+    # global average
+    print('{:8.2f}%'.format(100 * err.sum() / (nl-1) / nl))
+###############################################################################
+# Load an FAISS index
+def IndexLoad(idx_name, nprobe, gpu=False):
+    print('Reading FAISS index')
+    print(' - index: {:s}'.format(idx_name))
+    index = faiss.read_index(idx_name)
+    print(' - found {:d} sentences of dim {:d}'.format(index.ntotal, index.d))
+    print(' - setting nbprobe to {:d}'.format(nprobe))
+    if gpu:
+        print(' - transfer index to %d GPUs ' % faiss.get_num_gpus())
+        #co = faiss.GpuMultipleClonerOptions()
+        #co.shard = True
+        index = faiss.index_cpu_to_all_gpus(index) # co=co
+        faiss.GpuParameterSpace().set_index_parameter(index, 'nprobe', nprobe)
+    return index
+###############################################################################
+# Opens a text file with the sentences corresponding to the indices used
+# by an FAISS index
+# We also need the reference files with the byte offsets to the beginning
+# of each sentence
+# optionnally:  array with number of words per sentence
+# All arrays are memory mapped
+def IndexTextOpen(txt_fname):
+    print('Reading text corpus')
+    print(' - texts: {:s}'.format(txt_fname))
+    txt_mmap = np.memmap(txt_fname, mode='r', dtype=np.uint8)
+    fname = txt_fname.replace('.txt', '.ref.bin32')
+    if os.path.isfile(fname):
+        print(' - sentence start offsets (32 bit): {}'.format(fname))
+        ref_mmap = np.memmap(fname, mode='r', dtype=np.uint32)
+    else:
+        fname = txt_fname.replace('.txt', '.ref.bin64')
+        if os.path.isfile(fname):
+            print(' - sentence start offsets (64 bit): {}'.format(fname))
+            ref_mmap = np.memmap(fname, mode='r', dtype=np.uint64)
+        else:
+            print('ERROR: no file with sentence start offsets found')
+            sys.exit(1)
+    print(' - found {:d} sentences'.format(ref_mmap.shape[0]))
+    nbw_mmap = None
+    fname = txt_fname.replace('.txt', '.nw.bin8')
+    if os.path.isfile(fname):
+        print(' - word counts: {:s}'.format(fname))
+        nbw_mmap = np.memmap(fname, mode='r', dtype=np.uint8)
+    M = None
+    fname = txt_fname.replace('.txt', '.meta')
+    if os.path.isfile(fname):
+        M = []
+        n = 0
+        print(' - metafile: {:s}'.format(fname))
+        with open(fname, 'r') as fp:
+            for line in fp:
+                fields = line.strip().split()
+                if len(fields) != 2:
+                    print('ERROR: format error in meta file')
+                    sys.exit(1)
+                n += int(fields[1])
+                M.append({'lang': fields[0], 'n': n})
+        print(' - found {:d} languages:'.format(len(M)), end='')
+        for L in M:
+            print(' {:s}'.format(L['lang']), end='')
+        print('')
+    return txt_mmap, ref_mmap, nbw_mmap, M
+###############################################################################
+# Return the text for the given index
+def IndexTextQuery(txt_mmap, ref_mmap, idx):
+    p = int(ref_mmap[idx])  # get starting byte position
+    i = 0
+    dim = 10000  # max sentence length in bytes
+    b = bytearray(dim)
+    #  find EOL
+    while txt_mmap[p+i] != 10 and i < dim:
+        b[i] = txt_mmap[p+i]
+        i += 1
+    return b[0:i].decode('utf-8')
+###############################################################################
+# Search the [k] nearest vectors of [x] in the given index
+# and return the text lines
+def IndexSearchKNN(index, x, T, R, kmax=1, Dmax=1.0, dedup=True):
+    D, I = index.search(x, kmax)
+    prev = {}  # for depuplication
+    res = []
+    for n in range(x.shape[0]):
+        for i in range(kmax):
+            txt = IndexTextQuery(T, R, I[n, i])
+            if (dedup and txt not in prev) and D[n, i] <= Dmax:
+                prev[txt] = 1
+                res.append([txt, D[n, i]])
+    return res

laser/source/lib/romanize_lc.py ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/usr/bin/python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# LASER  Language-Agnostic SEntence Representations
+# is a toolkit to calculate multilingual sentence embeddings
+# and to use them for document classification, bitext filtering
+# and mining
+#
+# --------------------------------------------------------
+#
+# Romanize and lower case text
+import os
+import sys
+import argparse
+from transliterate import translit, get_available_language_codes
+parser = argparse.ArgumentParser(
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    description="Calculate multilingual sentence encodings")
+parser.add_argument(
+    '--input', '-i', type=argparse.FileType('r', encoding='UTF-8'),
+    default=sys.stdin,
+    metavar='PATH',
+    help="Input text file (default: standard input).")
+parser.add_argument(
+    '--output', '-o', type=argparse.FileType('w', encoding='UTF-8'),
+    default=sys.stdout,
+    metavar='PATH',
+    help="Output text file (default: standard output).")
+parser.add_argument(
+    '--language', '-l', type=str,
+    metavar='STR', default="none",
+    help="perform transliteration into Roman characters"
+         " from the specified language (default none)")
+parser.add_argument(
+    '--preserve-case', '-C', action='store_true',
+    help="Preserve case of input texts (default is all lower case)")
+args = parser.parse_args()
+for line in args.input:
+    if args.language != "none":
+        line = translit(line, args.language, reversed=True)
+    if not args.preserve_case:
+        line = line.lower()
+    args.output.write(line)

laser/source/lib/text_processing.py ADDED Viewed

	@@ -0,0 +1,272 @@

+#!/usr/bin/python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# LASER  Language-Agnostic SEntence Representations
+# is a toolkit to calculate multilingual sentence embeddings
+# and to use them for document classification, bitext filtering
+# and mining
+#
+# --------------------------------------------------------
+#
+# Helper functions for tokenization and BPE
+import os
+import sys
+import logging
+from pathlib import Path
+import numpy as np
+from subprocess import run, check_output, CalledProcessError, DEVNULL
+logging.basicConfig(
+    stream=sys.stdout,
+    level=logging.INFO,
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s")
+logger = logging.getLogger("preprocess")
+# get environment
+assert os.environ.get('LASER'), 'Please set the enviornment variable LASER'
+LASER = os.environ['LASER']
+FASTBPE = LASER + '/tools-external/fastBPE/fast'
+MOSES_BDIR = LASER + '/tools-external/moses-tokenizer/tokenizer/'
+MOSES_TOKENIZER = MOSES_BDIR + 'tokenizer.perl -q -no-escape -threads 20 -l '
+MOSES_LC = MOSES_BDIR + 'lowercase.perl'
+NORM_PUNC = MOSES_BDIR + 'normalize-punctuation.perl -l '
+DESCAPE = MOSES_BDIR + 'deescape-special-chars.perl'
+REM_NON_PRINT_CHAR = MOSES_BDIR + 'remove-non-printing-char.perl'
+SPM_DIR = LASER + '/tools-external/sentencepiece-master/build/src/'
+SPM = 'LD_LIBRARY_PATH=' + SPM_DIR + ' ' + SPM_DIR + '/spm_encode --output_format=piece'
+# Romanization (and lower casing)
+ROMAN_LC = 'python3 ' + LASER + '/source/lib/romanize_lc.py -l '
+# Mecab tokenizer for Japanese
+MECAB = LASER + '/tools-external/mecab'
+###############################################################################
+#
+# Tokenize a line of text
+#
+###############################################################################
+def TokenLine(line, lang='en', lower_case=True, romanize=False):
+    assert lower_case, 'lower case is needed by all the models'
+    roman = lang if romanize else 'none'
+    tok = check_output(
+            REM_NON_PRINT_CHAR
+            + '|' + NORM_PUNC + lang
+            + '|' + DESCAPE
+            + '|' + MOSES_TOKENIZER + lang
+            + ('| python3 -m jieba -d ' if lang == 'zh' else '')
+            + ('|' + MECAB + '/bin/mecab -O wakati -b 50000 ' if lang == 'ja' else '')
+            + '|' + ROMAN_LC + roman,
+            input=line,
+            encoding='UTF-8',
+            shell=True)
+    return tok.strip()
+###############################################################################
+#
+# Tokenize a file
+#
+###############################################################################
+def Token(inp_fname, out_fname, lang='en',
+          lower_case=True, romanize=False, descape=False,
+          verbose=False, over_write=False, gzip=False):
+    assert lower_case, 'lower case is needed by all the models'
+    assert not over_write, 'over-write is not yet implemented'
+    if not os.path.isfile(out_fname):
+        cat = 'zcat ' if gzip else 'cat '
+        roman = lang if romanize else 'none'
+        # handle some iso3 langauge codes
+        if lang in ('cmn', 'wuu', 'yue'):
+            lang = 'zh'
+        if lang in ('jpn'):
+            lang = 'ja'
+        if verbose:
+            logger.info('tokenizing {} in language {} {} {}'
+                  .format(os.path.basename(inp_fname), lang,
+                          '(gzip)' if gzip else '',
+                          '(de-escaped)' if descape else '',
+                          '(romanized)' if romanize else ''))
+        run(cat + inp_fname
+            + '|' + REM_NON_PRINT_CHAR
+            + '|' + NORM_PUNC + lang
+            + ('|' + DESCAPE if descape else '')
+            + '|' + MOSES_TOKENIZER + lang
+            + ('| python3 -m jieba -d ' if lang == 'zh' else '')
+            + ('|' + MECAB + '/bin/mecab -O wakati -b 50000 ' if lang == 'ja' else '')
+            + '|' + ROMAN_LC + roman
+            + '>' + out_fname,
+            env=dict(os.environ, LD_LIBRARY_PATH=MECAB + '/lib'),
+            shell=True)
+    elif not over_write and verbose:
+        logger.info('tokenized file {} exists already'
+              .format(os.path.basename(out_fname), lang))
+###############################################################################
+#
+# Apply SPM on a whole file
+#
+###############################################################################
+def SPMApply(inp_fname, out_fname, spm_model, lang='en',
+             lower_case=True, descape=False,
+             verbose=False, over_write=False, gzip=False):
+    assert lower_case, 'lower case is needed by all the models'
+    if not os.path.isfile(out_fname):
+        cat = 'zcat ' if gzip else 'cat '
+        if verbose:
+            logger.info('SPM processing {} {} {}'
+                  .format(os.path.basename(inp_fname),
+                         '(gzip)' if gzip else '',
+                         '(de-escaped)' if descape else ''))
+        assert os.path.isfile(spm_model), f'SPM model {spm_model} not found'
+        command = (cat + inp_fname
+            + '|' + REM_NON_PRINT_CHAR
+            + '|' + NORM_PUNC + lang
+            + ('|' + DESCAPE if descape else '')
+            + '|' + ROMAN_LC + 'none'
+            + '|' + SPM + " --model=" + spm_model
+            + ' > ' + out_fname)
+        try:
+            run(["/bin/bash", "-o", "pipefail", "-c", command], check=True, capture_output=True)
+        except CalledProcessError as e:
+            logger.error(e.stderr.decode().strip())
+            sys.exit(1)
+    elif not over_write and verbose:
+        logger.info('SPM encoded file {} exists already'
+              .format(os.path.basename(out_fname)))
+###############################################################################
+#
+# Apply FastBPE on a whole file
+#
+###############################################################################
+def BPEfastApply(inp_fname, out_fname, bpe_codes,
+                 verbose=False, over_write=False):
+    if not os.path.isfile(out_fname):
+        if verbose:
+            logger.info('fastBPE: processing {}'
+                  .format(os.path.basename(inp_fname)))
+        bpe_vocab = bpe_codes.replace('fcodes', 'fvocab')
+        assert os.path.isfile(bpe_vocab), f'fastBPE: vocab file {bpe_vocab} not found'
+        run(FASTBPE + ' applybpe '
+            + out_fname + ' ' + inp_fname
+            + ' ' + bpe_codes
+            + ' ' + bpe_vocab, shell=True, stderr=DEVNULL)
+    elif not over_write and verbose:
+        logger.info('fastBPE: {} exists already'
+              .format(os.path.basename(out_fname)))
+###############################################################################
+#
+# Split long lines into multiple sentences at "."
+#
+###############################################################################
+def SplitLines(ifname, of_txt, of_sid):
+    if os.path.isfile(of_txt):
+        print(' - SplitLines: {} already exists'.format(of_txt))
+        return
+    nl = 0
+    nl_sp = 0
+    maxw = 0
+    maxw_sp = 0
+    fp_sid = open(of_sid, 'w')
+    fp_txt = open(of_txt, 'w')
+    with open(ifname, 'r') as ifp:
+        for line in ifp:
+            print('{:d}'.format(nl), file=fp_sid)  # store current sentence ID
+            nw = 0
+            words = line.strip().split()
+            maxw = max(maxw, len(words))
+            for i, word in enumerate(words):
+                if word == '.' and i != len(words)-1:
+                    if nw > 0:
+                        print(' {}'.format(word), file=fp_txt)
+                    else:
+                        print('{}'.format(word), file=fp_txt)
+                    # store current sentence ID
+                    print('{:d}'.format(nl), file=fp_sid)
+                    nl_sp += 1
+                    maxw_sp = max(maxw_sp, nw+1)
+                    nw = 0
+                else:
+                    if nw > 0:
+                        print(' {}'.format(word), end='', file=fp_txt)
+                    else:
+                        print('{}'.format(word), end='', file=fp_txt)
+                    nw += 1
+            if nw > 0:
+                # handle remainder of sentence
+                print('', file=fp_txt)
+                nl_sp += 1
+                maxw_sp = max(maxw_sp, nw+1)
+            nl += 1
+    print(' - Split sentences: {}'.format(ifname))
+    print(' -                  lines/max words: {:d}/{:d} -> {:d}/{:d}'
+          .format(nl, maxw, nl_sp, maxw_sp))
+    fp_sid.close()
+    fp_txt.close()
+###############################################################################
+#
+# Join embeddings of previously split lines (average)
+#
+###############################################################################
+def JoinEmbed(if_embed, sid_fname, of_embed, dim=1024):
+    if os.path.isfile(of_embed):
+        print(' - JoinEmbed: {} already exists'.format(of_embed))
+        return
+    # read the input embeddings
+    em_in = np.fromfile(if_embed, dtype=np.float32, count=-1).reshape(-1, dim)
+    ninp = em_in.shape[0]
+    print(' - Combine embeddings:')
+    print('                input: {:s} {:d} sentences'.format(if_embed, ninp))
+    # get all sentence IDs
+    sid = np.empty(ninp, dtype=np.int32)
+    i = 0
+    with open(sid_fname, 'r') as fp_sid:
+        for line in fp_sid:
+            sid[i] = int(line)
+            i += 1
+    nout = sid.max() + 1
+    print('                IDs: {:s}, {:d} sentences'.format(sid_fname, nout))
+    # combining
+    em_out = np.zeros((nout, dim), dtype=np.float32)
+    cnt = np.zeros(nout, dtype=np.int32)
+    for i in range(ninp):
+        idx = sid[i]
+        em_out[idx] += em_in[i]  # cumulate sentence vectors
+        cnt[idx] += 1
+    if (cnt == 0).astype(int).sum() > 0:
+        print('ERROR: missing lines')
+        sys.exit(1)
+    # normalize
+    for i in range(nout):
+        em_out[i] /= cnt[i]
+    print('                output: {:s}'.format(of_embed))
+    em_out.tofile(of_embed)

laser/source/mine_bitexts.py ADDED Viewed

	@@ -0,0 +1,302 @@

+#!/usr/bin/python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# LASER  Language-Agnostic SEntence Representations
+# is a toolkit to calculate multilingual sentence embeddings
+# and to use them for document classification, bitext filtering
+# and mining
+#
+# --------------------------------------------------------
+#
+# Tool to calculate to embed a text file
+# The functions can be also imported into another Python code
+import os
+import sys
+import faiss
+import argparse
+import torch
+import numpy as np
+# get environment
+assert os.environ.get('LASER'), 'Please set the enviornment variable LASER'
+LASER = os.environ['LASER']
+sys.path.append(LASER + '/source')
+sys.path.append(LASER + '/source/tools')
+from embed import SentenceEncoder, EncodeLoad, EncodeFile, EmbedLoad
+from lib.text_processing import Token, BPEfastApply
+###############################################################################
+#
+# Load texts and remove duplicates
+#
+###############################################################################
+def TextLoadUnify(fname, args):
+    if args.verbose:
+        print(' - loading texts {:s}: '.format(fname), end='')
+    fin = open(fname, encoding=args.encoding, errors='surrogateescape')
+    inds = []
+    sents = []
+    sent2ind = {}
+    n = 0
+    nu = 0
+    for line in fin:
+        new_ind = len(sent2ind)
+        inds.append(sent2ind.setdefault(line, new_ind))
+        if args.unify:
+            if inds[-1] == new_ind:
+                sents.append(line[:-1])
+                nu += 1
+        else:
+            sents.append(line[:-1])
+            nu += 1
+        n += 1
+    if args.verbose:
+        print('{:d} lines, {:d} unique'.format(n, nu))
+    del sent2ind
+    return inds, sents
+###############################################################################
+#
+# Wrapper for knn on CPU/GPU
+#
+###############################################################################
+def knn(x, y, k, use_gpu):
+    return knnGPU(x, y, k) if use_gpu else knnCPU(x, y, k)
+###############################################################################
+#
+# Perform knn on GPU
+#
+###############################################################################
+def knnGPU(x, y, k, mem=5*1024*1024*1024):
+    dim = x.shape[1]
+    batch_size = mem // (dim*4)
+    sim = np.zeros((x.shape[0], k), dtype=np.float32)
+    ind = np.zeros((x.shape[0], k), dtype=np.int64)
+    for xfrom in range(0, x.shape[0], batch_size):
+        xto = min(xfrom + batch_size, x.shape[0])
+        bsims, binds = [], []
+        for yfrom in range(0, y.shape[0], batch_size):
+            yto = min(yfrom + batch_size, y.shape[0])
+            # print('{}-{}  ->  {}-{}'.format(xfrom, xto, yfrom, yto))
+            idx = faiss.IndexFlatIP(dim)
+            idx = faiss.index_cpu_to_all_gpus(idx)
+            idx.add(y[yfrom:yto])
+            bsim, bind = idx.search(x[xfrom:xto], min(k, yto-yfrom))
+            bsims.append(bsim)
+            binds.append(bind + yfrom)
+            del idx
+        bsims = np.concatenate(bsims, axis=1)
+        binds = np.concatenate(binds, axis=1)
+        aux = np.argsort(-bsims, axis=1)
+        for i in range(xfrom, xto):
+            for j in range(k):
+                sim[i, j] = bsims[i-xfrom, aux[i-xfrom, j]]
+                ind[i, j] = binds[i-xfrom, aux[i-xfrom, j]]
+    return sim, ind
+###############################################################################
+#
+# Perform knn on CPU
+#
+###############################################################################
+def knnCPU(x, y, k):
+    dim = x.shape[1]
+    idx = faiss.IndexFlatIP(dim)
+    idx.add(y)
+    sim, ind = idx.search(x, k)
+    return sim, ind
+###############################################################################
+#
+# Scoring
+#
+###############################################################################
+def score(x, y, fwd_mean, bwd_mean, margin):
+    return margin(x.dot(y), (fwd_mean + bwd_mean) / 2)
+def score_candidates(x, y, candidate_inds, fwd_mean, bwd_mean, margin, verbose=False):
+    if verbose:
+        print(' - scoring {:d} candidates'.format(x.shape[0]))
+    scores = np.zeros(candidate_inds.shape)
+    for i in range(scores.shape[0]):
+        for j in range(scores.shape[1]):
+            k = candidate_inds[i, j]
+            scores[i, j] = score(x[i], y[k], fwd_mean[i], bwd_mean[k], margin)
+    return scores
+###############################################################################
+#
+# Main
+#
+###############################################################################
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='LASER: Mine bitext')
+    parser.add_argument('src',
+        help='Source language corpus')
+    parser.add_argument('trg',
+        help='Target language corpus')
+    parser.add_argument('--encoding', default='utf-8',
+        help='Character encoding for input/output')
+    parser.add_argument('--src-lang', required=True,
+        help='Source language id')
+    parser.add_argument('--trg-lang', required=True,
+        help='Target language id')
+    parser.add_argument('--output', required=True,
+        help='Output file')
+    parser.add_argument('--threshold', type=float, default=0,
+        help='Threshold on extracted bitexts')
+    # mining params
+    parser.add_argument('--mode',
+        choices=['search', 'score', 'mine'], required=True,
+        help='Execution mode')
+    parser.add_argument('-k', '--neighborhood',
+        type=int, default=4,
+        help='Neighborhood size')
+    parser.add_argument('--margin',
+        choices=['absolute', 'distance', 'ratio'], default='ratio',
+        help='Margin function')
+    parser.add_argument('--retrieval',
+        choices=['fwd', 'bwd', 'max', 'intersect'], default='max',
+        help='Retrieval strategy')
+    parser.add_argument('--unify', action='store_true',
+        help='Unify texts')
+    parser.add_argument('--gpu', action='store_true',
+        help='Run knn on all available GPUs')
+    parser.add_argument('--verbose', action='store_true',
+        help='Detailed output')
+    # embeddings
+    parser.add_argument('--src-embeddings', required=True,
+        help='Precomputed source sentence embeddings')
+    parser.add_argument('--trg-embeddings', required=True,
+        help='Precomputed target sentence embeddings')
+    parser.add_argument('--dim', type=int, default=1024,
+        help='Embedding dimensionality')
+    parser.add_argument('--fp16', action='store_true',
+        help='Load precomputed embeddings in float16 format')
+    args = parser.parse_args()
+    print('LASER: tool to search, score or mine bitexts')
+    use_gpu = torch.cuda.is_available() and args.gpu
+    if use_gpu:
+        print(' - knn will run on all available GPUs (recommended)')
+    else:
+        print(' - knn will run on CPU (slow)')
+    src_inds, src_sents = TextLoadUnify(args.src, args)
+    trg_inds, trg_sents = TextLoadUnify(args.trg, args)
+    def unique_embeddings(emb, ind, verbose=False):
+        aux = {j: i for i, j in enumerate(ind)}
+        if verbose:
+            print(' - unify embeddings: {:d} -> {:d}'.format(len(emb), len(aux)))
+        return emb[[aux[i] for i in range(len(aux))]]
+    # load the embeddings and store as np.float32 (required for FAISS)
+    x = EmbedLoad(args.src_embeddings, args.dim, verbose=args.verbose, fp16=args.fp16).astype(np.float32)
+    if args.unify:
+        x = unique_embeddings(x, src_inds, args.verbose)
+    faiss.normalize_L2(x)
+    y = EmbedLoad(args.trg_embeddings, args.dim, verbose=args.verbose, fp16=args.fp16).astype(np.float32)
+    if args.unify:
+        y = unique_embeddings(y, trg_inds, args.verbose)
+    faiss.normalize_L2(y)
+    # calculate knn in both directions
+    if args.retrieval != 'bwd':
+        if args.verbose:
+            print(' - perform {:d}-nn source against target'.format(args.neighborhood))
+        x2y_sim, x2y_ind = knn(x, y, min(y.shape[0], args.neighborhood), use_gpu)
+        x2y_mean = x2y_sim.mean(axis=1)
+    if args.retrieval != 'fwd':
+        if args.verbose:
+            print(' - perform {:d}-nn target against source'.format(args.neighborhood))
+        y2x_sim, y2x_ind = knn(y, x, min(x.shape[0], args.neighborhood), use_gpu)
+        y2x_mean = y2x_sim.mean(axis=1)
+    # margin function
+    if args.margin == 'absolute':
+        margin = lambda a, b: a
+    elif args.margin == 'distance':
+        margin = lambda a, b: a - b
+    else:  # args.margin == 'ratio':
+        margin = lambda a, b: a / b
+    fout = open(args.output, mode='w', encoding=args.encoding, errors='surrogateescape')
+    if args.mode == 'search':
+        if args.verbose:
+            print(' - Searching for closest sentences in target')
+            print(' - writing alignments to {:s}'.format(args.output))
+        scores = score_candidates(x, y, x2y_ind, x2y_mean, y2x_mean, margin, args.verbose)
+        best = x2y_ind[np.arange(x.shape[0]), scores.argmax(axis=1)]
+        nbex = x.shape[0]
+        ref = np.linspace(0, nbex-1, nbex).astype(int)  # [0, nbex)
+        err = nbex - np.equal(best.reshape(nbex), ref).astype(int).sum()
+        print(' - errors: {:d}={:.2f}%'.format(err, 100*err/nbex))
+        for i in src_inds:
+            print(trg_sents[best[i]], file=fout)
+    elif args.mode == 'score':
+        for i, j in zip(src_inds, trg_inds):
+            s = score(x[i], y[j], x2y_mean[i], y2x_mean[j], margin)
+            print(s, src_sents[i], trg_sents[j], sep='\t', file=fout)
+    elif args.mode == 'mine':
+        if args.verbose:
+            print(' - mining for parallel data')
+        fwd_scores = score_candidates(x, y, x2y_ind, x2y_mean, y2x_mean, margin, args.verbose)
+        bwd_scores = score_candidates(y, x, y2x_ind, y2x_mean, x2y_mean, margin, args.verbose)
+        fwd_best = x2y_ind[np.arange(x.shape[0]), fwd_scores.argmax(axis=1)]
+        bwd_best = y2x_ind[np.arange(y.shape[0]), bwd_scores.argmax(axis=1)]
+        if args.verbose:
+            print(' - writing alignments to {:s}'.format(args.output))
+            if args.threshold > 0:
+                print(' - with threshold of {:f}'.format(args.threshold))
+        if args.retrieval == 'fwd':
+            for i, j in enumerate(fwd_best):
+                print(fwd_scores[i].max(), src_sents[i], trg_sents[j], sep='\t', file=fout)
+        if args.retrieval == 'bwd':
+            for j, i in enumerate(bwd_best):
+                print(bwd_scores[j].max(), src_sents[i], trg_sents[j], sep='\t', file=fout)
+        if args.retrieval == 'intersect':
+            for i, j in enumerate(fwd_best):
+                if bwd_best[j] == i:
+                    print(fwd_scores[i].max(), src_sents[i], trg_sents[j], sep='\t', file=fout)
+        if args.retrieval == 'max':
+            indices = np.stack((np.concatenate((np.arange(x.shape[0]), bwd_best)),
+                                np.concatenate((fwd_best, np.arange(y.shape[0])))), axis=1)
+            scores = np.concatenate((fwd_scores.max(axis=1), bwd_scores.max(axis=1)))
+            seen_src, seen_trg = set(), set()
+            for i in np.argsort(-scores):
+                src_ind, trg_ind = indices[i]
+                if not src_ind in seen_src and not trg_ind in seen_trg:
+                    seen_src.add(src_ind)
+                    seen_trg.add(trg_ind)
+                    if scores[i] > args.threshold:
+                        print(scores[i], src_sents[src_ind], trg_sents[trg_ind], sep='\t', file=fout)
+    fout.close()

laser/source/nli.py ADDED Viewed

	@@ -0,0 +1,371 @@

+#!/usr/bin/python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# LASER  Language-Agnostic SEntence Representations
+# is a toolkit to calculate multilingual sentence embeddings
+# and to use them for document classification, bitext filtering
+# and mining
+#
+# --------------------------------------------------------
+#
+#
+import os
+import copy
+import argparse
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.utils.data as data_utils
+import numpy as np
+import faiss
+################################################
+def LoadDataNLI(fn1, fn2, fn_lbl,
+                dim=1024, bsize=32,
+                fraction=1.0,
+                shuffle=False, quiet=False):
+    x = np.fromfile(fn1, dtype=np.float32, count=-1)
+    x.resize(x.shape[0] // dim, dim)
+    faiss.normalize_L2(x)
+    y = np.fromfile(fn2, dtype=np.float32, count=-1)
+    y.resize(y.shape[0] // dim, dim)
+    faiss.normalize_L2(y)
+    lbl = np.loadtxt(fn_lbl, dtype=np.int32)
+    lbl.reshape(lbl.shape[0], 1)
+    if not quiet:
+        print(' - read {:d}x{:d} elements in {:s}'.format(x.shape[0], x.shape[1], fn1))
+        print(' - read {:d}x{:d} elements in {:s}'.format(y.shape[0], y.shape[1], fn2))
+        print(' - read {:d} labels [{:d},{:d}] in {:s}'
+              .format(lbl.shape[0], lbl.min(), lbl.max(), fn_lbl))
+    if fraction < 1.0:
+        N = int(x.shape[0] * fraction)
+        if not quiet:
+            print(' - using only the first {:d} examples'.format(N))
+        x = x[:N][:]
+        y = y[:N][:]
+        lbl = lbl[:N][:]
+    if not quiet:
+        print(' - combine premises and hyps')
+    nli = np.concatenate((x, y, np.absolute(x - y), np.multiply(x, y)), axis=1)
+    D = data_utils.TensorDataset(torch.from_numpy(nli), torch.from_numpy(lbl))
+    loader = data_utils.DataLoader(D, batch_size=bsize, shuffle=shuffle)
+    return loader
+################################################
+class Net(nn.Module):
+    def __init__(self, fname='',
+                 idim=4*1024, odim=2, nhid=None,
+                 dropout=0.0, gpu=0, activation='TANH'):
+        super(Net, self).__init__()
+        self.gpu = gpu
+        if os.path.isfile(fname):
+            print(' - loading mlp from %s'.format(fname))
+            loaded = torch.load(fname)
+            self.mlp = loaded.mlp
+        else:
+            modules = []
+            print(' - mlp {:d}'.format(idim), end='')
+            if len(nhid) > 0:
+                if dropout > 0:
+                    modules.append(nn.Dropout(p=dropout))
+                nprev = idim
+                for nh in nhid:
+                    if nh > 0:
+                        modules.append(nn.Linear(nprev, nh))
+                        nprev = nh
+                        if activation == 'TANH':
+                            modules.append(nn.Tanh())
+                            print('-{:d}t'.format(nh), end='')
+                        elif activation == 'RELU':
+                            modules.append(nn.ReLU())
+                            print('-{:d}r'.format(nh), end='')
+                        else:
+                            raise Exception('Unrecognised activation {activation}')
+                        if dropout > 0:
+                            modules.append(nn.Dropout(p=dropout))
+                modules.append(nn.Linear(nprev, odim))
+                print('-{:d}, dropout={:.1f}'.format(odim, dropout))
+            else:
+                modules.append(nn.Linear(idim, odim))
+                print(' - mlp {:d}-{:d}'.format(idim, odim))
+            self.mlp = nn.Sequential(*modules)
+        if self.gpu >= 0:
+            self.mlp = self.mlp.cuda()
+    def forward(self, x):
+        return self.mlp(x)
+    def TestCorpus(self, dset, name=' Dev', nlbl=3, out_fname=None):
+        correct = 0
+        total = 0
+        self.mlp.train(mode=False)
+        corr = np.zeros(nlbl, dtype=np.int32)
+        if out_fname:
+            fp = open(out_fname, 'w')
+            fp.write('# outputs target_class predicted_class\n')
+        for data in dset:
+            X, Y = data
+            Y = Y.long()
+            if self.gpu >= 0:
+                X = X.cuda()
+                Y = Y.cuda()
+            outputs = self.mlp(X)
+            _, predicted = torch.max(outputs.data, 1)
+            total += Y.size(0)
+            correct += (predicted == Y).int().sum()
+            for i in range(nlbl):
+                corr[i] += (predicted == i).int().sum()
+            if out_fname:
+                for b in range(outputs.shape[0]):
+                    for i in range(nlbl):
+                        fp.write('{:f} '.format(outputs[b][i]))
+                    fp.write('{:d} {:d}\n'
+                             .format(predicted[b], Y[b]))
+        print(' | {:4s}: {:5.2f}%'
+              .format(name, 100.0 * correct.float() / total), end='')
+        # print(' | loss {:6.4f}'.format(loss/total), end='')
+        print(' | classes:', end='')
+        for i in range(nlbl):
+            print(' {:5.2f}'.format(100.0 * corr[i] / total), end='')
+        if out_fname:
+            fp.close()
+        return correct, total
+################################################
+parser = argparse.ArgumentParser(
+           formatter_class=argparse.RawDescriptionHelpFormatter,
+           description='Classifier for NLI')
+# Data
+parser.add_argument(
+    '--base-dir', '-b', type=str, required=True, metavar='PATH',
+    help='Directory with all the data files)')
+parser.add_argument(
+    '--load', '-l', type=str, required=False, metavar='PATH', default='',
+    help='Load network from file before training or for testing')
+parser.add_argument(
+    '--save', '-s', type=str, required=False, metavar='PATH', default='',
+    help='File in which to save best network')
+parser.add_argument(
+    '--train', '-t', type=str, required=True, metavar='STR',
+    help='Name of training corpus')
+parser.add_argument(
+    '--train-labels', '-T', type=str, required=True, metavar='STR',
+    help='Name of training corpus (labels)')
+parser.add_argument(
+    '--dev', '-d', type=str, required=True, metavar='STR',
+    help='Name of development corpus')
+parser.add_argument(
+    '--dev-labels', '-D', type=str, required=True, metavar='STR',
+    help='Name of development corpus (labels)')
+parser.add_argument(
+    '--test', '-e', type=str, default=None,
+    help='Name of test corpus without language extension')
+parser.add_argument(
+    '--test-labels', '-E', type=str, default=None,
+    help='Name of test corpus without language extension (labels)')
+parser.add_argument(
+    '--lang', '-L', nargs='+', default=None,
+    help='List of languages to test on')
+parser.add_argument(
+    '--cross-lingual', '-x', action='store_true',
+    help='Also test on premise and hypothesis in different languages)')
+parser.add_argument(
+    '--parts', '-p', type=str, nargs='+', default=['prem', 'hyp'],
+    help='Name of the two input parts to compare')
+parser.add_argument(
+    '--fraction', '-f', type=float, default=1.0,
+    help='Fraction of training examples to use (from the beginning)')
+parser.add_argument(
+    '--save-outputs', type=str, default=None,
+    help='File name to save classifier outputs ("l1-l2.txt" will be added)')
+# network definition
+parser.add_argument(
+    '--dim', '-m', type=int, default=1024,
+    help='dimension of sentence embeddings')
+parser.add_argument(
+    '--nhid', '-n', type=int, default=0, nargs='+',
+    help='List of hidden layer(s) dimensions')
+parser.add_argument(
+    '--dropout', '-o', type=float, default=0.0, metavar='FLOAT',
+    help='Value  of dropout')
+parser.add_argument(
+    '--nepoch', '-N', type=int, default=100, metavar='INT',
+    help='Number of epochs')
+parser.add_argument(
+    '--bsize', '-B', type=int, default=128, metavar='INT',
+    help='Batch size')
+parser.add_argument(
+    '--seed', '-S', type=int, default=123456789, metavar='INT',
+    help='Initial random seed')
+parser.add_argument(
+    '--lr', type=float, default=0.001, metavar='FLOAT',
+    help='Learning rate')
+parser.add_argument(
+    '--activation', '-a', type=str, default='TANH', metavar='STR',
+    help='NonLinearity to use in hidden layers')
+parser.add_argument(
+    '--gpu', '-g', type=int, default=-1, metavar='INT',
+    help='GPU id (-1 for CPU)')
+args = parser.parse_args()
+train_loader = LoadDataNLI(os.path.join(args.base_dir, args.train % args.parts[0]),
+                           os.path.join(args.base_dir, args.train % args.parts[1]),
+                           os.path.join(args.base_dir, args.train_labels),
+                           dim=args.dim, bsize=args.bsize, shuffle=True, fraction=args.fraction)
+dev_loader = LoadDataNLI(os.path.join(args.base_dir, args.dev % args.parts[0]),
+                         os.path.join(args.base_dir, args.dev % args.parts[1]),
+                         os.path.join(args.base_dir, args.dev_labels),
+                         dim=args.dim, bsize=args.bsize, shuffle=False)
+# set GPU and random seed
+np.random.seed(args.seed)
+torch.manual_seed(args.seed)
+if args.gpu < 0:
+    print(' - running on cpu')
+else:
+    print(' - running on gpu {:d}'.format(args.gpu))
+    torch.cuda.set_device(args.gpu)
+    torch.cuda.manual_seed(args.seed)
+print(' - setting seed to {:d}'.format(args.seed))
+print(' - lrate is {:f} and bsize {:d}'.format(args.lr, args.bsize))
+# create network
+net = Net(fname=args.load,
+          idim=4*args.dim, odim=3, nhid=args.nhid,
+          dropout=args.dropout, gpu=args.gpu,
+          activation=args.activation)
+if args.gpu >= 0:
+    criterion = nn.CrossEntropyLoss().cuda()
+else:
+    criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(net.parameters(), lr=args.lr)
+corr_best = 0
+# loop multiple times over the dataset
+for epoch in range(args.nepoch):
+    loss_epoch = 0.0
+    print('Ep {:4d}'.format(epoch), end='')
+    # for inputs, labels in train_loader:
+    for i, data in enumerate(train_loader, 0):
+        # get the inputs
+        inputs, labels = data
+        labels = labels.long()
+        if args.gpu >= 0:
+            inputs = inputs.cuda()
+            labels = labels.cuda()
+        # zero the parameter gradients
+        optimizer.zero_grad()
+        # forward + backward + optimize
+        net.train(mode=True)
+        outputs = net(inputs)
+        loss = criterion(outputs, labels)
+        loss.backward()
+        optimizer.step()
+        loss_epoch += loss.item()
+    print(' | loss {:e}'.format(loss_epoch), end='')
+    corr, nbex = net.TestCorpus(dev_loader, 'Dev')
+    if corr >= corr_best:
+        print(' | saved')
+        corr_best = corr
+        net_best = copy.deepcopy(net)
+    else:
+        print('')
+if 'net_best' in globals():
+    if args.save != '':
+        torch.save(net_best.cpu(), args.save)
+    print('Best Dev: {:d} = {:5.2f}%'
+          .format(corr_best, 100.0 * corr_best.float() / nbex))
+    if args.gpu >= 0:
+        net_best = net_best.cuda()
+# test on (several) languages
+if args.test is None:
+    os.exit()
+print('Testing on {}'.format(args.test))
+if not args.cross_lingual:
+    for l in args.lang:
+        test_loader = LoadDataNLI(os.path.join(args.base_dir, args.test % args.parts[0] + '.' + l),
+                                  os.path.join(args.base_dir, args.test % args.parts[1] + '.' + l),
+                                  os.path.join(args.base_dir, args.test_labels + '.' + l),
+                                  dim=args.dim, bsize=args.bsize, shuffle=False, quiet=True)
+        print('Ep best | Eval Test lang {:s}'.format(l), end='')
+        ofname = args.save_outputs + '.{:s}-{:s}'.format(l, l) + '.txt' if args.save_outputs else None
+        net_best.TestCorpus(test_loader, 'Test', out_fname=ofname)
+        print('')
+else:  # cross-lingual
+    err = np.empty((len(args.lang), len(args.lang)), dtype=np.float32)
+    i1 = 0
+    for l1 in args.lang:
+        i2 = 0
+        for l2 in args.lang:
+            test_loader = LoadDataNLI(os.path.join(args.base_dir, args.test % args.parts[0] + '.' + l1),
+                                      os.path.join(args.base_dir, args.test % args.parts[1] + '.' + l2),
+                                      os.path.join(args.base_dir, args.test_labels + '.' + l2),
+                                      dim=args.dim, bsize=args.bsize, shuffle=False, quiet=True)
+            print('Ep best | Eval Test {:s}-{:s}'.format(l1, l2), end='')
+            ofname = args.save_outputs + '.{:s}-{:s}'.format(l1, l2) + '.txt' if args.save_outputs else None
+            p, n = net_best.TestCorpus(test_loader, 'Test',
+                                       out_fname=ofname)
+            err[i1, i2] = 100.0 * float(p) / n
+            i2 += 1
+            print('')
+        i1 += 1
+    print('\nAccuracy matrix:')
+    print('      ', end='')
+    for i2 in range(err.shape[1]):
+        print('  {:4s} '.format(args.lang[i2]), end='')
+    print('  avg')
+    for i1 in range(err.shape[0]):
+        print('{:4s}'.format(args.lang[i1]), end='')
+        for i2 in range(err.shape[1]):
+            print('  {:5.2f}'.format(err[i1, i2]), end='')
+        print('   {:5.2f}'.format(np.average(err[i1, :])))
+    print('avg ', end='')
+    for i2 in range(err.shape[1]):
+        print('  {:5.2f}'.format(np.average(err[:, i2])), end='')
+    print('  {:5.2f}'.format(np.average(err)))
+    if err.shape[0] == err.shape[1]:
+        s = 0
+        # TODO: we assume the first lang is English
+        for i1 in range(1, err.shape[0]):
+            s += err[i1, i1]
+        print('xnli-xx: {:5.2f}'.format(s/(err.shape[0]-1)))

laser/source/paraphrase.py ADDED Viewed

	@@ -0,0 +1,285 @@

+#!/usr/bin/python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# LASER  Language-Agnostic SEntence Representations
+# is a toolkit to calculate multilingual sentence embeddings
+# and to use them for document classification, bitext filtering
+# and mining
+#
+# --------------------------------------------------------
+#
+# Python tool to search for paraphrases in FAISS index
+import re
+import sys
+import os.path
+import tempfile
+import argparse
+import faiss
+import time
+import pdb
+import numpy as np
+from collections import namedtuple
+# get environment
+assert os.environ.get('LASER'), 'Please set the enviornment variable LASER'
+LASER = os.environ['LASER']
+sys.path.append(LASER + '/source/lib')
+from indexing import IndexLoad, IndexTextOpen, IndexTextQuery, SplitOpen, SplitAccess
+from embed import SentenceEncoder, EncodeLoad, EncodeFile, EncodeTime
+from text_processing import Token, BPEfastApply
+SPACE_NORMALIZER = re.compile("\s+")
+Batch = namedtuple('Batch', 'srcs tokens lengths')
+# calculate L2 distance between [x]
+# and the vectors referenced in idxs
+# x should be already normalized
+def IndexDistL2(X, E, D, I, thresh=1.0, dtype=np.float32, sort=True):
+    nb, nK = I.shape
+    dim = X.shape[1]
+    dist_l2 = np.empty((nb, nK), dtype=np.float32)
+    y = np.empty((1, dim), dtype=dtype)
+    for i in range(nb):
+        for k in range(nK):
+            if D[i, k] <= thresh:
+                # get embedding from disk
+                np.copyto(y, SplitAccess(E, I[i, k]))
+                faiss.normalize_L2(y)
+                dist_l2[i, k] = 1.0 - np.dot(X[i], y[0])
+            else:
+                # exclude sentences which already have a huge FAISS distance
+                # (getting embeddings from disk is very time consumming)
+                dist_l2[i, k] = 1.0
+        if sort:
+            # re-sort according to L2
+            idxs = np.argsort(dist_l2[i], axis=0)
+            dist_l2[i] = dist_l2[i][idxs]
+            I[i] = I[i][idxs]
+    return dist_l2, I
+###############################################################################
+#
+# Apply an absolute threshold on the distance
+#
+###############################################################################
+def MarginAbs(em, ofp, params, args, stats):
+    D, I = params.idx.search(em, args.kmax)
+    thresh = args.threshold_faiss
+    if args.embed:
+        D, I = IndexDistL2(em, params.E, D, I, args.threshold_faiss)
+        thresh = args.threshold_L2
+    for n in range(D.shape[0]):
+        prev = {}  # for deduplication
+        for i in range(args.kmax):
+            txt = IndexTextQuery(params.T, params.R, I[n, i])
+            if (args.dedup and txt not in prev) and D[n, i] <= thresh:
+                prev[txt] = 1
+                ofp.write('{:d}\t{:7.5f}\t{}\n'
+                          .format(stats.nbs, D[n, i], txt))
+                stats.nbp += 1
+        # display source sentece if requested
+        if (args.include_source == 'matches' and len(prev) > 0):
+            ofp.write('{:d}\t{:6.1f}\t{}\n'
+                      .format(stats.nbs, 0.0, sentences[n].replace('@@ ', '')))
+        if args.include_source == 'always':
+            ofp.write('{:d}\t{:6.1f}\t{}\n'
+                      .format(stats.nbs, 0.0, sentences[n].replace('@@ ', '')))
+        stats.nbs += 1
+###############################################################################
+#
+# Apply an threshold on the ratio between distance and average
+#
+###############################################################################
+def MarginRatio(em, ofp, params, args, stats):
+    D, I = params.idx.search(em, args.margin_k)
+    thresh = args.threshold
+    if args.embed:
+        D, I = IndexDistL2(em, params.E, D, I, args.threshold_faiss)
+        thresh = args.threshold_L2
+    Mean = D.mean(axis=1)
+    for n in range(D.shape[0]):
+        if D[n, 0] / Mean[n] <= args.threshold:
+            if args.include_source == 'matches':
+                ofp.write('{:d}\t{:6.1f}\t{}\n'
+                          .format(stats.nbs, 0.0, sentences[n].replace('@@ ', '')))
+            txt = IndexTextQuery(params.T, params.R, I[n, 0])
+            ofp.write('{:d}\t{:7.5f}\t{}\n'.format(stats.nbs, D[n, 0], txt))
+            stats.nbp += 1
+        stats.nbs += 1
+    if args.include_source == 'always':
+        ofp.write('{:d}\t{:6.1f}\t{}\n'
+                  .format(stats.nbs, 0.0, sentences[n].replace('@@ ', '')))
+###############################################################################
+def MarginDist(em, ofp, params, args, stats):
+    print('ERROR: MarginAbs not implemented')
+    sys.exit(1)
+###############################################################################
+def buffered_read(fp, buffer_size):
+    buffer = []
+    for src_str in fp:
+        buffer.append(src_str.strip())
+        if len(buffer) >= buffer_size:
+            yield buffer
+            buffer = []
+    if len(buffer) > 0:
+        yield buffer
+###############################################################################
+parser = argparse.ArgumentParser('LASER: paraphrase tool')
+parser.add_argument('--encoder', type=str, required=True,
+    help='encoder to be used')
+parser.add_argument('--encoding', default='utf-8',
+    help='Character encoding for input/output')
+parser.add_argument('--token-lang', type=str, default='--',
+    help="Language of tokenizer ('--' for no tokenization)")
+parser.add_argument('--bpe-codes', type=str, default=None, required=True,
+    help='BPE codes')
+parser.add_argument('--buffer-size', type=int, default=100,
+    help='Buffer size (sentences)')
+parser.add_argument('--max-tokens', type=int, default=12000,
+    help='Maximum number of tokens to process in a batch')
+parser.add_argument('--max-sentences', type=int, default=None,
+    help='Maximum number of sentences to process in a batch')
+parser.add_argument('--cpu', action='store_true',
+    help='Use CPU instead of GPU')
+parser.add_argument('--index', type=str, required=True,
+    help='FAISS index')
+parser.add_argument('--nprobe', type=int, default=128,
+    help='FAISS: value of nprobe')
+parser.add_argument('--text', type=str, required=True,
+    help='File with indexed texts')
+parser.add_argument(
+    '--dim', type=int, default=1024,
+    help='Dimension of specified sentence embeddings')
+parser.add_argument(
+    '--embed', type=str, default=None,
+    help='Sentence embeddings, true L2 distance will be calculated when specified')
+parser.add_argument('-i', '--input', type=str, required=True,
+    help='Input text file')
+parser.add_argument('-p', '--output', type=str, default='--',
+    help='Output paraphrases')
+parser.add_argument('--kmax', type=int, default=10,
+    help='Max value of distance or margin of each paraphrase')
+parser.add_argument('--dedup', type=int, default=1,
+    help='Deduplicate list of paraphrases')
+parser.add_argument('--include-source', default='never',
+    choices=['never', 'matches', 'always'],
+    help='Include source sentence in the list of paraphrases')
+parser.add_argument('--margin',
+    choices=['absolute', 'distance', 'ratio'],
+    default='ratio', help='Margin function')
+parser.add_argument('-T', '--threshold-margin', type=float, default=0.9,
+    help='Threshold on margin')
+parser.add_argument('--threshold-faiss', type=float, default=0.4,
+    help='Threshold on FAISS distance')
+parser.add_argument('--threshold-L2', type=float, default=0.2,
+    help='Threshold on L2 distance')
+parser.add_argument('--margin-k', type=int, default=4,
+    help='Number of nearest neighbors for margin calculation')
+parser.add_argument('--verbose', action='store_true',
+    help='Detailed output')
+print('\nLASER: paraphrase tool')
+args = parser.parse_args()
+# index,
+# memory mapped texts, references and word counts
+# encoder
+params = namedtuple('params', 'idx T R W M E enc')
+# open text and reference file
+params.T, params.R, params.W, params.M = IndexTextOpen(args.text)
+# Open on-disk embeddings for L2 distances
+if args.embed:
+    params.E = SplitOpen(args.embed, ['en'],
+                                args.dim, np.float32, verbose=False)
+# load FAISS index
+params.idx = IndexLoad(args.index, args.nprobe)
+# load sentence encoder
+params.enc = EncodeLoad(args)
+margin_methods = {'absolute': MarginAbs,
+                  'distance': MarginDist,
+                  'ratio': MarginRatio}
+with tempfile.TemporaryDirectory() as tmpdir:
+    ifile = args.input
+    if args.token_lang != '--':
+        ifile = os.path.join(tmpdir, 'tok')
+        Token(args.input,
+              ifile,
+              lang=args.token_lang,
+              romanize=True if args.token_lang == 'el' else False,
+              lower_case=True, gzip=False,
+              verbose=args.verbose, over_write=False)
+    if args.bpe_codes:
+        bpe_file = os.path.join(tmpdir, 'bpe')
+        BPEfastApply(ifile,
+                     bpe_file,
+                     args.bpe_codes,
+                     verbose=args.verbose, over_write=False)
+        ifile = bpe_file
+    print(' - processing (batch size is {:d})'.format(args.buffer_size))
+    ifp = open(ifile, 'r', encoding=args.encoding, errors='surrogateescape')
+    if args.output == '--':
+        ofp = sys.stdout
+    else:
+        ofp = open(args.output, 'w', encoding=args.encoding, errors='surrogateescape')
+    stats = namedtuple('stats', 'ns np')
+    stats.nbs = 0
+    stats.nbp = 0
+    t = time.time()
+    for sentences in buffered_read(ifp, args.buffer_size):
+        embed = params.enc.encode_sentences(sentences)
+        faiss.normalize_L2(embed)
+        # call function for selected margin method
+        margin_methods.get(args.margin)(embed, ofp, params, args, stats)
+        if stats.nbs % 1000 == 0:
+            print('\r - {:d} sentences {:d} paraphrases'
+                  .format(stats.nbs, stats.nbp), end='')
+    ifp.close()
+    if args.output != '--':
+        ofp.close()
+    print('\r - {:d} sentences {:d} paraphrases'
+          .format(stats.nbs, stats.nbp), end='')
+    EncodeTime(t)

laser/source/pxsim.py ADDED Viewed

	@@ -0,0 +1,251 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# LASER  Language-Agnostic SEntence Representations
+# is a toolkit to calculate multilingual sentence embeddings
+# and to use them for various tasks such as document classification,
+# and bitext filtering
+#
+# --------------------------------------------------------
+#
+# Tool to calculate the dual approach multilingual similarity error rate (P-xSIM)
+import typing as tp
+from pathlib import Path
+import faiss
+import numpy as np
+import torch
+from scipy.special import softmax
+from sklearn.metrics.pairwise import cosine_similarity
+from stopes.eval.auto_pcp.audio_comparator import Comparator, get_model_pred
+from xsim import Margin, score_margin
+def get_neighbors(
+    x: np.ndarray, y: np.ndarray, k: int, margin: str
+) -> tp.Tuple[np.ndarray, np.ndarray, int]:
+    x_copy = x.astype(np.float32).copy()
+    y_copy = y.astype(np.float32).copy()
+    nbex, dim = x.shape
+    # create index
+    idx_x = faiss.IndexFlatIP(dim)
+    idx_y = faiss.IndexFlatIP(dim)
+    # L2 normalization needed for cosine distance
+    faiss.normalize_L2(x_copy)
+    faiss.normalize_L2(y_copy)
+    idx_x.add(x_copy)
+    idx_y.add(y_copy)
+    if margin == Margin.ABSOLUTE.value:
+        scores, indices = idx_y.search(x_copy, k)
+    else:
+        # return cosine similarity and indices of k closest neighbors
+        Cos_xy, Idx_xy = idx_y.search(x_copy, k)
+        Cos_yx, Idx_yx = idx_x.search(y_copy, k)
+        # average cosines
+        Avg_xy = Cos_xy.mean(axis=1)
+        Avg_yx = Cos_yx.mean(axis=1)
+        scores = score_margin(Cos_xy, Idx_xy, Avg_xy, Avg_yx, margin, k)
+        indices = Idx_xy
+    return scores, indices, nbex
+def get_cosine_scores(src_emb: np.ndarray, neighbor_embs: np.ndarray) -> np.ndarray:
+    assert src_emb.shape[0] == neighbor_embs.shape[1]
+    src_embs = np.repeat(
+        np.expand_dims(src_emb, axis=0), neighbor_embs.shape[0], axis=0
+    )
+    cosine_scores = cosine_similarity(src_embs, neighbor_embs).diagonal()
+    return cosine_scores
+def get_comparator_scores(
+    src_emb: np.ndarray,
+    neighbor_embs: np.ndarray,
+    comparator_model: tp.Any,
+    symmetrize_comparator: bool,
+) -> np.ndarray:
+    src_embs = np.repeat(
+        np.expand_dims(src_emb, axis=0), neighbor_embs.shape[0], axis=0
+    )
+    a = torch.from_numpy(src_embs).unsqueeze(1)  # restore depth dim
+    b = torch.from_numpy(neighbor_embs).unsqueeze(1)
+    res = get_comparator_preds(a, b, comparator_model, symmetrize_comparator)
+    scores_softmax = softmax(res)
+    return np.array(scores_softmax)
+def get_comparator_preds(
+    src_emb: np.ndarray, tgt_emb: np.ndarray, model: tp.Any, symmetrize: bool
+):
+    preds = (
+        get_model_pred(
+            model,
+            src=src_emb[:, 0],
+            mt=tgt_emb[:, 0],
+            use_gpu=model.use_gpu,
+            batch_size=1,
+        )[:, 0]
+        .cpu()
+        .numpy()
+    )
+    if symmetrize:
+        preds2 = (
+            get_model_pred(
+                model,
+                src=tgt_emb[:, 0],
+                mt=src_emb[:, 0],
+                use_gpu=model.use_gpu,
+                batch_size=1,
+            )[:, 0]
+            .cpu()
+            .numpy()
+        )
+        preds = (preds2 + preds) / 2
+    return preds
+def get_blended_predictions(
+    alpha: float,
+    nbex: int,
+    margin_scores: np.ndarray,
+    x_aux: np.ndarray,
+    y_aux: np.ndarray,
+    neighbor_indices: np.ndarray,
+    comparator_model: tp.Optional[tp.Any] = None,
+    symmetrize_comparator: bool = False,
+) -> list[int]:
+    predictions = []
+    for src_index in range(nbex):
+        neighbors = neighbor_indices[src_index]
+        neighbor_embs = y_aux[neighbors].astype(np.float32)
+        src_emb = x_aux[src_index].astype(np.float32)
+        aux_scores = (
+            get_comparator_scores(
+                src_emb, neighbor_embs, comparator_model, symmetrize_comparator
+            )
+            if comparator_model
+            else get_cosine_scores(src_emb, neighbor_embs)
+        )
+        assert margin_scores[src_index].shape == aux_scores.shape
+        blended_scores = alpha * margin_scores[src_index] + (1 - alpha) * aux_scores
+        blended_neighbor_idx = blended_scores.argmax()
+        predictions.append(neighbors[blended_neighbor_idx])
+    return predictions
+def PxSIM(
+    x: np.ndarray,
+    y: np.ndarray,
+    x_aux: np.ndarray,
+    y_aux: np.ndarray,
+    alpha: float,
+    margin: str = Margin.RATIO.value,
+    k: int = 16,
+    comparator_path: tp.Optional[Path] = None,
+    symmetrize_comparator: bool = False,
+) -> tp.Tuple[int, int, list[int]]:
+    """
+    Parameters
+    ----------
+    x : np.ndarray
+        source-side embedding array
+    y : np.ndarray
+        target-side embedding array
+    x_aux : np.ndarray
+        source-side embedding array using auxiliary model
+    y_aux : np.ndarray
+        target-side embedding array using auxiliary model
+    alpha : int
+        parameter to weight blended score
+    margin : str
+        margin scoring function (e.g. ratio, absolute, distance)
+    k : int
+        number of neighbors in k-nn search
+    comparator_path : Path
+        path to AutoPCP model config
+    symmetrize_comparator : bool
+        whether to symmetrize the comparator predictions
+    Returns
+    -------
+    err : int
+        Number of errors
+    nbex : int
+        Number of examples
+    preds : list[int]
+        List of (index-based) predictions
+    """
+    assert Margin.has_value(margin), f"Margin type: {margin}, is not supported."
+    comparator_model = Comparator.load(comparator_path) if comparator_path else None
+    # get margin-based nearest neighbors
+    margin_scores, neighbor_indices, nbex = get_neighbors(x, y, k=k, margin=margin)
+    preds = get_blended_predictions(
+        alpha,
+        nbex,
+        margin_scores,
+        x_aux,
+        y_aux,
+        neighbor_indices,
+        comparator_model,
+        symmetrize_comparator,
+    )
+    err = sum([idx != pred for idx, pred in enumerate(preds)])
+    print(f"P-xSIM error: {100 * (err / nbex):.2f}")
+    return err, nbex, preds
+def load_embeddings(
+    infile: Path, dim: int, fp16: bool = False, numpy_header: bool = False
+) -> np.ndarray:
+    assert infile.exists(), f"file: {infile} does not exist."
+    if numpy_header:
+        return np.load(infile)
+    emb = np.fromfile(infile, dtype=np.float16 if fp16 else np.float32)
+    num_examples = emb.shape[0] // dim
+    emb.resize(num_examples, dim)
+    if fp16:
+        emb = emb.astype(np.float32)  # faiss currently only supports fp32
+    return emb
+def run(
+    src_emb: Path,
+    tgt_emb: Path,
+    src_aux_emb: Path,
+    tgt_aux_emb: Path,
+    alpha: float,
+    margin: str = Margin.RATIO.value,
+    k: int = 16,
+    emb_fp16: bool = False,
+    aux_emb_fp16: bool = False,
+    emb_dim: int = 1024,
+    aux_emb_dim: int = 1024,
+    numpy_header: bool = False,
+    comparator_path: tp.Optional[Path] = None,
+    symmetrize_comparator: bool = False,
+    prediction_savepath: tp.Optional[Path] = None,
+) -> None:
+    x = load_embeddings(src_emb, emb_dim, emb_fp16, numpy_header)
+    y = load_embeddings(tgt_emb, emb_dim, emb_fp16, numpy_header)
+    x_aux = load_embeddings(src_aux_emb, aux_emb_dim, aux_emb_fp16, numpy_header)
+    y_aux = load_embeddings(tgt_aux_emb, aux_emb_dim, aux_emb_fp16, numpy_header)
+    assert (x.shape == y.shape) and (x_aux.shape == y_aux.shape)
+    _, _, preds = PxSIM(
+        x, y, x_aux, y_aux, alpha, margin, k, comparator_path, symmetrize_comparator
+    )
+    if prediction_savepath:
+        with open(prediction_savepath, "w") as outf:
+            for pred in preds:
+                print(pred, file=outf)
+if __name__ == "__main__":
+    import func_argparse
+    func_argparse.main()

laser/source/sent_classif.py ADDED Viewed

	@@ -0,0 +1,273 @@

+#!/usr/bin/python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# LASER  Language-Agnostic SEntence Representations
+# is a toolkit to calculate multilingual sentence embeddings
+# and to use them for document classification, bitext filtering
+# and mining
+#
+# --------------------------------------------------------
+#
+# Simple MLP classifier for sentence embeddings
+import argparse
+import copy
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.utils.data as data_utils
+################################################
+def LoadData(bdir, dfn, lfn, dim=1024, bsize=32, shuffle=False, quiet=False):
+    x = np.fromfile(bdir + dfn, dtype=np.float32, count=-1)
+    x.resize(x.shape[0] // dim, dim)
+    lbl = np.loadtxt(bdir + lfn, dtype=np.int32)
+    lbl.reshape(lbl.shape[0], 1)
+    if not quiet:
+        print(' - read {:d}x{:d} elements in {:s}'.format(x.shape[0], x.shape[1], dfn))
+        print(' - read {:d} labels [{:d},{:d}] in {:s}'
+              .format(lbl.shape[0], lbl.min(), lbl.max(), lfn))
+    D = data_utils.TensorDataset(torch.from_numpy(x), torch.from_numpy(lbl))
+    loader = data_utils.DataLoader(D, batch_size=bsize, shuffle=shuffle)
+    return loader
+################################################
+class Net(nn.Module):
+    def __init__(self, idim=1024, odim=2, nhid=None,
+                 dropout=0.0, gpu=0, activation='TANH'):
+        super(Net, self).__init__()
+        self.gpu = gpu
+        modules = []
+        modules = []
+        print(' - mlp {:d}'.format(idim), end='')
+        if len(nhid) > 0:
+            if dropout > 0:
+                modules.append(nn.Dropout(p=dropout))
+            nprev = idim
+            for nh in nhid:
+                if nh > 0:
+                    modules.append(nn.Linear(nprev, nh))
+                    nprev = nh
+                    if activation == 'TANH':
+                        modules.append(nn.Tanh())
+                        print('-{:d}t'.format(nh), end='')
+                    elif activation == 'RELU':
+                        modules.append(nn.ReLU())
+                        print('-{:d}r'.format(nh), end='')
+                    else:
+                       raise Exception('Unrecognized activation {activation}')
+                    if dropout > 0:
+                        modules.append(nn.Dropout(p=dropout))
+            modules.append(nn.Linear(nprev, odim))
+            print('-{:d}, dropout={:.1f}'.format(odim, dropout))
+        else:
+            modules.append(nn.Linear(idim, odim))
+            print(' - mlp %d-%d'.format(idim, odim))
+        self.mlp = nn.Sequential(*modules)
+        # Softmax is included CrossEntropyLoss !
+        if self.gpu >= 0:
+            self.mlp = self.mlp.cuda()
+    def forward(self, x):
+        return self.mlp(x)
+    def TestCorpus(self, dset, name=' Dev', nlbl=4):
+        correct = 0
+        total = 0
+        self.mlp.train(mode=False)
+        corr = np.zeros(nlbl, dtype=np.int32)
+        for data in dset:
+            X, Y = data
+            Y = Y.long()
+            if self.gpu >= 0:
+                X = X.cuda()
+                Y = Y.cuda()
+            outputs = self.mlp(X)
+            _, predicted = torch.max(outputs.data, 1)
+            total += Y.size(0)
+            correct += (predicted == Y).int().sum()
+            for i in range(nlbl):
+                corr[i] += (predicted == i).int().sum()
+        print(' | {:4s}: {:5.2f}%'
+                         .format(name, 100.0 * correct.float() / total), end='')
+        print(' | classes:', end='')
+        for i in range(nlbl):
+            print(' {:5.2f}'.format(100.0 * corr[i] / total), end='')
+        return correct, total
+################################################
+parser = argparse.ArgumentParser(
+           formatter_class=argparse.RawDescriptionHelpFormatter,
+           description="Simple sentence classifier")
+# Data
+parser.add_argument(
+    '--base-dir', '-b', type=str, required=True, metavar='PATH',
+    help="Directory with all the data files)")
+parser.add_argument(
+    '--save', '-s', type=str, required=False, metavar='PATH', default="",
+    help="File in which to save best network")
+parser.add_argument(
+    '--train', '-t', type=str, required=True, metavar='STR',
+    help="Name of training corpus")
+parser.add_argument(
+    '--train-labels', '-T', type=str, required=True, metavar='STR',
+    help="Name of training corpus (labels)")
+parser.add_argument(
+    '--dev', '-d', type=str, required=True, metavar='STR',
+    help="Name of development corpus")
+parser.add_argument(
+    '--dev-labels', '-D', type=str, required=True, metavar='STR',
+    help="Name of development corpus (labels)")
+parser.add_argument(
+    '--test', '-e', type=str, required=True, metavar='STR',
+    help="Name of test corpus without language extension")
+parser.add_argument(
+    '--test-labels', '-E', type=str, required=True, metavar='STR',
+    help="Name of test corpus without language extension (labels)")
+parser.add_argument(
+    '--lang', '-L', nargs='+', default=None,
+    help="List of languages to test on")
+# network definition
+parser.add_argument(
+    "--dim", "-m", type=int, default=1024,
+    help="Dimension of sentence embeddings")
+parser.add_argument(
+    '--nhid', '-n', type=int, default=[0], nargs='+',
+    help="List of hidden layer(s) dimensions")
+parser.add_argument(
+    "--nb-classes", "-c", type=int, default=2,
+    help="Number of output classes")
+parser.add_argument(
+    '--dropout', '-o', type=float, default=0.0, metavar='FLOAT',
+    help="Value  of dropout")
+parser.add_argument(
+    '--nepoch', '-N', type=int, default=100, metavar='INT',
+    help="Number of epochs")
+parser.add_argument(
+    '--bsize', '-B', type=int, default=128, metavar='INT',
+    help="Batch size")
+parser.add_argument(
+    '--seed', '-S', type=int, default=123456789, metavar='INT',
+    help="Initial random seed")
+parser.add_argument(
+    '--lr', type=float, default=0.001, metavar='FLOAT',
+    help='Learning rate')
+parser.add_argument(
+    '--wdecay', type=float, default=0.0, metavar='FLOAT',
+    help='Weight decay')
+parser.add_argument(
+    '--gpu', '-g', type=int, default=-1, metavar='INT',
+    help="GPU id (-1 for CPU)")
+args = parser.parse_args()
+print(' - base directory: {}'.format(args.base_dir))
+args.base_dir = args.base_dir + "/"
+train_loader = LoadData(args.base_dir, args.train, args.train_labels,
+                        dim=args.dim, bsize=args.bsize, shuffle=True)
+dev_loader = LoadData(args.base_dir, args.dev, args.dev_labels,
+                      dim=args.dim, bsize=args.bsize, shuffle=False)
+# set GPU and random seed
+torch.cuda.set_device(args.gpu)
+np.random.seed(args.seed)
+torch.manual_seed(args.seed)
+torch.cuda.manual_seed(args.seed)
+print(" - setting seed to %d" % args.seed)
+# create network
+net = Net(idim=args.dim, odim=args.nb_classes,
+          nhid=args.nhid, dropout=args.dropout, gpu=args.gpu)
+if args.gpu >= 0:
+    criterion = nn.CrossEntropyLoss().cuda()
+else:
+    criterion = nn.CrossEntropyLoss()
+#optimizer = optim.Adam(net.parameters(), weight_decay=0.0)
+# default: pytorch/optim/adam.py
+# Py0.4: lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False):
+# Py1.0: lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False):
+optimizer = optim.Adam(net.parameters(),
+                       lr=args.lr,
+                       weight_decay=args.wdecay,
+                       betas=(0.9, 0.999),
+                       eps=1e-8,
+                       amsgrad=False)
+corr_best = 0
+# loop multiple times over the dataset
+for epoch in range(args.nepoch):
+    loss_epoch = 0.0
+    print('Ep {:4d}'.format(epoch), end='')
+    # for inputs, labels in train_loader:
+    for i, data in enumerate(train_loader, 0):
+        # get the inputs
+        inputs, labels = data
+        labels = labels.long()
+        if args.gpu >= 0:
+            inputs = inputs.cuda()
+            labels = labels.cuda()
+        # zero the parameter gradients
+        net.zero_grad()
+        # forward + backward + optimize
+        net.train(mode=True)
+        outputs = net(inputs)
+        loss = criterion(outputs, labels)
+        loss.backward()
+        optimizer.step()
+        loss_epoch += loss.item()
+    print(' | loss {:e}'.format(loss_epoch), end='')
+    corr, nbex = net.TestCorpus(dev_loader, 'Dev')
+    if corr >= corr_best:
+        print(' | saved')
+        corr_best = corr
+        net_best = copy.deepcopy(net)
+    else:
+        print('')
+if 'net_best' in globals():
+    if args.save != '':
+        torch.save(net_best.cpu(), args.save)
+    print('Best Dev: {:d} = {:5.2f}%'
+          .format(corr_best, 100.0 * corr_best.float() / nbex))
+    if args.gpu >= 0:
+        net_best = net_best.cuda()
+    # test on (several) languages
+    for l in args.lang:
+        test_loader = LoadData(args.base_dir, args.test + '.' + l,
+                               args.test_labels + '.' + l,
+                               dim=args.dim, bsize=args.bsize,
+                               shuffle=False, quiet=True)
+        print('Ep best | Eval Test lang {:s}'.format(l), end='')
+        net_best.TestCorpus(test_loader, 'Test')
+        print('')

laser/source/similarity_search.py ADDED Viewed

	@@ -0,0 +1,113 @@

+#!/usr/bin/python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# LASER  Language-Agnostic SEntence Representations
+# is a toolkit to calculate multilingual sentence embeddings
+# and to use them for document classification, bitext filtering
+# and mining
+#
+# --------------------------------------------------------
+#
+# Quora Q&A paraphrase detection
+import os
+import sys
+import argparse
+import faiss
+import numpy as np
+# get environment
+assert os.environ.get('LASER'), 'Please set the enviornment variable LASER'
+LASER = os.environ['LASER']
+sys.path.append(LASER + '/source')
+sys.path.append(LASER + '/source/lib')
+from embed import SentenceEncoder, EncodeLoad, EncodeFile
+from text_processing import Token, BPEfastApply
+from indexing import IndexCreate, IndexSearchMultiple, IndexPrintConfusionMatrix
+###############################################################################
+parser = argparse.ArgumentParser('LASER: similarity search')
+parser.add_argument('--base-dir', type=str, default='.',
+    help='Base directory for all data files')
+parser.add_argument('--data', type=str, required=True,
+    help='Direcory and basename of input data (language name will be added)')
+parser.add_argument('--output', type=str, required=True,
+    help='Directory and basename of created data (language name will be added)')
+parser.add_argument('--textual', action='store_true',
+    help='Use textual comparison instead of indicies')
+parser.add_argument(
+    '--lang', '-l', nargs='+', required=True,
+    help="List of languages to test on")
+# preprocessing
+parser.add_argument('--bpe-codes', type=str, required=True,
+    help='Fast BPPE codes and vocabulary')
+parser.add_argument('--verbose', action='store_true',
+    help='Detailed output')
+# options for encoder
+parser.add_argument('--encoder', type=str, required=True,
+    help='encoder to be used')
+parser.add_argument('--buffer-size', type=int, default=100,
+    help='Buffer size (sentences)')
+parser.add_argument('--max-tokens', type=int, default=12000,
+    help='Maximum number of tokens to process in a batch')
+parser.add_argument('--max-sentences', type=int, default=None,
+    help='Maximum number of sentences to process in a batch')
+parser.add_argument('--cpu', action='store_true',
+    help='Use CPU instead of GPU')
+args = parser.parse_args()
+print('LASER: similarity search')
+print('\nProcessing:')
+all_texts = []
+if args.textual:
+    print(' - using textual comparision')
+    for l in args.lang:
+        with open(os.path.join(args.base_dir, args.data + '.' + l),
+                  encoding='utf-8', errors='surrogateescape') as f:
+            texts = f.readlines()
+            print(' -   {:s}: {:d} lines'.format(args.data + '.' + l, len(texts)))
+            all_texts.append(texts)
+enc = EncodeLoad(args)
+out_dir = os.path.dirname(args.output)
+if not os.path.exists(out_dir):
+    print(' - creating directory {}'.format(out_dir))
+    os.mkdir(out_dir)
+all_data = []
+all_index = []
+for l in args.lang:
+    Token(os.path.join(args.base_dir, args.data + '.' + l),
+          os.path.join(args.base_dir, args.output + '.tok.' + l),
+          lang=l,
+          romanize=True if l == 'el' else False,
+          lower_case=True,
+          verbose=args.verbose, over_write=False)
+    BPEfastApply(os.path.join(args.base_dir, args.output + '.tok.' + l),
+                 os.path.join(args.base_dir, args.output + '.bpe.' + l),
+                 args.bpe_codes,
+                 verbose=args.verbose, over_write=False)
+    EncodeFile(enc,
+               os.path.join(args.base_dir, args.output + '.bpe.' + l),
+               os.path.join(args.base_dir, args.output + '.enc.' + l),
+               verbose=args.verbose, over_write=False)
+    d, idx = IndexCreate(os.path.join(args.base_dir, args.output + '.enc.' + l),
+                         'FlatL2',
+                         verbose=args.verbose, save_index=False)
+    all_data.append(d)
+    all_index.append(idx)
+err = IndexSearchMultiple(all_data, all_index, args.lang, texts=all_texts,
+                          verbose=False, print_errors=False)
+IndexPrintConfusionMatrix(err, args.lang)

laser/source/xsim.py ADDED Viewed

	@@ -0,0 +1,165 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# LASER  Language-Agnostic SEntence Representations
+# is a toolkit to calculate multilingual sentence embeddings
+# and to use them for document classification, bitext filtering
+# and mining
+#
+# --------------------------------------------------------
+#
+# Tool to calculate multilingual similarity error rate (xSIM)
+import faiss
+import numpy as np
+import typing as tp
+import os
+import json
+from enum import Enum
+class Margin(Enum):
+    RATIO = "ratio"
+    DISTANCE = "distance"
+    ABSOLUTE = "absolute"
+    @classmethod
+    def has_value(cls, value):
+        return value in cls._value2member_map_
+def xSIM(
+    x: tp.Union[str, np.ndarray],
+    y: tp.Union[str, np.ndarray],
+    margin: str = Margin.RATIO.value,
+    k: int = 4,
+    dim: int = 1024,
+    fp16: bool = False,
+    eval_text: str = None,
+    augmented_json: str = None,
+) -> tp.Tuple[int, int, tp.Dict[str, int]]:
+    assert Margin.has_value(margin), f"Margin type: {margin}, is not supported."
+    if not isinstance(x, np.ndarray):
+        x = _load_embeddings(x, dim, fp16)
+    if not isinstance(y, np.ndarray):
+        y = _load_embeddings(y, dim, fp16)
+    # calculate xSIM error
+    return calculate_error(x, y, margin, k, eval_text, augmented_json)
+def _load_embeddings(infile: str, dim: int, fp16: bool = False) -> np.ndarray:
+    assert os.path.isfile(infile), f"file: {infile} does not exist."
+    emb = np.fromfile(infile, dtype=np.float16 if fp16 else np.float32)
+    num_examples = emb.shape[0] // dim
+    emb.resize(num_examples, dim)
+    if fp16:
+        emb = emb.astype(np.float32)  # faiss currently only supports fp32
+    return emb
+def score_margin(
+    Dxy: np.ndarray,
+    Ixy: np.ndarray,
+    Ax: np.ndarray,
+    Ay: np.ndarray,
+    margin: str,
+    k: int,
+) -> np.ndarray:
+    nbex = Dxy.shape[0]
+    scores = np.zeros((nbex, k))
+    for i in range(nbex):
+        for j in range(k):
+            jj = Ixy[i, j]
+            a = Dxy[i, j]
+            b = (Ax[i] + Ay[jj]) / 2
+            if margin == Margin.RATIO.value:
+                scores[i, j] = a / b
+            else:  # distance margin
+                scores[i, j] = a - b
+    return scores
+def _score_knn(x: np.ndarray, y: np.ndarray, k: int, margin: str) -> np.ndarray:
+    nbex, dim = x.shape
+    # create index
+    idx_x = faiss.IndexFlatIP(dim)
+    idx_y = faiss.IndexFlatIP(dim)
+    # L2 normalization needed for cosine distance
+    faiss.normalize_L2(x)
+    faiss.normalize_L2(y)
+    idx_x.add(x)
+    idx_y.add(y)
+    if margin == Margin.ABSOLUTE.value:
+        scores, indices = idx_y.search(x, 1)
+    else:
+        # return cosine similarity and indices of k closest neighbors
+        Cos_xy, Idx_xy = idx_y.search(x, k)
+        Cos_yx, Idx_yx = idx_x.search(y, k)
+        # average cosines
+        Avg_xy = Cos_xy.mean(axis=1)
+        Avg_yx = Cos_yx.mean(axis=1)
+        scores = score_margin(Cos_xy, Idx_xy, Avg_xy, Avg_yx, margin, k)
+        # find best
+        best = scores.argmax(axis=1)
+        indices = np.zeros((nbex, 1), dtype=np.int32)
+        for i in range(nbex):
+            indices[i] = Idx_xy[i, best[i]]
+    return indices
+def get_transform(augmented_json, closest_neighbor, src):
+    if (
+        closest_neighbor in augmented_json
+        and augmented_json[closest_neighbor]["src"] == src
+    ):
+        return augmented_json[closest_neighbor]["errtype"]
+    return "Misaligned"
+def calculate_error(
+    x: np.ndarray,
+    y: np.ndarray,
+    margin: str = None,
+    k: int = 4,
+    eval_text: str = None,
+    augmented_json: str = None,
+) -> tp.Tuple[int, int, tp.Dict[str, int]]:
+    if augmented_json:
+        with open(augmented_json) as f:
+            augmented_json = json.load(f)
+        assert (
+            x.shape[0] < y.shape[0]
+        ), f"Shape mismatch: {x.shape[0]} >= target {y.shape[0]}"
+    else:
+        assert (
+            x.shape == y.shape
+        ), f"number of source {x.shape} / target {y.shape} shapes mismatch, "
+    nbex = x.shape[0]
+    augmented_report = {}
+    # for each x calculate the highest scoring neighbor from y
+    closest_neighbor = _score_knn(x, y, k, margin)
+    if eval_text:  # calc textual error
+        lines = open(eval_text, encoding="utf-8", errors="surrogateescape").readlines()
+        err = 0
+        for ex in range(nbex):
+            if lines[ex] != lines[closest_neighbor[ex, 0]]:
+                err += 1
+                if augmented_json:
+                    transform = get_transform(
+                        augmented_json,
+                        lines[closest_neighbor[ex, 0]].strip(),
+                        lines[ex].strip(),
+                    )
+                    augmented_report[transform] = augmented_report.get(transform, 0) + 1
+    else:  # calc index error
+        ref = np.linspace(0, nbex - 1, nbex).astype(int)  # [0, nbex)
+        err = nbex - np.equal(closest_neighbor.reshape(nbex), ref).astype(int).sum()
+    return err, nbex, augmented_report

laser/tasks/CCMatrix/MatrixMine.pdf ADDED Viewed

Binary file (39.8 kB). View file

laser/tasks/CCMatrix/README.md ADDED Viewed

	@@ -0,0 +1,39 @@

+# CCMatrix: Mining Billions of High-Quality Parallel Sentences on the WEB
+## Parallel data
+We show that margin-based bitext mining in LASER's multilingual sentence space can be applied to monolingual corpora of billions of sentences to produce high quality aligned translation data. We use thirty-two snapshots of a curated common crawl corpus [1] totaling 69 billion unique sentences. Using one unified approach for 80 languages, we were able to mine 10.8 billion parallel sentences, out of which only 2.9 billion are aligned with English.
+## Download
+We open-source our scripts in this directory so that others may reproduce the data, evaluation and results reported in the CCMatrix paper.
+```
+pip3 install cc_net
+python3 dl_cc_matrix.py
+```
+Please cite reference [2][3] if you use this data.
+## Evaluation
+Evaluation
+We have assessed the quality of our mined data with bilingual models and multilingual models.
+* Bilingual models [2]:  To evaluate the quality of the mined bitexts, we train NMT systems for most of the language pairs and evaluate them on TED, WMT and WAT test sets. Using our mined bitexts only and no human translated parallel data, we achieve a new state-of-the-art for a single system on the WMT'19 test set for translation between English and German, Russian and Chinese, as well as German/French. In particular, our English/German system outperforms the best single one by close to 4 BLEU points and is almost on pair with best WMT'19 evaluation system which uses system combination and back-translation. We also achieve excellent results for distant languages pairs like Russian/Japanese, outperforming the best submission at the 2019 workshop on Asian Translation (WAT).
+* Multilingual models [3]:  CCMatrix data is used to train M2M-100, a large-scale Many-to-Many multilingual translation model. The thousands of directions we mine produce training data for direct translations without relying solely on English data. We mine using novel strategy which exploits language groupings and bridge languages to avoid mining every possible direction while maintaining good accuracy. By training on this data and scaling model capacity through model parallelism and language-specific parameters, M2M-100 outperforms English-Centric multilingual models trained on data where either the source or target language is English. The system improves over 10 BLEU on average compared to an English-Centric baseline when translating directly between non-English directions. M2M-100 is competitive to bilingual models from WMT and improves over existing publicly available multilingual translation systems. To download the data, follow our instructions above. To download the models and reproduce the training, click [*here*](https://github.com/pytorch/fairseq/tree/master/examples/m2m_100)
+Please note that additional data filtering was applied before training the M2M-100 model, see [3] for details.
+Also, we have improved mining against English which leads to more bitexts, in particular for mid- and low-resources languages.
+This new data was not used for M2M-100.
+## References
+[1] Guillaume Wenzek, Marie-Anne Lachaux, Alexis Conneau, Vishrav Chaudhary, Francisco Guzmán, Armand Jouli and Edouard Grave,
+    [*CCNet: Extracting High Quality Monolingual Datasets from Web Crawl Data*](https://arxiv.org/abs/1911.00359)
+[2] Holger Schwenk, Guillaume Wenzek, Sergey Edunov, Edouard Grave and Armand Joulin,
+    [*CCMatrix: Mining Billions of High-Quality Parallel Sentences on the WEB*](https://arxiv.org/abs/1911.04944)
+[3] Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, and Armand Joulin. Beyond English-Centric Multilingual Machine Translation

laser/tasks/CCMatrix/dl_cc_matrix.py ADDED Viewed

	@@ -0,0 +1,338 @@

+import contextlib
+import gzip
+import logging
+import re
+import subprocess
+import tempfile
+from collections import defaultdict
+from pathlib import Path
+from typing import Callable, Dict, Iterable, List, NamedTuple, Type
+from cc_net.jsonql import open_remote_file, open_write
+from cc_net.process_wet_file import CCSegmentsReader
+from typing import Sequence
+import functools
+import multiprocessing
+BUFFER_SIZE = "32G"
+SORT_PARALLEL = 8
+KNOWN_VERSIONS = ["v1.0.0", "v1.0.beta", "v1.0.alpha"]
+class NormalizedBitextPtr(NamedTuple):
+    lang_pair: str
+    line_no: int
+    segment: str
+    digest: str
+    ptr_start: int
+    ptr_end: int
+    score: float
+class Bitext(NamedTuple):
+    lang_pair: str
+    line_no: int
+    score: float
+    text: str
+class SimpleBitext(NamedTuple):
+    line_no: int
+    score: float
+    text: str
+WEB_PAT = re.compile(r"https?:[^ \n]* ")
+WEB_REPL = "WEB "
+WEB2_PAT = re.compile(r"https?:[^ \n]*\n")
+WEB2_REPL = "WEB\n"
+def clean_content(raw_content: str) -> str:
+    # We need to clean all the content, because otherwise there is no way for
+    # the user to know if we need to clean it or not.
+    par = raw_content
+    par = par.replace("</s>", ". ")
+    par = par.replace("\t", " ")
+    par = re.sub(WEB_PAT, WEB_REPL, par, count=0)
+    par = re.sub(WEB2_PAT, WEB2_REPL, par, count=0)
+    return par
+def get_typed_parser(cls: Type) -> Callable:
+    types = cls.__annotations__.values()
+    def parser(line: str) -> NamedTuple:
+        parts = line.rstrip("\n").split("\t")
+        assert len(parts) == len(
+            types
+        ), f"Print size mismatch expected the following columns {cls.__annotations__} got: {parts}"
+        return cls(*(t(p) for t, p in zip(types, parts)))
+    return parser
+def open_read(file: Path) -> Iterable[str]:
+    if file.suffix == ".gz":
+        reader = gzip.open(file, "rt")
+    else:
+        reader = open(file, "rt")
+    with reader as f:
+        for line in f:
+            yield line
+def dl(outdir: Path = Path("data"), version: str = KNOWN_VERSIONS[0], parallelism: int = 8):
+    """
+    Download bitext pointers from FAIR dataset and extract corresponding CC snippets.
+    - version: Specific version to download
+    - outdir: Directory where the data should go. Files will be in {outdir}/{version}/raw/
+    """
+    assert version in KNOWN_VERSIONS, f"Unknown version {version}, chose from {KNOWN_VERSIONS}"
+    metadata_dir = f"https://dl.fbaipublicfiles.com/laser/CCMatrix/{version}"
+    file_list = [l.strip() for l in open_remote_file(metadata_dir + "/list.txt")]
+    outdir.mkdir(exist_ok=True)
+    outdir = outdir / version / "raw"
+    outdir.mkdir(exist_ok=True, parents=True)
+    dlf = functools.partial(dl_file, metadata_dir, outdir)
+    # list(map(dlf, file_list))
+    with multiprocessing.Pool(parallelism) as pool:
+        pool.map(dlf, file_list)
+def get_documents(segment: str) -> Dict[str, str]:
+    return {d["digest"]: d["raw_content"] for d in CCSegmentsReader([segment])}
+def dl_file(metadata_dir: str, outdir: Path, file: str):
+    metadata = "/".join((metadata_dir, file))
+    parser = get_typed_parser(NormalizedBitextPtr)
+    found_bitext, missed_bitext, skipped_line = 0, 0, 0
+    segment = ""
+    segment_downloads: Dict[str, int] = defaultdict(int)
+    raw_documents: Dict[str, str] = {}
+    cleaned_documents: Dict[str, str] = {}
+    outfile = outdir / file
+    if outfile.exists():
+        return
+    o = FileWriterWithTmp(outfile)
+    for i, line in enumerate(open_remote_file(metadata)):
+        try:
+            bitext: NormalizedBitextPtr = parser(line)
+            # Add some more assert in case the line is invalid but still parse
+            assert bitext.segment.startswith("crawl-data/")
+            assert bitext.digest.startswith("sha1:")
+        except AssertionError:
+            logging.error(f"Skipping line {i}: {line}")
+            skipped_line += 1
+            continue
+        if not segment or bitext.segment != segment:
+            segment = bitext.segment
+            segment_downloads[segment] += 1
+            # Load segment in RAM, purge document cache
+            raw_documents = get_documents(segment)
+            cleaned_documents = {}
+        raw_doc = raw_documents.get(bitext.digest)
+        if raw_doc is None:
+            logging.error(f"Document not found: {bitext.digest} in {segment}")
+            missed_bitext += 1
+            continue
+        clean_doc = cleaned_documents.get(bitext.digest)
+        if clean_doc is None:
+            clean_doc = clean_content(raw_doc)
+            cleaned_documents[bitext.digest] = clean_doc
+        text = clean_doc[bitext.ptr_start : bitext.ptr_end]
+        score = getattr(bitext, "score", 0.0)
+        bt = Bitext(bitext.lang_pair, bitext.line_no, score, text)
+        print(*bt, sep="\t", file=o)
+    o.close(True)
+    logging.info(f"Found {found_bitext} sentences, missed {missed_bitext} sentences.")
+    if skipped_line > 0:
+        logging.error(f"Skipped {skipped_line} unparsable lines")
+    expected_dl = len(segment_downloads)
+    actual_dl = sum(segment_downloads.values())
+    if actual_dl != expected_dl:
+        logging.error(
+            f"Some segments where downloaded twice. Total dl: {actual_dl}, distinct dl: {expected_dl}"
+        )
+def _tmp(file: Path) -> Path:
+    tmp_dir = file.parent
+    prefix = file.name.split(".", 1)[0] + "."
+    suffix = ".tmp." + file.name[len(prefix) :]
+    _, tmp_path = tempfile.mkstemp(dir=tmp_dir, prefix=prefix, suffix=suffix)
+    return Path(tmp_path)
+class FileWriterWithTmp:
+    def __init__(self, file: Path):
+        self.file = file
+        self.tmp_file = _tmp(file)
+        # We don't want to make FileWriterWithTmp a ContextManager
+        self.handle = open_write(self.tmp_file).__enter__()
+    def write(self, data) -> int:
+        return self.handle.write(data)
+    def close(self, success: bool = False):
+        self.handle.close()
+        if success:
+            self.tmp_file.rename(self.file)
+def transpose_file(outdir: Path, file: Path) -> None:
+    sentinel_file = file.with_suffix(".transposed")
+    if sentinel_file.exists():
+        return
+    outputs: Dict[str, FileWriterWithTmp] = {}
+    parser = get_typed_parser(Bitext)
+    success = False
+    try:
+        for line in open_read(file):
+            bt: Bitext = parser(line)
+            lang_pair = bt.lang_pair
+            if bt.lang_pair not in outputs:
+                assert (
+                    "/" in lang_pair
+                ), f"Invalid lang pair '{lang_pair}' should be 'src-trg/src' or 'src-trg/trg'"
+                (outdir / f"{lang_pair}").mkdir(exist_ok=True, parents=True)
+                o = FileWriterWithTmp(outdir / f"{lang_pair}_{file.name}")
+                outputs[lang_pair] = o
+            simple_bt = SimpleBitext(bt.line_no, bt.score, bt.text)
+            print(*simple_bt, sep="\t", file=outputs[lang_pair])
+        success = True
+    finally:
+        for o in outputs.values():
+            o.close(success)
+        if success:
+            sentinel_file.write_text("\n".join(str(o.file) for o in outputs.values()))
+            # file.unlink()
+def sort_files(outdir: Path, lang_pair_dir: Path, lang: str) -> Path:
+    out = outdir / lang_pair_dir.name / f"{lang}.txt"
+    if out.exists():
+        return out
+    files: List[Path] = []
+    for f in lang_pair_dir.iterdir():
+        if not f.suffix == ".gz":
+            continue
+        if f.name.split("_")[0] != lang:
+            continue
+        files.append(f)
+    print(f"Found {len(files)} files for lang '{lang}' in {lang_pair_dir}: {files}")
+    assert len(files) > 0
+    (outdir / lang_pair_dir.name).mkdir(exist_ok=True, parents=True)
+    tmp_out = _tmp(out)
+    unzipped_files = []
+    for f in files:
+        subprocess.check_call(["gunzip", "-k", str(f)])
+        unzipped_files.append(str(f)[:-3])
+    sort_cmd = [
+        "sort",
+        "-nk1",
+        f"--parallel={SORT_PARALLEL}",
+        f"--buffer-size={BUFFER_SIZE}",
+        "--output",
+        str(tmp_out),
+        ] + unzipped_files
+    subprocess.check_call(sort_cmd)
+    tmp_out.rename(out)
+    return out
+def finalize(
+    outdir: Path = Path("data"), version: str = KNOWN_VERSIONS[0], pairs: Sequence[str] = []
+) -> None:
+    """From the downloaded raw text files, extract the bitexts, sorted by language pair.
+    Assumes 'dl' has been run with the same outdir and version before.
+    - version: Specific version to download
+    - outdir: Directory where the data should go. Files will be in {outdir}/{version}/bitext/
+    - pairs: List of language pairs you are interested in. Defaults to all.
+    """
+    raw_dir = outdir / version / "raw"
+    if not raw_dir.is_dir():
+        cmd = f"python {__file__} dl --outdir {outdir} --version {version}"
+        assert raw_dir.is_dir(), f"Dir not found {raw_dir}. Did you run following command?\n{cmd}"
+    raw_files = list(raw_dir.glob("*.gz"))
+    split_dir = outdir / version / "split_by_lang"
+    split_dir.mkdir(exist_ok=True, parents=True)
+    tr = functools.partial(transpose_file, split_dir)
+    with multiprocessing.Pool() as pool:
+        pool.map(tr, raw_files)
+    bitext_dir = outdir / version / "bitext"
+    bitext_dir.mkdir(exist_ok=True, parents=True)
+    if pairs:
+        pair_dirs = []
+        for pair in pairs:
+            assert (
+                len(pair.split("-")) == 2
+            ), f"Invalid pair '{pair}', should be 'src-trg'"
+            pair_dir = split_dir / pair
+            assert (
+                pair_dir.is_dir()
+            ), f"Dir {pair_dir} not found for lang pair '{pair}'. Is the pair valid ?"
+            pair_dirs.append(pair_dir)
+    else:
+        pair_dirs = [d for d in split_dir.iterdir() if d.is_dir()]
+    for pair_dir in pair_dirs:
+        src, trg = pair_dir.name.split("-")
+        src_file = sort_files(bitext_dir, pair_dir, src)
+        trg_file = sort_files(bitext_dir, pair_dir, trg)
+        validate(src_file, trg_file)
+def validate(src_file: Path, trg_file: Path) -> None:
+    """Checks that the segments in the given batch are valid."""
+    lines_src, lines_trg, found_pairs = 0, 0, 0
+    parser = get_typed_parser(SimpleBitext)
+    with open(src_file) as src_f, open(trg_file) as trg_f:
+        src_l = src_f.readline()
+        trg_l = trg_f.readline()
+        while src_l and trg_l:
+            src: SimpleBitext = parser(src_l)
+            trg: SimpleBitext = parser(trg_l)
+            if src.line_no <= trg.line_no:
+                lines_src += 1
+                src_l = src_f.readline()
+            if trg.line_no <= src.line_no:
+                lines_trg += 1
+                trg_l = trg_f.readline()
+            if trg.line_no == src.line_no:
+                found_pairs += 1
+    if found_pairs == lines_src and found_pairs == lines_trg:
+        logging.info(
+            f"Validated {src_file} and {trg_file}. Found {found_pairs} bitexts."
+        )
+    else:
+        logging.error(
+            f"Validated {src_file} and {trg_file}. "
+            f"Found {found_pairs} bitexts, from {lines_src} in {src_file} and {lines_trg} in {trg_file}"
+        )
+if __name__ == "__main__":
+    import func_argparse
+    func_argparse.main(dl, finalize)

laser/tasks/SentimentAnalysis/README.md ADDED Viewed

	@@ -0,0 +1,34 @@

+# Laser Encoder: Sentiment Analysis
+## Overview
+This project demonstrates the application of the Laser Encoder tool for creating sentence embeddings in the context of sentiment analysis. The Laser Encoder is used to encode text data, and a sentiment analysis model is trained to predict the sentiment of the text.
+## Getting Started
+To run the notebook in Google Colab, click the "Open in Colab" button below:
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NIXBLACK11/LASER-fork/blob/Sentiment-analysis-laser/tasks/SentimentAnalysis/SentimentAnalysis.ipynb)
+Also, check out the hugging face space with the button below:
+[![Open In Hugging Face Space](https://img.shields.io/badge/Open%20In-Hugging%20Face%20Space-blue?logo=huggingface)](https://huggingface.co/spaces/NIXBLACK/SentimentAnalysis_LASER_)
+## Example Usage
+Run the Example Notebook:
+    Execute the provided Jupyter Notebook SentimentAnalysis.ipynb
+        jupyter notebook SentimentAnalysis.ipynb
+## Customization
+- Modify the model architecture, hyperparameters, and training settings in the neural network model section based on your requirements.
+- Customize the sentiment mapping and handling of unknown sentiments in the data preparation section.
+## Additional Notes
+- Feel free to experiment with different models, embeddings, and hyperparameters to optimize performance.
+- Ensure that the dimensions of embeddings and model inputs are compatible.
+Adapt the code based on your specific dataset and use case.

laser/tasks/SentimentAnalysis/SentimentAnalysis.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

laser/tasks/WikiMatrix/README.md ADDED Viewed

	@@ -0,0 +1,93 @@

+# WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia
+The goal of this project is to mine for parallel sentences in the textual content of Wikipedia for all possible language pairs.
+## Mined data
+* 85 different languages, 1620 language pairs
+* 134M parallel sentences, out of which 34M are aligned with English
+* this [*table shows the amount of mined parallel sentences for most of the language pairs*](WikiMatrix-sizes.pdf)
+* the mined bitext are stored on AWS and can de downloaded with the following command:
+```bash
+wget https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/WikiMatrix.en-fr.tsv.gz
+```
+Replace "en-fr" with the ISO codes of the desired language pair.
+The language pair must be in alphabetical order, e.g. "de-en" and not "en-de".
+The list of available bitexts and their sizes are given in the file [*list_of_bitexts.txt*](list_of_bitexts.txt).
+Please do **not loop over all files** since AWs implements some [*limitations*](https://dl.fbaipublicfiles.com/README) to avoid abuse.
+Use this command if you want to download all 1620 language pairs in one tar file (but this is 65GB!):
+```bash
+wget https://dl.fbaipublicfiles.com/laser/WikiMatrix/WikiMatrix.v1.1620_language_pairs.tar
+```
+## Approach
+We use LASER's bitext mining approach and encoder for 93 languages [2,3].
+We do not use the inter-language links provided by Wikipedia,
+but search over all Wikipedia articles of each language.  We approach the
+computational challenge to mine in almost 600 million sentences by using fast
+indexing and similarity search with [*FAISS*](https://github.com/facebookresearch/faiss).
+Prior to mining parallel sentences, we perform
+sentence segmentation, deduplication and language identification.
+Please see reference [1] for details.
+## Data extraction and threshold optimization
+We provide a tool to extract parallel texts from the the TSV files:
+```bash
+python3 extract.py \
+  --tsv WikiMatrix.en-fr.tsv.gz \
+  --bitext WikiMatrix.en-fr.txt \
+  --src-lang en --trg-lang fr \
+  --threshold 1.04
+```
+One can specify the threshold on the margin score.
+The higher the value, the more likely the sentences are mutual translations, but the less data one will get.
+**A value of 1.04 seems to be good choice for most language pairs.** Please see the analysis in the paper for
+more information [1].
+## Evaluation
+To assess the quality of the mined bitexts, we trained neural MT system on all language pairs
+for which we were able to mine at least 25k parallel sentences (with a margin threshold of 1.04).
+We trained systems in both directions, source to target and target to source, and report BLEU scores
+on the [*TED test*](https://github.com/neulab/word-embeddings-for-nmt) set proposed in [4].
+This totals 1886 different NMT systems.
+This [*table shows the BLEU scores for the most frequest language pairs*](WikiMatrix-bleu.pdf).
+We achieve BLEU scores over 30 for several language pairs.
+The goal is not to build state of the art systems for each language pair, but
+to get an indication of the quality of the automatically mined data.  These
+BLEU scores should be of course appreciated in context of the sizes of the
+mined corpora.
+Obviously, we can not exclude that the
+provided data contains some wrong alignments even though the margin is large.
+Finally, we would like to point out that we run our approach on all available
+languages in Wikipedia, independently of the quality of LASER's sentence
+embeddings for each one.
+## License
+The mined data is distributed under the Creative Commons Attribution-ShareAlike license.
+Please cite reference [1] if you use this data.
+## References
+[1] Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman,
+    [*WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia*](https://arxiv.org/abs/1907.05791)
+    arXiv, July 11  2019.
+[2] Mikel Artetxe and Holger Schwenk,
+    [*Margin-based Parallel Corpus Mining with Multilingual Sentence Embeddings*](https://arxiv.org/abs/1811.01136)
+    arXiv, Nov 3 2018.
+[3] Mikel Artetxe and Holger Schwenk,
+    [*Massively Multilingual Sentence Embeddings for Zero-Shot Cross-Lingual Transfer and Beyond*](https://arxiv.org/abs/1812.10464)
+    arXiv, Dec 26 2018.
+[4] Ye Qi, Devendra  Sachan, Matthieu Felix, Sarguna Padmanabhan and Graham Neubig,
+    [*When and Why Are Pre-Trained Word Embeddings Useful for Neural Machine Translation?*](https://www.aclweb.org/anthology/papers/N/N18/N18-2084/)
+    NAACL, pages 529-535, 2018.

laser/tasks/WikiMatrix/WikiMatrix-bleu.pdf ADDED Viewed

Binary file (54.3 kB). View file

laser/tasks/WikiMatrix/WikiMatrix-sizes.pdf ADDED Viewed

Binary file (60.3 kB). View file

laser/tasks/WikiMatrix/extract.py ADDED Viewed

	@@ -0,0 +1,81 @@

+#!/bin/python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# LASER  Language-Agnostic SEntence Representations
+# is a toolkit to calculate multilingual sentence embeddings
+# and to use them for document classification, bitext filtering
+# and mining
+#
+# --------------------------------------------------------
+#
+# Tool to extract subset of mined bitexts in a tsv.gz file
+import os
+import sys
+import gzip
+import argparse
+###############################################################################
+#
+# Main
+#
+###############################################################################
+parser = argparse.ArgumentParser(description='Tool to extract bitext from the WikiMatrix')
+parser.add_argument('--encoding', default='utf-8',
+    help='character encoding for input/output')
+parser.add_argument('--tsv', type=str, required=True,
+    help='File with mined bitexts')
+parser.add_argument('--bitext', type=str, required=True,
+    help='Text file after sentence splitting')
+parser.add_argument('--src-lang', type=str, required=True,
+    help='Source language')
+parser.add_argument('--trg-lang', type=str, required=True,
+    help='Traget language')
+parser.add_argument('--threshold', type=float, default=1.05,
+    help='Threshold on margin score')
+parser.add_argument('--nb-sents', type=int, default=999999999,
+    help='Maximal number of sentences')
+parser.add_argument('--nb-words-src', type=int, default=999999999,
+    help='Maxmimal numer of total words in the source language')
+parser.add_argument('--nb-words-trg', type=int, default=999999999,
+    help='Maxmimal numer of total words in the target language')
+args = parser.parse_args()
+print('Tool to extract bitext from the WikiMatrix')
+nl = 0
+nw_src = 0
+nw_trg = 0
+print('Processing {}'.format(args.tsv))
+with gzip.open(args.tsv, 'rt', encoding=args.encoding) as tsv:
+    with open(args.bitext + '.' + args.src_lang, 'wt', encoding=args.encoding) as fsrc:
+        with open(args.bitext + '.' + args.trg_lang, 'wt', encoding=args.encoding) as ftrg:
+            while nl < args.nb_sents:
+                line = tsv.readline()
+                if not line:
+                    break
+                fields = line.split('\t')
+                cur_src = len(fields[1].split())
+                cur_trg = len(fields[2].split())
+                if float(fields[0]) < args.threshold:
+                    break
+                if nw_src + cur_src > args.nb_words_src:
+                    break
+                if nw_trg + cur_trg > args.nb_words_trg:
+                    break
+                fsrc.write(fields[1].strip() + '\n')
+                ftrg.write(fields[2].strip() + '\n')
+                nw_src += cur_src
+                nw_trg += cur_trg
+                nl += 1
+                if nl % 100000 == 0:
+                    print('\r - {:d} lines read'.format(nl), end='')
+print('\r - wrote {:d} lines'.format(nl))
+print(' - with {:d} source and {:d} target words'.format(nw_src, nw_trg))
+print(' - last threshold is {:.4f}'.format(float(fields[0])))

laser/tasks/WikiMatrix/list_of_bitexts.txt ADDED Viewed

	@@ -0,0 +1,1620 @@

+WikiMatrix.an-ca.tsv	24616
+WikiMatrix.an-de.tsv	12887
+WikiMatrix.an-en.tsv	23313
+WikiMatrix.an-es.tsv	33723
+WikiMatrix.an-fr.tsv	16726
+WikiMatrix.an-gl.tsv	15209
+WikiMatrix.an-it.tsv	13203
+WikiMatrix.an-pl.tsv	10456
+WikiMatrix.an-pt.tsv	14850
+WikiMatrix.an-ru.tsv	11579
+WikiMatrix.ar-arz.tsv	29316
+WikiMatrix.ar-az.tsv	17543
+WikiMatrix.ar-ba.tsv	15093
+WikiMatrix.ar-be.tsv	11720
+WikiMatrix.ar-bg.tsv	54919
+WikiMatrix.ar-bn.tsv	40997
+WikiMatrix.ar-br.tsv	10707
+WikiMatrix.ar-bs.tsv	34137
+WikiMatrix.ar-ca.tsv	94324
+WikiMatrix.ar-ceb.tsv	11056
+WikiMatrix.ar-cs.tsv	67131
+WikiMatrix.ar-da.tsv	53021
+WikiMatrix.ar-de.tsv	99258
+WikiMatrix.ar-el.tsv	66961
+WikiMatrix.ar-en.tsv	999762
+WikiMatrix.ar-eo.tsv	37130
+WikiMatrix.ar-es.tsv	174557
+WikiMatrix.ar-et.tsv	40659
+WikiMatrix.ar-eu.tsv	24853
+WikiMatrix.ar-fa.tsv	58545
+WikiMatrix.ar-fi.tsv	53052
+WikiMatrix.ar-fr.tsv	163549
+WikiMatrix.ar-gl.tsv	50528
+WikiMatrix.ar-he.tsv	68302
+WikiMatrix.ar-hi.tsv	38318
+WikiMatrix.ar-hr.tsv	38853
+WikiMatrix.ar-hu.tsv	60661
+WikiMatrix.ar-id.tsv	90815
+WikiMatrix.ar-is.tsv	18271
+WikiMatrix.ar-it.tsv	123838
+WikiMatrix.ar-ja.tsv	83059
+WikiMatrix.ar-kk.tsv	11688
+WikiMatrix.ar-ko.tsv	48869
+WikiMatrix.ar-lt.tsv	33495
+WikiMatrix.ar-mk.tsv	52154
+WikiMatrix.ar-ml.tsv	32012
+WikiMatrix.ar-mr.tsv	32462
+WikiMatrix.ar-nds.tsv	11783
+WikiMatrix.ar-ne.tsv	12129
+WikiMatrix.ar-nl.tsv	73006
+WikiMatrix.ar-no.tsv	58790
+WikiMatrix.ar-pl.tsv	74295
+WikiMatrix.ar-pt.tsv	157441
+WikiMatrix.ar-ro.tsv	71258
+WikiMatrix.ar-ru.tsv	125312
+WikiMatrix.ar-sh.tsv	35310
+WikiMatrix.ar-si.tsv	32607
+WikiMatrix.ar-sk.tsv	32135
+WikiMatrix.ar-sl.tsv	39982
+WikiMatrix.ar-sq.tsv	30042
+WikiMatrix.ar-sr.tsv	49502
+WikiMatrix.ar-sv.tsv	58224
+WikiMatrix.ar-sw.tsv	13755
+WikiMatrix.ar-ta.tsv	27250
+WikiMatrix.ar-te.tsv	27072
+WikiMatrix.ar-tl.tsv	18350
+WikiMatrix.ar-tr.tsv	69844
+WikiMatrix.ar-tt.tsv	14074
+WikiMatrix.ar-uk.tsv	70551
+WikiMatrix.ar-vi.tsv	93890
+WikiMatrix.arz-de.tsv	11796
+WikiMatrix.arz-en.tsv	18231
+WikiMatrix.arz-es.tsv	12571
+WikiMatrix.arz-fr.tsv	12047
+WikiMatrix.ar-zh.tsv	86236
+WikiMatrix.arz-it.tsv	10384
+WikiMatrix.arz-pt.tsv	10887
+WikiMatrix.arz-ru.tsv	12163
+WikiMatrix.as-de.tsv	11031
+WikiMatrix.as-es.tsv	11054
+WikiMatrix.as-fr.tsv	12092
+WikiMatrix.as-it.tsv	10844
+WikiMatrix.azb-fr.tsv	10355
+WikiMatrix.az-bg.tsv	14096
+WikiMatrix.az-ca.tsv	15416
+WikiMatrix.az-cs.tsv	17994
+WikiMatrix.az-da.tsv	11999
+WikiMatrix.az-de.tsv	34736
+WikiMatrix.az-el.tsv	10594
+WikiMatrix.az-en.tsv	71276
+WikiMatrix.az-es.tsv	31334
+WikiMatrix.az-et.tsv	10537
+WikiMatrix.az-fa.tsv	16093
+WikiMatrix.az-fi.tsv	14393
+WikiMatrix.az-fr.tsv	29949
+WikiMatrix.az-gl.tsv	10514
+WikiMatrix.az-he.tsv	12252
+WikiMatrix.az-hr.tsv	10638
+WikiMatrix.az-hu.tsv	15439
+WikiMatrix.az-id.tsv	17049
+WikiMatrix.az-it.tsv	25780
+WikiMatrix.az-ja.tsv	23140
+WikiMatrix.az-ko.tsv	11806
+WikiMatrix.az-nl.tsv	22641
+WikiMatrix.az-no.tsv	13992
+WikiMatrix.az-pl.tsv	22609
+WikiMatrix.az-pt.tsv	25337
+WikiMatrix.az-ro.tsv	14801
+WikiMatrix.az-ru.tsv	47130
+WikiMatrix.az-sr.tsv	10553
+WikiMatrix.az-sv.tsv	19240
+WikiMatrix.az-ta.tsv	11980
+WikiMatrix.az-tr.tsv	42846
+WikiMatrix.az-uk.tsv	19756
+WikiMatrix.az-vi.tsv	10485
+WikiMatrix.az-zh.tsv	19175
+WikiMatrix.ba-bg.tsv	14238
+WikiMatrix.ba-ca.tsv	17290
+WikiMatrix.ba-cs.tsv	16981
+WikiMatrix.ba-da.tsv	13015
+WikiMatrix.ba-de.tsv	27046
+WikiMatrix.ba-el.tsv	10653
+WikiMatrix.ba-en.tsv	28176
+WikiMatrix.ba-es.tsv	28201
+WikiMatrix.ba-fi.tsv	12213
+WikiMatrix.ba-fr.tsv	29638
+WikiMatrix.ba-gl.tsv	12390
+WikiMatrix.ba-hr.tsv	10663
+WikiMatrix.ba-hu.tsv	12223
+WikiMatrix.ba-id.tsv	12203
+WikiMatrix.ba-it.tsv	26652
+WikiMatrix.ba-ja.tsv	13782
+WikiMatrix.ba-nl.tsv	21836
+WikiMatrix.ba-no.tsv	15233
+WikiMatrix.ba-pl.tsv	19390
+WikiMatrix.ba-pt.tsv	24870
+WikiMatrix.bar-de.tsv	41990
+WikiMatrix.bar-en.tsv	16990
+WikiMatrix.bar-es.tsv	12506
+WikiMatrix.bar-fr.tsv	12036
+WikiMatrix.bar-it.tsv	10450
+WikiMatrix.ba-ro.tsv	15499
+WikiMatrix.bar-pt.tsv	10377
+WikiMatrix.bar-ru.tsv	10220
+WikiMatrix.ba-ru.tsv	42893
+WikiMatrix.ba-sh.tsv	10485
+WikiMatrix.ba-sk.tsv	10764
+WikiMatrix.ba-sl.tsv	10349
+WikiMatrix.ba-sr.tsv	10182
+WikiMatrix.ba-sv.tsv	20346
+WikiMatrix.ba-tr.tsv	11183
+WikiMatrix.ba-uk.tsv	15915
+WikiMatrix.ba-zh.tsv	10468
+WikiMatrix.be-bg.tsv	16061
+WikiMatrix.be-ca.tsv	16344
+WikiMatrix.be-cs.tsv	14343
+WikiMatrix.be-de.tsv	20671
+WikiMatrix.be-en.tsv	33927
+WikiMatrix.be-es.tsv	28153
+WikiMatrix.be-fi.tsv	10959
+WikiMatrix.be-fr.tsv	24250
+WikiMatrix.be-he.tsv	10710
+WikiMatrix.be-hu.tsv	11940
+WikiMatrix.be-it.tsv	24351
+WikiMatrix.be-ja.tsv	12032
+WikiMatrix.be-nl.tsv	14188
+WikiMatrix.be-no.tsv	10997
+WikiMatrix.be-pl.tsv	19438
+WikiMatrix.be-pt.tsv	23580
+WikiMatrix.be-ro.tsv	13182
+WikiMatrix.be-ru.tsv	161215
+WikiMatrix.be-sr.tsv	10206
+WikiMatrix.be-sv.tsv	16161
+WikiMatrix.be-uk.tsv	80801
+WikiMatrix.bg-bn.tsv	38072
+WikiMatrix.bg-bs.tsv	34760
+WikiMatrix.bg-ca.tsv	76189
+WikiMatrix.bg-ceb.tsv	11166
+WikiMatrix.bg-cs.tsv	79005
+WikiMatrix.bg-da.tsv	53738
+WikiMatrix.bg-de.tsv	132146
+WikiMatrix.bg-el.tsv	62768
+WikiMatrix.bg-en.tsv	357969
+WikiMatrix.bg-eo.tsv	40884
+WikiMatrix.bg-es.tsv	122534
+WikiMatrix.bg-et.tsv	43393
+WikiMatrix.bg-eu.tsv	25564
+WikiMatrix.bg-fa.tsv	37158
+WikiMatrix.bg-fi.tsv	61847
+WikiMatrix.bg-fr.tsv	117264
+WikiMatrix.bg-gl.tsv	43273
+WikiMatrix.bg-he.tsv	58167
+WikiMatrix.bg-hi.tsv	30349
+WikiMatrix.bg-hr.tsv	47877
+WikiMatrix.bg-hu.tsv	68595
+WikiMatrix.bg-id.tsv	60639
+WikiMatrix.bg-is.tsv	17659
+WikiMatrix.bg-it.tsv	102305
+WikiMatrix.bg-ja.tsv	71117
+WikiMatrix.bg-kk.tsv	11542
+WikiMatrix.bg-ko.tsv	38280
+WikiMatrix.bg-lt.tsv	42406
+WikiMatrix.bg-mk.tsv	86038
+WikiMatrix.bg-ml.tsv	29348
+WikiMatrix.bg-mr.tsv	35898
+WikiMatrix.bg-nds.tsv	11308
+WikiMatrix.bg-ne.tsv	13616
+WikiMatrix.bg-nl.tsv	84025
+WikiMatrix.bg-no.tsv	58964
+WikiMatrix.bg-pl.tsv	96090
+WikiMatrix.bg-pt.tsv	114067
+WikiMatrix.bg-ro.tsv	69902
+WikiMatrix.bg-ru.tsv	270073
+WikiMatrix.bg-sh.tsv	41845
+WikiMatrix.bg-si.tsv	31112
+WikiMatrix.bg-sk.tsv	43375
+WikiMatrix.bg-sl.tsv	46673
+WikiMatrix.bg-sq.tsv	26037
+WikiMatrix.bg-sr.tsv	65281
+WikiMatrix.bg-sv.tsv	63135
+WikiMatrix.bg-sw.tsv	12945
+WikiMatrix.bg-ta.tsv	21462
+WikiMatrix.bg-te.tsv	23487
+WikiMatrix.bg-tl.tsv	21198
+WikiMatrix.bg-tr.tsv	56592
+WikiMatrix.bg-tt.tsv	12088
+WikiMatrix.bg-uk.tsv	126154
+WikiMatrix.bg-vi.tsv	60738
+WikiMatrix.bg-zh.tsv	60373
+WikiMatrix.bn-bs.tsv	21448
+WikiMatrix.bn-ca.tsv	41891
+WikiMatrix.bn-cs.tsv	47405
+WikiMatrix.bn-da.tsv	33723
+WikiMatrix.bn-de.tsv	70350
+WikiMatrix.bn-el.tsv	36202
+WikiMatrix.bn-en.tsv	280567
+WikiMatrix.bn-eo.tsv	27166
+WikiMatrix.bn-es.tsv	81824
+WikiMatrix.bn-et.tsv	26968
+WikiMatrix.bn-eu.tsv	14912
+WikiMatrix.bn-fa.tsv	20952
+WikiMatrix.bn-fi.tsv	37517
+WikiMatrix.bn-fr.tsv	68784
+WikiMatrix.bn-gl.tsv	27666
+WikiMatrix.bn-he.tsv	34274
+WikiMatrix.bn-hi.tsv	21240
+WikiMatrix.bn-hr.tsv	23924
+WikiMatrix.bn-hu.tsv	41219
+WikiMatrix.bn-id.tsv	36553
+WikiMatrix.bn-it.tsv	64222
+WikiMatrix.bn-ja.tsv	38462
+WikiMatrix.bn-ko.tsv	20915
+WikiMatrix.bn-lt.tsv	21523
+WikiMatrix.bn-mk.tsv	23173
+WikiMatrix.bn-nl.tsv	50217
+WikiMatrix.bn-no.tsv	35729
+WikiMatrix.bn-pl.tsv	52856
+WikiMatrix.bn-pt.tsv	76354
+WikiMatrix.bn-ro.tsv	46700
+WikiMatrix.bn-ru.tsv	62512
+WikiMatrix.bn-sh.tsv	20767
+WikiMatrix.bn-sk.tsv	25064
+WikiMatrix.bn-sl.tsv	26700
+WikiMatrix.bn-sq.tsv	17724
+WikiMatrix.bn-sr.tsv	25613
+WikiMatrix.bn-sv.tsv	54274
+WikiMatrix.bn-ta.tsv	12734
+WikiMatrix.bn-tr.tsv	33161
+WikiMatrix.bn-uk.tsv	37701
+WikiMatrix.bn-vi.tsv	31080
+WikiMatrix.bn-zh.tsv	31604
+WikiMatrix.br-de.tsv	20925
+WikiMatrix.br-en.tsv	16902
+WikiMatrix.br-es.tsv	22492
+WikiMatrix.br-fr.tsv	23892
+WikiMatrix.br-it.tsv	22410
+WikiMatrix.br-pt.tsv	19806
+WikiMatrix.br-ru.tsv	16104
+WikiMatrix.br-uk.tsv	11428
+WikiMatrix.bs-ca.tsv	44601
+WikiMatrix.bs-cs.tsv	43380
+WikiMatrix.bs-da.tsv	32718
+WikiMatrix.bs-de.tsv	71019
+WikiMatrix.bs-el.tsv	33881
+WikiMatrix.bs-en.tsv	210690
+WikiMatrix.bs-eo.tsv	24088
+WikiMatrix.bs-es.tsv	70064
+WikiMatrix.bs-et.tsv	25631
+WikiMatrix.bs-eu.tsv	16473
+WikiMatrix.bs-fa.tsv	20287
+WikiMatrix.bs-fi.tsv	36106
+WikiMatrix.bs-fr.tsv	60013
+WikiMatrix.bs-gl.tsv	32509
+WikiMatrix.bs-he.tsv	30165
+WikiMatrix.bs-hi.tsv	16693
+WikiMatrix.bs-hr.tsv	164225
+WikiMatrix.bs-hu.tsv	39139
+WikiMatrix.bs-id.tsv	38865
+WikiMatrix.bs-is.tsv	11489
+WikiMatrix.bs-it.tsv	52824
+WikiMatrix.bs-ja.tsv	36882
+WikiMatrix.bs-ko.tsv	22710
+WikiMatrix.bs-lt.tsv	23114
+WikiMatrix.bs-mk.tsv	39333
+WikiMatrix.bs-ml.tsv	19148
+WikiMatrix.bs-mr.tsv	20082
+WikiMatrix.bs-nl.tsv	45271
+WikiMatrix.bs-no.tsv	36061
+WikiMatrix.bs-pl.tsv	48283
+WikiMatrix.bs-pt.tsv	62118
+WikiMatrix.bs-ro.tsv	37605
+WikiMatrix.bs-ru.tsv	59540
+WikiMatrix.bs-sh.tsv	178354
+WikiMatrix.bs-si.tsv	16269
+WikiMatrix.bs-sk.tsv	25108
+WikiMatrix.bs-sl.tsv	34165
+WikiMatrix.bs-sq.tsv	19923
+WikiMatrix.bs-sr.tsv	130890
+WikiMatrix.bs-sv.tsv	38600
+WikiMatrix.bs-ta.tsv	15962
+WikiMatrix.bs-te.tsv	12974
+WikiMatrix.bs-tl.tsv	13894
+WikiMatrix.bs-tr.tsv	33212
+WikiMatrix.bs-uk.tsv	39682
+WikiMatrix.bs-vi.tsv	38866
+WikiMatrix.bs-zh.tsv	31707
+WikiMatrix.ca-ceb.tsv	14847
+WikiMatrix.ca-cs.tsv	100782
+WikiMatrix.ca-da.tsv	86539
+WikiMatrix.ca-de.tsv	180321
+WikiMatrix.ca-el.tsv	90118
+WikiMatrix.ca-en.tsv	1205908
+WikiMatrix.ca-eo.tsv	81716
+WikiMatrix.ca-es.tsv	1580036
+WikiMatrix.ca-et.tsv	54756
+WikiMatrix.ca-eu.tsv	77232
+WikiMatrix.ca-fa.tsv	44064
+WikiMatrix.ca-fi.tsv	83094
+WikiMatrix.ca-fo.tsv	13082
+WikiMatrix.ca-fr.tsv	490870
+WikiMatrix.ca-fy.tsv	13000
+WikiMatrix.ca-gl.tsv	268445
+WikiMatrix.ca-he.tsv	84339
+WikiMatrix.ca-hi.tsv	37348
+WikiMatrix.ca-hr.tsv	57726
+WikiMatrix.ca-hu.tsv	92229
+WikiMatrix.ca-id.tsv	107262
+WikiMatrix.ca-is.tsv	23961
+WikiMatrix.ca-it.tsv	316207
+WikiMatrix.ca-ja.tsv	103898
+WikiMatrix.ca-ka.tsv	11585
+WikiMatrix.ca-kk.tsv	12931
+WikiMatrix.ca-ko.tsv	52062
+WikiMatrix.ca-la.tsv	12936
+WikiMatrix.ca-lb.tsv	12167
+WikiMatrix.ca-lt.tsv	45454
+WikiMatrix.ca-mk.tsv	61863
+WikiMatrix.ca-ml.tsv	45785
+WikiMatrix.ca-mr.tsv	56224
+WikiMatrix.ca-nds.tsv	16849
+WikiMatrix.ca-ne.tsv	17559
+WikiMatrix.ca-nl.tsv	144699
+WikiMatrix.ca-no.tsv	102814
+WikiMatrix.ca-oc.tsv	57688
+WikiMatrix.ca-pl.tsv	121144
+WikiMatrix.ca-pt.tsv	358872
+WikiMatrix.ca-ro.tsv	110611
+WikiMatrix.ca-ru.tsv	169694
+WikiMatrix.ca-sh.tsv	52130
+WikiMatrix.ca-si.tsv	52526
+WikiMatrix.ca-sk.tsv	50258
+WikiMatrix.ca-sl.tsv	57635
+WikiMatrix.ca-sq.tsv	34778
+WikiMatrix.ca-sr.tsv	67675
+WikiMatrix.ca-sv.tsv	102757
+WikiMatrix.ca-sw.tsv	14172
+WikiMatrix.ca-ta.tsv	30492
+WikiMatrix.ca-te.tsv	35458
+WikiMatrix.ca-tl.tsv	31806
+WikiMatrix.ca-tr.tsv	77056
+WikiMatrix.ca-tt.tsv	16252
+WikiMatrix.ca-uk.tsv	98316
+WikiMatrix.ca-vi.tsv	106890
+WikiMatrix.ca-zh.tsv	90642
+WikiMatrix.ceb-cs.tsv	13961
+WikiMatrix.ceb-de.tsv	22557
+WikiMatrix.ceb-en.tsv	29061
+WikiMatrix.ceb-es.tsv	27593
+WikiMatrix.ceb-fi.tsv	10552
+WikiMatrix.ceb-fr.tsv	24359
+WikiMatrix.ceb-hu.tsv	12546
+WikiMatrix.ceb-it.tsv	24544
+WikiMatrix.ceb-ja.tsv	14628
+WikiMatrix.ceb-nl.tsv	15981
+WikiMatrix.ceb-no.tsv	10617
+WikiMatrix.ceb-pl.tsv	17744
+WikiMatrix.ceb-pt.tsv	20982
+WikiMatrix.ceb-ro.tsv	11740
+WikiMatrix.ceb-ru.tsv	21786
+WikiMatrix.ceb-sv.tsv	55991
+WikiMatrix.ceb-uk.tsv	12630
+WikiMatrix.cs-da.tsv	75869
+WikiMatrix.cs-de.tsv	233859
+WikiMatrix.cs-el.tsv	70243
+WikiMatrix.cs-en.tsv	519194
+WikiMatrix.cs-eo.tsv	75647
+WikiMatrix.cs-es.tsv	181522
+WikiMatrix.cs-et.tsv	62499
+WikiMatrix.cs-eu.tsv	36854
+WikiMatrix.cs-fa.tsv	45233
+WikiMatrix.cs-fi.tsv	95910
+WikiMatrix.cs-fr.tsv	185766
+WikiMatrix.cs-fy.tsv	10155
+WikiMatrix.cs-gl.tsv	54156
+WikiMatrix.cs-he.tsv	72677
+WikiMatrix.cs-hi.tsv	38939
+WikiMatrix.cs-hr.tsv	63902
+WikiMatrix.cs-hu.tsv	105871
+WikiMatrix.cs-id.tsv	78669
+WikiMatrix.cs-is.tsv	23143
+WikiMatrix.cs-it.tsv	161101
+WikiMatrix.cs-ja.tsv	105593
+WikiMatrix.cs-ka.tsv	10280
+WikiMatrix.cs-kk.tsv	15269
+WikiMatrix.cs-ko.tsv	53009
+WikiMatrix.cs-la.tsv	11106
+WikiMatrix.cs-lt.tsv	55863
+WikiMatrix.cs-mk.tsv	51965
+WikiMatrix.cs-ml.tsv	36217
+WikiMatrix.cs-mr.tsv	41772
+WikiMatrix.cs-nds.tsv	14694
+WikiMatrix.cs-ne.tsv	15583
+WikiMatrix.cs-nl.tsv	139344
+WikiMatrix.cs-no.tsv	86494
+WikiMatrix.cs-oc.tsv	11347
+WikiMatrix.cs-pl.tsv	176644
+WikiMatrix.cs-pt.tsv	153498
+WikiMatrix.cs-ro.tsv	82650
+WikiMatrix.cs-ru.tsv	186997
+WikiMatrix.cs-sh.tsv	50524
+WikiMatrix.cs-si.tsv	37450
+WikiMatrix.cs-sk.tsv	474501
+WikiMatrix.cs-sl.tsv	64723
+WikiMatrix.cs-sq.tsv	30247
+WikiMatrix.cs-sr.tsv	63977
+WikiMatrix.cs-sv.tsv	97411
+WikiMatrix.cs-sw.tsv	15456
+WikiMatrix.cs-ta.tsv	31623
+WikiMatrix.cs-te.tsv	34268
+WikiMatrix.cs-tl.tsv	25877
+WikiMatrix.cs-tr.tsv	75298
+WikiMatrix.cs-tt.tsv	14187
+WikiMatrix.cs-uk.tsv	104982
+WikiMatrix.cs-vi.tsv	74800
+WikiMatrix.cs-zh.tsv	80380
+WikiMatrix.da-de.tsv	180346
+WikiMatrix.da-el.tsv	54103
+WikiMatrix.da-en.tsv	436051
+WikiMatrix.da-eo.tsv	39229
+WikiMatrix.da-es.tsv	140600
+WikiMatrix.da-et.tsv	45476
+WikiMatrix.da-eu.tsv	26471
+WikiMatrix.da-fa.tsv	29956
+WikiMatrix.da-fi.tsv	75305
+WikiMatrix.da-fo.tsv	12572
+WikiMatrix.da-fr.tsv	142489
+WikiMatrix.da-gl.tsv	44177
+WikiMatrix.da-he.tsv	55865
+WikiMatrix.da-hi.tsv	25361
+WikiMatrix.da-hr.tsv	43287
+WikiMatrix.da-hu.tsv	69597
+WikiMatrix.da-id.tsv	63913
+WikiMatrix.da-is.tsv	20952
+WikiMatrix.da-it.tsv	115905
+WikiMatrix.da-ja.tsv	76251
+WikiMatrix.da-ko.tsv	37016
+WikiMatrix.da-lt.tsv	35446
+WikiMatrix.da-mk.tsv	39837
+WikiMatrix.da-ml.tsv	30210
+WikiMatrix.da-mr.tsv	35952
+WikiMatrix.da-nds.tsv	11399
+WikiMatrix.da-ne.tsv	12258
+WikiMatrix.da-nl.tsv	110077
+WikiMatrix.da-no.tsv	303266
+WikiMatrix.da-pl.tsv	89734
+WikiMatrix.da-pt.tsv	123217
+WikiMatrix.da-ro.tsv	70268
+WikiMatrix.da-ru.tsv	109086
+WikiMatrix.da-sh.tsv	37811
+WikiMatrix.da-si.tsv	32338
+WikiMatrix.da-sk.tsv	39731
+WikiMatrix.da-sl.tsv	40166
+WikiMatrix.da-sq.tsv	23038
+WikiMatrix.da-sr.tsv	43677
+WikiMatrix.da-sv.tsv	168311
+WikiMatrix.da-sw.tsv	11561
+WikiMatrix.da-ta.tsv	20656
+WikiMatrix.da-te.tsv	21459
+WikiMatrix.da-tl.tsv	23770
+WikiMatrix.da-tr.tsv	55021
+WikiMatrix.da-tt.tsv	11511
+WikiMatrix.da-uk.tsv	62966
+WikiMatrix.da-vi.tsv	68811
+WikiMatrix.da-zh.tsv	57975
+WikiMatrix.de-el.tsv	95377
+WikiMatrix.de-en.tsv	1573437
+WikiMatrix.de-eo.tsv	186502
+WikiMatrix.de-es.tsv	418724
+WikiMatrix.de-et.tsv	106627
+WikiMatrix.de-eu.tsv	53517
+WikiMatrix.de-fa.tsv	66193
+WikiMatrix.de-fi.tsv	163341
+WikiMatrix.de-fo.tsv	14842
+WikiMatrix.de-fr.tsv	626166
+WikiMatrix.de-fy.tsv	16523
+WikiMatrix.de-gl.tsv	80842
+WikiMatrix.de-gom.tsv	10721
+WikiMatrix.de-he.tsv	109703
+WikiMatrix.de-hi.tsv	57760
+WikiMatrix.de-hr.tsv	87640
+WikiMatrix.de-hu.tsv	192730
+WikiMatrix.de-hy.tsv	11529
+WikiMatrix.de-id.tsv	107890
+WikiMatrix.de-is.tsv	34569
+WikiMatrix.de-it.tsv	388342
+WikiMatrix.de-ja.tsv	217547
+WikiMatrix.de-ka.tsv	15369
+WikiMatrix.de-kk.tsv	23972
+WikiMatrix.de-ko.tsv	82280
+WikiMatrix.de-la.tsv	17846
+WikiMatrix.de-lb.tsv	26924
+WikiMatrix.de-lt.tsv	78962
+WikiMatrix.de-mk.tsv	64773
+WikiMatrix.de-ml.tsv	51618
+WikiMatrix.de-mr.tsv	58672
+WikiMatrix.de-nds.tsv	75590
+WikiMatrix.de-ne.tsv	21897
+WikiMatrix.de-nl.tsv	472831
+WikiMatrix.de-no.tsv	207477
+WikiMatrix.de-oc.tsv	17152
+WikiMatrix.de-pl.tsv	285039
+WikiMatrix.de-pt.tsv	294059
+WikiMatrix.de-rm.tsv	10576
+WikiMatrix.de-ro.tsv	129013
+WikiMatrix.de-ru.tsv	368206
+WikiMatrix.de-sh.tsv	68373
+WikiMatrix.de-si.tsv	50991
+WikiMatrix.de-sk.tsv	94959
+WikiMatrix.de-sl.tsv	106666
+WikiMatrix.de-sq.tsv	51177
+WikiMatrix.de-sr.tsv	81479
+WikiMatrix.de-sv.tsv	216938
+WikiMatrix.de-sw.tsv	20702
+WikiMatrix.de-ta.tsv	58600
+WikiMatrix.de-te.tsv	57957
+WikiMatrix.de-tg.tsv	11121
+WikiMatrix.de-tl.tsv	32893
+WikiMatrix.de-tr.tsv	127051
+WikiMatrix.de-tt.tsv	23087
+WikiMatrix.de-uk.tsv	165076
+WikiMatrix.de-vi.tsv	107022
+WikiMatrix.de-wuu.tsv	11173
+WikiMatrix.de-zh.tsv	134077
+WikiMatrix.el-en.tsv	620801
+WikiMatrix.el-eo.tsv	39852
+WikiMatrix.el-es.tsv	145191
+WikiMatrix.el-et.tsv	41026
+WikiMatrix.el-eu.tsv	23862
+WikiMatrix.el-fa.tsv	35116
+WikiMatrix.el-fi.tsv	55435
+WikiMatrix.el-fr.tsv	137073
+WikiMatrix.el-gl.tsv	48685
+WikiMatrix.el-he.tsv	56833
+WikiMatrix.el-hi.tsv	26307
+WikiMatrix.el-hr.tsv	43565
+WikiMatrix.el-hu.tsv	64636
+WikiMatrix.el-id.tsv	73368
+WikiMatrix.el-is.tsv	15794
+WikiMatrix.el-it.tsv	119290
+WikiMatrix.el-ja.tsv	69478
+WikiMatrix.el-ko.tsv	35634
+WikiMatrix.el-lt.tsv	34372
+WikiMatrix.el-mk.tsv	52936
+WikiMatrix.el-ml.tsv	27124
+WikiMatrix.el-mr.tsv	32288
+WikiMatrix.el-nl.tsv	76721
+WikiMatrix.el-no.tsv	60863
+WikiMatrix.el-pl.tsv	77338
+WikiMatrix.el-pt.tsv	144004
+WikiMatrix.el-ro.tsv	78731
+WikiMatrix.el-ru.tsv	114815
+WikiMatrix.el-sh.tsv	38130
+WikiMatrix.el-si.tsv	31562
+WikiMatrix.el-sk.tsv	35679
+WikiMatrix.el-sl.tsv	46819
+WikiMatrix.el-sq.tsv	28074
+WikiMatrix.el-sr.tsv	52918
+WikiMatrix.el-sv.tsv	62158
+WikiMatrix.el-sw.tsv	11271
+WikiMatrix.el-ta.tsv	16938
+WikiMatrix.el-te.tsv	18789
+WikiMatrix.el-tl.tsv	20861
+WikiMatrix.el-tr.tsv	56445
+WikiMatrix.el-uk.tsv	68884
+WikiMatrix.el-vi.tsv	75576
+WikiMatrix.el-zh.tsv	62957
+WikiMatrix.en-eo.tsv	298200
+WikiMatrix.en-es.tsv	3377911
+WikiMatrix.en-et.tsv	243869
+WikiMatrix.en-eu.tsv	119479
+WikiMatrix.en-fa.tsv	303805
+WikiMatrix.en-fi.tsv	375723
+WikiMatrix.en-fo.tsv	32317
+WikiMatrix.en-fr.tsv	2757883
+WikiMatrix.en-fy.tsv	32249
+WikiMatrix.en-gl.tsv	446151
+WikiMatrix.en-he.tsv	545744
+WikiMatrix.en-hi.tsv	231459
+WikiMatrix.en-hr.tsv	259498
+WikiMatrix.en-hu.tsv	488318
+WikiMatrix.en-id.tsv	1019170
+WikiMatrix.en-io.tsv	11209
+WikiMatrix.en-is.tsv	85991
+WikiMatrix.en-it.tsv	2126083
+WikiMatrix.en-ja.tsv	851706
+WikiMatrix.en-jv.tsv	13048
+WikiMatrix.en-ka.tsv	12807
+WikiMatrix.en-kk.tsv	20053
+WikiMatrix.en-ko.tsv	306900
+WikiMatrix.en-la.tsv	32280
+WikiMatrix.en-lb.tsv	22281
+WikiMatrix.en-lmo.tsv	10434
+WikiMatrix.en-lt.tsv	157525
+WikiMatrix.en-mg.tsv	13959
+WikiMatrix.en-mk.tsv	395394
+WikiMatrix.en-ml.tsv	71508
+WikiMatrix.en-mr.tsv	124308
+WikiMatrix.en-mwl.tsv	10443
+WikiMatrix.en-nds_nl.tsv	10550
+WikiMatrix.en-nds.tsv	43401
+WikiMatrix.en-ne.tsv	15015
+WikiMatrix.en-nl.tsv	796507
+WikiMatrix.en-no.tsv	636472
+WikiMatrix.en-oc.tsv	37331
+WikiMatrix.en-pl.tsv	668646
+WikiMatrix.en-pt.tsv	2461557
+WikiMatrix.en-ro.tsv	631485
+WikiMatrix.en-ru.tsv	1661908
+WikiMatrix.en-sh.tsv	224146
+WikiMatrix.en-simple.tsv	599340
+WikiMatrix.en-si.tsv	115045
+WikiMatrix.en-sk.tsv	178984
+WikiMatrix.en-sl.tsv	318027
+WikiMatrix.en-sq.tsv	180111
+WikiMatrix.en-sr.tsv	395568
+WikiMatrix.en-sv.tsv	546288
+WikiMatrix.en-sw.tsv	51386
+WikiMatrix.en-ta.tsv	95161
+WikiMatrix.en-te.tsv	91910
+WikiMatrix.en-tg.tsv	15002
+WikiMatrix.en-tl.tsv	75446
+WikiMatrix.en-tr.tsv	477735
+WikiMatrix.en-tt.tsv	32153
+WikiMatrix.en-ug.tsv	10698
+WikiMatrix.en-uk.tsv	681114
+WikiMatrix.en-vi.tsv	1073751
+WikiMatrix.en-wuu.tsv	17675
+WikiMatrix.en-zh.tsv	786511
+WikiMatrix.eo-es.tsv	149827
+WikiMatrix.eo-et.tsv	31921
+WikiMatrix.eo-eu.tsv	25283
+WikiMatrix.eo-fa.tsv	23234
+WikiMatrix.eo-fi.tsv	46112
+WikiMatrix.eo-fr.tsv	134088
+WikiMatrix.eo-gl.tsv	46309
+WikiMatrix.eo-he.tsv	39004
+WikiMatrix.eo-hi.tsv	22778
+WikiMatrix.eo-hr.tsv	29259
+WikiMatrix.eo-hu.tsv	57398
+WikiMatrix.eo-id.tsv	46010
+WikiMatrix.eo-is.tsv	15379
+WikiMatrix.eo-it.tsv	101947
+WikiMatrix.eo-ja.tsv	48733
+WikiMatrix.eo-ko.tsv	26463
+WikiMatrix.eo-lt.tsv	28059
+WikiMatrix.eo-mk.tsv	30254
+WikiMatrix.eo-ml.tsv	28437
+WikiMatrix.eo-mr.tsv	28622
+WikiMatrix.eo-nds.tsv	11812
+WikiMatrix.eo-nl.tsv	81182
+WikiMatrix.eo-no.tsv	47185
+WikiMatrix.eo-pl.tsv	77317
+WikiMatrix.eo-pt.tsv	91599
+WikiMatrix.eo-ro.tsv	43594
+WikiMatrix.eo-ru.tsv	81964
+WikiMatrix.eo-sh.tsv	26394
+WikiMatrix.eo-si.tsv	28638
+WikiMatrix.eo-sk.tsv	41405
+WikiMatrix.eo-sl.tsv	32362
+WikiMatrix.eo-sq.tsv	19844
+WikiMatrix.eo-sr.tsv	36234
+WikiMatrix.eo-sv.tsv	53442
+WikiMatrix.eo-ta.tsv	16284
+WikiMatrix.eo-te.tsv	19804
+WikiMatrix.eo-tl.tsv	17779
+WikiMatrix.eo-tr.tsv	37653
+WikiMatrix.eo-uk.tsv	50410
+WikiMatrix.eo-vi.tsv	42253
+WikiMatrix.eo-zh.tsv	39852
+WikiMatrix.es-et.tsv	89252
+WikiMatrix.es-eu.tsv	154280
+WikiMatrix.es-fa.tsv	83056
+WikiMatrix.es-fi.tsv	155486
+WikiMatrix.es-fo.tsv	21382
+WikiMatrix.es-fr.tsv	905760
+WikiMatrix.es-fy.tsv	21959
+WikiMatrix.es-gl.tsv	610824
+WikiMatrix.es-gom.tsv	13914
+WikiMatrix.es-he.tsv	153353
+WikiMatrix.es-hi.tsv	71866
+WikiMatrix.es-hr.tsv	94295
+WikiMatrix.es-hu.tsv	167286
+WikiMatrix.es-hy.tsv	13124
+WikiMatrix.es-id.tsv	198191
+WikiMatrix.es-is.tsv	42377
+WikiMatrix.es-it.tsv	671298
+WikiMatrix.es-ja.tsv	219260
+WikiMatrix.es-jv.tsv	12254
+WikiMatrix.es-ka.tsv	16433
+WikiMatrix.es-kk.tsv	26257
+WikiMatrix.es-ko.tsv	108385
+WikiMatrix.es-la.tsv	20803
+WikiMatrix.es-lb.tsv	19884
+WikiMatrix.es-lt.tsv	76193
+WikiMatrix.es-mk.tsv	92702
+WikiMatrix.es-ml.tsv	65508
+WikiMatrix.es-mr.tsv	98088
+WikiMatrix.es-nds.tsv	28568
+WikiMatrix.es-ne.tsv	25483
+WikiMatrix.es-nl.tsv	272587
+WikiMatrix.es-no.tsv	181719
+WikiMatrix.es-oc.tsv	35804
+WikiMatrix.es-pl.tsv	235464
+WikiMatrix.es-pt.tsv	923724
+WikiMatrix.es-ro.tsv	183489
+WikiMatrix.es-ru.tsv	393314
+WikiMatrix.es-sh.tsv	81086
+WikiMatrix.es-si.tsv	84161
+WikiMatrix.es-sk.tsv	81589
+WikiMatrix.es-sl.tsv	93744
+WikiMatrix.es-sq.tsv	53815
+WikiMatrix.es-sr.tsv	107044
+WikiMatrix.es-sv.tsv	181152
+WikiMatrix.es-sw.tsv	21991
+WikiMatrix.es-ta.tsv	57223
+WikiMatrix.es-te.tsv	71668
+WikiMatrix.es-tl.tsv	48392
+WikiMatrix.es-tr.tsv	147352
+WikiMatrix.es-tt.tsv	26290
+WikiMatrix.es-uk.tsv	187294
+WikiMatrix.es-vi.tsv	206705
+WikiMatrix.es-wuu.tsv	12873
+WikiMatrix.es-zh.tsv	174315
+WikiMatrix.et-eu.tsv	22986
+WikiMatrix.et-fa.tsv	24256
+WikiMatrix.et-fi.tsv	70662
+WikiMatrix.et-fr.tsv	85947
+WikiMatrix.et-gl.tsv	32333
+WikiMatrix.et-he.tsv	39824
+WikiMatrix.et-hi.tsv	20988
+WikiMatrix.et-hr.tsv	33532
+WikiMatrix.et-hu.tsv	56432
+WikiMatrix.et-id.tsv	41272
+WikiMatrix.et-is.tsv	14970
+WikiMatrix.et-it.tsv	75461
+WikiMatrix.et-ja.tsv	57643
+WikiMatrix.et-ko.tsv	29213
+WikiMatrix.et-lt.tsv	35847
+WikiMatrix.et-mk.tsv	32911
+WikiMatrix.et-ml.tsv	20233
+WikiMatrix.et-mr.tsv	21549
+WikiMatrix.et-nl.tsv	72505
+WikiMatrix.et-no.tsv	49810
+WikiMatrix.et-pl.tsv	73151
+WikiMatrix.et-pt.tsv	76955
+WikiMatrix.et-ro.tsv	48427
+WikiMatrix.et-ru.tsv	96345
+WikiMatrix.et-sh.tsv	27195
+WikiMatrix.et-si.tsv	19538
+WikiMatrix.et-sk.tsv	34194
+WikiMatrix.et-sl.tsv	35300
+WikiMatrix.et-sq.tsv	18948
+WikiMatrix.et-sr.tsv	34016
+WikiMatrix.et-sv.tsv	58124
+WikiMatrix.et-ta.tsv	16587
+WikiMatrix.et-te.tsv	16967
+WikiMatrix.et-tl.tsv	15617
+WikiMatrix.et-tr.tsv	43264
+WikiMatrix.et-uk.tsv	56089
+WikiMatrix.et-vi.tsv	40281
+WikiMatrix.et-zh.tsv	44047
+WikiMatrix.eu-fa.tsv	14476
+WikiMatrix.eu-fi.tsv	33576
+WikiMatrix.eu-fr.tsv	65731
+WikiMatrix.eu-gl.tsv	43100
+WikiMatrix.eu-he.tsv	25498
+WikiMatrix.eu-hi.tsv	13049
+WikiMatrix.eu-hr.tsv	21394
+WikiMatrix.eu-hu.tsv	35098
+WikiMatrix.eu-id.tsv	27036
+WikiMatrix.eu-is.tsv	10055
+WikiMatrix.eu-it.tsv	54958
+WikiMatrix.eu-ja.tsv	33986
+WikiMatrix.eu-ko.tsv	18156
+WikiMatrix.eu-lt.tsv	19463
+WikiMatrix.eu-mk.tsv	19208
+WikiMatrix.eu-ml.tsv	11113
+WikiMatrix.eu-mr.tsv	10301
+WikiMatrix.eu-nl.tsv	44131
+WikiMatrix.eu-no.tsv	29644
+WikiMatrix.eu-pl.tsv	43382
+WikiMatrix.eu-pt.tsv	58821
+WikiMatrix.eu-ro.tsv	30397
+WikiMatrix.eu-ru.tsv	47206
+WikiMatrix.eu-sh.tsv	19346
+WikiMatrix.eu-sk.tsv	20316
+WikiMatrix.eu-sl.tsv	20626
+WikiMatrix.eu-sq.tsv	12941
+WikiMatrix.eu-sr.tsv	21433
+WikiMatrix.eu-sv.tsv	38206
+WikiMatrix.eu-ta.tsv	13885
+WikiMatrix.eu-te.tsv	11444
+WikiMatrix.eu-tr.tsv	29185
+WikiMatrix.eu-uk.tsv	30006
+WikiMatrix.eu-vi.tsv	25722
+WikiMatrix.eu-zh.tsv	23990
+WikiMatrix.fa-fi.tsv	34069
+WikiMatrix.fa-fr.tsv	71278
+WikiMatrix.fa-gl.tsv	25353
+WikiMatrix.fa-he.tsv	36955
+WikiMatrix.fa-hi.tsv	20557
+WikiMatrix.fa-hr.tsv	24987
+WikiMatrix.fa-hu.tsv	39139
+WikiMatrix.fa-id.tsv	46991
+WikiMatrix.fa-it.tsv	64468
+WikiMatrix.fa-ja.tsv	46942
+WikiMatrix.fa-ko.tsv	26572
+WikiMatrix.fa-lt.tsv	20032
+WikiMatrix.fa-mk.tsv	27555
+WikiMatrix.fa-ml.tsv	11083
+WikiMatrix.fa-mr.tsv	10684
+WikiMatrix.fa-nl.tsv	49211
+WikiMatrix.fa-no.tsv	32827
+WikiMatrix.fa-pl.tsv	50792
+WikiMatrix.fa-pt.tsv	77606
+WikiMatrix.fa-ro.tsv	40515
+WikiMatrix.fa-ru.tsv	72954
+WikiMatrix.fa-sh.tsv	21729
+WikiMatrix.fa-sk.tsv	21717
+WikiMatrix.fa-sl.tsv	24549
+WikiMatrix.fa-sq.tsv	17644
+WikiMatrix.fa-sr.tsv	30075
+WikiMatrix.fa-sv.tsv	42447
+WikiMatrix.fa-ta.tsv	21879
+WikiMatrix.fa-te.tsv	12711
+WikiMatrix.fa-tr.tsv	42681
+WikiMatrix.fa-uk.tsv	41735
+WikiMatrix.fa-vi.tsv	38848
+WikiMatrix.fa-zh.tsv	42042
+WikiMatrix.fi-fr.tsv	156225
+WikiMatrix.fi-gl.tsv	47377
+WikiMatrix.fi-he.tsv	64406
+WikiMatrix.fi-hi.tsv	28707
+WikiMatrix.fi-hr.tsv	48618
+WikiMatrix.fi-hu.tsv	90196
+WikiMatrix.fi-id.tsv	63983
+WikiMatrix.fi-is.tsv	22671
+WikiMatrix.fi-it.tsv	131193
+WikiMatrix.fi-ja.tsv	87559
+WikiMatrix.fi-ko.tsv	43152
+WikiMatrix.fi-lt.tsv	47157
+WikiMatrix.fi-mk.tsv	40253
+WikiMatrix.fi-ml.tsv	29127
+WikiMatrix.fi-mr.tsv	30489
+WikiMatrix.fi-nds.tsv	12120
+WikiMatrix.fi-ne.tsv	10944
+WikiMatrix.fi-nl.tsv	126003
+WikiMatrix.fi-no.tsv	86413
+WikiMatrix.fi-oc.tsv	10219
+WikiMatrix.fi-pl.tsv	119130
+WikiMatrix.fi-pt.tsv	131186
+WikiMatrix.fi-ro.tsv	69926
+WikiMatrix.fi-ru.tsv	139383
+WikiMatrix.fi-sh.tsv	39988
+WikiMatrix.fi-si.tsv	27125
+WikiMatrix.fi-sk.tsv	50645
+WikiMatrix.fi-sl.tsv	46789
+WikiMatrix.fi-sq.tsv	25032
+WikiMatrix.fi-sr.tsv	46945
+WikiMatrix.fi-sv.tsv	126098
+WikiMatrix.fi-sw.tsv	12603
+WikiMatrix.fi-ta.tsv	23818
+WikiMatrix.fi-te.tsv	24903
+WikiMatrix.fi-tl.tsv	21521
+WikiMatrix.fi-tr.tsv	72100
+WikiMatrix.fi-tt.tsv	10236
+WikiMatrix.fi-uk.tsv	76304
+WikiMatrix.fi-vi.tsv	60265
+WikiMatrix.fi-zh.tsv	64244
+WikiMatrix.fo-fr.tsv	18125
+WikiMatrix.fo-it.tsv	15116
+WikiMatrix.fo-nl.tsv	11341
+WikiMatrix.fo-pl.tsv	11846
+WikiMatrix.fo-pt.tsv	17485
+WikiMatrix.fo-ru.tsv	13640
+WikiMatrix.fo-sv.tsv	12903
+WikiMatrix.fr-fy.tsv	18384
+WikiMatrix.fr-gl.tsv	154872
+WikiMatrix.fr-gom.tsv	13233
+WikiMatrix.fr-he.tsv	136974
+WikiMatrix.fr-hi.tsv	60717
+WikiMatrix.fr-hr.tsv	85047
+WikiMatrix.fr-hu.tsv	164733
+WikiMatrix.fr-hy.tsv	12458
+WikiMatrix.fr-id.tsv	161857
+WikiMatrix.fr-is.tsv	38273
+WikiMatrix.fr-it.tsv	744432
+WikiMatrix.fr-ja.tsv	214852
+WikiMatrix.fr-jv.tsv	10933
+WikiMatrix.fr-ka.tsv	17291
+WikiMatrix.fr-kk.tsv	24401
+WikiMatrix.fr-ko.tsv	89109
+WikiMatrix.fr-la.tsv	18936
+WikiMatrix.fr-lb.tsv	18459
+WikiMatrix.fr-lt.tsv	71060
+WikiMatrix.fr-mg.tsv	12043
+WikiMatrix.fr-mk.tsv	83969
+WikiMatrix.fr-ml.tsv	62719
+WikiMatrix.fr-mr.tsv	83646
+WikiMatrix.fr-nds.tsv	25658
+WikiMatrix.fr-ne.tsv	25868
+WikiMatrix.fr-nl.tsv	331777
+WikiMatrix.fr-no.tsv	166978
+WikiMatrix.fr-oc.tsv	124226
+WikiMatrix.fr-pl.tsv	255763
+WikiMatrix.fr-pt.tsv	558861
+WikiMatrix.fr-ro.tsv	206443
+WikiMatrix.fr-ru.tsv	410005
+WikiMatrix.fr-sh.tsv	72887
+WikiMatrix.fr-si.tsv	74448
+WikiMatrix.fr-sk.tsv	83657
+WikiMatrix.fr-sl.tsv	86073
+WikiMatrix.fr-sq.tsv	48654
+WikiMatrix.fr-sr.tsv	92133
+WikiMatrix.fr-sv.tsv	186370
+WikiMatrix.fr-sw.tsv	19908
+WikiMatrix.fr-ta.tsv	56336
+WikiMatrix.fr-te.tsv	65809
+WikiMatrix.fr-tl.tsv	42182
+WikiMatrix.fr-tr.tsv	130472
+WikiMatrix.fr-tt.tsv	26231
+WikiMatrix.fr-uk.tsv	170063
+WikiMatrix.fr-vi.tsv	165937
+WikiMatrix.fr-wuu.tsv	11999
+WikiMatrix.fr-zh.tsv	157013
+WikiMatrix.fy-it.tsv	17275
+WikiMatrix.fy-nl.tsv	38648
+WikiMatrix.fy-pl.tsv	12437
+WikiMatrix.fy-pt.tsv	18487
+WikiMatrix.fy-ru.tsv	14073
+WikiMatrix.fy-sv.tsv	13136
+WikiMatrix.gl-he.tsv	41858
+WikiMatrix.gl-hi.tsv	21454
+WikiMatrix.gl-hr.tsv	33940
+WikiMatrix.gl-hu.tsv	50347
+WikiMatrix.gl-id.tsv	56200
+WikiMatrix.gl-is.tsv	14870
+WikiMatrix.gl-it.tsv	120462
+WikiMatrix.gl-ja.tsv	50922
+WikiMatrix.gl-ko.tsv	28478
+WikiMatrix.gl-lt.tsv	27669
+WikiMatrix.gl-mk.tsv	35727
+WikiMatrix.gl-ml.tsv	29945
+WikiMatrix.gl-mr.tsv	39026
+WikiMatrix.gl-nds.tsv	10043
+WikiMatrix.gl-ne.tsv	11932
+WikiMatrix.gl-nl.tsv	66259
+WikiMatrix.gl-no.tsv	52272
+WikiMatrix.gl-oc.tsv	17008
+WikiMatrix.gl-pl.tsv	65374
+WikiMatrix.gl-pt.tsv	227507
+WikiMatrix.gl-ro.tsv	56079
+WikiMatrix.gl-ru.tsv	84460
+WikiMatrix.gl-sh.tsv	30941
+WikiMatrix.gl-si.tsv	36721
+WikiMatrix.gl-sk.tsv	29118
+WikiMatrix.gl-sl.tsv	33881
+WikiMatrix.gl-sq.tsv	20614
+WikiMatrix.gl-sr.tsv	39519
+WikiMatrix.gl-sv.tsv	54302
+WikiMatrix.gl-ta.tsv	15445
+WikiMatrix.gl-te.tsv	17166
+WikiMatrix.gl-tl.tsv	22377
+WikiMatrix.gl-tr.tsv	43313
+WikiMatrix.gl-tt.tsv	12039
+WikiMatrix.gl-uk.tsv	51273
+WikiMatrix.gl-vi.tsv	58599
+WikiMatrix.gl-zh.tsv	46609
+WikiMatrix.gom-it.tsv	13099
+WikiMatrix.gom-pt.tsv	11983
+WikiMatrix.gom-ru.tsv	10566
+WikiMatrix.he-hi.tsv	28427
+WikiMatrix.he-hr.tsv	41487
+WikiMatrix.he-hu.tsv	65954
+WikiMatrix.he-id.tsv	63296
+WikiMatrix.he-is.tsv	17590
+WikiMatrix.he-it.tsv	121221
+WikiMatrix.he-ja.tsv	82041
+WikiMatrix.he-ko.tsv	43724
+WikiMatrix.he-lt.tsv	35179
+WikiMatrix.he-mk.tsv	42893
+WikiMatrix.he-ml.tsv	26296
+WikiMatrix.he-mr.tsv	25941
+WikiMatrix.he-nl.tsv	86933
+WikiMatrix.he-no.tsv	64090
+WikiMatrix.he-pl.tsv	84210
+WikiMatrix.he-pt.tsv	133567
+WikiMatrix.he-ro.tsv	67831
+WikiMatrix.he-ru.tsv	131378
+WikiMatrix.he-sh.tsv	35352
+WikiMatrix.he-si.tsv	21382
+WikiMatrix.he-sk.tsv	36947
+WikiMatrix.he-sl.tsv	38755
+WikiMatrix.he-sq.tsv	23046
+WikiMatrix.he-sr.tsv	45889
+WikiMatrix.he-sv.tsv	67852
+WikiMatrix.he-sw.tsv	10339
+WikiMatrix.he-ta.tsv	21839
+WikiMatrix.he-te.tsv	25488
+WikiMatrix.he-tl.tsv	13968
+WikiMatrix.he-tr.tsv	54841
+WikiMatrix.he-uk.tsv	73310
+WikiMatrix.he-vi.tsv	66128
+WikiMatrix.he-zh.tsv	62796
+WikiMatrix.hi-hr.tsv	21019
+WikiMatrix.hi-hu.tsv	33900
+WikiMatrix.hi-id.tsv	31354
+WikiMatrix.hi-it.tsv	56025
+WikiMatrix.hi-ja.tsv	35864
+WikiMatrix.hi-ko.tsv	18367
+WikiMatrix.hi-lt.tsv	16614
+WikiMatrix.hi-mk.tsv	24869
+WikiMatrix.hi-mr.tsv	11686
+WikiMatrix.hi-ne.tsv	12315
+WikiMatrix.hi-nl.tsv	40620
+WikiMatrix.hi-no.tsv	27952
+WikiMatrix.hi-pl.tsv	44014
+WikiMatrix.hi-pt.tsv	63743
+WikiMatrix.hi-ro.tsv	35158
+WikiMatrix.hi-ru.tsv	56751
+WikiMatrix.hi-sh.tsv	17960
+WikiMatrix.hi-sk.tsv	18987
+WikiMatrix.hi-sl.tsv	21600
+WikiMatrix.hi-sq.tsv	14770
+WikiMatrix.hi-sr.tsv	22522
+WikiMatrix.hi-sv.tsv	40738
+WikiMatrix.hi-ta.tsv	13224
+WikiMatrix.hi-te.tsv	18147
+WikiMatrix.hi-tr.tsv	29786
+WikiMatrix.hi-uk.tsv	33725
+WikiMatrix.hi-vi.tsv	26293
+WikiMatrix.hi-zh.tsv	30167
+WikiMatrix.hr-hu.tsv	58438
+WikiMatrix.hr-id.tsv	47104
+WikiMatrix.hr-is.tsv	14241
+WikiMatrix.hr-it.tsv	80194
+WikiMatrix.hr-ja.tsv	48151
+WikiMatrix.hr-ko.tsv	27662
+WikiMatrix.hr-lt.tsv	31432
+WikiMatrix.hr-mk.tsv	52353
+WikiMatrix.hr-ml.tsv	24061
+WikiMatrix.hr-mr.tsv	24490
+WikiMatrix.hr-ne.tsv	10741
+WikiMatrix.hr-nl.tsv	65007
+WikiMatrix.hr-no.tsv	48269
+WikiMatrix.hr-pl.tsv	71529
+WikiMatrix.hr-pt.tsv	85373
+WikiMatrix.hr-ro.tsv	51221
+WikiMatrix.hr-ru.tsv	85888
+WikiMatrix.hr-sh.tsv	666685
+WikiMatrix.hr-si.tsv	19842
+WikiMatrix.hr-sk.tsv	35635
+WikiMatrix.hr-sl.tsv	53346
+WikiMatrix.hr-sq.tsv	21471
+WikiMatrix.hr-sr.tsv	205175
+WikiMatrix.hr-sv.tsv	56793
+WikiMatrix.hr-ta.tsv	16692
+WikiMatrix.hr-te.tsv	16411
+WikiMatrix.hr-tl.tsv	17463
+WikiMatrix.hr-tr.tsv	42175
+WikiMatrix.hr-uk.tsv	55749
+WikiMatrix.hr-vi.tsv	46750
+WikiMatrix.hr-zh.tsv	42053
+WikiMatrix.hu-id.tsv	70813
+WikiMatrix.hu-is.tsv	20377
+WikiMatrix.hu-it.tsv	146012
+WikiMatrix.hu-ja.tsv	99686
+WikiMatrix.hu-kk.tsv	11558
+WikiMatrix.hu-ko.tsv	49720
+WikiMatrix.hu-lt.tsv	48514
+WikiMatrix.hu-mk.tsv	47880
+WikiMatrix.hu-ml.tsv	27146
+WikiMatrix.hu-mr.tsv	28805
+WikiMatrix.hu-nds.tsv	12598
+WikiMatrix.hu-ne.tsv	10988
+WikiMatrix.hu-nl.tsv	121366
+WikiMatrix.hu-no.tsv	75452
+WikiMatrix.hu-oc.tsv	10104
+WikiMatrix.hu-pl.tsv	126850
+WikiMatrix.hu-pt.tsv	148377
+WikiMatrix.hu-ro.tsv	87958
+WikiMatrix.hu-ru.tsv	149514
+WikiMatrix.hu-sh.tsv	46865
+WikiMatrix.hu-si.tsv	26089
+WikiMatrix.hu-sk.tsv	56197
+WikiMatrix.hu-sl.tsv	55097
+WikiMatrix.hu-sq.tsv	27366
+WikiMatrix.hu-sr.tsv	53429
+WikiMatrix.hu-sv.tsv	88872
+WikiMatrix.hu-sw.tsv	13743
+WikiMatrix.hu-ta.tsv	29256
+WikiMatrix.hu-te.tsv	30768
+WikiMatrix.hu-tl.tsv	20518
+WikiMatrix.hu-tr.tsv	75715
+WikiMatrix.hu-uk.tsv	83066
+WikiMatrix.hu-vi.tsv	74351
+WikiMatrix.hu-zh.tsv	75242
+WikiMatrix.hy-it.tsv	12210
+WikiMatrix.hy-pt.tsv	11393
+WikiMatrix.hy-ru.tsv	12074
+WikiMatrix.id-is.tsv	16944
+WikiMatrix.id-it.tsv	146885
+WikiMatrix.id-ja.tsv	77397
+WikiMatrix.id-jv.tsv	19595
+WikiMatrix.id-ko.tsv	45970
+WikiMatrix.id-lt.tsv	33551
+WikiMatrix.id-mk.tsv	55991
+WikiMatrix.id-ml.tsv	25693
+WikiMatrix.id-mr.tsv	23390
+WikiMatrix.id-ne.tsv	10057
+WikiMatrix.id-nl.tsv	101197
+WikiMatrix.id-no.tsv	83641
+WikiMatrix.id-pl.tsv	93486
+WikiMatrix.id-pt.tsv	204470
+WikiMatrix.id-ro.tsv	94439
+WikiMatrix.id-ru.tsv	127410
+WikiMatrix.id-sh.tsv	43738
+WikiMatrix.id-si.tsv	23134
+WikiMatrix.id-sk.tsv	37954
+WikiMatrix.id-sl.tsv	46656
+WikiMatrix.id-sq.tsv	32624
+WikiMatrix.id-sr.tsv	56109
+WikiMatrix.id-sv.tsv	79193
+WikiMatrix.id-sw.tsv	13829
+WikiMatrix.id-ta.tsv	24647
+WikiMatrix.id-te.tsv	19049
+WikiMatrix.id-tl.tsv	21284
+WikiMatrix.id-tr.tsv	79176
+WikiMatrix.id-tt.tsv	11627
+WikiMatrix.id-uk.tsv	73379
+WikiMatrix.id-vi.tsv	146746
+WikiMatrix.id-zh.tsv	83566
+WikiMatrix.is-it.tsv	31787
+WikiMatrix.is-ja.tsv	18848
+WikiMatrix.is-lt.tsv	12041
+WikiMatrix.is-mk.tsv	12532
+WikiMatrix.is-nl.tsv	27334
+WikiMatrix.is-no.tsv	22321
+WikiMatrix.is-pl.tsv	27453
+WikiMatrix.is-pt.tsv	35263
+WikiMatrix.is-ro.tsv	20255
+WikiMatrix.is-ru.tsv	30010
+WikiMatrix.is-sh.tsv	13271
+WikiMatrix.is-sk.tsv	13204
+WikiMatrix.is-sl.tsv	13405
+WikiMatrix.is-sr.tsv	13764
+WikiMatrix.is-sv.tsv	28017
+WikiMatrix.is-tr.tsv	16153
+WikiMatrix.is-uk.tsv	18889
+WikiMatrix.is-vi.tsv	16523
+WikiMatrix.is-zh.tsv	14873
+WikiMatrix.it-ja.tsv	179031
+WikiMatrix.it-jv.tsv	11246
+WikiMatrix.it-ka.tsv	16256
+WikiMatrix.it-kk.tsv	24825
+WikiMatrix.it-ko.tsv	83911
+WikiMatrix.it-la.tsv	17036
+WikiMatrix.it-lb.tsv	15844
+WikiMatrix.it-lmo.tsv	11595
+WikiMatrix.it-lt.tsv	62439
+WikiMatrix.it-mk.tsv	73015
+WikiMatrix.it-ml.tsv	58237
+WikiMatrix.it-mr.tsv	78773
+WikiMatrix.it-nds.tsv	22202
+WikiMatrix.it-ne.tsv	24633
+WikiMatrix.it-nl.tsv	240569
+WikiMatrix.it-no.tsv	150403
+WikiMatrix.it-oc.tsv	20093
+WikiMatrix.it-pl.tsv	219293
+WikiMatrix.it-pt.tsv	480108
+WikiMatrix.it-ro.tsv	161759
+WikiMatrix.it-ru.tsv	303974
+WikiMatrix.it-scn.tsv	11231
+WikiMatrix.it-sh.tsv	67153
+WikiMatrix.it-si.tsv	68652
+WikiMatrix.it-sk.tsv	72794
+WikiMatrix.it-sl.tsv	81545
+WikiMatrix.it-sq.tsv	48707
+WikiMatrix.it-sr.tsv	83320
+WikiMatrix.it-sv.tsv	153800
+WikiMatrix.it-sw.tsv	19586
+WikiMatrix.it-ta.tsv	44891
+WikiMatrix.it-te.tsv	58221
+WikiMatrix.it-tl.tsv	41245
+WikiMatrix.it-tr.tsv	112630
+WikiMatrix.it-tt.tsv	23566
+WikiMatrix.it-uk.tsv	144863
+WikiMatrix.it-vi.tsv	143644
+WikiMatrix.it-wuu.tsv	10484
+WikiMatrix.it-zh.tsv	137288
+WikiMatrix.ja-kk.tsv	14270
+WikiMatrix.ja-ko.tsv	222118
+WikiMatrix.ja-lt.tsv	47361
+WikiMatrix.ja-mk.tsv	48010
+WikiMatrix.ja-ml.tsv	21616
+WikiMatrix.ja-mr.tsv	23173
+WikiMatrix.ja-nds.tsv	11228
+WikiMatrix.ja-nl.tsv	123955
+WikiMatrix.ja-no.tsv	81283
+WikiMatrix.ja-pl.tsv	128372
+WikiMatrix.ja-pt.tsv	175188
+WikiMatrix.ja-ro.tsv	79395
+WikiMatrix.ja-ru.tsv	196556
+WikiMatrix.ja-sh.tsv	40636
+WikiMatrix.ja-si.tsv	19798
+WikiMatrix.ja-sk.tsv	48948
+WikiMatrix.ja-sl.tsv	50219
+WikiMatrix.ja-sq.tsv	28281
+WikiMatrix.ja-sr.tsv	51763
+WikiMatrix.ja-sv.tsv	96872
+WikiMatrix.ja-sw.tsv	12391
+WikiMatrix.ja-ta.tsv	37201
+WikiMatrix.ja-te.tsv	31809
+WikiMatrix.ja-tl.tsv	12366
+WikiMatrix.ja-tr.tsv	84255
+WikiMatrix.ja-tt.tsv	12937
+WikiMatrix.ja-uk.tsv	92317
+WikiMatrix.ja-vi.tsv	75798
+WikiMatrix.ja-zh.tsv	267409
+WikiMatrix.jv-pt.tsv	11226
+WikiMatrix.ka-nl.tsv	12120
+WikiMatrix.ka-pl.tsv	11605
+WikiMatrix.ka-pt.tsv	14003
+WikiMatrix.ka-ru.tsv	13330
+WikiMatrix.ka-sv.tsv	12345
+WikiMatrix.kk-nl.tsv	18071
+WikiMatrix.kk-no.tsv	11301
+WikiMatrix.kk-pl.tsv	17893
+WikiMatrix.kk-pt.tsv	22150
+WikiMatrix.kk-ro.tsv	12467
+WikiMatrix.kk-ru.tsv	32807
+WikiMatrix.kk-sv.tsv	16574
+WikiMatrix.kk-tr.tsv	10081
+WikiMatrix.kk-uk.tsv	14581
+WikiMatrix.ko-lt.tsv	23324
+WikiMatrix.ko-mk.tsv	26857
+WikiMatrix.ko-ml.tsv	10118
+WikiMatrix.ko-mr.tsv	10568
+WikiMatrix.ko-nl.tsv	56609
+WikiMatrix.ko-no.tsv	41716
+WikiMatrix.ko-pl.tsv	63894
+WikiMatrix.ko-pt.tsv	93224
+WikiMatrix.ko-ro.tsv	47054
+WikiMatrix.ko-ru.tsv	89951
+WikiMatrix.ko-sh.tsv	23213
+WikiMatrix.ko-sk.tsv	25644
+WikiMatrix.ko-sl.tsv	26403
+WikiMatrix.ko-sq.tsv	17929
+WikiMatrix.ko-sr.tsv	29639
+WikiMatrix.ko-sv.tsv	51718
+WikiMatrix.ko-ta.tsv	17059
+WikiMatrix.ko-te.tsv	13610
+WikiMatrix.ko-tr.tsv	47497
+WikiMatrix.ko-uk.tsv	48954
+WikiMatrix.ko-vi.tsv	49283
+WikiMatrix.ko-zh.tsv	57932
+WikiMatrix.la-nl.tsv	12202
+WikiMatrix.la-pl.tsv	13391
+WikiMatrix.la-pt.tsv	18561
+WikiMatrix.la-ro.tsv	10267
+WikiMatrix.la-ru.tsv	14815
+WikiMatrix.la-sv.tsv	13396
+WikiMatrix.lb-nl.tsv	11163
+WikiMatrix.lb-pl.tsv	11378
+WikiMatrix.lb-pt.tsv	16576
+WikiMatrix.lb-ru.tsv	11807
+WikiMatrix.lb-sv.tsv	12339
+WikiMatrix.lt-mk.tsv	28117
+WikiMatrix.lt-ml.tsv	16474
+WikiMatrix.lt-mr.tsv	16648
+WikiMatrix.lt-nl.tsv	57966
+WikiMatrix.lt-no.tsv	39216
+WikiMatrix.lt-pl.tsv	70315
+WikiMatrix.lt-pt.tsv	64976
+WikiMatrix.lt-ro.tsv	39152
+WikiMatrix.lt-ru.tsv	107783
+WikiMatrix.lt-sh.tsv	25495
+WikiMatrix.lt-si.tsv	15384
+WikiMatrix.lt-sk.tsv	30843
+WikiMatrix.lt-sl.tsv	30859
+WikiMatrix.lt-sq.tsv	16358
+WikiMatrix.lt-sr.tsv	29967
+WikiMatrix.lt-sv.tsv	46008
+WikiMatrix.lt-ta.tsv	13005
+WikiMatrix.lt-te.tsv	11731
+WikiMatrix.lt-tl.tsv	12904
+WikiMatrix.lt-tr.tsv	36776
+WikiMatrix.lt-uk.tsv	57413
+WikiMatrix.lt-vi.tsv	33170
+WikiMatrix.lt-zh.tsv	35895
+WikiMatrix.mk-ml.tsv	21457
+WikiMatrix.mk-mr.tsv	22675
+WikiMatrix.mk-nl.tsv	53320
+WikiMatrix.mk-no.tsv	46342
+WikiMatrix.mk-pl.tsv	56928
+WikiMatrix.mk-pt.tsv	93291
+WikiMatrix.mk-ro.tsv	56342
+WikiMatrix.mk-ru.tsv	88000
+WikiMatrix.mk-sh.tsv	52825
+WikiMatrix.mk-si.tsv	19587
+WikiMatrix.mk-sk.tsv	29821
+WikiMatrix.mk-sl.tsv	39973
+WikiMatrix.mk-sq.tsv	25078
+WikiMatrix.mk-sr.tsv	106377
+WikiMatrix.mk-sv.tsv	48072
+WikiMatrix.mk-ta.tsv	14353
+WikiMatrix.mk-te.tsv	15254
+WikiMatrix.mk-tl.tsv	16689
+WikiMatrix.mk-tr.tsv	43430
+WikiMatrix.mk-uk.tsv	57515
+WikiMatrix.mk-vi.tsv	57549
+WikiMatrix.mk-zh.tsv	45671
+WikiMatrix.ml-nl.tsv	41804
+WikiMatrix.ml-no.tsv	32249
+WikiMatrix.ml-pl.tsv	41517
+WikiMatrix.ml-pt.tsv	58378
+WikiMatrix.ml-ro.tsv	35368
+WikiMatrix.ml-ru.tsv	46205
+WikiMatrix.ml-sh.tsv	21975
+WikiMatrix.ml-sk.tsv	22420
+WikiMatrix.ml-sl.tsv	21686
+WikiMatrix.ml-sq.tsv	13707
+WikiMatrix.ml-sr.tsv	20165
+WikiMatrix.ml-sv.tsv	44814
+WikiMatrix.ml-tr.tsv	16597
+WikiMatrix.ml-uk.tsv	26706
+WikiMatrix.ml-vi.tsv	15688
+WikiMatrix.ml-zh.tsv	17523
+WikiMatrix.mr-nl.tsv	46456
+WikiMatrix.mr-no.tsv	35123
+WikiMatrix.mr-pl.tsv	47091
+WikiMatrix.mr-pt.tsv	86686
+WikiMatrix.mr-ro.tsv	47259
+WikiMatrix.mr-ru.tsv	50400
+WikiMatrix.mr-sh.tsv	22428
+WikiMatrix.mr-sk.tsv	25169
+WikiMatrix.mr-sl.tsv	25021
+WikiMatrix.mr-sq.tsv	13098
+WikiMatrix.mr-sr.tsv	19078
+WikiMatrix.mr-sv.tsv	56338
+WikiMatrix.mr-tr.tsv	17343
+WikiMatrix.mr-uk.tsv	26221
+WikiMatrix.mr-vi.tsv	14772
+WikiMatrix.mr-zh.tsv	17442
+WikiMatrix.mwl-pt.tsv	34539
+WikiMatrix.nds_nl-nl.tsv	15316
+WikiMatrix.nds-nl.tsv	19081
+WikiMatrix.nds-no.tsv	12797
+WikiMatrix.nds-pl.tsv	18216
+WikiMatrix.nds-pt.tsv	22939
+WikiMatrix.nds-ro.tsv	13008
+WikiMatrix.nds-ru.tsv	20062
+WikiMatrix.nds-sv.tsv	18542
+WikiMatrix.nds-uk.tsv	11947
+WikiMatrix.ne-nl.tsv	17856
+WikiMatrix.ne-no.tsv	13954
+WikiMatrix.ne-pl.tsv	17302
+WikiMatrix.ne-pt.tsv	21399
+WikiMatrix.ne-ro.tsv	14108
+WikiMatrix.ne-ru.tsv	19225
+WikiMatrix.ne-sh.tsv	10471
+WikiMatrix.ne-sk.tsv	10400
+WikiMatrix.ne-sl.tsv	10418
+WikiMatrix.ne-sv.tsv	17951
+WikiMatrix.ne-uk.tsv	11500
+WikiMatrix.nl-no.tsv	133308
+WikiMatrix.nl-oc.tsv	13488
+WikiMatrix.nl-pl.tsv	177117
+WikiMatrix.nl-pt.tsv	218472
+WikiMatrix.nl-ro.tsv	96776
+WikiMatrix.nl-ru.tsv	199345
+WikiMatrix.nl-sh.tsv	53430
+WikiMatrix.nl-si.tsv	42365
+WikiMatrix.nl-sk.tsv	66565
+WikiMatrix.nl-sl.tsv	64687
+WikiMatrix.nl-sq.tsv	34902
+WikiMatrix.nl-sr.tsv	61780
+WikiMatrix.nl-sv.tsv	151735
+WikiMatrix.nl-sw.tsv	16582
+WikiMatrix.nl-ta.tsv	37639
+WikiMatrix.nl-te.tsv	35569
+WikiMatrix.nl-tl.tsv	29776
+WikiMatrix.nl-tr.tsv	90968
+WikiMatrix.nl-tt.tsv	18420
+WikiMatrix.nl-uk.tsv	104378
+WikiMatrix.nl-vi.tsv	84022
+WikiMatrix.nl-zh.tsv	88818
+WikiMatrix.no-pl.tsv	103674
+WikiMatrix.no-pt.tsv	161215
+WikiMatrix.no-ro.tsv	74943
+WikiMatrix.no-ru.tsv	121486
+WikiMatrix.no-sh.tsv	42996
+WikiMatrix.no-si.tsv	28739
+WikiMatrix.no-sk.tsv	43781
+WikiMatrix.no-sl.tsv	51732
+WikiMatrix.no-sq.tsv	26679
+WikiMatrix.no-sr.tsv	47744
+WikiMatrix.no-sv.tsv	270882
+WikiMatrix.no-sw.tsv	12710
+WikiMatrix.no-ta.tsv	24589
+WikiMatrix.no-te.tsv	23501
+WikiMatrix.no-tl.tsv	24491
+WikiMatrix.no-tr.tsv	61772
+WikiMatrix.no-tt.tsv	13155
+WikiMatrix.no-uk.tsv	69895
+WikiMatrix.no-vi.tsv	79750
+WikiMatrix.no-zh.tsv	63206
+WikiMatrix.oc-pl.tsv	13703
+WikiMatrix.oc-pt.tsv	24424
+WikiMatrix.oc-ro.tsv	11840
+WikiMatrix.oc-ru.tsv	14902
+WikiMatrix.oc-sv.tsv	12596
+WikiMatrix.pl-pt.tsv	200506
+WikiMatrix.pl-ro.tsv	97037
+WikiMatrix.pl-ru.tsv	285946
+WikiMatrix.pl-sh.tsv	56752
+WikiMatrix.pl-si.tsv	40941
+WikiMatrix.pl-sk.tsv	81071
+WikiMatrix.pl-sl.tsv	68333
+WikiMatrix.pl-sq.tsv	35947
+WikiMatrix.pl-sr.tsv	69550
+WikiMatrix.pl-sv.tsv	121793
+WikiMatrix.pl-sw.tsv	16928
+WikiMatrix.pl-ta.tsv	39892
+WikiMatrix.pl-te.tsv	42060
+WikiMatrix.pl-tl.tsv	28804
+WikiMatrix.pl-tr.tsv	92945
+WikiMatrix.pl-tt.tsv	16386
+WikiMatrix.pl-uk.tsv	172368
+WikiMatrix.pl-vi.tsv	84550
+WikiMatrix.pl-zh.tsv	92708
+WikiMatrix.pt-ro.tsv	177269
+WikiMatrix.pt-ru.tsv	312869
+WikiMatrix.pt-sh.tsv	74080
+WikiMatrix.pt-si.tsv	76114
+WikiMatrix.pt-sk.tsv	71181
+WikiMatrix.pt-sl.tsv	85307
+WikiMatrix.pt-sq.tsv	47867
+WikiMatrix.pt-sr.tsv	101375
+WikiMatrix.pt-sv.tsv	155481
+WikiMatrix.pt-sw.tsv	20692
+WikiMatrix.pt-ta.tsv	42380
+WikiMatrix.pt-te.tsv	54636
+WikiMatrix.pt-tl.tsv	45927
+WikiMatrix.pt-tr.tsv	140579
+WikiMatrix.pt-tt.tsv	23174
+WikiMatrix.pt-uk.tsv	156140
+WikiMatrix.pt-vi.tsv	213119
+WikiMatrix.pt-wuu.tsv	11129
+WikiMatrix.pt-zh.tsv	165205
+WikiMatrix.ro-ru.tsv	136407
+WikiMatrix.ro-sh.tsv	44686
+WikiMatrix.ro-si.tsv	43266
+WikiMatrix.ro-sk.tsv	42561
+WikiMatrix.ro-sl.tsv	49716
+WikiMatrix.ro-sq.tsv	30941
+WikiMatrix.ro-sr.tsv	58682
+WikiMatrix.ro-sv.tsv	75782
+WikiMatrix.ro-sw.tsv	15025
+WikiMatrix.ro-ta.tsv	23098
+WikiMatrix.ro-te.tsv	27477
+WikiMatrix.ro-tl.tsv	29061
+WikiMatrix.ro-tr.tsv	72180
+WikiMatrix.ro-tt.tsv	13876
+WikiMatrix.ro-uk.tsv	82153
+WikiMatrix.ro-vi.tsv	96125
+WikiMatrix.ro-zh.tsv	74790
+WikiMatrix.ru-sh.tsv	70262
+WikiMatrix.ru-si.tsv	42594
+WikiMatrix.ru-sk.tsv	85656
+WikiMatrix.ru-sl.tsv	78858
+WikiMatrix.ru-sq.tsv	44661
+WikiMatrix.ru-sr.tsv	114775
+WikiMatrix.ru-sv.tsv	140222
+WikiMatrix.ru-sw.tsv	17943
+WikiMatrix.ru-ta.tsv	54465
+WikiMatrix.ru-te.tsv	55768
+WikiMatrix.ru-tg.tsv	10759
+WikiMatrix.ru-tl.tsv	29214
+WikiMatrix.ru-tr.tsv	119345
+WikiMatrix.ru-tt.tsv	25244
+WikiMatrix.ru-uk.tsv	2486905
+WikiMatrix.ru-vi.tsv	122026
+WikiMatrix.ru-wuu.tsv	10421
+WikiMatrix.ru-zh.tsv	148733
+WikiMatrix.sh-si.tsv	17999
+WikiMatrix.sh-sk.tsv	27941
+WikiMatrix.sh-sl.tsv	46667
+WikiMatrix.sh-sq.tsv	19045
+WikiMatrix.sh-sr.tsv	373728
+WikiMatrix.sh-sv.tsv	46389
+WikiMatrix.sh-ta.tsv	14229
+WikiMatrix.sh-te.tsv	13914
+WikiMatrix.sh-tl.tsv	17012
+WikiMatrix.sh-tr.tsv	35108
+WikiMatrix.sh-uk.tsv	45971
+WikiMatrix.sh-vi.tsv	42484
+WikiMatrix.sh-zh.tsv	36099
+WikiMatrix.si-sk.tsv	22131
+WikiMatrix.si-sl.tsv	22809
+WikiMatrix.si-sq.tsv	10145
+WikiMatrix.si-sr.tsv	15895
+WikiMatrix.si-sv.tsv	48372
+WikiMatrix.si-tr.tsv	15421
+WikiMatrix.si-uk.tsv	21209
+WikiMatrix.si-vi.tsv	14999
+WikiMatrix.si-zh.tsv	16002
+WikiMatrix.sk-sl.tsv	36507
+WikiMatrix.sk-sq.tsv	17211
+WikiMatrix.sk-sr.tsv	34375
+WikiMatrix.sk-sv.tsv	51536
+WikiMatrix.sk-ta.tsv	14594
+WikiMatrix.sk-te.tsv	15627
+WikiMatrix.sk-tl.tsv	16713
+WikiMatrix.sk-tr.tsv	37685
+WikiMatrix.sk-uk.tsv	51350
+WikiMatrix.sk-vi.tsv	38667
+WikiMatrix.sk-zh.tsv	38556
+WikiMatrix.sl-sq.tsv	19695
+WikiMatrix.sl-sr.tsv	47119
+WikiMatrix.sl-sv.tsv	50838
+WikiMatrix.sl-ta.tsv	15526
+WikiMatrix.sl-te.tsv	16081
+WikiMatrix.sl-tl.tsv	17840
+WikiMatrix.sl-tr.tsv	39624
+WikiMatrix.sl-uk.tsv	50320
+WikiMatrix.sl-vi.tsv	48297
+WikiMatrix.sl-zh.tsv	42036
+WikiMatrix.sq-sr.tsv	25103
+WikiMatrix.sq-sv.tsv	31183
+WikiMatrix.sq-ta.tsv	13707
+WikiMatrix.sq-te.tsv	10575
+WikiMatrix.sq-tl.tsv	10943
+WikiMatrix.sq-tr.tsv	27534
+WikiMatrix.sq-uk.tsv	29077
+WikiMatrix.sq-vi.tsv	30454
+WikiMatrix.sq-zh.tsv	24128
+WikiMatrix.sr-sv.tsv	51675
+WikiMatrix.sr-ta.tsv	19095
+WikiMatrix.sr-te.tsv	18178
+WikiMatrix.sr-tl.tsv	14064
+WikiMatrix.sr-tr.tsv	43382
+WikiMatrix.sr-uk.tsv	71932
+WikiMatrix.sr-vi.tsv	56213
+WikiMatrix.sr-zh.tsv	45291
+WikiMatrix.sv-sw.tsv	16864
+WikiMatrix.sv-ta.tsv	33629
+WikiMatrix.sv-te.tsv	39137
+WikiMatrix.sv-tl.tsv	35027
+WikiMatrix.sv-tr.tsv	72959
+WikiMatrix.sv-tt.tsv	17518
+WikiMatrix.sv-uk.tsv	82027
+WikiMatrix.sv-vi.tsv	74202
+WikiMatrix.sv-zh.tsv	73747
+WikiMatrix.sw-tr.tsv	12260
+WikiMatrix.sw-uk.tsv	12284
+WikiMatrix.sw-vi.tsv	10822
+WikiMatrix.sw-zh.tsv	11233
+WikiMatrix.ta-tr.tsv	29056
+WikiMatrix.ta-uk.tsv	30604
+WikiMatrix.ta-vi.tsv	19365
+WikiMatrix.ta-zh.tsv	27184
+WikiMatrix.te-tr.tsv	21596
+WikiMatrix.te-uk.tsv	30800
+WikiMatrix.te-vi.tsv	16788
+WikiMatrix.te-zh.tsv	20912
+WikiMatrix.tl-tr.tsv	12260
+WikiMatrix.tl-uk.tsv	16560
+WikiMatrix.tl-vi.tsv	17399
+WikiMatrix.tl-zh.tsv	10492
+WikiMatrix.tr-tt.tsv	10644
+WikiMatrix.tr-uk.tsv	67753
+WikiMatrix.tr-vi.tsv	77062
+WikiMatrix.tr-zh.tsv	69162
+WikiMatrix.tt-uk.tsv	11500
+WikiMatrix.tt-zh.tsv	10587
+WikiMatrix.uk-vi.tsv	73104
+WikiMatrix.uk-zh.tsv	72752
+WikiMatrix.vi-zh.tsv	89445
+WikiMatrix.wuu-zh.tsv	43747

laser/tasks/bucc/README.md ADDED Viewed

	@@ -0,0 +1,94 @@

+# LASER: application to bitext mining
+This codes shows how to use the multilingual sentence embeddings to mine
+for parallel data in (huge) collections of monolingual data.
+The underlying idea is pretty simple:
+* embed the sentences in the two languages into the joint sentence space
+* calculate all pairwise distances between the sentences.
+  This is of complexity O(N\*M) and can be done very efficiently with
+  the FAISS library [2]
+* all sentence pairs which have a distance below a threshold
+  are considered as parallel
+* this approach can be further improved using a margin criterion [3]
+Here, we apply this idea to the data provided by the shared task of the BUCC
+[Workshop on Building and Using Comparable Corpora](https://comparable.limsi.fr/bucc2018/bucc2018-task.html).
+The same approach can be scaled up to huge collections of monolingual texts
+(several billions) using more advanced features of the FAISS toolkit.
+## Installation
+* Please first download the BUCC shared task data
+  [here](https://comparable.limsi.fr/bucc2017/cgi-bin/download-data-2018.cgi)
+  and install it the directory "downloaded"
+* running the script
+```bash
+./bucc.sh
+```
+## Results
+Optimized on the F-scores on the training corpus.
+These results differ slighty from those published in [4] due to the switch from PyTorch 0.4 to 1.0.
+| Languages | Threshold | precision | Recall | F-score |
+|-----------|-----------|-----------|--------|---------|
+|   fr-en   |  1.088131 |   91.52   |  93.32 |  92.41  |
+|   de-en   |  1.092056 |   95.65   |  95.19 |  95.42  |
+|   ru-en   |  1.093404 |   90.60   |  94.04 |  92.29  |
+|   zh-en   |  1.085999 |   91.99   |  91.31 |  91.65  |
+Results on the official test set are scored by the organizers of the BUCC workshop.
+Below, we compare our approach to the [official results of the 2018 edition
+of the BUCC workshop](http://lrec-conf.org/workshops/lrec2018/W8/pdf/12_W8.pdf) [1].
+More details on our approach are provided in [2,3,4]
+|               System | fr-en | de-en | ru-en | zh-en |
+|----------------------|-------|-------|-------|-------|
+|   Azpeitia et al '17 |  79.5 |  83.7 |   -   |   -   |
+|   Azpeitia et al '18 |  81.5 |  85.5 |  81.3 |  77.5 |
+|Bouamor and Sajjad '18|  76.0 |   -   |   -   |   -   |
+|   Chongman et al '18 |   -   |   -   |   -   |  56   |
+|            LASER [3] |  75.8 |  76.9 |   -   |   -   |
+|            LASER [4] |  93.1 |  96.2 |  92.3 |  92.7 |
+All numbers are F1-scores on the test set.
+## Bonus
+To show case the highly multilingual aspect of LASER's sentence embeddings,
+we also mine for bitexts for language pairs which do not include English, e.g.
+French-German, Russian-French or Chinese-Russian.
+This is also performed by the script bucc.sh
+Below the number of extracted parallel sentences for each language pair.
+| src/trg | French | German | Russian | Chinese |
+|---------|--------|--------|---------|---------|
+| French  |  n/a   |  2795  |  3327   |  387    |
+| German  |  2795  |  n/a   |  3661   |  466    |
+| Russian |  3327  |  3661  |   n/a   |  664    |
+| Chinese |   387  |   466  |   664   |  n/a    |
+## References
+[1] Pierre Zweigenbaum, Serge Sharoff and Reinhard Rapp,`
+    [*Overview of the Third BUCC Shared Task: Spotting Parallel Sentences in Comparable Corpora*](http://lrec-conf.org/workshops/lrec2018/W8/pdf/12_W8.pdf),
+    LREC, 2018.
+[2] Holger Schwenk,
+    [*Filtering and Mining Parallel Data in a Joint Multilingual Space*](https://arxiv.org/abs/1805.09822),
+    ACL, July 2018
+[3] Mikel Artetxe and Holger Schwenk,
+    [*Margin-based Parallel Corpus Mining with Multilingual Sentence Embeddings*](https://arxiv.org/abs/1811.01136)
+    arXiv, 3 Nov 2018.
+[3] Mikel Artetxe and Holger Schwenk,
+    [*Massively Multilingual Sentence Embeddings for Zero-Shot Cross-Lingual Transfer and Beyond*](https://arxiv.org/abs/1812.10464)
+    arXiv, 26 Dec 2018.