KuangDW commited on
Commit
05d3571
·
1 Parent(s): bbf3202

Add laser2.spm using Git LFS

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. app.py +2 -4
  3. laser/.github/workflows/lint_and_tests.yml +32 -0
  4. laser/.gitignore +15 -0
  5. laser/CODE_OF_CONDUCT.md +5 -0
  6. laser/CONTRIBUTING.md +37 -0
  7. laser/LICENSE +30 -0
  8. laser/README.md +159 -0
  9. laser/docker/Dockerfile +38 -0
  10. laser/docker/README.md +82 -0
  11. laser/docker/app.py +64 -0
  12. laser/docker/decode.py +7 -0
  13. laser/install_external_tools.sh +200 -0
  14. laser/install_models.sh +48 -0
  15. laser/laser2.cvocab +0 -0
  16. laser/laser2.spm +3 -0
  17. laser/laser_encoders/README.md +149 -0
  18. laser/laser_encoders/__init__.py +16 -0
  19. laser/laser_encoders/download_models.py +154 -0
  20. laser/laser_encoders/language_list.py +564 -0
  21. laser/laser_encoders/laser_tokenizer.py +179 -0
  22. laser/laser_encoders/models.py +426 -0
  23. laser/laser_encoders/test_laser_tokenizer.py +310 -0
  24. laser/laser_encoders/test_models_initialization.py +57 -0
  25. laser/laser_encoders/validate_models.py +108 -0
  26. laser/pyproject.toml +69 -0
  27. laser/remove_external_tools.sh +26 -0
  28. laser/source/embed.py +362 -0
  29. laser/source/eval.py +381 -0
  30. laser/source/lib/indexing.py +258 -0
  31. laser/source/lib/romanize_lc.py +51 -0
  32. laser/source/lib/text_processing.py +272 -0
  33. laser/source/mine_bitexts.py +302 -0
  34. laser/source/nli.py +371 -0
  35. laser/source/paraphrase.py +285 -0
  36. laser/source/pxsim.py +251 -0
  37. laser/source/sent_classif.py +273 -0
  38. laser/source/similarity_search.py +113 -0
  39. laser/source/xsim.py +165 -0
  40. laser/tasks/CCMatrix/MatrixMine.pdf +0 -0
  41. laser/tasks/CCMatrix/README.md +39 -0
  42. laser/tasks/CCMatrix/dl_cc_matrix.py +338 -0
  43. laser/tasks/SentimentAnalysis/README.md +34 -0
  44. laser/tasks/SentimentAnalysis/SentimentAnalysis.ipynb +0 -0
  45. laser/tasks/WikiMatrix/README.md +93 -0
  46. laser/tasks/WikiMatrix/WikiMatrix-bleu.pdf +0 -0
  47. laser/tasks/WikiMatrix/WikiMatrix-sizes.pdf +0 -0
  48. laser/tasks/WikiMatrix/extract.py +81 -0
  49. laser/tasks/WikiMatrix/list_of_bitexts.txt +1620 -0
  50. laser/tasks/bucc/README.md +94 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ laser/laser2.spm filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -16,9 +16,7 @@ import pkg_resources
16
  import sys
17
 
18
  login(token=os.environ.get("LA_NAME"))
19
- laser_token = os.environ.get("ENC")
20
- laser_path = snapshot_download(repo_id="KuangDW/laser", use_auth_token=laser_token)
21
- os.environ["LASER"] = laser_path
22
 
23
  def check_and_install(package, required_version):
24
  try:
@@ -326,7 +324,7 @@ with gr.Blocks(title="Test-Time Machine Translation with Plan2Align") as demo:
326
 
327
  gr.Examples(
328
  examples=[
329
- ["台灣夜市文化豐富多彩...", "Chinese", "English", 2, 0.7, 1, ["Original", "Plan2Align"]],
330
  ["台北101曾經是世界最高的建築物,它不僅是台灣的地標,也象徵著經濟成就和創新精神。", "Chinese", "Russian", 2, 0.7, 1, ["Original", "Plan2Align"]],
331
  ["阿里山日出和森林鐵路是台灣最著名的自然景觀之一,每年吸引數十萬遊客前來欣賞雲海和壯麗的日出。", "Chinese", "German", 2, 0.7, 1, ["Original", "Plan2Align"]],
332
  ["珍珠奶茶,這款源自台灣的獨特飲品,不僅在台灣本地深受喜愛,更以其獨特的風味和口感,在全球掀起了一股熱潮,成為了一種跨越文化、風靡全球的時尚飲品。", "Chinese", "Japanese", 3, 0.7, 3, ["Original", "Plan2Align"]],
 
16
  import sys
17
 
18
  login(token=os.environ.get("LA_NAME"))
19
+ os.environ["LASER"] = "laser"
 
 
20
 
21
  def check_and_install(package, required_version):
22
  try:
 
324
 
325
  gr.Examples(
326
  examples=[
327
+ ["台灣夜市文化豐富多彩,從士林夜市到饒河街夜市,提供各種美食、遊戲和購物體驗,吸引了無數遊客。", "Chinese", "English", 2, 0.7, 1, ["Original", "Plan2Align"]],
328
  ["台北101曾經是世界最高的建築物,它不僅是台灣的地標,也象徵著經濟成就和創新精神。", "Chinese", "Russian", 2, 0.7, 1, ["Original", "Plan2Align"]],
329
  ["阿里山日出和森林鐵路是台灣最著名的自然景觀之一,每年吸引數十萬遊客前來欣賞雲海和壯麗的日出。", "Chinese", "German", 2, 0.7, 1, ["Original", "Plan2Align"]],
330
  ["珍珠奶茶,這款源自台灣的獨特飲品,不僅在台灣本地深受喜愛,更以其獨特的風味和口感,在全球掀起了一股熱潮,成為了一種跨越文化、風靡全球的時尚飲品。", "Chinese", "Japanese", 3, 0.7, 3, ["Original", "Plan2Align"]],
laser/.github/workflows/lint_and_tests.yml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: lint_and_tests
2
+
3
+ on: [push, pull_request]
4
+
5
+ jobs:
6
+ build:
7
+ strategy:
8
+ max-parallel: 1
9
+ matrix:
10
+ platform: [ubuntu-latest]
11
+ python-version: [3.8]
12
+
13
+ runs-on: ${{ matrix.platform }}
14
+
15
+ steps:
16
+ - uses: actions/checkout@v2
17
+
18
+ - name: Install dependencies
19
+ run: |
20
+ python --version
21
+ python -m pip install --upgrade 'pip>=23.2.1'
22
+ python -m pip show pip
23
+ python -m pip install -e '.[dev]'
24
+
25
+ - name: isort
26
+ run: cd laser_encoders && isort --check --diff .
27
+
28
+ - name: black
29
+ run: cd laser_encoders && black --check --diff .
30
+
31
+ - name: pytest
32
+ run: pytest laser_encoders
laser/.gitignore ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ source/__pycache__
2
+ source/lib/__pycache__
3
+ models
4
+ tools-external
5
+ tasks/mldoc/MLDoc
6
+ embed
7
+ tasks/bucc/downloaded
8
+ tasks/similarity/dev/
9
+ tasks/xnli/XNLI-1.0*
10
+ tasks/xnli/multinli_1.0*
11
+ .??*swp
12
+ .idea
13
+ __pycache__
14
+ nllb
15
+ dist
laser/CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Code of Conduct
2
+
3
+ Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
4
+ Please read the [full text](https://code.fb.com/codeofconduct)
5
+ so that you can understand what actions will and will not be tolerated.
laser/CONTRIBUTING.md ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing to LASER
2
+ We want to make contributing to this project as easy and transparent as
3
+ possible.
4
+
5
+ ## Our Development Process
6
+ Minor changes and improvements will be released on an ongoing basis.
7
+
8
+ ## Pull Requests
9
+ We actively welcome your pull requests.
10
+
11
+ 1. Fork the repo and create your branch from `master`.
12
+ 2. If you've added code that should be tested, add tests.
13
+ 3. If you've changed APIs, update the documentation.
14
+ 4. Ensure the test suite passes.
15
+ 5. Make sure your code lints.
16
+ 6. If you haven't already, complete the Contributor License Agreement ("CLA").
17
+
18
+ ## Contributor License Agreement ("CLA")
19
+ In order to accept your pull request, we need you to submit a CLA. You only need
20
+ to do this once to work on any of Facebook's open source projects.
21
+
22
+ Complete your CLA here: <https://code.facebook.com/cla>
23
+
24
+ ## Issues
25
+ We use GitHub issues to track public bugs. Please ensure your description is
26
+ clear and has sufficient instructions to be able to reproduce the issue.
27
+
28
+ ## Coding Style
29
+ * 4 spaces for indentation rather than tabs
30
+ * 80 character line length
31
+ * PEP8 formatting
32
+
33
+ ## License
34
+ By contributing to LASER, you agree that your contributions will be licensed
35
+ under the LICENSE file in the root directory of this source tree.
36
+
37
+
laser/LICENSE ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ BSD License
2
+
3
+ For Language-Agnostic SEntence Representations (LASER) software
4
+
5
+ Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
6
+
7
+ Redistribution and use in source and binary forms, with or without modification,
8
+ are permitted provided that the following conditions are met:
9
+
10
+ * Redistributions of source code must retain the above copyright notice, this
11
+ list of conditions and the following disclaimer.
12
+
13
+ * Redistributions in binary form must reproduce the above copyright notice,
14
+ this list of conditions and the following disclaimer in the documentation
15
+ and/or other materials provided with the distribution.
16
+
17
+ * Neither the name Facebook nor the names of its contributors may be used to
18
+ endorse or promote products derived from this software without specific
19
+ prior written permission.
20
+
21
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
28
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
laser/README.md ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # LASER Language-Agnostic SEntence Representations
2
+
3
+ LASER is a library to calculate and use multilingual sentence embeddings.
4
+
5
+ **NEWS**
6
+ * 2023/11/30 Released [**P-xSIM**](tasks/pxsim), a dual approach extension to multilingual similarity search (xSIM)
7
+ * 2023/11/16 Released [**laser_encoders**](laser_encoders), a pip-installable package supporting LASER-2 and LASER-3 models
8
+ * 2023/06/26 [**xSIM++**](https://arxiv.org/abs/2306.12907) evaluation pipeline and data [**released**](tasks/xsimplusplus/README.md)
9
+ * 2022/07/06 Updated LASER models with support for over 200 languages are [**now available**](nllb/README.md)
10
+ * 2022/07/06 Multilingual similarity search (**xSIM**) evaluation pipeline [**released**](tasks/xsim/README.md)
11
+ * 2022/05/03 [**Librivox S2S is available**](tasks/librivox-s2s): Speech-to-Speech translations automatically mined in Librivox [9]
12
+ * 2019/11/08 [**CCMatrix is available**](tasks/CCMatrix): Mining billions of high-quality parallel sentences on the WEB [8]
13
+ * 2019/07/31 Gilles Bodard and Jérémy Rapin provided a [**Docker environment**](docker) to use LASER
14
+ * 2019/07/11 [**WikiMatrix is available**](tasks/WikiMatrix): bitext extraction for 1620 language pairs in WikiPedia [7]
15
+ * 2019/03/18 switch to BSD license
16
+ * 2019/02/13 The code to perform bitext mining is [**now available**](tasks/bucc)
17
+
18
+ **CURRENT VERSION:**
19
+ * We now provide updated LASER models which support over 200 languages. Please see [here](nllb/README.md) for more details including how to download the models and perform inference.
20
+
21
+ According to our experience, the sentence encoder also supports code-switching, i.e.
22
+ the same sentences can contain words in several different languages.
23
+
24
+ We have also some evidence that the encoder can generalize to other
25
+ languages which have not been seen during training, but which are in
26
+ a language family which is covered by other languages.
27
+
28
+ A detailed description of how the multilingual sentence embeddings are trained can
29
+ be found [here](https://arxiv.org/abs/2205.12654), together with an experimental evaluation.
30
+
31
+ ## The core sentence embedding package: `laser_encoders`
32
+ We provide a package `laser_encoders` with minimal dependencies.
33
+ It supports LASER-2 (a single encoder for the languages listed [below](#supported-languages))
34
+ and LASER-3 (147 language-specific encoders described [here](nllb/README.md)).
35
+
36
+ The package can be installed simply with `pip install laser_encoders` and used as below:
37
+
38
+ ```python
39
+ from laser_encoders import LaserEncoderPipeline
40
+ encoder = LaserEncoderPipeline(lang="eng_Latn")
41
+ embeddings = encoder.encode_sentences(["Hi!", "This is a sentence encoder."])
42
+ print(embeddings.shape) # (2, 1024)
43
+ ```
44
+
45
+ The laser_encoders [readme file](laser_encoders) provides more examples of its installation and usage.
46
+
47
+ ## The full LASER kit
48
+ Apart from the `laser_encoders`, we provide support for LASER-1 (the original multilingual encoder)
49
+ and for various LASER applications listed below.
50
+
51
+ ### Dependencies
52
+ * Python >= 3.7
53
+ * [PyTorch 1.0](http://pytorch.org/)
54
+ * [NumPy](http://www.numpy.org/), tested with 1.15.4
55
+ * [Cython](https://pypi.org/project/Cython/), needed by Python wrapper of FastBPE, tested with 0.29.6
56
+ * [Faiss](https://github.com/facebookresearch/faiss), for fast similarity search and bitext mining
57
+ * [transliterate 1.10.2](https://pypi.org/project/transliterate) (`pip install transliterate`)
58
+ * [jieba 0.39](https://pypi.org/project/jieba/), Chinese segmenter (`pip install jieba`)
59
+ * [mecab 0.996](https://pypi.org/project/JapaneseTokenizer/), Japanese segmenter
60
+ * tokenization from the Moses encoder (installed automatically)
61
+ * [FastBPE](https://github.com/glample/fastBPE), fast C++ implementation of byte-pair encoding (installed automatically)
62
+ * [Fairseq](https://github.com/pytorch/fairseq), sequence modeling toolkit (`pip install fairseq==0.12.1`)
63
+ * [tabulate](https://pypi.org/project/tabulate), pretty-print tabular data (`pip install tabulate`)
64
+ * [pandas](https://pypi.org/project/pandas), data analysis toolkit (`pip install pandas`)
65
+ * [Sentencepiece](https://github.com/google/sentencepiece), subword tokenization (installed automatically)
66
+
67
+ ### Installation
68
+ * install the `laser_encoders` package by e.g. `pip install -e .` for installing it in the editable mode
69
+ * set the environment variable 'LASER' to the root of the installation, e.g.
70
+ `export LASER="${HOME}/projects/laser"`
71
+ * download encoders from Amazon s3 by e.g. `bash ./nllb/download_models.sh`
72
+ * download third party software by `bash ./install_external_tools.sh`
73
+ * download the data used in the example tasks (see description for each task)
74
+
75
+ ## Applications
76
+
77
+ We showcase several applications of multilingual sentence embeddings
78
+ with code to reproduce our results (in the directory "tasks").
79
+
80
+ * [**Cross-lingual document classification**](tasks/mldoc) using the
81
+ [*MLDoc*](https://github.com/facebookresearch/MLDoc) corpus [2,6]
82
+ * [**WikiMatrix**](tasks/WikiMatrix)
83
+ Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia [7]
84
+ * [**Bitext mining**](tasks/bucc) using the
85
+ [*BUCC*](https://comparable.limsi.fr/bucc2018/bucc2018-task.html) corpus [3,5]
86
+ * [**Cross-lingual NLI**](tasks/xnli)
87
+ using the [*XNLI*](https://www.nyu.edu/projects/bowman/xnli/) corpus [4,5,6]
88
+ * [**Multilingual similarity search**](tasks/similarity) [1,6]
89
+ * [**Sentence embedding of text files**](tasks/embed)
90
+ example how to calculate sentence embeddings for arbitrary text files in any of the supported language.
91
+
92
+ **For all tasks, we use exactly the same multilingual encoder, without any task specific optimization or fine-tuning.**
93
+
94
+ ## License
95
+
96
+ LASER is BSD-licensed, as found in the [`LICENSE`](LICENSE) file in the root directory of this source tree.
97
+
98
+ ## Supported languages
99
+
100
+ The original LASER model was trained on the following languages:
101
+
102
+ Afrikaans, Albanian, Amharic, Arabic, Armenian, Aymara, Azerbaijani, Basque, Belarusian, Bengali,
103
+ Berber languages, Bosnian, Breton, Bulgarian, Burmese, Catalan, Central/Kadazan Dusun, Central Khmer,
104
+ Chavacano, Chinese, Coastal Kadazan, Cornish, Croatian, Czech, Danish, Dutch, Eastern Mari, English,
105
+ Esperanto, Estonian, Finnish, French, Galician, Georgian, German, Greek, Hausa, Hebrew, Hindi,
106
+ Hungarian, Icelandic, Ido, Indonesian, Interlingua, Interlingue, Irish, Italian, Japanese, Kabyle,
107
+ Kazakh, Korean, Kurdish, Latvian, Latin, Lingua Franca Nova, Lithuanian, Low German/Saxon,
108
+ Macedonian, Malagasy, Malay, Malayalam, Maldivian (Divehi), Marathi, Norwegian (Bokmål), Occitan,
109
+ Persian (Farsi), Polish, Portuguese, Romanian, Russian, Serbian, Sindhi, Sinhala, Slovak, Slovenian,
110
+ Somali, Spanish, Swahili, Swedish, Tagalog, Tajik, Tamil, Tatar, Telugu, Thai, Turkish, Uighur,
111
+ Ukrainian, Urdu, Uzbek, Vietnamese, Wu Chinese and Yue Chinese.
112
+
113
+ We have also observed that the model seems to generalize well to other (minority) languages or dialects, e.g.
114
+
115
+ Asturian, Egyptian Arabic, Faroese, Kashubian, North Moluccan Malay, Nynorsk Norwegian, Piedmontese, Sorbian, Swabian,
116
+ Swiss German or Western Frisian.
117
+
118
+ ### LASER3
119
+
120
+ Updated LASER models referred to as *[LASER3](nllb/README.md)* supplement the above list with support for 147 languages. The full list of supported languages can be seen [here](nllb/README.md#list-of-available-laser3-encoders).
121
+
122
+ ## References
123
+
124
+ [1] Holger Schwenk and Matthijs Douze,
125
+ [*Learning Joint Multilingual Sentence Representations with Neural Machine Translation*](https://aclanthology.info/papers/W17-2619/w17-2619),
126
+ ACL workshop on Representation Learning for NLP, 2017
127
+
128
+ [2] Holger Schwenk and Xian Li,
129
+ [*A Corpus for Multilingual Document Classification in Eight Languages*](http://www.lrec-conf.org/proceedings/lrec2018/pdf/658.pdf),
130
+ LREC, pages 3548-3551, 2018.
131
+
132
+ [3] Holger Schwenk,
133
+ [*Filtering and Mining Parallel Data in a Joint Multilingual Space*](http://aclweb.org/anthology/P18-2037)
134
+ ACL, July 2018
135
+
136
+ [4] Alexis Conneau, Guillaume Lample, Ruty Rinott, Adina Williams, Samuel R. Bowman, Holger Schwenk and Veselin Stoyanov,
137
+ [*XNLI: Cross-lingual Sentence Understanding through Inference*](https://aclweb.org/anthology/D18-1269),
138
+ EMNLP, 2018.
139
+
140
+ [5] Mikel Artetxe and Holger Schwenk,
141
+ [*Margin-based Parallel Corpus Mining with Multilingual Sentence Embeddings*](https://arxiv.org/abs/1811.01136)
142
+ arXiv, Nov 3 2018.
143
+
144
+ [6] Mikel Artetxe and Holger Schwenk,
145
+ [*Massively Multilingual Sentence Embeddings for Zero-Shot Cross-Lingual Transfer and Beyond*](https://arxiv.org/abs/1812.10464)
146
+ arXiv, Dec 26 2018.
147
+
148
+ [7] Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman,
149
+ [*WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia*](https://arxiv.org/abs/1907.05791)
150
+ arXiv, July 11 2019.
151
+
152
+ [8] Holger Schwenk, Guillaume Wenzek, Sergey Edunov, Edouard Grave and Armand Joulin
153
+ [*CCMatrix: Mining Billions of High-Quality Parallel Sentences on the WEB*](https://arxiv.org/abs/1911.04944)
154
+
155
+ [9] Paul-Ambroise Duquenne, Hongyu Gong, Holger Schwenk,
156
+ [*Multimodal and Multilingual Embeddings for Large-Scale Speech Mining,*](https://papers.nips.cc/paper/2021/hash/8466f9ace6a9acbe71f75762ffc890f1-Abstract.html), NeurIPS 2021, pages 15748-15761.
157
+
158
+ [10] Kevin Heffernan, Onur Celebi, and Holger Schwenk,
159
+ [*Bitext Mining Using Distilled Sentence Representations for Low-Resource Languages*](https://arxiv.org/abs/2205.12654)
laser/docker/Dockerfile ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM continuumio/miniconda3
2
+
3
+ MAINTAINER Gilles Bodart <[email protected]>
4
+
5
+ # Install build-essential (compiler and development tools)
6
+ RUN apt-get update && \
7
+ apt-get install -y build-essential && \
8
+ rm -rf /var/lib/apt/lists/*
9
+
10
+ RUN conda create -n env python=3.8
11
+ RUN echo "source activate env" > ~/.bashrc
12
+ ENV PATH /opt/conda/envs/env/bin:$PATH
13
+
14
+ # Set the working directory to /app
15
+ WORKDIR /app
16
+
17
+ # Copy the local laser-encoders repository
18
+ COPY laser_encoders /app/laser_encoders
19
+ COPY pyproject.toml /app/pyproject.toml
20
+
21
+ RUN pip install --upgrade pip
22
+ RUN pip install -e .
23
+ RUN pip install Flask==2.3.3 Requests==2.31.0
24
+
25
+ # Define the argument for language
26
+ ARG langs="eng_Latn"
27
+
28
+ # Download language models for each specified language
29
+ RUN for lang in $langs; do \
30
+ python -m laser_encoders.download_models --lang=$lang; \
31
+ done
32
+
33
+ # Open the port 80
34
+ EXPOSE 80
35
+
36
+ COPY docker/app.py /app/app.py
37
+
38
+ CMD ["/bin/bash"]
laser/docker/README.md ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## LASER Docker Image
2
+
3
+ This image provides a convenient way to run LASER in a Docker container.
4
+
5
+ ### Building the image
6
+ To build the image, run the following command from the root of the LASER directory:
7
+
8
+ ```
9
+ docker build --tag laser -f docker/Dockerfile .
10
+ ```
11
+ ### Specifying Languages with `langs` Argument
12
+
13
+ You can pre-download the encoders and tokenizers for specific languages by using the `langs` build argument. This argument accepts a space-separated list of language codes. For example, to build an image with models for English and French, use the following command:
14
+ ```
15
+ docker build --build-arg langs="eng_Latn fra_Latn" -t laser -f docker/Dockerfile .
16
+ ```
17
+ If the `langs` argument is not specified during the build process, the image will default to building with English (`eng_Latn`). It's important to note that in this default case where English is selected, the LASER2 model, which supports 92 languages, is used. For a comprehensive list of LASER2 supported languages, refer to `LASER2_LANGUAGES_LIST` in [`language_list.py`](https://github.com/facebookresearch/LASER/blob/main/laser_encoders/language_list.py).
18
+
19
+
20
+ ### Running the Image
21
+ Once the image is built, you can run it with the following command:
22
+
23
+ ```
24
+ docker run -it laser
25
+ ```
26
+ **Note:** If you want to expose a local port to the REST server on top of the embed task, you can do so by executing the following command instead of the last command:
27
+
28
+ ```
29
+ docker run -it -p [CHANGEME_LOCAL_PORT]:80 laser python app.py
30
+ ```
31
+ This will override the command line entrypoint of the Docker container.
32
+
33
+ Example:
34
+
35
+ ```
36
+ docker run -it -p 8081:80 laser python app.py
37
+ ```
38
+
39
+ This Flask server will serve a REST Api that can be use by calling your server with this URL :
40
+
41
+ ```
42
+ http://127.0.0.1:[CHANGEME_LOCAL_PORT]/vectorize?q=[YOUR_SENTENCE_URL_ENCODED]&lang=[LANGUAGE]
43
+ ```
44
+
45
+ Example:
46
+
47
+ ```
48
+ http://127.0.0.1:8081/vectorize?q=ki%20lo%20'orukọ%20ẹ&lang=yor
49
+ ```
50
+
51
+ Sample response:
52
+ ```
53
+ {
54
+ "content": "ki lo 'orukọ ẹ",
55
+ "embedding": [
56
+ [
57
+ -0.10241681337356567,
58
+ 0.11120740324258804,
59
+ -0.26641348004341125,
60
+ -0.055699944496154785,
61
+ ....
62
+ ....
63
+ ....
64
+ -0.034048307687044144,
65
+ 0.11005636304616928,
66
+ -0.3238321840763092,
67
+ -0.060631975531578064,
68
+ -0.19269055128097534,
69
+ ]
70
+ }
71
+ ```
72
+
73
+ Here is an example of how you can send requests to it with python:
74
+
75
+ ```python
76
+ import requests
77
+ import numpy as np
78
+ url = "http://127.0.0.1:[CHANGEME_LOCAL_PORT]/vectorize"
79
+ params = {"q": "Hey, how are you?\nI'm OK and you?", "lang": "en"}
80
+ resp = requests.get(url=url, params=params).json()
81
+ print(resp["embedding"])
82
+ ```
laser/docker/app.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ import os
4
+ import socket
5
+
6
+ from flask import Flask, jsonify, request
7
+
8
+ from laser_encoders import LaserEncoderPipeline
9
+ from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE
10
+
11
+ app = Flask(__name__)
12
+
13
+ # Global cache for encoders
14
+ encoder_cache = {}
15
+
16
+ laser2_encoder = None
17
+
18
+
19
+ @app.route("/")
20
+ def root():
21
+ print("/")
22
+ html = "<h3>Hello {name}!</h3>" "<b>Hostname:</b> {hostname}<br/>"
23
+ return html.format(name=os.getenv("LASER", "world"), hostname=socket.gethostname())
24
+
25
+
26
+ @app.route("/vectorize", methods=["GET"])
27
+ def vectorize():
28
+ content = request.args.get("q")
29
+ lang = request.args.get(
30
+ "lang", "eng"
31
+ ) # Default to English if 'lang' is not provided
32
+
33
+ if content is None:
34
+ return jsonify({"error": "Missing input content"}), 400
35
+
36
+ try:
37
+ global laser2_encoder
38
+ if lang in LASER2_LANGUAGE: # Checks for both 3-letter code or 8-letter code
39
+ if not laser2_encoder:
40
+ laser2_encoder = LaserEncoderPipeline(lang=lang)
41
+ encoder = laser2_encoder
42
+ else:
43
+ lang_code = LASER3_LANGUAGE.get(
44
+ lang, lang
45
+ ) # Use language code as key to prevent multiple entries for same language
46
+ if lang_code not in encoder_cache:
47
+ encoder_cache[lang_code] = LaserEncoderPipeline(lang=lang_code)
48
+ encoder = encoder_cache[lang_code]
49
+
50
+ embeddings = encoder.encode_sentences([content])
51
+ embeddings_list = embeddings.tolist()
52
+ body = {"content": content, "embedding": embeddings_list}
53
+ return jsonify(body), 200
54
+
55
+ except ValueError as e:
56
+ # Check if the exception is due to an unsupported language
57
+ if "unsupported language" in str(e).lower():
58
+ return jsonify({"error": f"Language '{lang}' is not supported."}), 400
59
+ else:
60
+ return jsonify({"error": str(e)}), 400
61
+
62
+
63
+ if __name__ == "__main__":
64
+ app.run(debug=True, port=80, host="0.0.0.0")
laser/docker/decode.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import sys
3
+
4
+ dim = 1024
5
+ X = np.fromfile(sys.argv[1], dtype=np.float32, count=-1)
6
+ X.resize(X.shape[0] // dim, dim)
7
+ print(X)
laser/install_external_tools.sh ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+ # LASER Language-Agnostic SEntence Representations
9
+ # is a toolkit to calculate multilingual sentence embeddings
10
+ # and to use them for document classification, bitext filtering
11
+ # and mining
12
+ #
13
+ #-------------------------------------------------------
14
+ #
15
+ # This bash script installs third party software
16
+ #
17
+
18
+ if [ -z ${LASER} ] ; then
19
+ echo "Please set the environment variable 'LASER'"
20
+ exit
21
+ fi
22
+
23
+ ###################################################################
24
+ #
25
+ # Generic helper functions
26
+ #
27
+ ###################################################################
28
+
29
+ MKDIR () {
30
+ dname=$1
31
+ if [ ! -d ${dname} ] ; then
32
+ echo " - creating directory ${dname}"
33
+ mkdir -p ${dname}
34
+ fi
35
+ }
36
+
37
+
38
+ bdir="${LASER}"
39
+ tools_ext="${bdir}/tools-external"
40
+ MKDIR $tools_ext
41
+
42
+ ###################################################################
43
+ #
44
+ # Tokenization tools from Moses
45
+ # It is important to use the official release V4 and not the current one
46
+ # to obtain the same results than the published ones.
47
+ # (the behavior of the tokenizer for end-of-sentence abbreviations has changed)
48
+ #
49
+ ###################################################################
50
+
51
+ InstallMosesTools () {
52
+ moses_git="https://raw.githubusercontent.com/moses-smt/mosesdecoder/RELEASE-4.0/scripts"
53
+ moses_files=("tokenizer/tokenizer.perl" "tokenizer/detokenizer.perl" \
54
+ "tokenizer/normalize-punctuation.perl" \
55
+ "tokenizer/remove-non-printing-char.perl" \
56
+ "tokenizer/deescape-special-chars.perl" \
57
+ "tokenizer/lowercase.perl" \
58
+ "tokenizer/basic-protected-patterns" \
59
+ )
60
+
61
+ wdir="${tools_ext}/moses-tokenizer/tokenizer"
62
+ MKDIR ${wdir}
63
+ cd ${wdir}
64
+
65
+ for f in ${moses_files[@]} ; do
66
+ if [ ! -f `basename ${f}` ] ; then
67
+ echo " - download ${f}"
68
+ wget -q ${moses_git}/${f}
69
+ fi
70
+ done
71
+ chmod 755 *perl
72
+
73
+ # download non-breaking prefixes per language
74
+ moses_non_breakings="share/nonbreaking_prefixes/nonbreaking_prefix"
75
+ moses_non_breaking_langs=( \
76
+ "ca" "cs" "de" "el" "en" "es" "fi" "fr" "ga" "hu" "is" \
77
+ "it" "lt" "lv" "nl" "pl" "pt" "ro" "ru" "sk" "sl" "sv" \
78
+ "ta" "yue" "zh" )
79
+ wdir="${tools_ext}/moses-tokenizer/share/nonbreaking_prefixes"
80
+ MKDIR ${wdir}
81
+ cd ${wdir}
82
+
83
+ for l in ${moses_non_breaking_langs[@]} ; do
84
+ f="${moses_non_breakings}.${l}"
85
+ if [ ! -f `basename ${f}` ] ; then
86
+ echo " - download ${f}"
87
+ wget -q ${moses_git}/${f}
88
+ fi
89
+ done
90
+ }
91
+
92
+
93
+ ###################################################################
94
+ #
95
+ # FAST BPE
96
+ #
97
+ ###################################################################
98
+
99
+ InstallFastBPE () {
100
+ cd ${tools_ext}
101
+ if [ ! -x fastBPE/fast ] ; then
102
+ echo " - download fastBPE software from github"
103
+ wget https://github.com/glample/fastBPE/archive/master.zip
104
+ unzip master.zip
105
+ /bin/rm master.zip
106
+ mv fastBPE-master fastBPE
107
+ cd fastBPE
108
+ echo " - compiling"
109
+ g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast
110
+ if [ $? -eq 1 ] ; then
111
+ echo "ERROR: compilation failed, please install manually"; exit
112
+ fi
113
+ python setup.py install
114
+ fi
115
+ }
116
+
117
+ ###################################################################
118
+ #
119
+ # SENTENCEPIECE
120
+ #
121
+ ###################################################################
122
+
123
+ InstallSentencePiece () {
124
+ cd ${tools_ext}
125
+ if [ ! -d sentencepiece-master ] ; then
126
+ echo " - download sentencepiece from github"
127
+ wget https://github.com/google/sentencepiece/archive/master.zip
128
+ unzip master.zip
129
+ /bin/rm master.zip
130
+ if [ ! -s /usr/local/bin/spm_encode ] ; then
131
+ echo " - building code "
132
+ cd sentencepiece-master
133
+ mkdir build
134
+ cd build
135
+ cmake ..
136
+ make -j 10
137
+ fi
138
+ fi
139
+ }
140
+
141
+
142
+ ###################################################################
143
+ #
144
+ # Install Japanese tokenizer Mecab
145
+ # We do not use automatic installation with "pip" but directly add the soruce directory
146
+ #
147
+ ###################################################################
148
+
149
+ InstallMecab () {
150
+ cd ${tools_ext}
151
+ if [ ! -x mecab/mecab/bin/mecab ] ; then
152
+ echo " - download mecab from github"
153
+ wget https://github.com/taku910/mecab/archive/master.zip
154
+ unzip master.zip
155
+ #/bin/rm master.zip
156
+ if [ ! -s mecab/bin/mecab ] ; then
157
+ mkdir mecab
158
+ cd mecab-master/mecab
159
+ echo " - installing code"
160
+ ./configure --prefix ${tools_ext}/mecab && make && make install
161
+ if [ $? -q 1 ] ; then
162
+ echo "ERROR: installation failed, please install manually"; exit
163
+ fi
164
+ fi
165
+ if [ ! -d mecab/lib/mecab/dic/ipadic ] ; then
166
+ cd ${tools_ext}/mecab-master/mecab-ipadic
167
+ echo " - installing dictionaries"
168
+ ./configure --prefix ${tools_ext}/mecab --with-mecab-config=${tools_ext}/mecab/bin/mecab-config \
169
+ && make && make install
170
+ if [ $? -eq 1 ] ; then
171
+ echo "ERROR: compilation failed, please install manually"; exit
172
+ fi
173
+ fi
174
+ fi
175
+ }
176
+
177
+
178
+ ###################################################################
179
+ #
180
+ # main
181
+ #
182
+ ###################################################################
183
+
184
+ echo "Installing the laser_encoders package in editable mode"
185
+
186
+ pip install -e .
187
+
188
+ echo "Installing external tools"
189
+
190
+ InstallMosesTools
191
+ InstallFastBPE
192
+ InstallSentencePiece
193
+
194
+ #InstallMecab
195
+ echo ""
196
+ echo "automatic installation of the Japanese tokenizer mecab may be tricky"
197
+ echo "Please install it manually from https://github.com/taku910/mecab"
198
+ echo ""
199
+ echo "The installation directory should be ${LASER}/tools-external/mecab"
200
+ echo ""
laser/install_models.sh ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+ # LASER Language-Agnostic SEntence Representations
9
+ # is a toolkit to calculate multilingual sentence embeddings
10
+ # and to use them for document classification, bitext filtering
11
+ # and mining
12
+ #
13
+ #-------------------------------------------------------
14
+ #
15
+ # This bash script installs sentence encoders from Amazon s3
16
+ #
17
+
18
+ if [ -z ${LASER} ] ; then
19
+ echo "Please set the environment variable 'LASER'"
20
+ exit
21
+ fi
22
+
23
+ mdir="${LASER}/models"
24
+
25
+ # available encoders
26
+ s3="https://dl.fbaipublicfiles.com/laser/models"
27
+ networks=("bilstm.eparl21.2018-11-19.pt" \
28
+ "eparl21.fcodes" "eparl21.fvocab" \
29
+ "bilstm.93langs.2018-12-26.pt" \
30
+ "93langs.fcodes" "93langs.fvocab")
31
+
32
+
33
+ echo "Downloading networks"
34
+
35
+ if [ ! -d ${mdir} ] ; then
36
+ echo " - creating directory ${mdir}"
37
+ mkdir -p ${mdir}
38
+ fi
39
+
40
+ cd ${mdir}
41
+ for f in ${networks[@]} ; do
42
+ if [ -f ${f} ] ; then
43
+ echo " - ${mdir}/${f} already downloaded"
44
+ else
45
+ echo " - ${f}"
46
+ wget -q ${s3}/${f}
47
+ fi
48
+ done
laser/laser2.cvocab ADDED
The diff for this file is too large to render. See raw diff
 
laser/laser2.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f7ef5da4408b94a096ff72d31d90f8ba438b4ab772764eb50c3db5e201fb384
3
+ size 1008139
laser/laser_encoders/README.md ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # LASER encoders
2
+
3
+ LASER Language-Agnostic SEntence Representations Toolkit
4
+
5
+ laser_encoders is the official Python package for the Facebook [LASER](https://github.com/facebookresearch/LASER) library. It provides a simple and convenient way to use LASER embeddings in Python. It allows you to calculate multilingual sentence embeddings using the LASER toolkit. These embeddings can be utilized for various natural language processing tasks, including document classification, bitext filtering, and mining.
6
+
7
+ ## Dependencies
8
+
9
+ - Python `>= 3.8`
10
+ - [PyTorch `>= 1.10.0`](http://pytorch.org/)
11
+ - sacremoses `>=0.1.0`
12
+ - sentencepiece `>=0.1.99`
13
+ - numpy `>=1.21.3`
14
+ - fairseq `>=0.12.2`
15
+
16
+ You can find a full list of requirements [here](https://github.com/facebookresearch/LASER/blob/main/pyproject.toml)
17
+
18
+ ## Installation
19
+
20
+ You can install `laser_encoders` package from PyPI:
21
+
22
+ ```sh
23
+ pip install laser_encoders
24
+ ```
25
+
26
+ Alternatively, you can install it from a local clone of this repository, in editable mode:
27
+ ```sh
28
+ pip install . -e
29
+ ```
30
+
31
+ ## Usage
32
+
33
+ Here's a simple example on how to obtain embeddings for sentences using the `LaserEncoderPipeline`:
34
+
35
+ >**Note:** By default, the models will be downloaded to the `~/.cache/laser_encoders` directory. To specify a different download location, you can provide the argument `model_dir=path/to/model/directory`
36
+
37
+ ```py
38
+ from laser_encoders import LaserEncoderPipeline
39
+
40
+ # Initialize the LASER encoder pipeline
41
+ encoder = LaserEncoderPipeline(lang="igbo")
42
+
43
+ # Encode sentences into embeddings
44
+ embeddings = encoder.encode_sentences(["nnọọ, kedu ka ị mere"])
45
+ # If you want the output embeddings to be L2-normalized, set normalize_embeddings to True
46
+ normalized_embeddings = encoder.encode_sentences(["nnọọ, kedu ka ị mere"], normalize_embeddings=True)
47
+
48
+ ```
49
+
50
+ If you prefer more control over the tokenization and encoding process, you can initialize the tokenizer and encoder separately:
51
+ ```py
52
+ from laser_encoders import initialize_encoder, initialize_tokenizer
53
+
54
+ # Initialize the LASER tokenizer
55
+ tokenizer = initialize_tokenizer(lang="igbo")
56
+ tokenized_sentence = tokenizer.tokenize("nnọọ, kedu ka ị mere")
57
+
58
+ # Initialize the LASER sentence encoder
59
+ encoder = initialize_encoder(lang="igbo")
60
+
61
+ # Encode tokenized sentences into embeddings
62
+ embeddings = encoder.encode_sentences([tokenized_sentence])
63
+ ```
64
+ >By default, the `spm` flag is set to `True` when initializing the encoder, ensuring the accompanying spm model is downloaded.
65
+
66
+ **Supported Languages:** You can specify any language from the [FLORES200](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200) dataset. This includes both languages identified by their full codes (like "ibo_Latn") and simpler alternatives (like "igbo").
67
+
68
+ ## Downloading the pre-trained models
69
+
70
+ If you prefer to download the models individually, you can use the following command:
71
+
72
+ ```sh
73
+ python -m laser_encoders.download_models --lang=your_prefered_language # e.g., --lang="igbo""
74
+ ```
75
+
76
+ By default, the downloaded models will be stored in the `~/.cache/laser_encoders` directory. To specify a different download location, utilize the following command:
77
+
78
+ ```sh
79
+ python -m laser_encoders.download_models --model-dir=path/to/model/directory
80
+ ```
81
+
82
+ > For a comprehensive list of available arguments, you can use the `--help` command with the download_models script.
83
+
84
+ Once you have successfully downloaded the models, you can utilize the `SentenceEncoder` to tokenize and encode your text in your desired language. Here's an example of how you can achieve this:
85
+
86
+ ```py
87
+ from laser_encoders.models import SentenceEncoder
88
+ from pathlib import Path
89
+
90
+ encoder = SentenceEncoder(model_path=path/to/downloaded/model, spm_model=Path(path/to/spm_model), spm_vocab=path/to/cvocab)
91
+ embeddings = encoder("This is a test sentence.")
92
+ ```
93
+ If you want to perform tokenization seperately, you can do this below:
94
+ ```py
95
+ from laser_encoders.laser_tokenizer import LaserTokenizer
96
+
97
+ tokenizer = LaserTokenizer(spm_model=Path(path/to/spm_model))
98
+
99
+ tokenized_sentence = tokenizer.tokenize("This is a test sentence.")
100
+
101
+ encoder = SentenceEncoder(model_path=path/to/downloaded/model, spm_vocab=path/to/cvocab)
102
+ embeddings = encoder.encode_sentences([tokenized_sentence])
103
+ ```
104
+
105
+ For tokenizing a file instead of a string, you can use the following:
106
+
107
+ ```py
108
+ tokenized_sentence = tokenizer.tokenize_file(inp_fname=Path(path/to/input_file.txt), out_fname=Path(path/to/output_file.txt))
109
+ ```
110
+
111
+ ### Now you can use these embeddings for downstream tasks
112
+
113
+ For more advanced usage and options, please refer to the official LASER repository documentation.
114
+
115
+ ## LASER Versions and Associated Packages
116
+
117
+ For users familiar with the earlier version of LASER, you might have encountered the [`laserembeddings`](https://pypi.org/project/laserembeddings/) package. This package primarily dealt with LASER-1 model embeddings.
118
+
119
+ For the latest LASER-2,3 models, use the newly introduced `laser_encoders` package, which offers better performance and support for a wider range of languages.
120
+
121
+
122
+ ## Contributing
123
+
124
+ We welcome contributions from the developer community to enhance and improve laser_encoders. If you'd like to contribute, you can:
125
+
126
+ 1. Submit bug reports or feature requests through GitHub issues.
127
+ 1. Fork the repository, make changes, and submit pull requests for review.
128
+
129
+ Please follow our [Contribution Guidelines](https://github.com/facebookresearch/LASER/blob/main/CONTRIBUTING.md) to ensure a smooth process.
130
+
131
+ ### Code of Conduct
132
+
133
+ We expect all contributors to adhere to our [Code of Conduct](https://github.com/facebookresearch/LASER/blob/main/CODE_OF_CONDUCT.md).
134
+
135
+ ### Contributors
136
+
137
+ The following people have contributed to this project:
138
+
139
+ - [Victor Joseph](https://github.com/CaptainVee)
140
+ - [Paul Okewunmi](https://github.com/Paulooh007)
141
+ - [Siddharth Singh Rana](https://github.com/NIXBLACK11)
142
+ - [David Dale](https://github.com/avidale/)
143
+ - [Holger Schwenk](https://github.com/hoschwenk)
144
+ - [Kevin Heffernan](https://github.com/heffernankevin)
145
+
146
+ ### License
147
+
148
+ This package is released under the [LASER](https://github.com/facebookresearch/LASER/blob/main/LICENSE) BSD License.
149
+
laser/laser_encoders/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+ # LASER Language-Agnostic SEntence Representations
9
+ # is a toolkit to calculate multilingual sentence embeddings
10
+ # and to use them for document classification, bitext filtering
11
+ # and mining
12
+ #
13
+ # -------------------------------------------------------
14
+
15
+ from laser_encoders.laser_tokenizer import initialize_tokenizer
16
+ from laser_encoders.models import LaserEncoderPipeline, initialize_encoder
laser/laser_encoders/download_models.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+ # LASER Language-Agnostic SEntence Representations
9
+ # is a toolkit to calculate multilingual sentence embeddings
10
+ # and to use them for document classification, bitext filtering
11
+ # and mining
12
+ #
13
+ # -------------------------------------------------------
14
+ #
15
+ # This python script installs NLLB LASER2 and LASER3 sentence encoders from Amazon s3
16
+
17
+ import argparse
18
+ import logging
19
+ import os
20
+ import shutil
21
+ import sys
22
+ import tempfile
23
+ from pathlib import Path
24
+
25
+ import requests
26
+ from tqdm import tqdm
27
+
28
+ from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE, SPM_LANGUAGE
29
+
30
+ logging.basicConfig(
31
+ stream=sys.stdout,
32
+ level=logging.INFO,
33
+ format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
34
+ )
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ class LaserModelDownloader:
39
+ def __init__(self, model_dir: str = None):
40
+ if model_dir is None:
41
+ model_dir = os.path.expanduser("~/.cache/laser_encoders")
42
+ os.makedirs(model_dir, exist_ok=True)
43
+
44
+ self.model_dir = Path(model_dir)
45
+ self.base_url = "https://dl.fbaipublicfiles.com/nllb/laser"
46
+
47
+ def download(self, filename: str):
48
+ # Because on windows os.path.join will use "\" insted of "/", so link would be:
49
+ # https://dl.fbaipublicfiles.com/nllb/laser\laser2.pt instead of https://dl.fbaipublicfiles.com/nllb/laser/laser2.pt
50
+ # which results in a failed download.
51
+ url = f"{self.base_url}/{filename}"
52
+ local_file_path = os.path.join(self.model_dir, filename)
53
+
54
+ if os.path.exists(local_file_path):
55
+ logger.info(f" - {filename} already downloaded")
56
+ else:
57
+ logger.info(f" - Downloading {filename}")
58
+
59
+ tf = tempfile.NamedTemporaryFile(delete=False)
60
+ temp_file_path = tf.name
61
+
62
+ with tf:
63
+ response = requests.get(url, stream=True)
64
+ total_size = int(response.headers.get("Content-Length", 0))
65
+ progress_bar = tqdm(total=total_size, unit_scale=True, unit="B")
66
+
67
+ for chunk in response.iter_content(chunk_size=1024):
68
+ tf.write(chunk)
69
+ progress_bar.update(len(chunk))
70
+ progress_bar.close()
71
+
72
+ shutil.move(temp_file_path, local_file_path)
73
+
74
+ def get_language_code(self, language_list: dict, lang: str) -> str:
75
+ try:
76
+ lang_3_4 = language_list[lang]
77
+ if isinstance(lang_3_4, list):
78
+ options = ", ".join(f"'{opt}'" for opt in lang_3_4)
79
+ raise ValueError(
80
+ f"Language '{lang}' has multiple options: {options}. Please specify using the 'lang' argument."
81
+ )
82
+ return lang_3_4
83
+ except KeyError:
84
+ raise ValueError(
85
+ f"language name: {lang} not found in language list. Specify a supported language name"
86
+ )
87
+
88
+ def download_laser2(self):
89
+ self.download("laser2.pt")
90
+ self.download("laser2.spm")
91
+ self.download("laser2.cvocab")
92
+
93
+ def download_laser3(self, lang: str, spm: bool = False):
94
+ result = self.get_language_code(LASER3_LANGUAGE, lang)
95
+
96
+ if isinstance(result, list):
97
+ raise ValueError(
98
+ f"There are script-specific models available for {lang}. Please choose one from the following: {result}"
99
+ )
100
+
101
+ lang = result
102
+ self.download(f"laser3-{lang}.v1.pt")
103
+ if spm:
104
+ if lang in SPM_LANGUAGE:
105
+ self.download(f"laser3-{lang}.v1.spm")
106
+ self.download(f"laser3-{lang}.v1.cvocab")
107
+ else:
108
+ self.download(f"laser2.spm")
109
+ self.download(f"laser2.cvocab")
110
+
111
+ def main(self, args):
112
+ if args.laser:
113
+ if args.laser == "laser2":
114
+ self.download_laser2()
115
+ elif args.laser == "laser3":
116
+ self.download_laser3(lang=args.lang, spm=args.spm)
117
+ else:
118
+ raise ValueError(
119
+ f"Unsupported laser model: {args.laser}. Choose either laser2 or laser3."
120
+ )
121
+ else:
122
+ if args.lang in LASER3_LANGUAGE:
123
+ self.download_laser3(lang=args.lang, spm=args.spm)
124
+ elif args.lang in LASER2_LANGUAGE:
125
+ self.download_laser2()
126
+ else:
127
+ raise ValueError(
128
+ f"Unsupported language name: {args.lang}. Please specify a supported language name using --lang."
129
+ )
130
+
131
+
132
+ if __name__ == "__main__":
133
+ parser = argparse.ArgumentParser(description="LASER: Download Laser models")
134
+ parser.add_argument(
135
+ "--laser",
136
+ type=str,
137
+ help="Laser model to download",
138
+ )
139
+ parser.add_argument(
140
+ "--lang",
141
+ type=str,
142
+ help="The language name in FLORES200 format",
143
+ )
144
+ parser.add_argument(
145
+ "--spm",
146
+ action="store_false",
147
+ help="Do not download the SPM model?",
148
+ )
149
+ parser.add_argument(
150
+ "--model-dir", type=str, help="The directory to download the models to"
151
+ )
152
+ args = parser.parse_args()
153
+ downloader = LaserModelDownloader(args.model_dir)
154
+ downloader.main(args)
laser/laser_encoders/language_list.py ADDED
@@ -0,0 +1,564 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+ # LASER Language-Agnostic SEntence Representations
9
+ # is a toolkit to calculate multilingual sentence embeddings
10
+ # and to use them for document classification, bitext filtering
11
+ # and mining
12
+ #
13
+ # -------------------------------------------------------
14
+ # Language mapping to handle different language codes and names
15
+
16
+
17
+ def build_language_names_dict(language_list: list, language_names: dict) -> dict:
18
+ """
19
+ Build a dictionary mapping language names to their corresponding language codes.
20
+
21
+ Parameters:
22
+ - language_list (list): A list of language codes.
23
+ - language_names (dict): A dictionary mapping language codes to language names.
24
+
25
+ Returns:
26
+ - dict: A dictionary mapping language names to their corresponding language codes.
27
+ """
28
+ result_dict = {}
29
+
30
+ for lang_code in language_list:
31
+ if lang_code not in language_names:
32
+ raise ValueError(
33
+ f"Language code '{lang_code}' not found in the provided language_names dictionary."
34
+ )
35
+
36
+ names_list = language_names[lang_code]
37
+
38
+ # Ensure names_list is always a list
39
+ if not isinstance(names_list, list):
40
+ names_list = [names_list]
41
+
42
+ for name in names_list:
43
+ if name not in result_dict:
44
+ result_dict[name] = []
45
+ result_dict[name].append(lang_code)
46
+
47
+ # Remove single-element lists and convert them to the element itself
48
+ for key in result_dict:
49
+ if len(result_dict[key]) == 1:
50
+ result_dict[key] = result_dict[key][0]
51
+
52
+ return result_dict
53
+
54
+
55
+ SPM_LANGUAGE = [
56
+ "amh_Ethi",
57
+ "ayr_Latn",
58
+ "azj_Latn",
59
+ "bak_Cyrl",
60
+ "bel_Cyrl",
61
+ "bod_Tibt",
62
+ "ckb_Arab",
63
+ "crh_Latn",
64
+ "dik_Latn",
65
+ "dzo_Tibt",
66
+ "fur_Latn",
67
+ "fuv_Latn",
68
+ "grn_Latn",
69
+ "kab_Latn",
70
+ "kac_Latn",
71
+ "kaz_Cyrl",
72
+ "kir_Cyrl",
73
+ "kmr_Latn",
74
+ "lij_Latn",
75
+ "lim_Latn",
76
+ "lmo_Latn",
77
+ "ltg_Latn",
78
+ "mya_Mymr",
79
+ "pbt_Arab",
80
+ "pes_Arab",
81
+ "prs_Arab",
82
+ "sat_Beng",
83
+ "scn_Latn",
84
+ "srd_Latn",
85
+ "szl_Latn",
86
+ "taq_Latn",
87
+ "tgk_Cyrl",
88
+ "tir_Ethi",
89
+ "tzm_Tfng",
90
+ "vec_Latn",
91
+ ]
92
+
93
+
94
+ ##################################
95
+ ###### LANGUAGE NAMES ############
96
+ ##################################
97
+
98
+ LANGUAGE_NAMES = {
99
+ "ace_Arab": ["acehnese", "ace", "ace_Arab"],
100
+ "ace_Latn": ["acehnese", "ace", "ace_Latn"],
101
+ "acm_Arab": ["mesopotamian arabic", "acm", "acm_Arab"],
102
+ "acq_Arab": ["ta’izzi-adeni arabic", "acq", "acq_Arab"],
103
+ "aeb_Arab": ["tunisian arabic", "aeb", "aeb_Arab"],
104
+ "afr_Latn": ["afrikaans", "afr", "afr_Latn"],
105
+ "ajp_Arab": ["south levantine arabic", "ajp", "ajp_Arab"],
106
+ "aka_Latn": ["akan", "aka", "aka_Latn"],
107
+ "amh_Ethi": ["amharic", "amh", "amh_Ethi"],
108
+ "apc_Arab": ["north levantine arabic", "apc", "apc_Arab"],
109
+ "arb_Arab": ["modern standard arabic", "arb", "arb_Arab"],
110
+ "arb_Latn": ["modern standard arabic", "arb", "arb_Latn"],
111
+ "ars_Arab": ["najdi arabic", "ars", "ars_Arab"],
112
+ "ary_Arab": ["moroccan arabic", "ary", "ary_Arab"],
113
+ "arz_Arab": ["egyptian arabic", "arz", "arz_Arab"],
114
+ "asm_Beng": ["assamese", "asm", "asm_Beng"],
115
+ "ast_Latn": ["asturian", "ast", "ast_Latn"],
116
+ "awa_Deva": ["awadhi", "awa", "awa_Deva"],
117
+ "ayr_Latn": ["central aymara", "ayr", "ayr_Latn"],
118
+ "azb_Arab": ["south azerbaijani", "azb", "azb_Arab"],
119
+ "azj_Latn": ["north azerbaijani", "azj", "azj_Latn"],
120
+ "bak_Cyrl": ["bashkir", "bak", "bak_Cyrl"],
121
+ "bam_Latn": ["bambara", "bam", "bam_Latn"],
122
+ "ban_Latn": ["balinese", "ban", "ban_Latn"],
123
+ "bel_Cyrl": ["belarusian", "bel", "bel_Cyrl"],
124
+ "bem_Latn": ["bemba", "bem", "bem_Latn"],
125
+ "ben_Beng": ["bengali", "ben", "ben_Beng"],
126
+ "bho_Deva": ["bhojpuri", "bho", "bho_Deva"],
127
+ "bjn_Arab": ["banjar", "bjn", "bjn_Arab"],
128
+ "bjn_Latn": ["banjar", "bjn", "bjn_Latn"],
129
+ "bod_Tibt": ["standard tibetan", "bod", "bod_Tibt"],
130
+ "bos_Latn": ["bosnian", "bos", "bos_Latn"],
131
+ "bug_Latn": ["buginese", "bug", "bug_Latn"],
132
+ "bul_Cyrl": ["bulgarian", "bul", "bul_Cyrl"],
133
+ "cat_Latn": ["catalan", "cat", "cat_Latn"],
134
+ "ceb_Latn": ["cebuano", "ceb", "ceb_Latn"],
135
+ "ces_Latn": ["czech", "ces", "ces_Latn"],
136
+ "cjk_Latn": ["chokwe", "cjk", "cjk_Latn"],
137
+ "ckb_Arab": ["central kurdish", "ckb", "ckb_Arab"],
138
+ "crh_Latn": ["crimean tatar", "crh", "crh_Latn"],
139
+ "cym_Latn": ["welsh", "cym", "cym_Latn"],
140
+ "dan_Latn": ["danish", "dan", "dan_Latn"],
141
+ "deu_Latn": ["german", "deu", "deu_Latn"],
142
+ "dik_Latn": ["southwestern dinka", "dik", "dik_Latn"],
143
+ "dyu_Latn": ["dyula", "dyu", "dyu_Latn"],
144
+ "dzo_Tibt": ["dzongkha", "dzo", "dzo_Tibt"],
145
+ "ell_Grek": ["greek", "ell", "ell_Grek"],
146
+ "eng_Latn": ["english", "eng", "eng_Latn"],
147
+ "epo_Latn": ["esperanto", "epo", "epo_Latn"],
148
+ "est_Latn": ["estonian", "est", "est_Latn"],
149
+ "eus_Latn": ["basque", "eus", "eus_Latn"],
150
+ "ewe_Latn": ["ewe", "ewe_Latn"],
151
+ "fao_Latn": ["faroese", "fao", "fao_Latn"],
152
+ "fij_Latn": ["fijian", "fij", "fij_Latn"],
153
+ "fin_Latn": ["finnish", "fin", "fin_Latn"],
154
+ "fon_Latn": ["fon", "fon_Latn"],
155
+ "fra_Latn": ["french", "fra", "fra_Latn"],
156
+ "fur_Latn": ["friulian", "fur", "fur_Latn"],
157
+ "fuv_Latn": ["nigerian fulfulde", "fuv", "fuv_Latn"],
158
+ "gla_Latn": ["scottish gaelic", "gla", "gla_Latn"],
159
+ "gle_Latn": ["irish", "gle", "gle_Latn"],
160
+ "glg_Latn": ["galician", "glg", "glg_Latn"],
161
+ "grn_Latn": ["guarani", "grn", "grn_Latn"],
162
+ "guj_Gujr": ["gujarati", "guj", "guj_Gujr"],
163
+ "hat_Latn": ["haitian creole", "hat", "hat_Latn"],
164
+ "hau_Latn": ["hausa", "hau", "hau_Latn"],
165
+ "heb_Hebr": ["hebrew", "heb", "heb_Hebr"],
166
+ "hin_Deva": ["hindi", "hin", "hin_Deva"],
167
+ "hne_Deva": ["chhattisgarhi", "hne", "hne_Deva"],
168
+ "hrv_Latn": ["croatian", "hrv", "hrv_Latn"],
169
+ "hun_Latn": ["hungarian", "hun", "hun_Latn"],
170
+ "hye_Armn": ["armenian", "hye", "hye_Armn"],
171
+ "ibo_Latn": ["igbo", "ibo", "ibo_Latn"],
172
+ "ilo_Latn": ["ilocano", "ilo", "ilo_Latn"],
173
+ "ind_Latn": ["indonesian", "ind", "ind_Latn"],
174
+ "isl_Latn": ["icelandic", "isl", "isl_Latn"],
175
+ "ita_Latn": ["italian", "ita", "ita_Latn"],
176
+ "jav_Latn": ["javanese", "jav", "jav_Latn"],
177
+ "jpn_Jpan": ["japanese", "jpn", "jpn_Jpan"],
178
+ "kab_Latn": ["kabyle", "kab", "kab_Latn"],
179
+ "kac_Latn": ["jingpho", "kac", "kac_Latn"],
180
+ "kam_Latn": ["kamba", "kam", "kam_Latn"],
181
+ "kan_Knda": ["kannada", "kan", "kan_Knda"],
182
+ "kas_Arab": ["kashmiri", "kas", "kas_Arab"],
183
+ "kas_Deva": ["kashmiri", "kas", "kas_Deva"],
184
+ "kat_Geor": ["georgian", "kat", "kat_Geor"],
185
+ "knc_Arab": ["central kanuri", "knc", "knc_Arab"],
186
+ "knc_Latn": ["central kanuri", "knc", "knc_Latn"],
187
+ "kaz_Cyrl": ["kazakh", "kaz", "kaz_Cyrl"],
188
+ "kbp_Latn": ["kabiyè", "kbp", "kbp_Latn"],
189
+ "kea_Latn": ["kabuverdianu", "kea", "kea_Latn"],
190
+ "khm_Khmr": ["khmer", "khm", "khm_Khmr"],
191
+ "kik_Latn": ["kikuyu", "kik", "kik_Latn"],
192
+ "kin_Latn": ["kinyarwanda", "kin", "kin_Latn"],
193
+ "kir_Cyrl": ["kyrgyz", "kir", "kir_Cyrl"],
194
+ "kmb_Latn": ["kimbundu", "kmb", "kmb_Latn"],
195
+ "kmr_Latn": ["northern kurdish", "kmr", "kmr_Latn"],
196
+ "kon_Latn": ["kikongo", "kon", "kon_Latn"],
197
+ "kor_Hang": ["korean", "kor", "kor_Hang"],
198
+ "lao_Laoo": ["lao", "lao_Laoo"],
199
+ "lij_Latn": ["ligurian", "lij", "lij_Latn"],
200
+ "lim_Latn": ["limburgish", "lim", "lim_Latn"],
201
+ "lin_Latn": ["lingala", "lin", "lin_Latn"],
202
+ "lit_Latn": ["lithuanian", "lit", "lit_Latn"],
203
+ "lmo_Latn": ["lombard", "lmo", "lmo_Latn"],
204
+ "ltg_Latn": ["latgalian", "ltg", "ltg_Latn"],
205
+ "ltz_Latn": ["luxembourgish", "ltz", "ltz_Latn"],
206
+ "lua_Latn": ["luba-kasai", "lua", "lua_Latn"],
207
+ "lug_Latn": ["ganda", "lug", "lug_Latn"],
208
+ "luo_Latn": ["luo", "luo_Latn"],
209
+ "lus_Latn": ["mizo", "lus", "lus_Latn"],
210
+ "lvs_Latn": ["standard latvian", "lvs", "lvs_Latn"],
211
+ "mag_Deva": ["magahi", "mag", "mag_Deva"],
212
+ "mai_Deva": ["maithili", "mai", "mai_Deva"],
213
+ "mal_Mlym": ["malayalam", "mal", "mal_Mlym"],
214
+ "mar_Deva": ["marathi", "mar", "mar_Deva"],
215
+ "min_Arab": ["minangkabau", "min", "min_Arab"],
216
+ "min_Latn": ["minangkabau", "min", "min_Latn"],
217
+ "mkd_Cyrl": ["macedonian", "mkd", "mkd_Cyrl"],
218
+ "plt_Latn": ["plateau malagasy", "plt", "plt_Latn"],
219
+ "mlt_Latn": ["maltese", "mlt", "mlt_Latn"],
220
+ "mni_Beng": ["meitei", "mni", "mni_Beng"],
221
+ "khk_Cyrl": ["halh mongolian", "khk", "khk_Cyrl"],
222
+ "mos_Latn": ["mossi", "mos", "mos_Latn"],
223
+ "mri_Latn": ["maori", "mri", "mri_Latn"],
224
+ "mya_Mymr": ["burmese", "mya", "mya_Mymr"],
225
+ "nld_Latn": ["dutch", "nld", "nld_Latn"],
226
+ "nno_Latn": ["norwegian nynorsk", "nno", "nno_Latn"],
227
+ "nob_Latn": ["norwegian bokmål", "nob", "nob_Latn"],
228
+ "npi_Deva": ["nepali", "npi", "npi_Deva"],
229
+ "nso_Latn": ["northern sotho", "nso", "nso_Latn"],
230
+ "nus_Latn": ["nuer", "nus", "nus_Latn"],
231
+ "nya_Latn": ["nyanja", "nya", "nya_Latn"],
232
+ "oci_Latn": ["occitan", "oci", "oci_Latn"],
233
+ "gaz_Latn": ["west central oromo", "gaz", "gaz_Latn"],
234
+ "ory_Orya": ["odia", "ory", "ory_Orya"],
235
+ "pag_Latn": ["pangasinan", "pag", "pag_Latn"],
236
+ "pan_Guru": ["eastern panjabi", "pan", "pan_Guru"],
237
+ "pap_Latn": ["papiamento", "pap", "pap_Latn"],
238
+ "pes_Arab": ["western persian", "pes", "pes_Arab"],
239
+ "pol_Latn": ["polish", "pol", "pol_Latn"],
240
+ "por_Latn": ["portuguese", "por", "por_Latn"],
241
+ "prs_Arab": ["dari", "prs", "prs_Arab"],
242
+ "pbt_Arab": ["southern pashto", "pbt", "pbt_Arab"],
243
+ "quy_Latn": ["ayacucho quechua", "quy", "quy_Latn"],
244
+ "ron_Latn": ["romanian", "ron", "ron_Latn"],
245
+ "run_Latn": ["rundi", "run", "run_Latn"],
246
+ "rus_Cyrl": ["russian", "rus", "rus_Cyrl"],
247
+ "sag_Latn": ["sango", "sag", "sag_Latn"],
248
+ "san_Deva": ["sanskrit", "san", "san_Deva"],
249
+ "sat_Olck": ["santali", "sat", "sat_Olck"],
250
+ "scn_Latn": ["sicilian", "scn", "scn_Latn"],
251
+ "shn_Mymr": ["shan", "shn", "shn_Mymr"],
252
+ "sin_Sinh": ["sinhala", "sin", "sin_Sinh"],
253
+ "slk_Latn": ["slovak", "slk", "slk_Latn"],
254
+ "slv_Latn": ["slovenian", "slv", "slv_Latn"],
255
+ "smo_Latn": ["samoan", "smo", "smo_Latn"],
256
+ "sna_Latn": ["shona", "sna", "sna_Latn"],
257
+ "snd_Arab": ["sindhi", "snd", "snd_Arab"],
258
+ "som_Latn": ["somali", "som", "som_Latn"],
259
+ "sot_Latn": ["southern sotho", "sot", "sot_Latn"],
260
+ "spa_Latn": ["spanish", "spa", "spa_Latn"],
261
+ "als_Latn": ["tosk albanian", "als", "als_Latn"],
262
+ "srd_Latn": ["sardinian", "srd", "srd_Latn"],
263
+ "srp_Cyrl": ["serbian", "srp", "srp_Cyrl"],
264
+ "ssw_Latn": ["swati", "ssw", "ssw_Latn"],
265
+ "sun_Latn": ["sundanese", "sun", "sun_Latn"],
266
+ "swe_Latn": ["swedish", "swe", "swe_Latn"],
267
+ "swh_Latn": ["swahili", "swh", "swh_Latn"],
268
+ "szl_Latn": ["silesian", "szl", "szl_Latn"],
269
+ "tam_Taml": ["tamil", "tam", "tam_Taml"],
270
+ "tat_Cyrl": ["tatar", "tat", "tat_Cyrl"],
271
+ "tel_Telu": ["telugu", "tel", "tel_Telu"],
272
+ "tgk_Cyrl": ["tajik", "tgk", "tgk_Cyrl"],
273
+ "tgl_Latn": ["tagalog", "tgl", "tgl_Latn"],
274
+ "tha_Thai": ["thai", "tha", "tha_Thai"],
275
+ "tir_Ethi": ["tigrinya", "tir", "tir_Ethi"],
276
+ "taq_Latn": ["tamasheq", "taq", "taq_Latn"],
277
+ "taq_Tfng": ["tamasheq", "taq", "taq_Tfng"],
278
+ "tpi_Latn": ["tok pisin", "tpi", "tpi_Latn"],
279
+ "tsn_Latn": ["tswana", "tsn", "tsn_Latn"],
280
+ "tso_Latn": ["tsonga", "tso", "tso_Latn"],
281
+ "tuk_Latn": ["turkmen", "tuk", "tuk_Latn"],
282
+ "tum_Latn": ["tumbuka", "tum", "tum_Latn"],
283
+ "tur_Latn": ["turkish", "tur", "tur_Latn"],
284
+ "twi_Latn": ["twi", "twi_Latn"],
285
+ "tzm_Tfng": ["central atlas tamazight", "tzm", "tzm_Tfng"],
286
+ "uig_Arab": ["uyghur", "uig", "uig_Arab"],
287
+ "ukr_Cyrl": ["ukrainian", "ukr", "ukr_Cyrl"],
288
+ "umb_Latn": ["umbundu", "umb", "umb_Latn"],
289
+ "urd_Arab": ["urdu", "urd", "urd_Arab"],
290
+ "uzn_Latn": ["northern uzbek", "uzn", "uzn_Latn"],
291
+ "vec_Latn": ["venetian", "vec", "vec_Latn"],
292
+ "vie_Latn": ["vietnamese", "vie", "vie_Latn"],
293
+ "war_Latn": ["waray", "war", "war_Latn"],
294
+ "wol_Latn": ["wolof", "wol", "wol_Latn"],
295
+ "xho_Latn": ["xhosa", "xho", "xho_Latn"],
296
+ "ydd_Hebr": ["eastern yiddish", "ydd", "ydd_Hebr"],
297
+ "yor_Latn": ["yoruba", "yor", "yor_Latn"],
298
+ "yue_Hant": ["yue chinese", "yue", "yue_Hant"],
299
+ "zho_Hans": ["chinese", "zho", "zho_Hans"],
300
+ "zho_Hant": ["chinese", "zho", "zho_Hant"],
301
+ "zsm_Latn": ["standard malay", "zsm", "zsm_Latn"],
302
+ "zul_Latn": ["zulu", "zul", "zul_Latn"],
303
+ "diq_Latn": ["southern zaza", "diq", "diq_Latn"],
304
+ "sat_Beng": ["santali", "sat", "sat_Beng"],
305
+ }
306
+
307
+ ##################################
308
+ ###### LASER 3 ###################
309
+ ##################################
310
+
311
+ LASER3_LANGUAGES_LIST = [
312
+ "ace_Latn",
313
+ "aka_Latn",
314
+ "als_Latn",
315
+ "amh_Ethi",
316
+ "asm_Beng",
317
+ "awa_Deva",
318
+ "ayr_Latn",
319
+ "azb_Arab",
320
+ "azj_Latn",
321
+ "bak_Cyrl",
322
+ "bam_Latn",
323
+ "ban_Latn",
324
+ "bel_Cyrl",
325
+ "bem_Latn",
326
+ "ben_Beng",
327
+ "bho_Deva",
328
+ "bjn_Latn",
329
+ "bod_Tibt",
330
+ "bug_Latn",
331
+ "ceb_Latn",
332
+ "cjk_Latn",
333
+ "ckb_Arab",
334
+ "crh_Latn",
335
+ "cym_Latn",
336
+ "dik_Latn",
337
+ "diq_Latn",
338
+ "dyu_Latn",
339
+ "dzo_Tibt",
340
+ "ewe_Latn",
341
+ "fao_Latn",
342
+ "fij_Latn",
343
+ "fon_Latn",
344
+ "fur_Latn",
345
+ "fuv_Latn",
346
+ "gaz_Latn",
347
+ "gla_Latn",
348
+ "gle_Latn",
349
+ "grn_Latn",
350
+ "guj_Gujr",
351
+ "hat_Latn",
352
+ "hau_Latn",
353
+ "hin_Deva",
354
+ "hne_Deva",
355
+ "hye_Armn",
356
+ "ibo_Latn",
357
+ "ilo_Latn",
358
+ "ind_Latn",
359
+ "jav_Latn",
360
+ "kab_Latn",
361
+ "kac_Latn",
362
+ "kam_Latn",
363
+ "kan_Knda",
364
+ "kas_Arab",
365
+ "kas_Deva",
366
+ "kat_Geor",
367
+ "kaz_Cyrl",
368
+ "kbp_Latn",
369
+ "kea_Latn",
370
+ "khk_Cyrl",
371
+ "khm_Khmr",
372
+ "kik_Latn",
373
+ "kin_Latn",
374
+ "kir_Cyrl",
375
+ "kmb_Latn",
376
+ "kmr_Latn",
377
+ "knc_Arab",
378
+ "knc_Latn",
379
+ "kon_Latn",
380
+ "lao_Laoo",
381
+ "lij_Latn",
382
+ "lim_Latn",
383
+ "lin_Latn",
384
+ "lmo_Latn",
385
+ "ltg_Latn",
386
+ "ltz_Latn",
387
+ "lua_Latn",
388
+ "lug_Latn",
389
+ "luo_Latn",
390
+ "lus_Latn",
391
+ "mag_Deva",
392
+ "mai_Deva",
393
+ "mal_Mlym",
394
+ "mar_Deva",
395
+ "min_Latn",
396
+ "mlt_Latn",
397
+ "mni_Beng",
398
+ "mos_Latn",
399
+ "mri_Latn",
400
+ "mya_Mymr",
401
+ "npi_Deva",
402
+ "nso_Latn",
403
+ "nus_Latn",
404
+ "nya_Latn",
405
+ "ory_Orya",
406
+ "pag_Latn",
407
+ "pan_Guru",
408
+ "pap_Latn",
409
+ "pbt_Arab",
410
+ "pes_Arab",
411
+ "plt_Latn",
412
+ "prs_Arab",
413
+ "quy_Latn",
414
+ "run_Latn",
415
+ "sag_Latn",
416
+ "san_Deva",
417
+ "sat_Beng",
418
+ "scn_Latn",
419
+ "shn_Mymr",
420
+ "sin_Sinh",
421
+ "smo_Latn",
422
+ "sna_Latn",
423
+ "snd_Arab",
424
+ "som_Latn",
425
+ "sot_Latn",
426
+ "srd_Latn",
427
+ "ssw_Latn",
428
+ "sun_Latn",
429
+ "swh_Latn",
430
+ "szl_Latn",
431
+ "tam_Taml",
432
+ "taq_Latn",
433
+ "tat_Cyrl",
434
+ "tel_Telu",
435
+ "tgk_Cyrl",
436
+ "tgl_Latn",
437
+ "tha_Thai",
438
+ "tir_Ethi",
439
+ "tpi_Latn",
440
+ "tsn_Latn",
441
+ "tso_Latn",
442
+ "tuk_Latn",
443
+ "tum_Latn",
444
+ "tur_Latn",
445
+ "twi_Latn",
446
+ "tzm_Tfng",
447
+ "uig_Arab",
448
+ "umb_Latn",
449
+ "urd_Arab",
450
+ "uzn_Latn",
451
+ "vec_Latn",
452
+ "war_Latn",
453
+ "wol_Latn",
454
+ "xho_Latn",
455
+ "ydd_Hebr",
456
+ "yor_Latn",
457
+ "zsm_Latn",
458
+ "zul_Latn",
459
+ ]
460
+
461
+
462
+ LASER3_LANGUAGE = build_language_names_dict(LASER3_LANGUAGES_LIST, LANGUAGE_NAMES)
463
+
464
+ ##################################
465
+ ###### LASER 2 ###################
466
+ ##################################
467
+
468
+ LASER2_LANGUAGES_LIST = [
469
+ "acm_Arab",
470
+ "acq_Arab",
471
+ "aeb_Arab",
472
+ "afr_Latn",
473
+ "ajp_Arab",
474
+ "amh_Ethi",
475
+ "apc_Arab",
476
+ "arb_Arab",
477
+ "arb_Latn",
478
+ "ars_Arab",
479
+ "ary_Arab",
480
+ "arz_Arab",
481
+ "ayr_Latn",
482
+ "azb_Arab",
483
+ "azj_Latn",
484
+ "bel_Cyrl",
485
+ "ben_Beng",
486
+ "bos_Latn",
487
+ "bul_Cyrl",
488
+ "cat_Latn",
489
+ "ces_Latn",
490
+ "ckb_Arab",
491
+ "crh_Latn",
492
+ "dan_Latn",
493
+ "deu_Latn",
494
+ "ell_Grek",
495
+ "eng_Latn",
496
+ "epo_Latn",
497
+ "est_Latn",
498
+ "eus_Latn",
499
+ "fin_Latn",
500
+ "fra_Latn",
501
+ "gle_Latn",
502
+ "glg_Latn",
503
+ "hau_Latn",
504
+ "heb_Hebr",
505
+ "hin_Deva",
506
+ "hrv_Latn",
507
+ "hun_Latn",
508
+ "hye_Armn",
509
+ "ind_Latn",
510
+ "isl_Latn",
511
+ "ita_Latn",
512
+ "jpn_Jpan",
513
+ "kab_Latn",
514
+ "kat_Geor",
515
+ "kaz_Cyrl",
516
+ "khm_Khmr",
517
+ "kmr_Latn",
518
+ "kor_Hang",
519
+ "lit_Latn",
520
+ "lvs_Latn",
521
+ "mal_Mlym",
522
+ "mar_Deva",
523
+ "mkd_Cyrl",
524
+ "plt_Latn",
525
+ "mya_Mymr",
526
+ "nld_Latn",
527
+ "nob_Latn",
528
+ "oci_Latn",
529
+ "pes_Arab",
530
+ "pol_Latn",
531
+ "por_Latn",
532
+ "ron_Latn",
533
+ "rus_Cyrl",
534
+ "sin_Sinh",
535
+ "slk_Latn",
536
+ "slv_Latn",
537
+ "snd_Arab",
538
+ "som_Latn",
539
+ "spa_Latn",
540
+ "als_Latn",
541
+ "srp_Cyrl",
542
+ "swe_Latn",
543
+ "swh_Latn",
544
+ "tam_Taml",
545
+ "tat_Cyrl",
546
+ "tel_Telu",
547
+ "tgk_Cyrl",
548
+ "tgl_Latn",
549
+ "tha_Thai",
550
+ "tur_Latn",
551
+ "uig_Arab",
552
+ "ukr_Cyrl",
553
+ "urd_Arab",
554
+ "uzn_Latn",
555
+ "vie_Latn",
556
+ "yue_Hant",
557
+ "yue_Hant",
558
+ "zho_Hans",
559
+ "zho_Hant",
560
+ "zsm_Latn",
561
+ ]
562
+
563
+
564
+ LASER2_LANGUAGE = build_language_names_dict(LASER2_LANGUAGES_LIST, LANGUAGE_NAMES)
laser/laser_encoders/laser_tokenizer.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+ # LASER Language-Agnostic SEntence Representations
9
+ # is a toolkit to calculate multilingual sentence embeddings
10
+ # and to use them for document classification, bitext filtering
11
+ # and mining
12
+ #
13
+ # --------------------------------------------------------
14
+ #
15
+ # Helper functions for tokenization
16
+
17
+ import gzip
18
+ import logging
19
+ import os
20
+ import re
21
+ import sys
22
+ from pathlib import Path
23
+ from typing import IO, List
24
+
25
+ import sentencepiece as spm
26
+ from sacremoses import MosesDetokenizer, MosesPunctNormalizer
27
+ from unicategories import categories
28
+
29
+ from laser_encoders.download_models import LaserModelDownloader
30
+ from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE, SPM_LANGUAGE
31
+
32
+ SPACE_NORMALIZER = re.compile(r"\s+")
33
+ NON_PRINT_CHARS = set(c for c in categories["C"].characters())
34
+
35
+ logging.basicConfig(
36
+ stream=sys.stdout,
37
+ level=logging.INFO,
38
+ format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
39
+ )
40
+ logger = logging.getLogger("preprocess")
41
+
42
+
43
+ class LaserTokenizer:
44
+ def __init__(
45
+ self,
46
+ spm_model: Path,
47
+ lang: str = "en",
48
+ lower_case: bool = True,
49
+ descape: bool = False,
50
+ verbose: bool = False,
51
+ over_write: bool = False,
52
+ normalize_punct: bool = True,
53
+ ):
54
+ self.spm_model = spm_model
55
+ self.lang = lang
56
+ self.lower_case = lower_case
57
+ self.descape = descape
58
+ self.verbose = verbose
59
+ self.over_write = over_write
60
+ self.normalize_punct = normalize_punct
61
+
62
+ assert spm_model.exists(), f"spm model file: {spm_model} does not exist"
63
+ self.moses_punct_normalizer = MosesPunctNormalizer(self.lang, perl_parity=True)
64
+ # add parity with MOSES release-4.0
65
+ self.moses_punct_normalizer.substitutions[21] = ("‘", r'"')
66
+ self.moses_punct_normalizer.substitutions[22] = ("‚", r'"')
67
+ self.moses_detokenizer = MosesDetokenizer()
68
+ self.spm_encoder = spm.SentencePieceProcessor(model_file=str(self.spm_model))
69
+
70
+ def open(self, file: Path, mode: str, encoding="utf-8") -> IO:
71
+ return (
72
+ gzip.open(file, mode, encoding=encoding)
73
+ if file.name.endswith(".gz")
74
+ else open(file, mode, encoding=encoding)
75
+ )
76
+
77
+ def log(self, message: str) -> None:
78
+ if self.verbose:
79
+ logger.info(message)
80
+
81
+ def tokenize(self, text: str) -> str:
82
+ # Preprocessing
83
+ sentence_text = "".join([c if c not in NON_PRINT_CHARS else " " for c in text])
84
+ if self.normalize_punct:
85
+ sentence_text = self.moses_punct_normalizer.normalize(sentence_text)
86
+ if self.descape:
87
+ sentence_text = self.moses_detokenizer.unescape_xml(text=sentence_text)
88
+ if self.lower_case:
89
+ sentence_text = sentence_text.lower()
90
+
91
+ # SentencePiece encoding
92
+ encoded_text = " ".join(self.spm_encoder.encode(sentence_text, out_type=str))
93
+ return encoded_text
94
+
95
+ def tokenize_file(self, inp_fname: Path, out_fname: Path) -> None:
96
+ if not self.over_write and out_fname.exists():
97
+ self.log(f"tokenized file {out_fname.name} already exists")
98
+ return
99
+ else:
100
+ self.log(
101
+ f"tokenizing {inp_fname.name}"
102
+ + f"{' (de-escaped)' if self.descape else ''}"
103
+ + f"{' (lower-cased)' if self.lower_case else ' (cased)'} "
104
+ + f"(punctuation-normalization lang: {self.lang})"
105
+ )
106
+
107
+ with self.open(inp_fname, "rt") as file_in, open(
108
+ out_fname, "w"
109
+ ) as file_out:
110
+ for line in file_in:
111
+ tokens = self.tokenize(line.strip())
112
+ file_out.write(tokens + "\n")
113
+
114
+ def __call__(self, text_or_batch):
115
+ if isinstance(text_or_batch, str):
116
+ return self.tokenize(text_or_batch)
117
+ else:
118
+ return self.tokenize_batch(text_or_batch)
119
+
120
+ def tokenize_batch(self, batch: List[str]) -> List[List[str]]:
121
+ return [self.tokenize(text) for text in batch]
122
+
123
+ def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
124
+ return [self.spm_encoder.DecodeIds(ids) for ids in ids]
125
+
126
+ def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
127
+ ids = []
128
+
129
+ for token in tokens:
130
+ # Apply the same tokenization logic as in _tokenize method
131
+ tokens = SPACE_NORMALIZER.sub(" ", token).strip().split()
132
+
133
+ # Initialize an empty tensor for this token's IDs
134
+ token_ids = []
135
+
136
+ for i, token in enumerate(tokens):
137
+ token_id = self.spm_encoder.PieceToId(token)
138
+ if token_id == 0: # Handle out-of-vocabulary tokens
139
+ token_id = self.spm_encoder.PieceToId("<unk>")
140
+ token_ids.append(token_id)
141
+
142
+ # Append token IDs to the final IDs tensor
143
+ ids.extend(token_ids)
144
+
145
+ return ids
146
+
147
+
148
+ def initialize_tokenizer(lang: str = None, model_dir: str = None, laser: str = None):
149
+ downloader = LaserModelDownloader(model_dir)
150
+ if laser is not None:
151
+ if laser == "laser3":
152
+ lang = downloader.get_language_code(LASER3_LANGUAGE, lang)
153
+ if lang in SPM_LANGUAGE:
154
+ filename = f"laser3-{lang}.v1.spm"
155
+ else:
156
+ filename = "laser2.spm"
157
+ elif laser == "laser2":
158
+ filename = "laser2.spm"
159
+ else:
160
+ raise ValueError(
161
+ f"Unsupported laser model: {laser}. Choose either laser2 or laser3."
162
+ )
163
+ else:
164
+ if lang in LASER3_LANGUAGE:
165
+ lang = downloader.get_language_code(LASER3_LANGUAGE, lang)
166
+ if lang in SPM_LANGUAGE:
167
+ filename = f"laser3-{lang}.v1.spm"
168
+ else:
169
+ filename = "laser2.spm"
170
+ elif lang in LASER2_LANGUAGE:
171
+ filename = "laser2.spm"
172
+ else:
173
+ raise ValueError(
174
+ f"Unsupported language name: {lang}. Please specify a supported language name."
175
+ )
176
+
177
+ downloader.download(filename)
178
+ model_path = os.path.join(downloader.model_dir, filename)
179
+ return LaserTokenizer(spm_model=Path(model_path))
laser/laser_encoders/models.py ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+ # LASER Language-Agnostic SEntence Representations
9
+ # is a toolkit to calculate multilingual sentence embeddings
10
+ # and to use them for document classification, bitext filtering
11
+ # and mining
12
+ #
13
+ # --------------------------------------------------------
14
+
15
+
16
+ import logging
17
+ import os
18
+ import re
19
+ import sys
20
+ import warnings
21
+ from collections import namedtuple
22
+ from pathlib import Path
23
+
24
+ import numpy as np
25
+ import torch
26
+ import torch.nn as nn
27
+ from fairseq.data.dictionary import Dictionary
28
+ from fairseq.models.transformer import Embedding, TransformerEncoder
29
+ from fairseq.modules import LayerNorm
30
+
31
+ from laser_encoders.download_models import LaserModelDownloader
32
+ from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE
33
+ from laser_encoders.laser_tokenizer import LaserTokenizer, initialize_tokenizer
34
+
35
+ SPACE_NORMALIZER = re.compile(r"\s+")
36
+ Batch = namedtuple("Batch", "srcs tokens lengths")
37
+
38
+ logging.basicConfig(
39
+ stream=sys.stdout,
40
+ level=logging.INFO,
41
+ format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
42
+ )
43
+ logger = logging.getLogger("embed")
44
+
45
+
46
+ class SentenceEncoder:
47
+ def __init__(
48
+ self,
49
+ model_path,
50
+ max_sentences=None,
51
+ max_tokens=None,
52
+ spm_vocab=None,
53
+ spm_model=None,
54
+ cpu=False,
55
+ fp16=False,
56
+ verbose=False,
57
+ sort_kind="quicksort",
58
+ ):
59
+ if verbose:
60
+ logger.info(f"loading encoder: {model_path}")
61
+ self.spm_model = spm_model
62
+ if self.spm_model:
63
+ self.tokenizer = LaserTokenizer(spm_model=Path(self.spm_model))
64
+
65
+ self.use_cuda = torch.cuda.is_available() and not cpu
66
+ self.max_sentences = max_sentences
67
+ self.max_tokens = max_tokens
68
+ if self.max_tokens is None and self.max_sentences is None:
69
+ self.max_sentences = 1
70
+
71
+ state_dict = torch.load(model_path)
72
+ if "params" in state_dict:
73
+ self.encoder = LaserLstmEncoder(**state_dict["params"])
74
+ self.encoder.load_state_dict(state_dict["model"])
75
+ self.dictionary = state_dict["dictionary"]
76
+ self.prepend_bos = False
77
+ self.left_padding = False
78
+ else:
79
+ self.encoder = LaserTransformerEncoder(state_dict, spm_vocab)
80
+ self.dictionary = self.encoder.dictionary.indices
81
+ self.prepend_bos = state_dict["cfg"]["model"].prepend_bos
82
+ self.left_padding = state_dict["cfg"]["model"].left_pad_source
83
+ del state_dict
84
+ self.bos_index = self.dictionary["<s>"] = 0
85
+ self.pad_index = self.dictionary["<pad>"] = 1
86
+ self.eos_index = self.dictionary["</s>"] = 2
87
+ self.unk_index = self.dictionary["<unk>"] = 3
88
+
89
+ if fp16:
90
+ self.encoder.half()
91
+ if self.use_cuda:
92
+ if verbose:
93
+ logger.info("transfer encoder to GPU")
94
+ self.encoder.cuda()
95
+ self.encoder.eval()
96
+ self.sort_kind = sort_kind
97
+
98
+ def __call__(self, text_or_batch):
99
+ if self.spm_model:
100
+ text_or_batch = self.tokenizer(text_or_batch)
101
+ if isinstance(text_or_batch, str):
102
+ text_or_batch = [text_or_batch]
103
+ return self.encode_sentences(text_or_batch)
104
+ else:
105
+ raise ValueError(
106
+ "Either initialize the encoder with an spm_model or pre-tokenize and use the encode_sentences method."
107
+ )
108
+
109
+ def _process_batch(self, batch):
110
+ tokens = batch.tokens
111
+ lengths = batch.lengths
112
+ if self.use_cuda:
113
+ tokens = tokens.cuda()
114
+ lengths = lengths.cuda()
115
+
116
+ with torch.no_grad():
117
+ sentemb = self.encoder(tokens, lengths)["sentemb"]
118
+ embeddings = sentemb.detach().cpu().numpy()
119
+ return embeddings
120
+
121
+ def _tokenize(self, line):
122
+ tokens = SPACE_NORMALIZER.sub(" ", line).strip().split()
123
+ ntokens = len(tokens)
124
+ if self.prepend_bos:
125
+ ids = torch.LongTensor(ntokens + 2)
126
+ ids[0] = self.bos_index
127
+ for i, token in enumerate(tokens):
128
+ ids[i + 1] = self.dictionary.get(token, self.unk_index)
129
+ ids[ntokens + 1] = self.eos_index
130
+ else:
131
+ ids = torch.LongTensor(ntokens + 1)
132
+ for i, token in enumerate(tokens):
133
+ ids[i] = self.dictionary.get(token, self.unk_index)
134
+ ids[ntokens] = self.eos_index
135
+ return ids
136
+
137
+ def _make_batches(self, lines):
138
+ tokens = [self._tokenize(line) for line in lines]
139
+ lengths = np.array([t.numel() for t in tokens])
140
+ indices = np.argsort(-lengths, kind=self.sort_kind)
141
+
142
+ def batch(tokens, lengths, indices):
143
+ toks = tokens[0].new_full((len(tokens), tokens[0].shape[0]), self.pad_index)
144
+ if not self.left_padding:
145
+ for i in range(len(tokens)):
146
+ toks[i, : tokens[i].shape[0]] = tokens[i]
147
+ else:
148
+ for i in range(len(tokens)):
149
+ toks[i, -tokens[i].shape[0] :] = tokens[i]
150
+ return (
151
+ Batch(srcs=None, tokens=toks, lengths=torch.LongTensor(lengths)),
152
+ indices,
153
+ )
154
+
155
+ batch_tokens, batch_lengths, batch_indices = [], [], []
156
+ ntokens = nsentences = 0
157
+ for i in indices:
158
+ if nsentences > 0 and (
159
+ (self.max_tokens is not None and ntokens + lengths[i] > self.max_tokens)
160
+ or (self.max_sentences is not None and nsentences == self.max_sentences)
161
+ ):
162
+ yield batch(batch_tokens, batch_lengths, batch_indices)
163
+ ntokens = nsentences = 0
164
+ batch_tokens, batch_lengths, batch_indices = [], [], []
165
+ batch_tokens.append(tokens[i])
166
+ batch_lengths.append(lengths[i])
167
+ batch_indices.append(i)
168
+ ntokens += tokens[i].shape[0]
169
+ nsentences += 1
170
+ if nsentences > 0:
171
+ yield batch(batch_tokens, batch_lengths, batch_indices)
172
+
173
+ def encode_sentences(self, sentences, normalize_embeddings=False):
174
+ indices = []
175
+ results = []
176
+ for batch, batch_indices in self._make_batches(sentences):
177
+ indices.extend(batch_indices)
178
+ encoded_batch = self._process_batch(batch)
179
+ if normalize_embeddings:
180
+ # Perform L2 normalization on the embeddings
181
+ norms = np.linalg.norm(encoded_batch, axis=1, keepdims=True)
182
+ encoded_batch = encoded_batch / norms
183
+ results.append(encoded_batch)
184
+ return np.vstack(results)[np.argsort(indices, kind=self.sort_kind)]
185
+
186
+
187
+ class LaserTransformerEncoder(TransformerEncoder):
188
+ def __init__(self, state_dict, vocab_path):
189
+ self.dictionary = Dictionary.load(vocab_path)
190
+ if any(
191
+ k in state_dict["model"]
192
+ for k in ["encoder.layer_norm.weight", "layer_norm.weight"]
193
+ ):
194
+ self.dictionary.add_symbol("<mask>")
195
+ cfg = state_dict["cfg"]["model"]
196
+ self.sentemb_criterion = cfg.sentemb_criterion
197
+ self.pad_idx = self.dictionary.pad_index
198
+ self.bos_idx = self.dictionary.bos_index
199
+ embed_tokens = Embedding(
200
+ len(self.dictionary),
201
+ cfg.encoder_embed_dim,
202
+ self.pad_idx,
203
+ )
204
+ super().__init__(cfg, self.dictionary, embed_tokens)
205
+ if "decoder.version" in state_dict["model"]:
206
+ self._remove_decoder_layers(state_dict)
207
+ if "layer_norm.weight" in state_dict["model"]:
208
+ self.layer_norm = LayerNorm(cfg.encoder_embed_dim)
209
+ self.load_state_dict(state_dict["model"])
210
+
211
+ def _remove_decoder_layers(self, state_dict):
212
+ for key in list(state_dict["model"].keys()):
213
+ if not key.startswith(
214
+ (
215
+ "encoder.layer_norm",
216
+ "encoder.layers",
217
+ "encoder.embed",
218
+ "encoder.version",
219
+ )
220
+ ):
221
+ del state_dict["model"][key]
222
+ else:
223
+ renamed_key = key.replace("encoder.", "")
224
+ state_dict["model"][renamed_key] = state_dict["model"].pop(key)
225
+
226
+ def forward(self, src_tokens, src_lengths):
227
+ encoder_out = super().forward(src_tokens, src_lengths)
228
+ if isinstance(encoder_out, dict):
229
+ x = encoder_out["encoder_out"][0] # T x B x C
230
+ else:
231
+ x = encoder_out[0]
232
+ if self.sentemb_criterion == "cls":
233
+ cls_indices = src_tokens.eq(self.bos_idx).t()
234
+ sentemb = x[cls_indices, :]
235
+ else:
236
+ padding_mask = src_tokens.eq(self.pad_idx).t().unsqueeze(-1)
237
+ if padding_mask.any():
238
+ x = x.float().masked_fill_(padding_mask, float("-inf")).type_as(x)
239
+ sentemb = x.max(dim=0)[0]
240
+ return {"sentemb": sentemb}
241
+
242
+
243
+ class LaserLstmEncoder(nn.Module):
244
+ def __init__(
245
+ self,
246
+ num_embeddings,
247
+ padding_idx,
248
+ embed_dim=320,
249
+ hidden_size=512,
250
+ num_layers=1,
251
+ bidirectional=False,
252
+ left_pad=True,
253
+ padding_value=0.0,
254
+ ):
255
+ super().__init__()
256
+
257
+ self.num_layers = num_layers
258
+ self.bidirectional = bidirectional
259
+ self.hidden_size = hidden_size
260
+
261
+ self.padding_idx = padding_idx
262
+ self.embed_tokens = nn.Embedding(
263
+ num_embeddings, embed_dim, padding_idx=self.padding_idx
264
+ )
265
+
266
+ self.lstm = nn.LSTM(
267
+ input_size=embed_dim,
268
+ hidden_size=hidden_size,
269
+ num_layers=num_layers,
270
+ bidirectional=bidirectional,
271
+ )
272
+ self.left_pad = left_pad
273
+ self.padding_value = padding_value
274
+
275
+ self.output_units = hidden_size
276
+ if bidirectional:
277
+ self.output_units *= 2
278
+
279
+ def forward(self, src_tokens, src_lengths):
280
+ bsz, seqlen = src_tokens.size()
281
+
282
+ # embed tokens
283
+ x = self.embed_tokens(src_tokens)
284
+
285
+ # B x T x C -> T x B x C
286
+ x = x.transpose(0, 1)
287
+
288
+ # pack embedded source tokens into a PackedSequence
289
+ packed_x = nn.utils.rnn.pack_padded_sequence(x, src_lengths.data.tolist())
290
+
291
+ # apply LSTM
292
+ if self.bidirectional:
293
+ state_size = 2 * self.num_layers, bsz, self.hidden_size
294
+ else:
295
+ state_size = self.num_layers, bsz, self.hidden_size
296
+ h0 = x.data.new(*state_size).zero_()
297
+ c0 = x.data.new(*state_size).zero_()
298
+ packed_outs, (final_hiddens, final_cells) = self.lstm(packed_x, (h0, c0))
299
+
300
+ # unpack outputs and apply dropout
301
+ x, _ = nn.utils.rnn.pad_packed_sequence(
302
+ packed_outs, padding_value=self.padding_value
303
+ )
304
+ assert list(x.size()) == [seqlen, bsz, self.output_units]
305
+
306
+ if self.bidirectional:
307
+
308
+ def combine_bidir(outs):
309
+ return torch.cat(
310
+ [
311
+ torch.cat([outs[2 * i], outs[2 * i + 1]], dim=0).view(
312
+ 1, bsz, self.output_units
313
+ )
314
+ for i in range(self.num_layers)
315
+ ],
316
+ dim=0,
317
+ )
318
+
319
+ final_hiddens = combine_bidir(final_hiddens)
320
+ final_cells = combine_bidir(final_cells)
321
+
322
+ encoder_padding_mask = src_tokens.eq(self.padding_idx).t()
323
+
324
+ # Set padded outputs to -inf so they are not selected by max-pooling
325
+ padding_mask = src_tokens.eq(self.padding_idx).t().unsqueeze(-1)
326
+ if padding_mask.any():
327
+ x = x.float().masked_fill_(padding_mask, float("-inf")).type_as(x)
328
+
329
+ # Build the sentence embedding by max-pooling over the encoder outputs
330
+ sentemb = x.max(dim=0)[0]
331
+
332
+ return {
333
+ "sentemb": sentemb,
334
+ "encoder_out": (x, final_hiddens, final_cells),
335
+ "encoder_padding_mask": encoder_padding_mask
336
+ if encoder_padding_mask.any()
337
+ else None,
338
+ }
339
+
340
+
341
+ def initialize_encoder(
342
+ lang: str = None,
343
+ model_dir: str = None,
344
+ spm: bool = True,
345
+ laser: str = None,
346
+ ):
347
+ downloader = LaserModelDownloader(model_dir)
348
+ if laser is not None:
349
+ if laser == "laser3":
350
+ lang = downloader.get_language_code(LASER3_LANGUAGE, lang)
351
+ downloader.download_laser3(lang=lang, spm=spm)
352
+ file_path = f"laser3-{lang}.v1"
353
+ elif laser == "laser2":
354
+ downloader.download_laser2()
355
+ file_path = "laser2"
356
+ else:
357
+ raise ValueError(
358
+ f"Unsupported laser model: {laser}. Choose either laser2 or laser3."
359
+ )
360
+ else:
361
+ if lang in LASER3_LANGUAGE:
362
+ lang = downloader.get_language_code(LASER3_LANGUAGE, lang)
363
+ downloader.download_laser3(lang=lang, spm=spm)
364
+ file_path = f"laser3-{lang}.v1"
365
+ elif lang in LASER2_LANGUAGE:
366
+ downloader.download_laser2()
367
+ file_path = "laser2"
368
+ else:
369
+ raise ValueError(
370
+ f"Unsupported language name: {lang}. Please specify a supported language name."
371
+ )
372
+
373
+ model_dir = downloader.model_dir
374
+ model_path = os.path.join(model_dir, f"{file_path}.pt")
375
+ spm_vocab = os.path.join(model_dir, f"{file_path}.cvocab")
376
+
377
+ if not os.path.exists(spm_vocab):
378
+ # if there is no cvocab for the laser3 lang use laser2 cvocab
379
+ spm_vocab = os.path.join(model_dir, "laser2.cvocab")
380
+
381
+ return SentenceEncoder(model_path=model_path, spm_vocab=spm_vocab, spm_model=None)
382
+
383
+
384
+ class LaserEncoderPipeline:
385
+ def __init__(
386
+ self,
387
+ lang: str = None,
388
+ model_dir: str = None,
389
+ spm: bool = True,
390
+ laser: str = None,
391
+ ):
392
+
393
+ if laser == "laser2" and lang is not None:
394
+ warnings.warn(
395
+ "Warning: The 'lang' parameter is optional when using 'laser2'. It will be ignored."
396
+ )
397
+
398
+ if laser == "laser3" and lang is None:
399
+ raise ValueError("For 'laser3', the 'lang' parameter is required.")
400
+
401
+ if laser is None and lang is None:
402
+ raise ValueError("Either 'laser' or 'lang' should be provided.")
403
+
404
+ self.tokenizer = initialize_tokenizer(
405
+ lang=lang, model_dir=model_dir, laser=laser
406
+ )
407
+ self.encoder = initialize_encoder(
408
+ lang=lang, model_dir=model_dir, spm=spm, laser=laser
409
+ )
410
+
411
+ def encode_sentences(
412
+ self, sentences: list, normalize_embeddings: bool = False
413
+ ) -> list:
414
+ """
415
+ Tokenizes and encodes a list of sentences.
416
+
417
+ Args:
418
+ - sentences (list of str): List of sentences to tokenize and encode.
419
+
420
+ Returns:
421
+ - List of embeddings for each sentence.
422
+ """
423
+ tokenized_sentences = [
424
+ self.tokenizer.tokenize(sentence) for sentence in sentences
425
+ ]
426
+ return self.encoder.encode_sentences(tokenized_sentences, normalize_embeddings)
laser/laser_encoders/test_laser_tokenizer.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+ # LASER Language-Agnostic SEntence Representations
9
+ # is a toolkit to calculate multilingual sentence embeddings
10
+ # and to use them for document classification, bitext filtering
11
+ # and mining
12
+ #
13
+ # --------------------------------------------------------
14
+ # Tests for LaserTokenizer
15
+
16
+ import os
17
+ import warnings
18
+ from pathlib import Path
19
+ from tempfile import TemporaryDirectory
20
+ from typing import List
21
+
22
+ import numpy as np
23
+ import pytest
24
+
25
+ from laser_encoders import (
26
+ LaserEncoderPipeline,
27
+ initialize_encoder,
28
+ initialize_tokenizer,
29
+ )
30
+
31
+
32
+ @pytest.fixture
33
+ def tokenizer(tmp_path: Path):
34
+ tokenizer_instance = initialize_tokenizer(model_dir=tmp_path, laser="laser2")
35
+ return tokenizer_instance
36
+
37
+
38
+ @pytest.fixture
39
+ def input_text() -> str:
40
+ return "This is a test sentence."
41
+
42
+
43
+ @pytest.fixture
44
+ def test_readme_params() -> dict:
45
+ return {
46
+ "lang": "igbo",
47
+ "input_sentences": ["nnọọ, kedu ka ị mere"],
48
+ "expected_embedding_shape": (1, 1024),
49
+ "expected_array": [
50
+ 0.3807628,
51
+ -0.27941525,
52
+ -0.17819545,
53
+ 0.44144684,
54
+ -0.38985375,
55
+ 0.04719935,
56
+ 0.20238206,
57
+ -0.03934783,
58
+ 0.0118901,
59
+ 0.28986093,
60
+ ],
61
+ }
62
+
63
+
64
+ def test_tokenize(tokenizer, input_text: str):
65
+ expected_output = "▁this ▁is ▁a ▁test ▁sent ence ."
66
+ assert tokenizer.tokenize(input_text) == expected_output
67
+
68
+
69
+ def test_tokenizer_call_method(tokenizer, input_text: str):
70
+ single_string = "This is a test sentence."
71
+ expected_output = "▁this ▁is ▁a ▁test ▁sent ence ."
72
+ assert tokenizer(single_string) == expected_output
73
+
74
+ list_of_strings = ["This is a test sentence.", "This is another test sentence."]
75
+ expected_output = [
76
+ "▁this ▁is ▁a ▁test ▁sent ence .",
77
+ "▁this ▁is ▁another ▁test ▁sent ence .",
78
+ ]
79
+ assert tokenizer(list_of_strings) == expected_output
80
+
81
+
82
+ def test_normalization(tokenizer):
83
+ test_data = "Hello!!! How are you??? I'm doing great."
84
+ expected_output = "▁hel lo !!! ▁how ▁are ▁you ??? ▁i ' m ▁do ing ▁great ."
85
+ assert tokenizer.tokenize(test_data) == expected_output
86
+
87
+
88
+ def test_descape(tokenizer):
89
+ test_data = "I &lt;3 Apple &amp; Carrots!"
90
+ expected_output = "▁i ▁<3 ▁app le ▁& ▁car ro ts !"
91
+ tokenizer.descape = True
92
+ assert tokenizer.tokenize(test_data) == expected_output
93
+
94
+
95
+ def test_lowercase(tokenizer):
96
+ test_data = "THIS OUTPUT MUST BE UPPERCASE"
97
+ expected_output = "▁TH IS ▁ OU TP UT ▁ MU ST ▁BE ▁ UP PER CA SE"
98
+ tokenizer.lower_case = False
99
+ assert tokenizer.tokenize(test_data) == expected_output
100
+
101
+
102
+ def test_is_printable(tokenizer):
103
+ test_data = "Hello, \tWorld! ABC\x1f123"
104
+ expected_output = "▁hel lo , ▁world ! ▁ab c ▁12 3"
105
+ assert tokenizer.tokenize(test_data) == expected_output
106
+
107
+
108
+ def test_tokenize_file(tokenizer, input_text: str):
109
+ with TemporaryDirectory() as temp_dir:
110
+ input_file = os.path.join(temp_dir, "input.txt")
111
+ output_file = os.path.join(temp_dir, "output.txt")
112
+
113
+ with open(input_file, "w") as file:
114
+ file.write(input_text)
115
+
116
+ tokenizer.tokenize_file(
117
+ inp_fname=Path(input_file),
118
+ out_fname=Path(output_file),
119
+ )
120
+
121
+ with open(output_file, "r") as file:
122
+ output = file.read().strip()
123
+
124
+ expected_output = "▁this ▁is ▁a ▁test ▁sent ence ."
125
+ assert output == expected_output
126
+
127
+
128
+ def test_tokenize_file_overwrite(tokenizer, input_text: str):
129
+ with TemporaryDirectory() as temp_dir:
130
+ input_file = os.path.join(temp_dir, "input.txt")
131
+ output_file = os.path.join(temp_dir, "output.txt")
132
+
133
+ with open(input_file, "w") as file:
134
+ file.write(input_text)
135
+
136
+ with open(output_file, "w") as file:
137
+ file.write("Existing output")
138
+
139
+ # Test when over_write is False
140
+ tokenizer.over_write = False
141
+ tokenizer.tokenize_file(
142
+ inp_fname=Path(input_file),
143
+ out_fname=Path(output_file),
144
+ )
145
+
146
+ with open(output_file, "r") as file:
147
+ output = file.read().strip()
148
+
149
+ assert output == "Existing output"
150
+
151
+ # Test when over_write is True
152
+ tokenizer.over_write = True
153
+ tokenizer.tokenize_file(
154
+ inp_fname=Path(input_file),
155
+ out_fname=Path(output_file),
156
+ )
157
+
158
+ with open(output_file, "r") as file:
159
+ output = file.read().strip()
160
+
161
+ expected_output = "▁this ▁is ▁a ▁test ▁sent ence ."
162
+ assert output == expected_output
163
+
164
+
165
+ @pytest.mark.parametrize(
166
+ "laser, expected_array, lang",
167
+ [
168
+ (
169
+ "laser2",
170
+ [
171
+ 1.042462512850761414e-02,
172
+ 6.325428839772939682e-03,
173
+ -3.032622225873637944e-05,
174
+ 9.033476933836936951e-03,
175
+ 2.937933895736932755e-04,
176
+ 4.489220678806304932e-03,
177
+ 2.334521152079105377e-03,
178
+ -9.427300537936389446e-04,
179
+ -1.571535394759848714e-04,
180
+ 2.095808042213320732e-03,
181
+ ],
182
+ None,
183
+ ),
184
+ (
185
+ "laser3",
186
+ [
187
+ 3.038274645805358887e-01,
188
+ 4.151830971240997314e-01,
189
+ -2.458990514278411865e-01,
190
+ 3.153458833694458008e-01,
191
+ -5.153598189353942871e-01,
192
+ -6.035178527235984802e-02,
193
+ 2.210616767406463623e-01,
194
+ -2.701394855976104736e-01,
195
+ -4.902199506759643555e-01,
196
+ -3.126966953277587891e-02,
197
+ ],
198
+ "zul_Latn",
199
+ ),
200
+ ],
201
+ )
202
+ def test_sentence_encoder(
203
+ tmp_path: Path,
204
+ tokenizer,
205
+ laser: str,
206
+ expected_array: List,
207
+ lang: str,
208
+ input_text: str,
209
+ ):
210
+ sentence_encoder = initialize_encoder(model_dir=tmp_path, laser=laser, lang=lang)
211
+ tokenized_text = tokenizer.tokenize(input_text)
212
+ sentence_embedding = sentence_encoder.encode_sentences([tokenized_text])
213
+
214
+ assert isinstance(sentence_embedding, np.ndarray)
215
+ assert sentence_embedding.shape == (1, 1024)
216
+ assert np.allclose(expected_array, sentence_embedding[:, :10], atol=1e-3)
217
+
218
+
219
+ def test_laser_encoder_pipeline(tmp_path: Path, test_readme_params: dict):
220
+ lang = test_readme_params["lang"]
221
+ input_sentences = test_readme_params["input_sentences"]
222
+ expected_embedding_shape = test_readme_params["expected_embedding_shape"]
223
+ expected_array = test_readme_params["expected_array"]
224
+
225
+ encoder = LaserEncoderPipeline(model_dir=tmp_path, lang=lang)
226
+ embeddings = encoder.encode_sentences(input_sentences)
227
+
228
+ assert isinstance(embeddings, np.ndarray)
229
+ assert embeddings.shape == expected_embedding_shape
230
+ assert np.allclose(expected_array, embeddings[:, :10], atol=1e-3)
231
+
232
+
233
+ def test_separate_initialization_and_encoding(
234
+ tmp_path, tokenizer, test_readme_params: dict
235
+ ):
236
+ lang = test_readme_params["lang"]
237
+ input_sentences = test_readme_params["input_sentences"]
238
+ expected_embedding_shape = test_readme_params["expected_embedding_shape"]
239
+ expected_array = test_readme_params["expected_array"]
240
+
241
+ tokenized_sentence = tokenizer.tokenize(input_sentences[0])
242
+ sentence_encoder = initialize_encoder(model_dir=tmp_path, lang=lang)
243
+
244
+ # Encode tokenized sentences into embeddings
245
+ embeddings = sentence_encoder.encode_sentences([tokenized_sentence])
246
+
247
+ assert isinstance(embeddings, np.ndarray)
248
+ assert embeddings.shape == expected_embedding_shape
249
+ assert np.allclose(expected_array, embeddings[:, :10], atol=1e-3)
250
+
251
+
252
+ def test_encoder_normalization(tmp_path: Path, test_readme_params: dict):
253
+ lang = test_readme_params["lang"]
254
+ input_sentences = test_readme_params["input_sentences"]
255
+
256
+ encoder = LaserEncoderPipeline(model_dir=tmp_path, lang=lang)
257
+ normalized_embeddings = encoder.encode_sentences(
258
+ input_sentences, normalize_embeddings=True
259
+ )
260
+ norm = np.linalg.norm(normalized_embeddings[0])
261
+
262
+ assert np.allclose(norm, 1.0, atol=1e-3)
263
+
264
+
265
+ def test_encoder_default_behaviour(tmp_path: Path, test_readme_params: dict):
266
+ lang = test_readme_params["lang"]
267
+ input_sentences = test_readme_params["input_sentences"]
268
+
269
+ encoder = LaserEncoderPipeline(model_dir=tmp_path, lang=lang)
270
+ default_embeddings = encoder.encode_sentences(input_sentences)
271
+ non_normalized_embeddings = encoder.encode_sentences(
272
+ input_sentences, normalize_embeddings=False
273
+ )
274
+
275
+ assert np.allclose(default_embeddings, non_normalized_embeddings)
276
+
277
+
278
+ def test_encoder_non_normalization(tmp_path: Path, test_readme_params: dict):
279
+ lang = test_readme_params["lang"]
280
+ input_sentences = test_readme_params["input_sentences"]
281
+
282
+ encoder = LaserEncoderPipeline(model_dir=tmp_path, lang=lang)
283
+ non_normalized_embeddings = encoder.encode_sentences(
284
+ input_sentences, normalize_embeddings=False
285
+ )
286
+ norm = np.linalg.norm(non_normalized_embeddings[0])
287
+
288
+ assert not np.isclose(norm, 1)
289
+
290
+
291
+ def test_optional_lang_with_laser2(tmp_path: Path):
292
+ with pytest.warns(
293
+ UserWarning,
294
+ match="The 'lang' parameter is optional when using 'laser2'. It will be ignored.",
295
+ ):
296
+ encoder = LaserEncoderPipeline(lang="en", laser="laser2", model_dir=tmp_path)
297
+
298
+
299
+ def test_required_lang_with_laser3(tmp_path: Path):
300
+ with pytest.raises(
301
+ ValueError, match="For 'laser3', the 'lang' parameter is required."
302
+ ):
303
+ encoder = LaserEncoderPipeline(laser="laser3", model_dir=tmp_path)
304
+
305
+
306
+ def test_missing_lang_and_laser(tmp_path: Path):
307
+ with pytest.raises(
308
+ ValueError, match="Either 'laser' or 'lang' should be provided."
309
+ ):
310
+ encoder = LaserEncoderPipeline(model_dir=tmp_path)
laser/laser_encoders/test_models_initialization.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+
4
+ import pytest
5
+
6
+ from laser_encoders.download_models import LaserModelDownloader
7
+ from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE
8
+ from laser_encoders.laser_tokenizer import initialize_tokenizer
9
+ from laser_encoders.models import initialize_encoder
10
+
11
+
12
+ def test_validate_achnese_models_and_tokenize_laser3(lang="acehnese"):
13
+ with tempfile.TemporaryDirectory() as tmp_dir:
14
+ print(f"Created temporary directory for {lang}", tmp_dir)
15
+
16
+ downloader = LaserModelDownloader(model_dir=tmp_dir)
17
+ downloader.download_laser3(lang)
18
+ encoder = initialize_encoder(lang, model_dir=tmp_dir)
19
+ tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir)
20
+
21
+ # Test tokenization with a sample sentence
22
+ tokenized = tokenizer.tokenize("This is a sample sentence.")
23
+
24
+ print(f"{lang} model validated successfully")
25
+
26
+
27
+ def test_validate_english_models_and_tokenize_laser2(lang="english"):
28
+ with tempfile.TemporaryDirectory() as tmp_dir:
29
+ print(f"Created temporary directory for {lang}", tmp_dir)
30
+
31
+ downloader = LaserModelDownloader(model_dir=tmp_dir)
32
+ downloader.download_laser2()
33
+
34
+ encoder = initialize_encoder(lang, model_dir=tmp_dir)
35
+ tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir)
36
+
37
+ # Test tokenization with a sample sentence
38
+ tokenized = tokenizer.tokenize("This is a sample sentence.")
39
+
40
+ print(f"{lang} model validated successfully")
41
+
42
+
43
+ def test_validate_kashmiri_models_and_tokenize_laser3(lang="kas"):
44
+ with tempfile.TemporaryDirectory() as tmp_dir:
45
+ print(f"Created temporary directory for {lang}", tmp_dir)
46
+
47
+ downloader = LaserModelDownloader(model_dir=tmp_dir)
48
+ with pytest.raises(ValueError):
49
+ downloader.download_laser3(lang)
50
+
51
+ encoder = initialize_encoder(lang, model_dir=tmp_dir)
52
+ tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir)
53
+
54
+ # Test tokenization with a sample sentence
55
+ tokenized = tokenizer.tokenize("This is a sample sentence.")
56
+
57
+ print(f"{lang} model validated successfully")
laser/laser_encoders/validate_models.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+
4
+ import pytest
5
+
6
+ from laser_encoders.download_models import LaserModelDownloader
7
+ from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE
8
+ from laser_encoders.laser_tokenizer import initialize_tokenizer
9
+ from laser_encoders.models import initialize_encoder
10
+
11
+
12
+ @pytest.mark.slow
13
+ @pytest.mark.parametrize("lang", LASER3_LANGUAGE)
14
+ def test_validate_language_models_and_tokenize_laser3(lang):
15
+ with tempfile.TemporaryDirectory() as tmp_dir:
16
+ print(f"Created temporary directory for {lang}", tmp_dir)
17
+
18
+ downloader = LaserModelDownloader(model_dir=tmp_dir)
19
+ if lang in ["kashmiri", "kas", "central kanuri", "knc"]:
20
+ with pytest.raises(ValueError) as excinfo:
21
+ downloader.download_laser3(lang)
22
+ assert "ValueError" in str(excinfo.value)
23
+ print(f"{lang} language model raised a ValueError as expected.")
24
+ else:
25
+ downloader.download_laser3(lang)
26
+ encoder = initialize_encoder(lang, model_dir=tmp_dir)
27
+ tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir)
28
+
29
+ # Test tokenization with a sample sentence
30
+ tokenized = tokenizer.tokenize("This is a sample sentence.")
31
+
32
+ print(f"{lang} model validated successfully")
33
+
34
+
35
+ @pytest.mark.slow
36
+ @pytest.mark.parametrize("lang", LASER2_LANGUAGE)
37
+ def test_validate_language_models_and_tokenize_laser2(lang):
38
+ with tempfile.TemporaryDirectory() as tmp_dir:
39
+ print(f"Created temporary directory for {lang}", tmp_dir)
40
+
41
+ downloader = LaserModelDownloader(model_dir=tmp_dir)
42
+ downloader.download_laser2()
43
+
44
+ encoder = initialize_encoder(lang, model_dir=tmp_dir)
45
+ tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir)
46
+
47
+ # Test tokenization with a sample sentence
48
+ tokenized = tokenizer.tokenize("This is a sample sentence.")
49
+
50
+ print(f"{lang} model validated successfully")
51
+
52
+
53
+ class MockLaserModelDownloader(LaserModelDownloader):
54
+ def __init__(self, model_dir):
55
+ self.model_dir = model_dir
56
+
57
+ def download_laser3(self, lang):
58
+ lang = self.get_language_code(LASER3_LANGUAGE, lang)
59
+ file_path = os.path.join(self.model_dir, f"laser3-{lang}.v1.pt")
60
+ if not os.path.exists(file_path):
61
+ raise FileNotFoundError(f"Could not find {file_path}.")
62
+
63
+ def download_laser2(self):
64
+ files = ["laser2.pt", "laser2.spm", "laser2.cvocab"]
65
+ for file_name in files:
66
+ file_path = os.path.join(self.model_dir, file_name)
67
+ if not os.path.exists(file_path):
68
+ raise FileNotFoundError(f"Could not find {file_path}.")
69
+
70
+
71
+ CACHE_DIR = "/home/user/.cache/models" # Change this to the desired cache directory
72
+
73
+ # This uses the mock downloader
74
+ @pytest.mark.slow
75
+ @pytest.mark.parametrize("lang", LASER3_LANGUAGE)
76
+ def test_validate_language_models_and_tokenize_mock_laser3(lang):
77
+ downloader = MockLaserModelDownloader(model_dir=CACHE_DIR)
78
+
79
+ try:
80
+ downloader.download_laser3(lang)
81
+ except FileNotFoundError as e:
82
+ raise pytest.error(str(e))
83
+
84
+ encoder = initialize_encoder(lang, model_dir=CACHE_DIR)
85
+ tokenizer = initialize_tokenizer(lang, model_dir=CACHE_DIR)
86
+
87
+ tokenized = tokenizer.tokenize("This is a sample sentence.")
88
+
89
+ print(f"{lang} model validated successfully")
90
+
91
+
92
+ # This uses the mock downloader
93
+ @pytest.mark.slow
94
+ @pytest.mark.parametrize("lang", LASER2_LANGUAGE)
95
+ def test_validate_language_models_and_tokenize_mock_laser2(lang):
96
+ downloader = MockLaserModelDownloader(model_dir=CACHE_DIR)
97
+
98
+ try:
99
+ downloader.download_laser2()
100
+ except FileNotFoundError as e:
101
+ raise pytest.error(str(e))
102
+
103
+ encoder = initialize_encoder(lang, model_dir=CACHE_DIR)
104
+ tokenizer = initialize_tokenizer(lang, model_dir=CACHE_DIR)
105
+
106
+ tokenized = tokenizer.tokenize("This is a sample sentence.")
107
+
108
+ print(f"{lang} model validated successfully")
laser/pyproject.toml ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["flit_core >=3.2,<4", "setuptools"]
3
+ build-backend = "flit_core.buildapi"
4
+
5
+ [project]
6
+ name = "laser_encoders"
7
+ version = "0.0.2"
8
+ authors = [{name = "Facebook AI Research"}]
9
+ description = "LASER Language-Agnostic SEntence Representations is a toolkit to calculate multilingual sentence embeddings and to use them for document classification, bitext filtering and mining"
10
+ readme = "laser_encoders/README.md"
11
+ requires-python = ">=3.8"
12
+
13
+ dependencies = [
14
+ 'sacremoses==0.1.0',
15
+ 'unicategories>=0.1.2',
16
+ 'sentencepiece>=0.1.99',
17
+ 'numpy>=1.21.3',
18
+ 'torch>=1.10.0',
19
+ 'fairseq>=0.12.2',
20
+ ]
21
+
22
+ classifiers=[
23
+ "License :: OSI Approved :: BSD License",
24
+ "Topic :: Scientific/Engineering",
25
+ "Development Status :: 4 - Beta",
26
+ ]
27
+
28
+ [project.urls]
29
+ "Homepage" = "https://github.com/facebookresearch/LASER"
30
+ "Bug Tracker" = "https://github.com/facebookresearch/LASER/issues"
31
+
32
+ [project.optional-dependencies]
33
+ dev = [
34
+ # Test
35
+ "pytest>=4.3.0",
36
+ # Format
37
+ "black==22.3.0",
38
+ "isort>=5.10.1",
39
+ # Linters
40
+ "mypy>=0.782",
41
+ "pylint>=2.8.0",
42
+ # Release
43
+ "flit>=3.5.1"
44
+ ]
45
+
46
+ [tool.black]
47
+ # Black defaults are great !
48
+
49
+ [tool.isort]
50
+ profile = "black"
51
+ skip_gitignore = true
52
+ skip_glob = ["website/*", "*.pyx"]
53
+
54
+ [tool.mypy]
55
+ python_version = "3.8"
56
+ show_error_codes = true
57
+ check_untyped_defs = true
58
+
59
+ ignore_missing_imports = true
60
+
61
+ files = [
62
+ "laser_encoders/"
63
+ ]
64
+
65
+ [tool.pytest.ini_options]
66
+ testpaths = ["laser_encoders"]
67
+ python_files = [
68
+ "test_*.py",
69
+ ]
laser/remove_external_tools.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+ # LASER Language-Agnostic SEntence Representations
9
+ # is a toolkit to calculate multilingual sentence embeddings
10
+ # and to use them for document classification, bitext filtering
11
+ # and mining
12
+ #
13
+ #-------------------------------------------------------
14
+ #
15
+ # This bash script removes all installed third party software
16
+ #
17
+
18
+ if [ -z ${LASER+x} ] ; then
19
+ echo "Please set the environment variable 'LASER'"
20
+ exit
21
+ fi
22
+
23
+ bdir="${LASER}"
24
+ tools_ext="${bdir}/tools-external"
25
+
26
+ /bin/rm -rf ${tools_ext}
laser/source/embed.py ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+ # LASER Language-Agnostic SEntence Representations
9
+ # is a toolkit to calculate multilingual sentence embeddings
10
+ # and to use them for document classification, bitext filtering
11
+ # and mining
12
+ #
13
+ # --------------------------------------------------------
14
+ #
15
+ # Tool to calculate to embed a text file
16
+ # The functions can be also imported into another Python code
17
+
18
+
19
+ import argparse
20
+ import logging
21
+ import os
22
+ import re
23
+ import sys
24
+ import tempfile
25
+ import time
26
+ from collections import namedtuple
27
+ from pathlib import Path
28
+ from subprocess import run
29
+ from typing import Optional, Union
30
+
31
+ assert os.environ.get("LASER"), "Please set the environment variable LASER"
32
+ LASER = os.environ["LASER"]
33
+ sys.path.append(LASER)
34
+
35
+ import numpy as np
36
+ from lib.text_processing import BPEfastApply, SPMApply, Token
37
+ from laser_encoders.models import SentenceEncoder
38
+
39
+ SPACE_NORMALIZER = re.compile(r"\s+")
40
+ Batch = namedtuple("Batch", "srcs tokens lengths")
41
+
42
+ logging.basicConfig(
43
+ stream=sys.stdout,
44
+ level=logging.INFO,
45
+ format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
46
+ )
47
+ logger = logging.getLogger("embed")
48
+
49
+
50
+ def buffered_read(fp, buffer_size):
51
+ buffer = []
52
+ for src_str in fp:
53
+ buffer.append(src_str.strip())
54
+ if len(buffer) >= buffer_size:
55
+ yield buffer
56
+ buffer = []
57
+
58
+ if len(buffer) > 0:
59
+ yield buffer
60
+
61
+
62
+ class HuggingFaceEncoder:
63
+ def __init__(self, encoder_name: str, verbose=False):
64
+ from sentence_transformers import SentenceTransformer
65
+
66
+ encoder = f"sentence-transformers/{encoder_name}"
67
+ if verbose:
68
+ logger.info(f"loading HuggingFace encoder: {encoder}")
69
+ self.encoder = SentenceTransformer(encoder)
70
+
71
+ def encode_sentences(self, sentences):
72
+ return self.encoder.encode(sentences)
73
+
74
+
75
+ def load_model(
76
+ encoder: str,
77
+ spm_model: str,
78
+ bpe_codes: str,
79
+ hugging_face=False,
80
+ verbose=False,
81
+ **encoder_kwargs,
82
+ ) -> Union[SentenceEncoder, HuggingFaceEncoder]:
83
+ if hugging_face:
84
+ return HuggingFaceEncoder(encoder, verbose=verbose)
85
+ if spm_model:
86
+ spm_vocab = str(Path(spm_model).with_suffix(".cvocab"))
87
+ if verbose:
88
+ logger.info(f"spm_model: {spm_model}")
89
+ logger.info(f"spm_cvocab: {spm_vocab}")
90
+ else:
91
+ spm_vocab = None
92
+ return SentenceEncoder(
93
+ encoder, spm_vocab=spm_vocab, verbose=verbose, **encoder_kwargs
94
+ )
95
+
96
+
97
+ def EncodeLoad(args):
98
+ args.buffer_size = max(args.buffer_size, 1)
99
+ assert (
100
+ not args.max_sentences or args.max_sentences <= args.buffer_size
101
+ ), "--max-sentences/--batch-size cannot be larger than --buffer-size"
102
+
103
+ print(" - loading encoder", args.encoder)
104
+ return SentenceEncoder(
105
+ args.encoder,
106
+ max_sentences=args.max_sentences,
107
+ max_tokens=args.max_tokens,
108
+ cpu=args.cpu,
109
+ verbose=args.verbose,
110
+ )
111
+
112
+
113
+ def EncodeTime(t):
114
+ t = int(time.time() - t)
115
+ if t < 1000:
116
+ return "{:d}s".format(t)
117
+ else:
118
+ return "{:d}m{:d}s".format(t // 60, t % 60)
119
+
120
+
121
+ # Encode sentences (existing file pointers)
122
+ def EncodeFilep(
123
+ encoder, inp_file, out_file, buffer_size=10000, fp16=False, verbose=False
124
+ ):
125
+ n = 0
126
+ t = time.time()
127
+ for sentences in buffered_read(inp_file, buffer_size):
128
+ encoded = encoder.encode_sentences(sentences)
129
+ if fp16:
130
+ encoded = encoded.astype(np.float16)
131
+ encoded.tofile(out_file)
132
+ n += len(sentences)
133
+ if verbose and n % 10000 == 0:
134
+ logger.info("encoded {:d} sentences".format(n))
135
+ if verbose:
136
+ logger.info(f"encoded {n} sentences in {EncodeTime(t)}")
137
+
138
+
139
+ # Encode sentences (file names)
140
+ def EncodeFile(
141
+ encoder,
142
+ inp_fname,
143
+ out_fname,
144
+ buffer_size=10000,
145
+ fp16=False,
146
+ verbose=False,
147
+ over_write=False,
148
+ inp_encoding="utf-8",
149
+ ):
150
+ # TODO :handle over write
151
+ if not os.path.isfile(out_fname):
152
+ if verbose:
153
+ logger.info(
154
+ "encoding {} to {}".format(
155
+ inp_fname if len(inp_fname) > 0 else "stdin",
156
+ out_fname,
157
+ )
158
+ )
159
+ fin = (
160
+ open(inp_fname, "r", encoding=inp_encoding, errors="surrogateescape")
161
+ if len(inp_fname) > 0
162
+ else sys.stdin
163
+ )
164
+ fout = open(out_fname, mode="wb")
165
+ EncodeFilep(
166
+ encoder, fin, fout, buffer_size=buffer_size, fp16=fp16, verbose=verbose
167
+ )
168
+ fin.close()
169
+ fout.close()
170
+ elif not over_write and verbose:
171
+ logger.info("encoder: {} exists already".format(os.path.basename(out_fname)))
172
+
173
+
174
+ # Load existing embeddings
175
+ def EmbedLoad(fname, dim=1024, verbose=False, fp16=False):
176
+ x = np.fromfile(fname, dtype=(np.float16 if fp16 else np.float32), count=-1)
177
+ x.resize(x.shape[0] // dim, dim)
178
+ if verbose:
179
+ print(" - Embeddings: {:s}, {:d}x{:d}".format(fname, x.shape[0], dim))
180
+ return x
181
+
182
+
183
+ # Get memory mapped embeddings
184
+ def EmbedMmap(fname, dim=1024, dtype=np.float32, verbose=False):
185
+ nbex = int(os.path.getsize(fname) / dim / np.dtype(dtype).itemsize)
186
+ E = np.memmap(fname, mode="r", dtype=dtype, shape=(nbex, dim))
187
+ if verbose:
188
+ print(" - embeddings on disk: {:s} {:d} x {:d}".format(fname, nbex, dim))
189
+ return E
190
+
191
+
192
+ def embed_sentences(
193
+ ifname: str,
194
+ output: str,
195
+ encoder: Union[SentenceEncoder, HuggingFaceEncoder] = None,
196
+ encoder_path: str = None,
197
+ hugging_face=False,
198
+ token_lang: Optional[str] = "--",
199
+ bpe_codes: Optional[str] = None,
200
+ spm_lang: Optional[str] = "en",
201
+ spm_model: Optional[str] = None,
202
+ verbose: bool = False,
203
+ buffer_size: int = 10000,
204
+ max_tokens: int = 12000,
205
+ max_sentences: Optional[int] = None,
206
+ cpu: bool = False,
207
+ fp16: bool = False,
208
+ sort_kind: str = "quicksort",
209
+ ):
210
+ assert encoder or encoder_path, "Provide initialised encoder or encoder_path"
211
+ buffer_size = max(buffer_size, 1)
212
+ assert (
213
+ not max_sentences or max_sentences <= buffer_size
214
+ ), "--max-sentences/--batch-size cannot be larger than --buffer-size"
215
+
216
+ assert not (bpe_codes and spm_model), "Cannot specify both spm and bpe"
217
+
218
+ if encoder_path:
219
+ encoder = load_model(
220
+ encoder_path,
221
+ spm_model,
222
+ bpe_codes,
223
+ verbose=verbose,
224
+ hugging_face=hugging_face,
225
+ max_sentences=max_sentences,
226
+ max_tokens=max_tokens,
227
+ sort_kind=sort_kind,
228
+ cpu=cpu,
229
+ )
230
+ if not ifname:
231
+ ifname = "" # default to stdin
232
+ with tempfile.TemporaryDirectory() as tmpdir:
233
+ if token_lang != "--":
234
+ tok_fname = os.path.join(tmpdir, "tok")
235
+ Token(
236
+ ifname,
237
+ tok_fname,
238
+ lang=token_lang,
239
+ romanize=True if token_lang == "el" else False,
240
+ lower_case=True,
241
+ gzip=False,
242
+ verbose=verbose,
243
+ over_write=False,
244
+ )
245
+ ifname = tok_fname
246
+
247
+ if bpe_codes:
248
+ if ifname == "": # stdin
249
+ ifname = os.path.join(tmpdir, "no_tok")
250
+ run(f"cat > {ifname}", shell=True)
251
+ bpe_fname = os.path.join(tmpdir, "bpe")
252
+ BPEfastApply(
253
+ ifname, bpe_fname, bpe_codes, verbose=verbose, over_write=False
254
+ )
255
+ ifname = bpe_fname
256
+
257
+ if spm_model:
258
+ spm_fname = os.path.join(tmpdir, "spm")
259
+ SPMApply(
260
+ ifname,
261
+ spm_fname,
262
+ spm_model,
263
+ lang=spm_lang,
264
+ lower_case=True,
265
+ verbose=verbose,
266
+ over_write=False,
267
+ )
268
+ ifname = spm_fname
269
+
270
+ EncodeFile(
271
+ encoder,
272
+ ifname,
273
+ output,
274
+ verbose=verbose,
275
+ over_write=False,
276
+ buffer_size=buffer_size,
277
+ fp16=fp16,
278
+ )
279
+
280
+
281
+ if __name__ == "__main__":
282
+ parser = argparse.ArgumentParser(description="LASER: Embed sentences")
283
+ parser.add_argument(
284
+ "-i",
285
+ "--input",
286
+ type=str,
287
+ default=None,
288
+ help="Input text file",
289
+ )
290
+ parser.add_argument("--encoder", type=str, required=True, help="encoder to be used")
291
+ parser.add_argument(
292
+ "--token-lang",
293
+ type=str,
294
+ default="--",
295
+ help="Perform tokenization with given language ('--' for no tokenization)",
296
+ )
297
+ parser.add_argument(
298
+ "--bpe-codes", type=str, default=None, help="Apply BPE using specified codes"
299
+ )
300
+ parser.add_argument(
301
+ "--spm-lang", type=str, default="en", help="Apply SPM using specified language"
302
+ )
303
+ parser.add_argument(
304
+ "--spm-model", type=str, default=None, help="Apply SPM using specified model"
305
+ )
306
+ parser.add_argument("-v", "--verbose", action="store_true", help="Detailed output")
307
+
308
+ parser.add_argument(
309
+ "-o", "--output", required=True, help="Output sentence embeddings"
310
+ )
311
+ parser.add_argument(
312
+ "--buffer-size", type=int, default=10000, help="Buffer size (sentences)"
313
+ )
314
+ parser.add_argument(
315
+ "--max-tokens",
316
+ type=int,
317
+ default=12000,
318
+ help="Maximum number of tokens to process in a batch",
319
+ )
320
+ parser.add_argument(
321
+ "--max-sentences",
322
+ type=int,
323
+ default=None,
324
+ help="Maximum number of sentences to process in a batch",
325
+ )
326
+ parser.add_argument(
327
+ "--fp16",
328
+ action="store_true",
329
+ help="Store embedding matrices in fp16 instead of fp32",
330
+ )
331
+ parser.add_argument("--cpu", action="store_true", help="Use CPU instead of GPU")
332
+ parser.add_argument(
333
+ "--sort-kind",
334
+ type=str,
335
+ default="quicksort",
336
+ choices=["quicksort", "mergesort"],
337
+ help="Algorithm used to sort batch by length",
338
+ )
339
+ parser.add_argument(
340
+ "--use-hugging-face",
341
+ action="store_true",
342
+ help="Use a HuggingFace sentence transformer",
343
+ )
344
+
345
+ args = parser.parse_args()
346
+ embed_sentences(
347
+ ifname=args.input,
348
+ encoder_path=args.encoder,
349
+ token_lang=args.token_lang,
350
+ bpe_codes=args.bpe_codes,
351
+ spm_lang=args.spm_lang,
352
+ hugging_face=args.use_hugging_face,
353
+ spm_model=args.spm_model,
354
+ verbose=args.verbose,
355
+ output=args.output,
356
+ buffer_size=args.buffer_size,
357
+ max_tokens=args.max_tokens,
358
+ max_sentences=args.max_sentences,
359
+ cpu=args.cpu,
360
+ fp16=args.fp16,
361
+ sort_kind=args.sort_kind,
362
+ )
laser/source/eval.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+ # LASER Language-Agnostic SEntence Representations
9
+ # is a toolkit to calculate multilingual sentence embeddings
10
+ # and to use them for document classification, bitext filtering
11
+ # and mining
12
+ #
13
+ # --------------------------------------------------------
14
+ #
15
+ # Tool to calculate multilingual similarity error rate
16
+ # on various predefined test sets
17
+
18
+
19
+ import os
20
+ import argparse
21
+ import pandas
22
+ import tempfile
23
+ import numpy as np
24
+ from pathlib import Path
25
+ import itertools
26
+ import logging
27
+ import sys
28
+ from typing import List, Tuple, Dict
29
+ from tabulate import tabulate
30
+ from collections import defaultdict
31
+ from xsim import xSIM
32
+ from embed import embed_sentences, load_model
33
+
34
+ logging.basicConfig(
35
+ stream=sys.stdout,
36
+ level=logging.INFO,
37
+ format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
38
+ )
39
+ logger = logging.getLogger("eval")
40
+
41
+
42
+ class Eval:
43
+ def __init__(self, args):
44
+ self.base_dir = args.base_dir
45
+ self.corpus = args.corpus
46
+ self.split = args.corpus_part
47
+ self.min_sents = args.min_sents
48
+ self.index_comparison = args.index_comparison
49
+ self.emb_dimension = args.embedding_dimension
50
+ self.encoder_args = {
51
+ k: v
52
+ for k, v in args._get_kwargs()
53
+ if k in ["max_sentences", "max_tokens", "cpu", "sort_kind", "verbose"]
54
+ }
55
+ self.src_bpe_codes = args.src_bpe_codes
56
+ self.tgt_bpe_codes = args.tgt_bpe_codes
57
+ self.src_spm_model = args.src_spm_model
58
+ self.tgt_spm_model = args.tgt_spm_model
59
+ logger.info("loading src encoder")
60
+ self.src_encoder = load_model(
61
+ args.src_encoder,
62
+ self.src_spm_model,
63
+ self.src_bpe_codes,
64
+ hugging_face=args.use_hugging_face,
65
+ **self.encoder_args,
66
+ )
67
+ if args.tgt_encoder:
68
+ logger.info("loading tgt encoder")
69
+ self.tgt_encoder = load_model(
70
+ args.tgt_encoder,
71
+ self.tgt_spm_model,
72
+ self.tgt_bpe_codes,
73
+ hugging_face=args.use_hugging_face,
74
+ **self.encoder_args,
75
+ )
76
+ else:
77
+ logger.info("encoding tgt using src encoder")
78
+ self.tgt_encoder = self.src_encoder
79
+ self.tgt_bpe_codes = self.src_bpe_codes
80
+ self.tgt_spm_model = self.src_spm_model
81
+ self.nway = args.nway
82
+ self.buffer_size = args.buffer_size
83
+ self.fp16 = args.fp16
84
+ self.margin = args.margin
85
+
86
+ def _embed(
87
+ self, tmpdir, langs, encoder, spm_model, bpe_codes, tgt_aug_langs=[]
88
+ ) -> List[List[str]]:
89
+ emb_data = []
90
+ for lang in langs:
91
+ augjson = None
92
+ fname = f"{lang}.{self.split}"
93
+ infile = self.base_dir / self.corpus / self.split / fname
94
+ assert infile.exists(), f"{infile} does not exist"
95
+ outfile = tmpdir / fname
96
+ if lang in tgt_aug_langs:
97
+ fname = f"{lang}_augmented.{self.split}"
98
+ fjname = f"{lang}_errtype.{self.split}.json"
99
+ augment_dir = self.base_dir / self.corpus / (self.split + "_augmented")
100
+ augjson = augment_dir / fjname
101
+ auginfile = augment_dir / fname
102
+ assert augjson.exists(), f"{augjson} does not exist"
103
+ assert auginfile.exists(), f"{auginfile} does not exist"
104
+ combined_infile = tmpdir / f"combined_{lang}"
105
+ with open(combined_infile, "w") as newfile:
106
+ for f in [infile, auginfile]:
107
+ with open(f) as fin:
108
+ newfile.write(fin.read())
109
+ infile = combined_infile
110
+ embed_sentences(
111
+ str(infile),
112
+ str(outfile),
113
+ encoder=encoder,
114
+ spm_model=spm_model,
115
+ bpe_codes=bpe_codes,
116
+ token_lang=lang if bpe_codes else "--",
117
+ buffer_size=self.buffer_size,
118
+ fp16=self.fp16,
119
+ **self.encoder_args,
120
+ )
121
+ assert (
122
+ os.path.isfile(outfile) and os.path.getsize(outfile) > 0
123
+ ), f"Error encoding {infile}"
124
+ emb_data.append([lang, infile, outfile, augjson])
125
+ return emb_data
126
+
127
+ def _xsim(
128
+ self, src_emb, src_lang, tgt_emb, tgt_lang, tgt_txt, augjson=None
129
+ ) -> Tuple[int, int, Dict[str, int]]:
130
+ return xSIM(
131
+ src_emb,
132
+ tgt_emb,
133
+ margin=self.margin,
134
+ dim=self.emb_dimension,
135
+ fp16=self.fp16,
136
+ eval_text=tgt_txt if not self.index_comparison else None,
137
+ augmented_json=augjson,
138
+ )
139
+
140
+ def calc_xsim(
141
+ self, embdir, src_langs, tgt_langs, tgt_aug_langs, err_sum=0, totl_nbex=0
142
+ ) -> None:
143
+ outputs = []
144
+ src_emb_data = self._embed(
145
+ embdir,
146
+ src_langs,
147
+ self.src_encoder,
148
+ self.src_spm_model,
149
+ self.src_bpe_codes,
150
+ )
151
+ tgt_emb_data = self._embed(
152
+ embdir,
153
+ tgt_langs,
154
+ self.tgt_encoder,
155
+ self.tgt_spm_model,
156
+ self.tgt_bpe_codes,
157
+ tgt_aug_langs,
158
+ )
159
+ aug_df = defaultdict(lambda: defaultdict())
160
+ combs = list(itertools.product(src_emb_data, tgt_emb_data))
161
+ for (src_lang, _, src_emb, _), (tgt_lang, tgt_txt, tgt_emb, augjson) in combs:
162
+ if src_lang == tgt_lang:
163
+ continue
164
+ err, nbex, aug_report = self._xsim(
165
+ src_emb, src_lang, tgt_emb, tgt_lang, tgt_txt, augjson
166
+ )
167
+ result = round(100 * err / nbex, 2)
168
+ if tgt_lang in tgt_aug_langs:
169
+ aug_df[tgt_lang][src_lang] = aug_report
170
+ if nbex < self.min_sents:
171
+ result = "skipped"
172
+ else:
173
+ err_sum += err
174
+ totl_nbex += nbex
175
+ outputs.append(
176
+ [self.corpus, f"{src_lang}-{tgt_lang}", f"{result}", f"{nbex}"]
177
+ )
178
+ outputs.append(
179
+ [
180
+ self.corpus,
181
+ "average",
182
+ f"{round(100 * err_sum / totl_nbex, 2)}",
183
+ f"{len(combs)}",
184
+ ]
185
+ )
186
+ print(
187
+ tabulate(
188
+ outputs,
189
+ tablefmt="psql",
190
+ headers=[
191
+ "dataset",
192
+ "src-tgt",
193
+ "xsim" + ("(++)" if tgt_aug_langs else ""),
194
+ "nbex",
195
+ ],
196
+ )
197
+ )
198
+ for tgt_aug_lang in tgt_aug_langs:
199
+ df = pandas.DataFrame.from_dict(aug_df[tgt_aug_lang]).fillna(0).T
200
+ print(
201
+ f"\nAbsolute error under augmented transformations for: {tgt_aug_lang}"
202
+ )
203
+ print(f"{tabulate(df, df.columns, floatfmt='.2f', tablefmt='grid')}")
204
+
205
+ def calc_xsim_nway(self, embdir, langs) -> None:
206
+ err_matrix = np.zeros((len(langs), len(langs)))
207
+ emb_data = self._embed(
208
+ embdir,
209
+ langs,
210
+ self.src_encoder,
211
+ self.src_spm_model,
212
+ self.src_bpe_codes,
213
+ )
214
+ for i1, (src_lang, _, src_emb, _) in enumerate(emb_data):
215
+ for i2, (tgt_lang, tgt_txt, tgt_emb, _) in enumerate(emb_data):
216
+ if src_lang == tgt_lang:
217
+ err_matrix[i1, i2] = 0
218
+ else:
219
+ err, nbex, _ = self._xsim(
220
+ src_emb, src_lang, tgt_emb, tgt_lang, tgt_txt
221
+ )
222
+ err_matrix[i1, i2] = 100 * err / nbex
223
+ df = pandas.DataFrame(err_matrix, columns=langs, index=langs)
224
+ df.loc["avg"] = df.sum() / float(df.shape[0] - 1) # exclude diagonal in average
225
+ print(f"\n{tabulate(df, langs, floatfmt='.2f', tablefmt='grid')}\n\n")
226
+ print(f"Global average: {df.loc['avg'].mean():.2f}")
227
+
228
+
229
+ def run_eval(args) -> None:
230
+ evaluation = Eval(args)
231
+ tmp_dir = None
232
+ if args.embed_dir:
233
+ os.makedirs(args.embed_dir, exist_ok=True)
234
+ embed_dir = args.embed_dir
235
+ else:
236
+ tmp_dir = tempfile.TemporaryDirectory()
237
+ embed_dir = Path(tmp_dir.name)
238
+ src_langs = sorted(args.src_langs.split(","))
239
+ tgt_aug_langs = sorted(args.tgt_aug_langs.split(",")) if args.tgt_aug_langs else []
240
+ if evaluation.nway:
241
+ evaluation.calc_xsim_nway(embed_dir, src_langs)
242
+ else:
243
+ assert (
244
+ args.tgt_langs
245
+ ), "Please provide tgt langs when not performing n-way comparison"
246
+ tgt_langs = sorted(args.tgt_langs.split(","))
247
+ evaluation.calc_xsim(embed_dir, src_langs, tgt_langs, tgt_aug_langs)
248
+ if tmp_dir:
249
+ tmp_dir.cleanup() # remove temporary directory
250
+
251
+
252
+ if __name__ == "__main__":
253
+ parser = argparse.ArgumentParser(
254
+ description="LASER: multilingual similarity error evaluation"
255
+ )
256
+ parser.add_argument(
257
+ "--base-dir",
258
+ type=Path,
259
+ default=None,
260
+ help="Base directory for evaluation files",
261
+ required=True,
262
+ )
263
+ parser.add_argument(
264
+ "--corpus",
265
+ type=str,
266
+ default=None,
267
+ help="Name of evaluation corpus",
268
+ required=True,
269
+ )
270
+ parser.add_argument(
271
+ "--corpus-part",
272
+ type=str,
273
+ default=None,
274
+ help="Specify split of the corpus to use e.g., dev",
275
+ required=True,
276
+ )
277
+ parser.add_argument(
278
+ "--margin",
279
+ type=str,
280
+ default=None,
281
+ help="Margin for xSIM calculation. See: https://aclanthology.org/P19-1309",
282
+ )
283
+ parser.add_argument(
284
+ "--min-sents",
285
+ type=int,
286
+ default=100,
287
+ help="Only use test sets which have at least N sentences",
288
+ )
289
+ parser.add_argument(
290
+ "--nway", action="store_true", help="Test N-way for corpora which support it"
291
+ )
292
+ parser.add_argument(
293
+ "--embed-dir",
294
+ type=Path,
295
+ default=None,
296
+ help="Store/load embeddings from specified directory (default temporary)",
297
+ )
298
+ parser.add_argument(
299
+ "--index-comparison",
300
+ action="store_true",
301
+ help="Use index comparison instead of texts (not recommended when test data contains duplicates)",
302
+ )
303
+ parser.add_argument("--src-spm-model", type=str, default=None)
304
+ parser.add_argument("--tgt-spm-model", type=str, default=None)
305
+ parser.add_argument(
306
+ "--src-bpe-codes",
307
+ type=str,
308
+ default=None,
309
+ help="Path to bpe codes for src model",
310
+ )
311
+ parser.add_argument(
312
+ "--tgt-bpe-codes",
313
+ type=str,
314
+ default=None,
315
+ help="Path to bpe codes for tgt model",
316
+ )
317
+ parser.add_argument("--src-encoder", type=str, default=None, required=True)
318
+ parser.add_argument("--tgt-encoder", type=str, default=None)
319
+ parser.add_argument(
320
+ "--buffer-size", type=int, default=100, help="Buffer size (sentences)"
321
+ )
322
+ parser.add_argument(
323
+ "--max-tokens",
324
+ type=int,
325
+ default=12000,
326
+ help="Maximum number of tokens to process in a batch",
327
+ )
328
+ parser.add_argument(
329
+ "--max-sentences",
330
+ type=int,
331
+ default=None,
332
+ help="Maximum number of sentences to process in a batch",
333
+ )
334
+ parser.add_argument("--cpu", action="store_true", help="Use CPU instead of GPU")
335
+
336
+ parser.add_argument(
337
+ "--src-langs",
338
+ type=str,
339
+ default=None,
340
+ help="Source-side languages for evaluation",
341
+ required=True,
342
+ )
343
+ parser.add_argument(
344
+ "--tgt-langs",
345
+ type=str,
346
+ default=None,
347
+ help="Target-side languages for evaluation",
348
+ )
349
+ parser.add_argument(
350
+ "--tgt-aug-langs",
351
+ type=str,
352
+ default=None,
353
+ help="languages with augmented data",
354
+ required=False,
355
+ )
356
+ parser.add_argument(
357
+ "--fp16",
358
+ action="store_true",
359
+ help="Store embedding matrices in fp16 instead of fp32",
360
+ )
361
+ parser.add_argument(
362
+ "--sort-kind",
363
+ type=str,
364
+ default="quicksort",
365
+ choices=["quicksort", "mergesort"],
366
+ help="Algorithm used to sort batch by length",
367
+ )
368
+ parser.add_argument(
369
+ "--use-hugging-face",
370
+ action="store_true",
371
+ help="Use a HuggingFace sentence transformer",
372
+ )
373
+ parser.add_argument(
374
+ "--embedding-dimension",
375
+ type=int,
376
+ default=1024,
377
+ help="Embedding dimension for encoders",
378
+ )
379
+ parser.add_argument("-v", "--verbose", action="store_true", help="Detailed output")
380
+ args = parser.parse_args()
381
+ run_eval(args)
laser/source/lib/indexing.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+ # LASER Language-Agnostic SEntence Representations
9
+ # is a toolkit to calculate multilingual sentence embeddings
10
+ # and to use them for document classification, bitext filtering
11
+ # and mining
12
+ #
13
+ # --------------------------------------------------------
14
+ #
15
+ # tools for indexing and search with FAISS
16
+
17
+ import faiss
18
+ import os.path
19
+ import sys
20
+ import numpy as np
21
+
22
+ #-------------------------------------------------------------
23
+ # Get list of fnames:
24
+ # - we loop over the list of given languages
25
+ # - for each language, we also check if there are splitted files .%03d
26
+
27
+ def SplitFnames(par_fname, langs):
28
+ fnames = []
29
+ for l in langs:
30
+ fname = par_fname + '.' + l
31
+ if os.path.isfile(fname):
32
+ fnames.append(fname)
33
+ for i in range(1000):
34
+ fname = par_fname + '.' + l + '.{:03d}'.format(i)
35
+ if os.path.isfile(fname):
36
+ fnames.append(fname)
37
+ if len(fnames) == 0:
38
+ print("ERROR: no embeddings found in {:s}*".format(par_fname))
39
+ sys.exit(1)
40
+ return fnames
41
+
42
+ def SplitOpen(par_fname, langs, dim, dtype, verbose=False):
43
+ M = []
44
+ nf = 0
45
+ nc = 0
46
+ print('Reading sentence embeddings')
47
+ print(' - memory mapped files {:s}'.format(par_fname))
48
+ for fname in SplitFnames(par_fname, langs):
49
+ n = int(os.path.getsize(fname) / dim / np.dtype(dtype).itemsize)
50
+ if verbose:
51
+ print(' - {:s}: {:d} x {:d}'.format(fname, n, dim))
52
+ Mi = np.memmap(fname, mode='r', dtype=dtype, shape=(n, dim))
53
+ nc += n
54
+ nf += 1
55
+ M.append(Mi)
56
+ print(' - total of {:d} files: {:d} x {:d}'.format(nf, nc, dim))
57
+ return M
58
+
59
+ def SplitAccess(M, idx):
60
+ i = idx
61
+ for Mi in M:
62
+ n = Mi.shape[0]
63
+ if i < n:
64
+ return Mi[i,:]
65
+ i -= n
66
+ print('ERROR: index {:d} is too large form memory mapped files'.format(idx))
67
+ sys.exit(1)
68
+
69
+
70
+ ###############################################################################
71
+ # create an FAISS index on the given data
72
+
73
+ def IndexCreate(dname, idx_type,
74
+ verbose=False, normalize=True, save_index=False, dim=1024):
75
+
76
+ assert idx_type == 'FlatL2', 'only FlatL2 index is currently supported'
77
+ x = np.fromfile(dname, dtype=np.float32, count=-1)
78
+ nbex = x.shape[0] // dim
79
+ print(' - embedding: {:s} {:d} examples of dim {:d}'
80
+ .format(dname, nbex, dim))
81
+ x.resize(nbex, dim)
82
+ print(' - creating FAISS index')
83
+ idx = faiss.IndexFlatL2(dim)
84
+ if normalize:
85
+ faiss.normalize_L2(x)
86
+ idx.add(x)
87
+ if save_index:
88
+ iname = 'TODO'
89
+ print(' - saving index into ' + iname)
90
+ faiss.write_index(idx, iname)
91
+ return x, idx
92
+
93
+
94
+ ###############################################################################
95
+ # search closest vector for all languages pairs and calculate error rate
96
+
97
+ def IndexSearchMultiple(data, idx, langs, verbose=False, texts=None, print_errors=False):
98
+ nl = len(data)
99
+ nbex = data[0].shape[0]
100
+ err = np.zeros((nl, nl)).astype(float)
101
+ ref = np.linspace(0, nbex-1, nbex).astype(int) # [0, nbex)
102
+ if verbose:
103
+ if texts is None:
104
+ print('Calculating similarity error (indices):')
105
+ else:
106
+ print('Calculating similarity error (textual):')
107
+ for i1 in range(nl):
108
+ for i2 in range(nl):
109
+ if i1 != i2:
110
+ D, I = idx[i2].search(data[i1], 1)
111
+ if texts: # do textual comparison
112
+ e1 = 0
113
+ for p in range(I.shape[0]):
114
+ if texts[i2][p] != texts[i2][I[p,0]]:
115
+ e1 += 1
116
+ if print_errors:
117
+ print('Error {:s}\n {:s}'
118
+ .format(texts[i2][p].strip(), texts[i2][I[p,0]].strip()))
119
+ err[i1, i2] = e1 / nbex
120
+ else: # do index based comparision
121
+ err[i1, i2] \
122
+ = (nbex - np.equal(I.reshape(nbex), ref)
123
+ .astype(int).sum()) / nbex
124
+ if verbose:
125
+ print(' - similarity error {:s}/{:s}: {:5.2f}%'
126
+ .format(langs[i1], langs[i2],
127
+ 100.0 * err[i1, i2]))
128
+ return err
129
+
130
+
131
+ ###############################################################################
132
+ # print confusion matrix
133
+
134
+ def IndexPrintConfusionMatrix(err, langs):
135
+ nl = len(langs)
136
+ assert nl == err.shape[0], 'size of errror matrix doesn not match'
137
+ print('Confusion matrix:')
138
+ print('{:8s}'.format('langs'), end='')
139
+ for i2 in range(nl):
140
+ print('{:8s} '.format(langs[i2]), end='')
141
+ print('{:8s}'.format('avg'))
142
+ for i1 in range(nl):
143
+ print('{:3s}'.format(langs[i1]), end='')
144
+ for i2 in range(nl):
145
+ print('{:8.2f}%'.format(100 * err[i1, i2]), end='')
146
+ print('{:8.2f}%'.format(100 * err[i1, :].sum() / (nl-1)))
147
+
148
+ print('avg', end='')
149
+ for i2 in range(nl):
150
+ print('{:8.2f}%'.format(100 * err[:, i2].sum() / (nl-1)), end='')
151
+
152
+ # global average
153
+ print('{:8.2f}%'.format(100 * err.sum() / (nl-1) / nl))
154
+
155
+
156
+ ###############################################################################
157
+ # Load an FAISS index
158
+
159
+ def IndexLoad(idx_name, nprobe, gpu=False):
160
+ print('Reading FAISS index')
161
+ print(' - index: {:s}'.format(idx_name))
162
+ index = faiss.read_index(idx_name)
163
+ print(' - found {:d} sentences of dim {:d}'.format(index.ntotal, index.d))
164
+ print(' - setting nbprobe to {:d}'.format(nprobe))
165
+ if gpu:
166
+ print(' - transfer index to %d GPUs ' % faiss.get_num_gpus())
167
+ #co = faiss.GpuMultipleClonerOptions()
168
+ #co.shard = True
169
+ index = faiss.index_cpu_to_all_gpus(index) # co=co
170
+ faiss.GpuParameterSpace().set_index_parameter(index, 'nprobe', nprobe)
171
+ return index
172
+
173
+
174
+ ###############################################################################
175
+ # Opens a text file with the sentences corresponding to the indices used
176
+ # by an FAISS index
177
+ # We also need the reference files with the byte offsets to the beginning
178
+ # of each sentence
179
+ # optionnally: array with number of words per sentence
180
+ # All arrays are memory mapped
181
+
182
+ def IndexTextOpen(txt_fname):
183
+ print('Reading text corpus')
184
+ print(' - texts: {:s}'.format(txt_fname))
185
+ txt_mmap = np.memmap(txt_fname, mode='r', dtype=np.uint8)
186
+ fname = txt_fname.replace('.txt', '.ref.bin32')
187
+ if os.path.isfile(fname):
188
+ print(' - sentence start offsets (32 bit): {}'.format(fname))
189
+ ref_mmap = np.memmap(fname, mode='r', dtype=np.uint32)
190
+ else:
191
+ fname = txt_fname.replace('.txt', '.ref.bin64')
192
+ if os.path.isfile(fname):
193
+ print(' - sentence start offsets (64 bit): {}'.format(fname))
194
+ ref_mmap = np.memmap(fname, mode='r', dtype=np.uint64)
195
+ else:
196
+ print('ERROR: no file with sentence start offsets found')
197
+ sys.exit(1)
198
+ print(' - found {:d} sentences'.format(ref_mmap.shape[0]))
199
+
200
+ nbw_mmap = None
201
+ fname = txt_fname.replace('.txt', '.nw.bin8')
202
+ if os.path.isfile(fname):
203
+ print(' - word counts: {:s}'.format(fname))
204
+ nbw_mmap = np.memmap(fname, mode='r', dtype=np.uint8)
205
+
206
+ M = None
207
+ fname = txt_fname.replace('.txt', '.meta')
208
+ if os.path.isfile(fname):
209
+ M = []
210
+ n = 0
211
+ print(' - metafile: {:s}'.format(fname))
212
+ with open(fname, 'r') as fp:
213
+ for line in fp:
214
+ fields = line.strip().split()
215
+ if len(fields) != 2:
216
+ print('ERROR: format error in meta file')
217
+ sys.exit(1)
218
+ n += int(fields[1])
219
+ M.append({'lang': fields[0], 'n': n})
220
+ print(' - found {:d} languages:'.format(len(M)), end='')
221
+ for L in M:
222
+ print(' {:s}'.format(L['lang']), end='')
223
+ print('')
224
+
225
+ return txt_mmap, ref_mmap, nbw_mmap, M
226
+
227
+
228
+ ###############################################################################
229
+ # Return the text for the given index
230
+
231
+ def IndexTextQuery(txt_mmap, ref_mmap, idx):
232
+ p = int(ref_mmap[idx]) # get starting byte position
233
+ i = 0
234
+ dim = 10000 # max sentence length in bytes
235
+ b = bytearray(dim)
236
+ # find EOL
237
+ while txt_mmap[p+i] != 10 and i < dim:
238
+ b[i] = txt_mmap[p+i]
239
+ i += 1
240
+
241
+ return b[0:i].decode('utf-8')
242
+
243
+
244
+ ###############################################################################
245
+ # Search the [k] nearest vectors of [x] in the given index
246
+ # and return the text lines
247
+
248
+ def IndexSearchKNN(index, x, T, R, kmax=1, Dmax=1.0, dedup=True):
249
+ D, I = index.search(x, kmax)
250
+ prev = {} # for depuplication
251
+ res = []
252
+ for n in range(x.shape[0]):
253
+ for i in range(kmax):
254
+ txt = IndexTextQuery(T, R, I[n, i])
255
+ if (dedup and txt not in prev) and D[n, i] <= Dmax:
256
+ prev[txt] = 1
257
+ res.append([txt, D[n, i]])
258
+ return res
laser/source/lib/romanize_lc.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+ # LASER Language-Agnostic SEntence Representations
9
+ # is a toolkit to calculate multilingual sentence embeddings
10
+ # and to use them for document classification, bitext filtering
11
+ # and mining
12
+ #
13
+ # --------------------------------------------------------
14
+ #
15
+ # Romanize and lower case text
16
+
17
+ import os
18
+ import sys
19
+ import argparse
20
+ from transliterate import translit, get_available_language_codes
21
+
22
+ parser = argparse.ArgumentParser(
23
+ formatter_class=argparse.RawDescriptionHelpFormatter,
24
+ description="Calculate multilingual sentence encodings")
25
+ parser.add_argument(
26
+ '--input', '-i', type=argparse.FileType('r', encoding='UTF-8'),
27
+ default=sys.stdin,
28
+ metavar='PATH',
29
+ help="Input text file (default: standard input).")
30
+ parser.add_argument(
31
+ '--output', '-o', type=argparse.FileType('w', encoding='UTF-8'),
32
+ default=sys.stdout,
33
+ metavar='PATH',
34
+ help="Output text file (default: standard output).")
35
+ parser.add_argument(
36
+ '--language', '-l', type=str,
37
+ metavar='STR', default="none",
38
+ help="perform transliteration into Roman characters"
39
+ " from the specified language (default none)")
40
+ parser.add_argument(
41
+ '--preserve-case', '-C', action='store_true',
42
+ help="Preserve case of input texts (default is all lower case)")
43
+
44
+ args = parser.parse_args()
45
+
46
+ for line in args.input:
47
+ if args.language != "none":
48
+ line = translit(line, args.language, reversed=True)
49
+ if not args.preserve_case:
50
+ line = line.lower()
51
+ args.output.write(line)
laser/source/lib/text_processing.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+ # LASER Language-Agnostic SEntence Representations
9
+ # is a toolkit to calculate multilingual sentence embeddings
10
+ # and to use them for document classification, bitext filtering
11
+ # and mining
12
+ #
13
+ # --------------------------------------------------------
14
+ #
15
+ # Helper functions for tokenization and BPE
16
+
17
+ import os
18
+ import sys
19
+ import logging
20
+ from pathlib import Path
21
+ import numpy as np
22
+ from subprocess import run, check_output, CalledProcessError, DEVNULL
23
+
24
+ logging.basicConfig(
25
+ stream=sys.stdout,
26
+ level=logging.INFO,
27
+ format="%(asctime)s | %(levelname)s | %(name)s | %(message)s")
28
+ logger = logging.getLogger("preprocess")
29
+
30
+ # get environment
31
+ assert os.environ.get('LASER'), 'Please set the enviornment variable LASER'
32
+ LASER = os.environ['LASER']
33
+
34
+ FASTBPE = LASER + '/tools-external/fastBPE/fast'
35
+ MOSES_BDIR = LASER + '/tools-external/moses-tokenizer/tokenizer/'
36
+ MOSES_TOKENIZER = MOSES_BDIR + 'tokenizer.perl -q -no-escape -threads 20 -l '
37
+ MOSES_LC = MOSES_BDIR + 'lowercase.perl'
38
+ NORM_PUNC = MOSES_BDIR + 'normalize-punctuation.perl -l '
39
+ DESCAPE = MOSES_BDIR + 'deescape-special-chars.perl'
40
+ REM_NON_PRINT_CHAR = MOSES_BDIR + 'remove-non-printing-char.perl'
41
+ SPM_DIR = LASER + '/tools-external/sentencepiece-master/build/src/'
42
+ SPM = 'LD_LIBRARY_PATH=' + SPM_DIR + ' ' + SPM_DIR + '/spm_encode --output_format=piece'
43
+
44
+ # Romanization (and lower casing)
45
+ ROMAN_LC = 'python3 ' + LASER + '/source/lib/romanize_lc.py -l '
46
+
47
+ # Mecab tokenizer for Japanese
48
+ MECAB = LASER + '/tools-external/mecab'
49
+
50
+
51
+
52
+
53
+ ###############################################################################
54
+ #
55
+ # Tokenize a line of text
56
+ #
57
+ ###############################################################################
58
+
59
+ def TokenLine(line, lang='en', lower_case=True, romanize=False):
60
+ assert lower_case, 'lower case is needed by all the models'
61
+ roman = lang if romanize else 'none'
62
+ tok = check_output(
63
+ REM_NON_PRINT_CHAR
64
+ + '|' + NORM_PUNC + lang
65
+ + '|' + DESCAPE
66
+ + '|' + MOSES_TOKENIZER + lang
67
+ + ('| python3 -m jieba -d ' if lang == 'zh' else '')
68
+ + ('|' + MECAB + '/bin/mecab -O wakati -b 50000 ' if lang == 'ja' else '')
69
+ + '|' + ROMAN_LC + roman,
70
+ input=line,
71
+ encoding='UTF-8',
72
+ shell=True)
73
+ return tok.strip()
74
+
75
+
76
+ ###############################################################################
77
+ #
78
+ # Tokenize a file
79
+ #
80
+ ###############################################################################
81
+
82
+ def Token(inp_fname, out_fname, lang='en',
83
+ lower_case=True, romanize=False, descape=False,
84
+ verbose=False, over_write=False, gzip=False):
85
+ assert lower_case, 'lower case is needed by all the models'
86
+ assert not over_write, 'over-write is not yet implemented'
87
+ if not os.path.isfile(out_fname):
88
+ cat = 'zcat ' if gzip else 'cat '
89
+ roman = lang if romanize else 'none'
90
+ # handle some iso3 langauge codes
91
+ if lang in ('cmn', 'wuu', 'yue'):
92
+ lang = 'zh'
93
+ if lang in ('jpn'):
94
+ lang = 'ja'
95
+ if verbose:
96
+ logger.info('tokenizing {} in language {} {} {}'
97
+ .format(os.path.basename(inp_fname), lang,
98
+ '(gzip)' if gzip else '',
99
+ '(de-escaped)' if descape else '',
100
+ '(romanized)' if romanize else ''))
101
+ run(cat + inp_fname
102
+ + '|' + REM_NON_PRINT_CHAR
103
+ + '|' + NORM_PUNC + lang
104
+ + ('|' + DESCAPE if descape else '')
105
+ + '|' + MOSES_TOKENIZER + lang
106
+ + ('| python3 -m jieba -d ' if lang == 'zh' else '')
107
+ + ('|' + MECAB + '/bin/mecab -O wakati -b 50000 ' if lang == 'ja' else '')
108
+ + '|' + ROMAN_LC + roman
109
+ + '>' + out_fname,
110
+ env=dict(os.environ, LD_LIBRARY_PATH=MECAB + '/lib'),
111
+ shell=True)
112
+ elif not over_write and verbose:
113
+ logger.info('tokenized file {} exists already'
114
+ .format(os.path.basename(out_fname), lang))
115
+
116
+
117
+ ###############################################################################
118
+ #
119
+ # Apply SPM on a whole file
120
+ #
121
+ ###############################################################################
122
+
123
+ def SPMApply(inp_fname, out_fname, spm_model, lang='en',
124
+ lower_case=True, descape=False,
125
+ verbose=False, over_write=False, gzip=False):
126
+ assert lower_case, 'lower case is needed by all the models'
127
+ if not os.path.isfile(out_fname):
128
+ cat = 'zcat ' if gzip else 'cat '
129
+ if verbose:
130
+ logger.info('SPM processing {} {} {}'
131
+ .format(os.path.basename(inp_fname),
132
+ '(gzip)' if gzip else '',
133
+ '(de-escaped)' if descape else ''))
134
+
135
+ assert os.path.isfile(spm_model), f'SPM model {spm_model} not found'
136
+ command = (cat + inp_fname
137
+ + '|' + REM_NON_PRINT_CHAR
138
+ + '|' + NORM_PUNC + lang
139
+ + ('|' + DESCAPE if descape else '')
140
+ + '|' + ROMAN_LC + 'none'
141
+ + '|' + SPM + " --model=" + spm_model
142
+ + ' > ' + out_fname)
143
+ try:
144
+ run(["/bin/bash", "-o", "pipefail", "-c", command], check=True, capture_output=True)
145
+ except CalledProcessError as e:
146
+ logger.error(e.stderr.decode().strip())
147
+ sys.exit(1)
148
+
149
+ elif not over_write and verbose:
150
+ logger.info('SPM encoded file {} exists already'
151
+ .format(os.path.basename(out_fname)))
152
+
153
+
154
+ ###############################################################################
155
+ #
156
+ # Apply FastBPE on a whole file
157
+ #
158
+ ###############################################################################
159
+
160
+ def BPEfastApply(inp_fname, out_fname, bpe_codes,
161
+ verbose=False, over_write=False):
162
+ if not os.path.isfile(out_fname):
163
+ if verbose:
164
+ logger.info('fastBPE: processing {}'
165
+ .format(os.path.basename(inp_fname)))
166
+ bpe_vocab = bpe_codes.replace('fcodes', 'fvocab')
167
+ assert os.path.isfile(bpe_vocab), f'fastBPE: vocab file {bpe_vocab} not found'
168
+ run(FASTBPE + ' applybpe '
169
+ + out_fname + ' ' + inp_fname
170
+ + ' ' + bpe_codes
171
+ + ' ' + bpe_vocab, shell=True, stderr=DEVNULL)
172
+ elif not over_write and verbose:
173
+ logger.info('fastBPE: {} exists already'
174
+ .format(os.path.basename(out_fname)))
175
+
176
+
177
+ ###############################################################################
178
+ #
179
+ # Split long lines into multiple sentences at "."
180
+ #
181
+ ###############################################################################
182
+
183
+ def SplitLines(ifname, of_txt, of_sid):
184
+ if os.path.isfile(of_txt):
185
+ print(' - SplitLines: {} already exists'.format(of_txt))
186
+ return
187
+ nl = 0
188
+ nl_sp = 0
189
+ maxw = 0
190
+ maxw_sp = 0
191
+ fp_sid = open(of_sid, 'w')
192
+ fp_txt = open(of_txt, 'w')
193
+ with open(ifname, 'r') as ifp:
194
+ for line in ifp:
195
+ print('{:d}'.format(nl), file=fp_sid) # store current sentence ID
196
+ nw = 0
197
+ words = line.strip().split()
198
+ maxw = max(maxw, len(words))
199
+ for i, word in enumerate(words):
200
+ if word == '.' and i != len(words)-1:
201
+ if nw > 0:
202
+ print(' {}'.format(word), file=fp_txt)
203
+ else:
204
+ print('{}'.format(word), file=fp_txt)
205
+ # store current sentence ID
206
+ print('{:d}'.format(nl), file=fp_sid)
207
+ nl_sp += 1
208
+ maxw_sp = max(maxw_sp, nw+1)
209
+ nw = 0
210
+ else:
211
+ if nw > 0:
212
+ print(' {}'.format(word), end='', file=fp_txt)
213
+ else:
214
+ print('{}'.format(word), end='', file=fp_txt)
215
+ nw += 1
216
+ if nw > 0:
217
+ # handle remainder of sentence
218
+ print('', file=fp_txt)
219
+ nl_sp += 1
220
+ maxw_sp = max(maxw_sp, nw+1)
221
+ nl += 1
222
+ print(' - Split sentences: {}'.format(ifname))
223
+ print(' - lines/max words: {:d}/{:d} -> {:d}/{:d}'
224
+ .format(nl, maxw, nl_sp, maxw_sp))
225
+ fp_sid.close()
226
+ fp_txt.close()
227
+
228
+
229
+ ###############################################################################
230
+ #
231
+ # Join embeddings of previously split lines (average)
232
+ #
233
+ ###############################################################################
234
+
235
+ def JoinEmbed(if_embed, sid_fname, of_embed, dim=1024):
236
+ if os.path.isfile(of_embed):
237
+ print(' - JoinEmbed: {} already exists'.format(of_embed))
238
+ return
239
+ # read the input embeddings
240
+ em_in = np.fromfile(if_embed, dtype=np.float32, count=-1).reshape(-1, dim)
241
+ ninp = em_in.shape[0]
242
+ print(' - Combine embeddings:')
243
+ print(' input: {:s} {:d} sentences'.format(if_embed, ninp))
244
+
245
+ # get all sentence IDs
246
+ sid = np.empty(ninp, dtype=np.int32)
247
+ i = 0
248
+ with open(sid_fname, 'r') as fp_sid:
249
+ for line in fp_sid:
250
+ sid[i] = int(line)
251
+ i += 1
252
+ nout = sid.max() + 1
253
+ print(' IDs: {:s}, {:d} sentences'.format(sid_fname, nout))
254
+
255
+ # combining
256
+ em_out = np.zeros((nout, dim), dtype=np.float32)
257
+ cnt = np.zeros(nout, dtype=np.int32)
258
+ for i in range(ninp):
259
+ idx = sid[i]
260
+ em_out[idx] += em_in[i] # cumulate sentence vectors
261
+ cnt[idx] += 1
262
+
263
+ if (cnt == 0).astype(int).sum() > 0:
264
+ print('ERROR: missing lines')
265
+ sys.exit(1)
266
+
267
+ # normalize
268
+ for i in range(nout):
269
+ em_out[i] /= cnt[i]
270
+
271
+ print(' output: {:s}'.format(of_embed))
272
+ em_out.tofile(of_embed)
laser/source/mine_bitexts.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+ # LASER Language-Agnostic SEntence Representations
9
+ # is a toolkit to calculate multilingual sentence embeddings
10
+ # and to use them for document classification, bitext filtering
11
+ # and mining
12
+ #
13
+ # --------------------------------------------------------
14
+ #
15
+ # Tool to calculate to embed a text file
16
+ # The functions can be also imported into another Python code
17
+
18
+ import os
19
+ import sys
20
+ import faiss
21
+ import argparse
22
+ import torch
23
+ import numpy as np
24
+
25
+ # get environment
26
+ assert os.environ.get('LASER'), 'Please set the enviornment variable LASER'
27
+ LASER = os.environ['LASER']
28
+
29
+ sys.path.append(LASER + '/source')
30
+ sys.path.append(LASER + '/source/tools')
31
+ from embed import SentenceEncoder, EncodeLoad, EncodeFile, EmbedLoad
32
+ from lib.text_processing import Token, BPEfastApply
33
+
34
+
35
+ ###############################################################################
36
+ #
37
+ # Load texts and remove duplicates
38
+ #
39
+ ###############################################################################
40
+
41
+ def TextLoadUnify(fname, args):
42
+ if args.verbose:
43
+ print(' - loading texts {:s}: '.format(fname), end='')
44
+ fin = open(fname, encoding=args.encoding, errors='surrogateescape')
45
+ inds = []
46
+ sents = []
47
+ sent2ind = {}
48
+ n = 0
49
+ nu = 0
50
+ for line in fin:
51
+ new_ind = len(sent2ind)
52
+ inds.append(sent2ind.setdefault(line, new_ind))
53
+ if args.unify:
54
+ if inds[-1] == new_ind:
55
+ sents.append(line[:-1])
56
+ nu += 1
57
+ else:
58
+ sents.append(line[:-1])
59
+ nu += 1
60
+ n += 1
61
+ if args.verbose:
62
+ print('{:d} lines, {:d} unique'.format(n, nu))
63
+ del sent2ind
64
+ return inds, sents
65
+
66
+
67
+ ###############################################################################
68
+ #
69
+ # Wrapper for knn on CPU/GPU
70
+ #
71
+ ###############################################################################
72
+
73
+ def knn(x, y, k, use_gpu):
74
+ return knnGPU(x, y, k) if use_gpu else knnCPU(x, y, k)
75
+
76
+
77
+ ###############################################################################
78
+ #
79
+ # Perform knn on GPU
80
+ #
81
+ ###############################################################################
82
+
83
+ def knnGPU(x, y, k, mem=5*1024*1024*1024):
84
+ dim = x.shape[1]
85
+ batch_size = mem // (dim*4)
86
+ sim = np.zeros((x.shape[0], k), dtype=np.float32)
87
+ ind = np.zeros((x.shape[0], k), dtype=np.int64)
88
+ for xfrom in range(0, x.shape[0], batch_size):
89
+ xto = min(xfrom + batch_size, x.shape[0])
90
+ bsims, binds = [], []
91
+ for yfrom in range(0, y.shape[0], batch_size):
92
+ yto = min(yfrom + batch_size, y.shape[0])
93
+ # print('{}-{} -> {}-{}'.format(xfrom, xto, yfrom, yto))
94
+ idx = faiss.IndexFlatIP(dim)
95
+ idx = faiss.index_cpu_to_all_gpus(idx)
96
+ idx.add(y[yfrom:yto])
97
+ bsim, bind = idx.search(x[xfrom:xto], min(k, yto-yfrom))
98
+ bsims.append(bsim)
99
+ binds.append(bind + yfrom)
100
+ del idx
101
+ bsims = np.concatenate(bsims, axis=1)
102
+ binds = np.concatenate(binds, axis=1)
103
+ aux = np.argsort(-bsims, axis=1)
104
+ for i in range(xfrom, xto):
105
+ for j in range(k):
106
+ sim[i, j] = bsims[i-xfrom, aux[i-xfrom, j]]
107
+ ind[i, j] = binds[i-xfrom, aux[i-xfrom, j]]
108
+ return sim, ind
109
+
110
+
111
+ ###############################################################################
112
+ #
113
+ # Perform knn on CPU
114
+ #
115
+ ###############################################################################
116
+
117
+ def knnCPU(x, y, k):
118
+ dim = x.shape[1]
119
+ idx = faiss.IndexFlatIP(dim)
120
+ idx.add(y)
121
+ sim, ind = idx.search(x, k)
122
+ return sim, ind
123
+
124
+
125
+ ###############################################################################
126
+ #
127
+ # Scoring
128
+ #
129
+ ###############################################################################
130
+
131
+ def score(x, y, fwd_mean, bwd_mean, margin):
132
+ return margin(x.dot(y), (fwd_mean + bwd_mean) / 2)
133
+
134
+
135
+ def score_candidates(x, y, candidate_inds, fwd_mean, bwd_mean, margin, verbose=False):
136
+ if verbose:
137
+ print(' - scoring {:d} candidates'.format(x.shape[0]))
138
+ scores = np.zeros(candidate_inds.shape)
139
+ for i in range(scores.shape[0]):
140
+ for j in range(scores.shape[1]):
141
+ k = candidate_inds[i, j]
142
+ scores[i, j] = score(x[i], y[k], fwd_mean[i], bwd_mean[k], margin)
143
+ return scores
144
+
145
+
146
+ ###############################################################################
147
+ #
148
+ # Main
149
+ #
150
+ ###############################################################################
151
+
152
+ if __name__ == '__main__':
153
+ parser = argparse.ArgumentParser(description='LASER: Mine bitext')
154
+ parser.add_argument('src',
155
+ help='Source language corpus')
156
+ parser.add_argument('trg',
157
+ help='Target language corpus')
158
+ parser.add_argument('--encoding', default='utf-8',
159
+ help='Character encoding for input/output')
160
+ parser.add_argument('--src-lang', required=True,
161
+ help='Source language id')
162
+ parser.add_argument('--trg-lang', required=True,
163
+ help='Target language id')
164
+ parser.add_argument('--output', required=True,
165
+ help='Output file')
166
+ parser.add_argument('--threshold', type=float, default=0,
167
+ help='Threshold on extracted bitexts')
168
+
169
+ # mining params
170
+ parser.add_argument('--mode',
171
+ choices=['search', 'score', 'mine'], required=True,
172
+ help='Execution mode')
173
+ parser.add_argument('-k', '--neighborhood',
174
+ type=int, default=4,
175
+ help='Neighborhood size')
176
+ parser.add_argument('--margin',
177
+ choices=['absolute', 'distance', 'ratio'], default='ratio',
178
+ help='Margin function')
179
+ parser.add_argument('--retrieval',
180
+ choices=['fwd', 'bwd', 'max', 'intersect'], default='max',
181
+ help='Retrieval strategy')
182
+ parser.add_argument('--unify', action='store_true',
183
+ help='Unify texts')
184
+ parser.add_argument('--gpu', action='store_true',
185
+ help='Run knn on all available GPUs')
186
+ parser.add_argument('--verbose', action='store_true',
187
+ help='Detailed output')
188
+
189
+ # embeddings
190
+ parser.add_argument('--src-embeddings', required=True,
191
+ help='Precomputed source sentence embeddings')
192
+ parser.add_argument('--trg-embeddings', required=True,
193
+ help='Precomputed target sentence embeddings')
194
+ parser.add_argument('--dim', type=int, default=1024,
195
+ help='Embedding dimensionality')
196
+ parser.add_argument('--fp16', action='store_true',
197
+ help='Load precomputed embeddings in float16 format')
198
+ args = parser.parse_args()
199
+
200
+ print('LASER: tool to search, score or mine bitexts')
201
+ use_gpu = torch.cuda.is_available() and args.gpu
202
+ if use_gpu:
203
+ print(' - knn will run on all available GPUs (recommended)')
204
+ else:
205
+ print(' - knn will run on CPU (slow)')
206
+
207
+ src_inds, src_sents = TextLoadUnify(args.src, args)
208
+ trg_inds, trg_sents = TextLoadUnify(args.trg, args)
209
+
210
+ def unique_embeddings(emb, ind, verbose=False):
211
+ aux = {j: i for i, j in enumerate(ind)}
212
+ if verbose:
213
+ print(' - unify embeddings: {:d} -> {:d}'.format(len(emb), len(aux)))
214
+ return emb[[aux[i] for i in range(len(aux))]]
215
+
216
+ # load the embeddings and store as np.float32 (required for FAISS)
217
+ x = EmbedLoad(args.src_embeddings, args.dim, verbose=args.verbose, fp16=args.fp16).astype(np.float32)
218
+ if args.unify:
219
+ x = unique_embeddings(x, src_inds, args.verbose)
220
+ faiss.normalize_L2(x)
221
+ y = EmbedLoad(args.trg_embeddings, args.dim, verbose=args.verbose, fp16=args.fp16).astype(np.float32)
222
+ if args.unify:
223
+ y = unique_embeddings(y, trg_inds, args.verbose)
224
+ faiss.normalize_L2(y)
225
+
226
+ # calculate knn in both directions
227
+ if args.retrieval != 'bwd':
228
+ if args.verbose:
229
+ print(' - perform {:d}-nn source against target'.format(args.neighborhood))
230
+ x2y_sim, x2y_ind = knn(x, y, min(y.shape[0], args.neighborhood), use_gpu)
231
+ x2y_mean = x2y_sim.mean(axis=1)
232
+
233
+ if args.retrieval != 'fwd':
234
+ if args.verbose:
235
+ print(' - perform {:d}-nn target against source'.format(args.neighborhood))
236
+ y2x_sim, y2x_ind = knn(y, x, min(x.shape[0], args.neighborhood), use_gpu)
237
+ y2x_mean = y2x_sim.mean(axis=1)
238
+
239
+ # margin function
240
+ if args.margin == 'absolute':
241
+ margin = lambda a, b: a
242
+ elif args.margin == 'distance':
243
+ margin = lambda a, b: a - b
244
+ else: # args.margin == 'ratio':
245
+ margin = lambda a, b: a / b
246
+
247
+ fout = open(args.output, mode='w', encoding=args.encoding, errors='surrogateescape')
248
+
249
+ if args.mode == 'search':
250
+ if args.verbose:
251
+ print(' - Searching for closest sentences in target')
252
+ print(' - writing alignments to {:s}'.format(args.output))
253
+ scores = score_candidates(x, y, x2y_ind, x2y_mean, y2x_mean, margin, args.verbose)
254
+ best = x2y_ind[np.arange(x.shape[0]), scores.argmax(axis=1)]
255
+
256
+ nbex = x.shape[0]
257
+ ref = np.linspace(0, nbex-1, nbex).astype(int) # [0, nbex)
258
+ err = nbex - np.equal(best.reshape(nbex), ref).astype(int).sum()
259
+ print(' - errors: {:d}={:.2f}%'.format(err, 100*err/nbex))
260
+ for i in src_inds:
261
+ print(trg_sents[best[i]], file=fout)
262
+
263
+ elif args.mode == 'score':
264
+ for i, j in zip(src_inds, trg_inds):
265
+ s = score(x[i], y[j], x2y_mean[i], y2x_mean[j], margin)
266
+ print(s, src_sents[i], trg_sents[j], sep='\t', file=fout)
267
+
268
+ elif args.mode == 'mine':
269
+ if args.verbose:
270
+ print(' - mining for parallel data')
271
+ fwd_scores = score_candidates(x, y, x2y_ind, x2y_mean, y2x_mean, margin, args.verbose)
272
+ bwd_scores = score_candidates(y, x, y2x_ind, y2x_mean, x2y_mean, margin, args.verbose)
273
+ fwd_best = x2y_ind[np.arange(x.shape[0]), fwd_scores.argmax(axis=1)]
274
+ bwd_best = y2x_ind[np.arange(y.shape[0]), bwd_scores.argmax(axis=1)]
275
+ if args.verbose:
276
+ print(' - writing alignments to {:s}'.format(args.output))
277
+ if args.threshold > 0:
278
+ print(' - with threshold of {:f}'.format(args.threshold))
279
+ if args.retrieval == 'fwd':
280
+ for i, j in enumerate(fwd_best):
281
+ print(fwd_scores[i].max(), src_sents[i], trg_sents[j], sep='\t', file=fout)
282
+ if args.retrieval == 'bwd':
283
+ for j, i in enumerate(bwd_best):
284
+ print(bwd_scores[j].max(), src_sents[i], trg_sents[j], sep='\t', file=fout)
285
+ if args.retrieval == 'intersect':
286
+ for i, j in enumerate(fwd_best):
287
+ if bwd_best[j] == i:
288
+ print(fwd_scores[i].max(), src_sents[i], trg_sents[j], sep='\t', file=fout)
289
+ if args.retrieval == 'max':
290
+ indices = np.stack((np.concatenate((np.arange(x.shape[0]), bwd_best)),
291
+ np.concatenate((fwd_best, np.arange(y.shape[0])))), axis=1)
292
+ scores = np.concatenate((fwd_scores.max(axis=1), bwd_scores.max(axis=1)))
293
+ seen_src, seen_trg = set(), set()
294
+ for i in np.argsort(-scores):
295
+ src_ind, trg_ind = indices[i]
296
+ if not src_ind in seen_src and not trg_ind in seen_trg:
297
+ seen_src.add(src_ind)
298
+ seen_trg.add(trg_ind)
299
+ if scores[i] > args.threshold:
300
+ print(scores[i], src_sents[src_ind], trg_sents[trg_ind], sep='\t', file=fout)
301
+
302
+ fout.close()
laser/source/nli.py ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+ # LASER Language-Agnostic SEntence Representations
9
+ # is a toolkit to calculate multilingual sentence embeddings
10
+ # and to use them for document classification, bitext filtering
11
+ # and mining
12
+ #
13
+ # --------------------------------------------------------
14
+ #
15
+ #
16
+
17
+
18
+ import os
19
+ import copy
20
+ import argparse
21
+ import torch
22
+ import torch.nn as nn
23
+ import torch.nn.functional as F
24
+ import torch.optim as optim
25
+ import torch.utils.data as data_utils
26
+ import numpy as np
27
+ import faiss
28
+
29
+
30
+ ################################################
31
+
32
+ def LoadDataNLI(fn1, fn2, fn_lbl,
33
+ dim=1024, bsize=32,
34
+ fraction=1.0,
35
+ shuffle=False, quiet=False):
36
+ x = np.fromfile(fn1, dtype=np.float32, count=-1)
37
+ x.resize(x.shape[0] // dim, dim)
38
+ faiss.normalize_L2(x)
39
+
40
+ y = np.fromfile(fn2, dtype=np.float32, count=-1)
41
+ y.resize(y.shape[0] // dim, dim)
42
+ faiss.normalize_L2(y)
43
+
44
+ lbl = np.loadtxt(fn_lbl, dtype=np.int32)
45
+ lbl.reshape(lbl.shape[0], 1)
46
+
47
+ if not quiet:
48
+ print(' - read {:d}x{:d} elements in {:s}'.format(x.shape[0], x.shape[1], fn1))
49
+ print(' - read {:d}x{:d} elements in {:s}'.format(y.shape[0], y.shape[1], fn2))
50
+ print(' - read {:d} labels [{:d},{:d}] in {:s}'
51
+ .format(lbl.shape[0], lbl.min(), lbl.max(), fn_lbl))
52
+
53
+ if fraction < 1.0:
54
+ N = int(x.shape[0] * fraction)
55
+ if not quiet:
56
+ print(' - using only the first {:d} examples'.format(N))
57
+ x = x[:N][:]
58
+ y = y[:N][:]
59
+ lbl = lbl[:N][:]
60
+
61
+ if not quiet:
62
+ print(' - combine premises and hyps')
63
+ nli = np.concatenate((x, y, np.absolute(x - y), np.multiply(x, y)), axis=1)
64
+
65
+ D = data_utils.TensorDataset(torch.from_numpy(nli), torch.from_numpy(lbl))
66
+ loader = data_utils.DataLoader(D, batch_size=bsize, shuffle=shuffle)
67
+ return loader
68
+
69
+
70
+ ################################################
71
+
72
+ class Net(nn.Module):
73
+ def __init__(self, fname='',
74
+ idim=4*1024, odim=2, nhid=None,
75
+ dropout=0.0, gpu=0, activation='TANH'):
76
+ super(Net, self).__init__()
77
+ self.gpu = gpu
78
+ if os.path.isfile(fname):
79
+ print(' - loading mlp from %s'.format(fname))
80
+ loaded = torch.load(fname)
81
+ self.mlp = loaded.mlp
82
+ else:
83
+ modules = []
84
+ print(' - mlp {:d}'.format(idim), end='')
85
+ if len(nhid) > 0:
86
+ if dropout > 0:
87
+ modules.append(nn.Dropout(p=dropout))
88
+ nprev = idim
89
+ for nh in nhid:
90
+ if nh > 0:
91
+ modules.append(nn.Linear(nprev, nh))
92
+ nprev = nh
93
+ if activation == 'TANH':
94
+ modules.append(nn.Tanh())
95
+ print('-{:d}t'.format(nh), end='')
96
+ elif activation == 'RELU':
97
+ modules.append(nn.ReLU())
98
+ print('-{:d}r'.format(nh), end='')
99
+ else:
100
+ raise Exception('Unrecognised activation {activation}')
101
+ if dropout > 0:
102
+ modules.append(nn.Dropout(p=dropout))
103
+ modules.append(nn.Linear(nprev, odim))
104
+ print('-{:d}, dropout={:.1f}'.format(odim, dropout))
105
+ else:
106
+ modules.append(nn.Linear(idim, odim))
107
+ print(' - mlp {:d}-{:d}'.format(idim, odim))
108
+ self.mlp = nn.Sequential(*modules)
109
+
110
+ if self.gpu >= 0:
111
+ self.mlp = self.mlp.cuda()
112
+
113
+ def forward(self, x):
114
+ return self.mlp(x)
115
+
116
+ def TestCorpus(self, dset, name=' Dev', nlbl=3, out_fname=None):
117
+ correct = 0
118
+ total = 0
119
+ self.mlp.train(mode=False)
120
+ corr = np.zeros(nlbl, dtype=np.int32)
121
+ if out_fname:
122
+ fp = open(out_fname, 'w')
123
+ fp.write('# outputs target_class predicted_class\n')
124
+ for data in dset:
125
+ X, Y = data
126
+ Y = Y.long()
127
+ if self.gpu >= 0:
128
+ X = X.cuda()
129
+ Y = Y.cuda()
130
+ outputs = self.mlp(X)
131
+ _, predicted = torch.max(outputs.data, 1)
132
+ total += Y.size(0)
133
+ correct += (predicted == Y).int().sum()
134
+ for i in range(nlbl):
135
+ corr[i] += (predicted == i).int().sum()
136
+ if out_fname:
137
+ for b in range(outputs.shape[0]):
138
+ for i in range(nlbl):
139
+ fp.write('{:f} '.format(outputs[b][i]))
140
+ fp.write('{:d} {:d}\n'
141
+ .format(predicted[b], Y[b]))
142
+
143
+ print(' | {:4s}: {:5.2f}%'
144
+ .format(name, 100.0 * correct.float() / total), end='')
145
+ # print(' | loss {:6.4f}'.format(loss/total), end='')
146
+ print(' | classes:', end='')
147
+ for i in range(nlbl):
148
+ print(' {:5.2f}'.format(100.0 * corr[i] / total), end='')
149
+
150
+ if out_fname:
151
+ fp.close()
152
+
153
+ return correct, total
154
+
155
+
156
+ ################################################
157
+
158
+ parser = argparse.ArgumentParser(
159
+ formatter_class=argparse.RawDescriptionHelpFormatter,
160
+ description='Classifier for NLI')
161
+
162
+ # Data
163
+ parser.add_argument(
164
+ '--base-dir', '-b', type=str, required=True, metavar='PATH',
165
+ help='Directory with all the data files)')
166
+ parser.add_argument(
167
+ '--load', '-l', type=str, required=False, metavar='PATH', default='',
168
+ help='Load network from file before training or for testing')
169
+ parser.add_argument(
170
+ '--save', '-s', type=str, required=False, metavar='PATH', default='',
171
+ help='File in which to save best network')
172
+ parser.add_argument(
173
+ '--train', '-t', type=str, required=True, metavar='STR',
174
+ help='Name of training corpus')
175
+ parser.add_argument(
176
+ '--train-labels', '-T', type=str, required=True, metavar='STR',
177
+ help='Name of training corpus (labels)')
178
+ parser.add_argument(
179
+ '--dev', '-d', type=str, required=True, metavar='STR',
180
+ help='Name of development corpus')
181
+ parser.add_argument(
182
+ '--dev-labels', '-D', type=str, required=True, metavar='STR',
183
+ help='Name of development corpus (labels)')
184
+ parser.add_argument(
185
+ '--test', '-e', type=str, default=None,
186
+ help='Name of test corpus without language extension')
187
+ parser.add_argument(
188
+ '--test-labels', '-E', type=str, default=None,
189
+ help='Name of test corpus without language extension (labels)')
190
+ parser.add_argument(
191
+ '--lang', '-L', nargs='+', default=None,
192
+ help='List of languages to test on')
193
+ parser.add_argument(
194
+ '--cross-lingual', '-x', action='store_true',
195
+ help='Also test on premise and hypothesis in different languages)')
196
+ parser.add_argument(
197
+ '--parts', '-p', type=str, nargs='+', default=['prem', 'hyp'],
198
+ help='Name of the two input parts to compare')
199
+ parser.add_argument(
200
+ '--fraction', '-f', type=float, default=1.0,
201
+ help='Fraction of training examples to use (from the beginning)')
202
+ parser.add_argument(
203
+ '--save-outputs', type=str, default=None,
204
+ help='File name to save classifier outputs ("l1-l2.txt" will be added)')
205
+
206
+ # network definition
207
+ parser.add_argument(
208
+ '--dim', '-m', type=int, default=1024,
209
+ help='dimension of sentence embeddings')
210
+ parser.add_argument(
211
+ '--nhid', '-n', type=int, default=0, nargs='+',
212
+ help='List of hidden layer(s) dimensions')
213
+ parser.add_argument(
214
+ '--dropout', '-o', type=float, default=0.0, metavar='FLOAT',
215
+ help='Value of dropout')
216
+ parser.add_argument(
217
+ '--nepoch', '-N', type=int, default=100, metavar='INT',
218
+ help='Number of epochs')
219
+ parser.add_argument(
220
+ '--bsize', '-B', type=int, default=128, metavar='INT',
221
+ help='Batch size')
222
+ parser.add_argument(
223
+ '--seed', '-S', type=int, default=123456789, metavar='INT',
224
+ help='Initial random seed')
225
+ parser.add_argument(
226
+ '--lr', type=float, default=0.001, metavar='FLOAT',
227
+ help='Learning rate')
228
+ parser.add_argument(
229
+ '--activation', '-a', type=str, default='TANH', metavar='STR',
230
+ help='NonLinearity to use in hidden layers')
231
+ parser.add_argument(
232
+ '--gpu', '-g', type=int, default=-1, metavar='INT',
233
+ help='GPU id (-1 for CPU)')
234
+ args = parser.parse_args()
235
+
236
+ train_loader = LoadDataNLI(os.path.join(args.base_dir, args.train % args.parts[0]),
237
+ os.path.join(args.base_dir, args.train % args.parts[1]),
238
+ os.path.join(args.base_dir, args.train_labels),
239
+ dim=args.dim, bsize=args.bsize, shuffle=True, fraction=args.fraction)
240
+
241
+ dev_loader = LoadDataNLI(os.path.join(args.base_dir, args.dev % args.parts[0]),
242
+ os.path.join(args.base_dir, args.dev % args.parts[1]),
243
+ os.path.join(args.base_dir, args.dev_labels),
244
+ dim=args.dim, bsize=args.bsize, shuffle=False)
245
+
246
+ # set GPU and random seed
247
+ np.random.seed(args.seed)
248
+ torch.manual_seed(args.seed)
249
+ if args.gpu < 0:
250
+ print(' - running on cpu')
251
+ else:
252
+ print(' - running on gpu {:d}'.format(args.gpu))
253
+ torch.cuda.set_device(args.gpu)
254
+ torch.cuda.manual_seed(args.seed)
255
+ print(' - setting seed to {:d}'.format(args.seed))
256
+ print(' - lrate is {:f} and bsize {:d}'.format(args.lr, args.bsize))
257
+
258
+ # create network
259
+ net = Net(fname=args.load,
260
+ idim=4*args.dim, odim=3, nhid=args.nhid,
261
+ dropout=args.dropout, gpu=args.gpu,
262
+ activation=args.activation)
263
+ if args.gpu >= 0:
264
+ criterion = nn.CrossEntropyLoss().cuda()
265
+ else:
266
+ criterion = nn.CrossEntropyLoss()
267
+
268
+ optimizer = optim.Adam(net.parameters(), lr=args.lr)
269
+
270
+ corr_best = 0
271
+ # loop multiple times over the dataset
272
+ for epoch in range(args.nepoch):
273
+
274
+ loss_epoch = 0.0
275
+ print('Ep {:4d}'.format(epoch), end='')
276
+ # for inputs, labels in train_loader:
277
+ for i, data in enumerate(train_loader, 0):
278
+ # get the inputs
279
+ inputs, labels = data
280
+ labels = labels.long()
281
+ if args.gpu >= 0:
282
+ inputs = inputs.cuda()
283
+ labels = labels.cuda()
284
+
285
+ # zero the parameter gradients
286
+ optimizer.zero_grad()
287
+
288
+ # forward + backward + optimize
289
+ net.train(mode=True)
290
+ outputs = net(inputs)
291
+ loss = criterion(outputs, labels)
292
+ loss.backward()
293
+ optimizer.step()
294
+ loss_epoch += loss.item()
295
+
296
+ print(' | loss {:e}'.format(loss_epoch), end='')
297
+
298
+ corr, nbex = net.TestCorpus(dev_loader, 'Dev')
299
+ if corr >= corr_best:
300
+ print(' | saved')
301
+ corr_best = corr
302
+ net_best = copy.deepcopy(net)
303
+ else:
304
+ print('')
305
+
306
+
307
+ if 'net_best' in globals():
308
+ if args.save != '':
309
+ torch.save(net_best.cpu(), args.save)
310
+ print('Best Dev: {:d} = {:5.2f}%'
311
+ .format(corr_best, 100.0 * corr_best.float() / nbex))
312
+
313
+ if args.gpu >= 0:
314
+ net_best = net_best.cuda()
315
+
316
+ # test on (several) languages
317
+ if args.test is None:
318
+ os.exit()
319
+
320
+ print('Testing on {}'.format(args.test))
321
+ if not args.cross_lingual:
322
+ for l in args.lang:
323
+ test_loader = LoadDataNLI(os.path.join(args.base_dir, args.test % args.parts[0] + '.' + l),
324
+ os.path.join(args.base_dir, args.test % args.parts[1] + '.' + l),
325
+ os.path.join(args.base_dir, args.test_labels + '.' + l),
326
+ dim=args.dim, bsize=args.bsize, shuffle=False, quiet=True)
327
+ print('Ep best | Eval Test lang {:s}'.format(l), end='')
328
+ ofname = args.save_outputs + '.{:s}-{:s}'.format(l, l) + '.txt' if args.save_outputs else None
329
+ net_best.TestCorpus(test_loader, 'Test', out_fname=ofname)
330
+ print('')
331
+ else: # cross-lingual
332
+ err = np.empty((len(args.lang), len(args.lang)), dtype=np.float32)
333
+ i1 = 0
334
+ for l1 in args.lang:
335
+ i2 = 0
336
+ for l2 in args.lang:
337
+ test_loader = LoadDataNLI(os.path.join(args.base_dir, args.test % args.parts[0] + '.' + l1),
338
+ os.path.join(args.base_dir, args.test % args.parts[1] + '.' + l2),
339
+ os.path.join(args.base_dir, args.test_labels + '.' + l2),
340
+ dim=args.dim, bsize=args.bsize, shuffle=False, quiet=True)
341
+ print('Ep best | Eval Test {:s}-{:s}'.format(l1, l2), end='')
342
+ ofname = args.save_outputs + '.{:s}-{:s}'.format(l1, l2) + '.txt' if args.save_outputs else None
343
+ p, n = net_best.TestCorpus(test_loader, 'Test',
344
+ out_fname=ofname)
345
+ err[i1, i2] = 100.0 * float(p) / n
346
+ i2 += 1
347
+ print('')
348
+ i1 += 1
349
+
350
+ print('\nAccuracy matrix:')
351
+ print(' ', end='')
352
+ for i2 in range(err.shape[1]):
353
+ print(' {:4s} '.format(args.lang[i2]), end='')
354
+
355
+ print(' avg')
356
+ for i1 in range(err.shape[0]):
357
+ print('{:4s}'.format(args.lang[i1]), end='')
358
+ for i2 in range(err.shape[1]):
359
+ print(' {:5.2f}'.format(err[i1, i2]), end='')
360
+ print(' {:5.2f}'.format(np.average(err[i1, :])))
361
+ print('avg ', end='')
362
+ for i2 in range(err.shape[1]):
363
+ print(' {:5.2f}'.format(np.average(err[:, i2])), end='')
364
+ print(' {:5.2f}'.format(np.average(err)))
365
+
366
+ if err.shape[0] == err.shape[1]:
367
+ s = 0
368
+ # TODO: we assume the first lang is English
369
+ for i1 in range(1, err.shape[0]):
370
+ s += err[i1, i1]
371
+ print('xnli-xx: {:5.2f}'.format(s/(err.shape[0]-1)))
laser/source/paraphrase.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+ # LASER Language-Agnostic SEntence Representations
9
+ # is a toolkit to calculate multilingual sentence embeddings
10
+ # and to use them for document classification, bitext filtering
11
+ # and mining
12
+ #
13
+ # --------------------------------------------------------
14
+ #
15
+ # Python tool to search for paraphrases in FAISS index
16
+
17
+ import re
18
+ import sys
19
+ import os.path
20
+ import tempfile
21
+ import argparse
22
+ import faiss
23
+ import time
24
+ import pdb
25
+ import numpy as np
26
+ from collections import namedtuple
27
+
28
+ # get environment
29
+ assert os.environ.get('LASER'), 'Please set the enviornment variable LASER'
30
+ LASER = os.environ['LASER']
31
+
32
+ sys.path.append(LASER + '/source/lib')
33
+ from indexing import IndexLoad, IndexTextOpen, IndexTextQuery, SplitOpen, SplitAccess
34
+ from embed import SentenceEncoder, EncodeLoad, EncodeFile, EncodeTime
35
+ from text_processing import Token, BPEfastApply
36
+
37
+ SPACE_NORMALIZER = re.compile("\s+")
38
+ Batch = namedtuple('Batch', 'srcs tokens lengths')
39
+
40
+ # calculate L2 distance between [x]
41
+ # and the vectors referenced in idxs
42
+ # x should be already normalized
43
+ def IndexDistL2(X, E, D, I, thresh=1.0, dtype=np.float32, sort=True):
44
+ nb, nK = I.shape
45
+ dim = X.shape[1]
46
+ dist_l2 = np.empty((nb, nK), dtype=np.float32)
47
+ y = np.empty((1, dim), dtype=dtype)
48
+ for i in range(nb):
49
+ for k in range(nK):
50
+ if D[i, k] <= thresh:
51
+ # get embedding from disk
52
+ np.copyto(y, SplitAccess(E, I[i, k]))
53
+ faiss.normalize_L2(y)
54
+ dist_l2[i, k] = 1.0 - np.dot(X[i], y[0])
55
+ else:
56
+ # exclude sentences which already have a huge FAISS distance
57
+ # (getting embeddings from disk is very time consumming)
58
+ dist_l2[i, k] = 1.0
59
+
60
+ if sort:
61
+ # re-sort according to L2
62
+ idxs = np.argsort(dist_l2[i], axis=0)
63
+ dist_l2[i] = dist_l2[i][idxs]
64
+ I[i] = I[i][idxs]
65
+
66
+ return dist_l2, I
67
+
68
+ ###############################################################################
69
+ #
70
+ # Apply an absolute threshold on the distance
71
+ #
72
+ ###############################################################################
73
+
74
+ def MarginAbs(em, ofp, params, args, stats):
75
+ D, I = params.idx.search(em, args.kmax)
76
+ thresh = args.threshold_faiss
77
+ if args.embed:
78
+ D, I = IndexDistL2(em, params.E, D, I, args.threshold_faiss)
79
+ thresh = args.threshold_L2
80
+
81
+ for n in range(D.shape[0]):
82
+
83
+ prev = {} # for deduplication
84
+ for i in range(args.kmax):
85
+ txt = IndexTextQuery(params.T, params.R, I[n, i])
86
+ if (args.dedup and txt not in prev) and D[n, i] <= thresh:
87
+ prev[txt] = 1
88
+ ofp.write('{:d}\t{:7.5f}\t{}\n'
89
+ .format(stats.nbs, D[n, i], txt))
90
+ stats.nbp += 1
91
+
92
+ # display source sentece if requested
93
+ if (args.include_source == 'matches' and len(prev) > 0):
94
+ ofp.write('{:d}\t{:6.1f}\t{}\n'
95
+ .format(stats.nbs, 0.0, sentences[n].replace('@@ ', '')))
96
+
97
+ if args.include_source == 'always':
98
+ ofp.write('{:d}\t{:6.1f}\t{}\n'
99
+ .format(stats.nbs, 0.0, sentences[n].replace('@@ ', '')))
100
+ stats.nbs += 1
101
+
102
+
103
+ ###############################################################################
104
+ #
105
+ # Apply an threshold on the ratio between distance and average
106
+ #
107
+ ###############################################################################
108
+
109
+ def MarginRatio(em, ofp, params, args, stats):
110
+ D, I = params.idx.search(em, args.margin_k)
111
+ thresh = args.threshold
112
+ if args.embed:
113
+ D, I = IndexDistL2(em, params.E, D, I, args.threshold_faiss)
114
+ thresh = args.threshold_L2
115
+
116
+ Mean = D.mean(axis=1)
117
+ for n in range(D.shape[0]):
118
+ if D[n, 0] / Mean[n] <= args.threshold:
119
+ if args.include_source == 'matches':
120
+ ofp.write('{:d}\t{:6.1f}\t{}\n'
121
+ .format(stats.nbs, 0.0, sentences[n].replace('@@ ', '')))
122
+ txt = IndexTextQuery(params.T, params.R, I[n, 0])
123
+ ofp.write('{:d}\t{:7.5f}\t{}\n'.format(stats.nbs, D[n, 0], txt))
124
+ stats.nbp += 1
125
+
126
+ stats.nbs += 1
127
+
128
+ if args.include_source == 'always':
129
+ ofp.write('{:d}\t{:6.1f}\t{}\n'
130
+ .format(stats.nbs, 0.0, sentences[n].replace('@@ ', '')))
131
+
132
+
133
+ ###############################################################################
134
+
135
+ def MarginDist(em, ofp, params, args, stats):
136
+ print('ERROR: MarginAbs not implemented')
137
+ sys.exit(1)
138
+
139
+
140
+ ###############################################################################
141
+
142
+ def buffered_read(fp, buffer_size):
143
+ buffer = []
144
+ for src_str in fp:
145
+ buffer.append(src_str.strip())
146
+ if len(buffer) >= buffer_size:
147
+ yield buffer
148
+ buffer = []
149
+
150
+ if len(buffer) > 0:
151
+ yield buffer
152
+
153
+
154
+ ###############################################################################
155
+
156
+ parser = argparse.ArgumentParser('LASER: paraphrase tool')
157
+
158
+ parser.add_argument('--encoder', type=str, required=True,
159
+ help='encoder to be used')
160
+ parser.add_argument('--encoding', default='utf-8',
161
+ help='Character encoding for input/output')
162
+ parser.add_argument('--token-lang', type=str, default='--',
163
+ help="Language of tokenizer ('--' for no tokenization)")
164
+ parser.add_argument('--bpe-codes', type=str, default=None, required=True,
165
+ help='BPE codes')
166
+ parser.add_argument('--buffer-size', type=int, default=100,
167
+ help='Buffer size (sentences)')
168
+ parser.add_argument('--max-tokens', type=int, default=12000,
169
+ help='Maximum number of tokens to process in a batch')
170
+ parser.add_argument('--max-sentences', type=int, default=None,
171
+ help='Maximum number of sentences to process in a batch')
172
+ parser.add_argument('--cpu', action='store_true',
173
+ help='Use CPU instead of GPU')
174
+
175
+ parser.add_argument('--index', type=str, required=True,
176
+ help='FAISS index')
177
+ parser.add_argument('--nprobe', type=int, default=128,
178
+ help='FAISS: value of nprobe')
179
+ parser.add_argument('--text', type=str, required=True,
180
+ help='File with indexed texts')
181
+ parser.add_argument(
182
+ '--dim', type=int, default=1024,
183
+ help='Dimension of specified sentence embeddings')
184
+ parser.add_argument(
185
+ '--embed', type=str, default=None,
186
+ help='Sentence embeddings, true L2 distance will be calculated when specified')
187
+
188
+ parser.add_argument('-i', '--input', type=str, required=True,
189
+ help='Input text file')
190
+ parser.add_argument('-p', '--output', type=str, default='--',
191
+ help='Output paraphrases')
192
+ parser.add_argument('--kmax', type=int, default=10,
193
+ help='Max value of distance or margin of each paraphrase')
194
+ parser.add_argument('--dedup', type=int, default=1,
195
+ help='Deduplicate list of paraphrases')
196
+ parser.add_argument('--include-source', default='never',
197
+ choices=['never', 'matches', 'always'],
198
+ help='Include source sentence in the list of paraphrases')
199
+ parser.add_argument('--margin',
200
+ choices=['absolute', 'distance', 'ratio'],
201
+ default='ratio', help='Margin function')
202
+ parser.add_argument('-T', '--threshold-margin', type=float, default=0.9,
203
+ help='Threshold on margin')
204
+ parser.add_argument('--threshold-faiss', type=float, default=0.4,
205
+ help='Threshold on FAISS distance')
206
+ parser.add_argument('--threshold-L2', type=float, default=0.2,
207
+ help='Threshold on L2 distance')
208
+ parser.add_argument('--margin-k', type=int, default=4,
209
+ help='Number of nearest neighbors for margin calculation')
210
+
211
+ parser.add_argument('--verbose', action='store_true',
212
+ help='Detailed output')
213
+
214
+
215
+ print('\nLASER: paraphrase tool')
216
+ args = parser.parse_args()
217
+
218
+ # index,
219
+ # memory mapped texts, references and word counts
220
+ # encoder
221
+ params = namedtuple('params', 'idx T R W M E enc')
222
+
223
+ # open text and reference file
224
+ params.T, params.R, params.W, params.M = IndexTextOpen(args.text)
225
+
226
+ # Open on-disk embeddings for L2 distances
227
+ if args.embed:
228
+ params.E = SplitOpen(args.embed, ['en'],
229
+ args.dim, np.float32, verbose=False)
230
+
231
+ # load FAISS index
232
+ params.idx = IndexLoad(args.index, args.nprobe)
233
+
234
+ # load sentence encoder
235
+ params.enc = EncodeLoad(args)
236
+
237
+
238
+ margin_methods = {'absolute': MarginAbs,
239
+ 'distance': MarginDist,
240
+ 'ratio': MarginRatio}
241
+
242
+ with tempfile.TemporaryDirectory() as tmpdir:
243
+ ifile = args.input
244
+ if args.token_lang != '--':
245
+ ifile = os.path.join(tmpdir, 'tok')
246
+ Token(args.input,
247
+ ifile,
248
+ lang=args.token_lang,
249
+ romanize=True if args.token_lang == 'el' else False,
250
+ lower_case=True, gzip=False,
251
+ verbose=args.verbose, over_write=False)
252
+
253
+ if args.bpe_codes:
254
+ bpe_file = os.path.join(tmpdir, 'bpe')
255
+ BPEfastApply(ifile,
256
+ bpe_file,
257
+ args.bpe_codes,
258
+ verbose=args.verbose, over_write=False)
259
+ ifile = bpe_file
260
+
261
+ print(' - processing (batch size is {:d})'.format(args.buffer_size))
262
+ ifp = open(ifile, 'r', encoding=args.encoding, errors='surrogateescape')
263
+ if args.output == '--':
264
+ ofp = sys.stdout
265
+ else:
266
+ ofp = open(args.output, 'w', encoding=args.encoding, errors='surrogateescape')
267
+ stats = namedtuple('stats', 'ns np')
268
+ stats.nbs = 0
269
+ stats.nbp = 0
270
+ t = time.time()
271
+ for sentences in buffered_read(ifp, args.buffer_size):
272
+ embed = params.enc.encode_sentences(sentences)
273
+ faiss.normalize_L2(embed)
274
+ # call function for selected margin method
275
+ margin_methods.get(args.margin)(embed, ofp, params, args, stats)
276
+ if stats.nbs % 1000 == 0:
277
+ print('\r - {:d} sentences {:d} paraphrases'
278
+ .format(stats.nbs, stats.nbp), end='')
279
+
280
+ ifp.close()
281
+ if args.output != '--':
282
+ ofp.close()
283
+ print('\r - {:d} sentences {:d} paraphrases'
284
+ .format(stats.nbs, stats.nbp), end='')
285
+ EncodeTime(t)
laser/source/pxsim.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ #
7
+ # LASER Language-Agnostic SEntence Representations
8
+ # is a toolkit to calculate multilingual sentence embeddings
9
+ # and to use them for various tasks such as document classification,
10
+ # and bitext filtering
11
+ #
12
+ # --------------------------------------------------------
13
+ #
14
+ # Tool to calculate the dual approach multilingual similarity error rate (P-xSIM)
15
+
16
+ import typing as tp
17
+ from pathlib import Path
18
+
19
+ import faiss
20
+ import numpy as np
21
+ import torch
22
+ from scipy.special import softmax
23
+ from sklearn.metrics.pairwise import cosine_similarity
24
+ from stopes.eval.auto_pcp.audio_comparator import Comparator, get_model_pred
25
+ from xsim import Margin, score_margin
26
+
27
+
28
+ def get_neighbors(
29
+ x: np.ndarray, y: np.ndarray, k: int, margin: str
30
+ ) -> tp.Tuple[np.ndarray, np.ndarray, int]:
31
+ x_copy = x.astype(np.float32).copy()
32
+ y_copy = y.astype(np.float32).copy()
33
+ nbex, dim = x.shape
34
+ # create index
35
+ idx_x = faiss.IndexFlatIP(dim)
36
+ idx_y = faiss.IndexFlatIP(dim)
37
+ # L2 normalization needed for cosine distance
38
+ faiss.normalize_L2(x_copy)
39
+ faiss.normalize_L2(y_copy)
40
+ idx_x.add(x_copy)
41
+ idx_y.add(y_copy)
42
+ if margin == Margin.ABSOLUTE.value:
43
+ scores, indices = idx_y.search(x_copy, k)
44
+ else:
45
+ # return cosine similarity and indices of k closest neighbors
46
+ Cos_xy, Idx_xy = idx_y.search(x_copy, k)
47
+ Cos_yx, Idx_yx = idx_x.search(y_copy, k)
48
+
49
+ # average cosines
50
+ Avg_xy = Cos_xy.mean(axis=1)
51
+ Avg_yx = Cos_yx.mean(axis=1)
52
+
53
+ scores = score_margin(Cos_xy, Idx_xy, Avg_xy, Avg_yx, margin, k)
54
+ indices = Idx_xy
55
+ return scores, indices, nbex
56
+
57
+
58
+ def get_cosine_scores(src_emb: np.ndarray, neighbor_embs: np.ndarray) -> np.ndarray:
59
+ assert src_emb.shape[0] == neighbor_embs.shape[1]
60
+ src_embs = np.repeat(
61
+ np.expand_dims(src_emb, axis=0), neighbor_embs.shape[0], axis=0
62
+ )
63
+ cosine_scores = cosine_similarity(src_embs, neighbor_embs).diagonal()
64
+ return cosine_scores
65
+
66
+
67
+ def get_comparator_scores(
68
+ src_emb: np.ndarray,
69
+ neighbor_embs: np.ndarray,
70
+ comparator_model: tp.Any,
71
+ symmetrize_comparator: bool,
72
+ ) -> np.ndarray:
73
+ src_embs = np.repeat(
74
+ np.expand_dims(src_emb, axis=0), neighbor_embs.shape[0], axis=0
75
+ )
76
+ a = torch.from_numpy(src_embs).unsqueeze(1) # restore depth dim
77
+ b = torch.from_numpy(neighbor_embs).unsqueeze(1)
78
+ res = get_comparator_preds(a, b, comparator_model, symmetrize_comparator)
79
+ scores_softmax = softmax(res)
80
+ return np.array(scores_softmax)
81
+
82
+
83
+ def get_comparator_preds(
84
+ src_emb: np.ndarray, tgt_emb: np.ndarray, model: tp.Any, symmetrize: bool
85
+ ):
86
+ preds = (
87
+ get_model_pred(
88
+ model,
89
+ src=src_emb[:, 0],
90
+ mt=tgt_emb[:, 0],
91
+ use_gpu=model.use_gpu,
92
+ batch_size=1,
93
+ )[:, 0]
94
+ .cpu()
95
+ .numpy()
96
+ )
97
+ if symmetrize:
98
+ preds2 = (
99
+ get_model_pred(
100
+ model,
101
+ src=tgt_emb[:, 0],
102
+ mt=src_emb[:, 0],
103
+ use_gpu=model.use_gpu,
104
+ batch_size=1,
105
+ )[:, 0]
106
+ .cpu()
107
+ .numpy()
108
+ )
109
+ preds = (preds2 + preds) / 2
110
+ return preds
111
+
112
+
113
+ def get_blended_predictions(
114
+ alpha: float,
115
+ nbex: int,
116
+ margin_scores: np.ndarray,
117
+ x_aux: np.ndarray,
118
+ y_aux: np.ndarray,
119
+ neighbor_indices: np.ndarray,
120
+ comparator_model: tp.Optional[tp.Any] = None,
121
+ symmetrize_comparator: bool = False,
122
+ ) -> list[int]:
123
+ predictions = []
124
+ for src_index in range(nbex):
125
+ neighbors = neighbor_indices[src_index]
126
+ neighbor_embs = y_aux[neighbors].astype(np.float32)
127
+ src_emb = x_aux[src_index].astype(np.float32)
128
+ aux_scores = (
129
+ get_comparator_scores(
130
+ src_emb, neighbor_embs, comparator_model, symmetrize_comparator
131
+ )
132
+ if comparator_model
133
+ else get_cosine_scores(src_emb, neighbor_embs)
134
+ )
135
+ assert margin_scores[src_index].shape == aux_scores.shape
136
+ blended_scores = alpha * margin_scores[src_index] + (1 - alpha) * aux_scores
137
+ blended_neighbor_idx = blended_scores.argmax()
138
+ predictions.append(neighbors[blended_neighbor_idx])
139
+ return predictions
140
+
141
+
142
+ def PxSIM(
143
+ x: np.ndarray,
144
+ y: np.ndarray,
145
+ x_aux: np.ndarray,
146
+ y_aux: np.ndarray,
147
+ alpha: float,
148
+ margin: str = Margin.RATIO.value,
149
+ k: int = 16,
150
+ comparator_path: tp.Optional[Path] = None,
151
+ symmetrize_comparator: bool = False,
152
+ ) -> tp.Tuple[int, int, list[int]]:
153
+ """
154
+ Parameters
155
+ ----------
156
+ x : np.ndarray
157
+ source-side embedding array
158
+ y : np.ndarray
159
+ target-side embedding array
160
+ x_aux : np.ndarray
161
+ source-side embedding array using auxiliary model
162
+ y_aux : np.ndarray
163
+ target-side embedding array using auxiliary model
164
+ alpha : int
165
+ parameter to weight blended score
166
+ margin : str
167
+ margin scoring function (e.g. ratio, absolute, distance)
168
+ k : int
169
+ number of neighbors in k-nn search
170
+ comparator_path : Path
171
+ path to AutoPCP model config
172
+ symmetrize_comparator : bool
173
+ whether to symmetrize the comparator predictions
174
+
175
+ Returns
176
+ -------
177
+ err : int
178
+ Number of errors
179
+ nbex : int
180
+ Number of examples
181
+ preds : list[int]
182
+ List of (index-based) predictions
183
+ """
184
+ assert Margin.has_value(margin), f"Margin type: {margin}, is not supported."
185
+ comparator_model = Comparator.load(comparator_path) if comparator_path else None
186
+ # get margin-based nearest neighbors
187
+ margin_scores, neighbor_indices, nbex = get_neighbors(x, y, k=k, margin=margin)
188
+ preds = get_blended_predictions(
189
+ alpha,
190
+ nbex,
191
+ margin_scores,
192
+ x_aux,
193
+ y_aux,
194
+ neighbor_indices,
195
+ comparator_model,
196
+ symmetrize_comparator,
197
+ )
198
+ err = sum([idx != pred for idx, pred in enumerate(preds)])
199
+ print(f"P-xSIM error: {100 * (err / nbex):.2f}")
200
+ return err, nbex, preds
201
+
202
+
203
+ def load_embeddings(
204
+ infile: Path, dim: int, fp16: bool = False, numpy_header: bool = False
205
+ ) -> np.ndarray:
206
+ assert infile.exists(), f"file: {infile} does not exist."
207
+ if numpy_header:
208
+ return np.load(infile)
209
+ emb = np.fromfile(infile, dtype=np.float16 if fp16 else np.float32)
210
+ num_examples = emb.shape[0] // dim
211
+ emb.resize(num_examples, dim)
212
+ if fp16:
213
+ emb = emb.astype(np.float32) # faiss currently only supports fp32
214
+ return emb
215
+
216
+
217
+ def run(
218
+ src_emb: Path,
219
+ tgt_emb: Path,
220
+ src_aux_emb: Path,
221
+ tgt_aux_emb: Path,
222
+ alpha: float,
223
+ margin: str = Margin.RATIO.value,
224
+ k: int = 16,
225
+ emb_fp16: bool = False,
226
+ aux_emb_fp16: bool = False,
227
+ emb_dim: int = 1024,
228
+ aux_emb_dim: int = 1024,
229
+ numpy_header: bool = False,
230
+ comparator_path: tp.Optional[Path] = None,
231
+ symmetrize_comparator: bool = False,
232
+ prediction_savepath: tp.Optional[Path] = None,
233
+ ) -> None:
234
+ x = load_embeddings(src_emb, emb_dim, emb_fp16, numpy_header)
235
+ y = load_embeddings(tgt_emb, emb_dim, emb_fp16, numpy_header)
236
+ x_aux = load_embeddings(src_aux_emb, aux_emb_dim, aux_emb_fp16, numpy_header)
237
+ y_aux = load_embeddings(tgt_aux_emb, aux_emb_dim, aux_emb_fp16, numpy_header)
238
+ assert (x.shape == y.shape) and (x_aux.shape == y_aux.shape)
239
+ _, _, preds = PxSIM(
240
+ x, y, x_aux, y_aux, alpha, margin, k, comparator_path, symmetrize_comparator
241
+ )
242
+ if prediction_savepath:
243
+ with open(prediction_savepath, "w") as outf:
244
+ for pred in preds:
245
+ print(pred, file=outf)
246
+
247
+
248
+ if __name__ == "__main__":
249
+ import func_argparse
250
+
251
+ func_argparse.main()
laser/source/sent_classif.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+ # LASER Language-Agnostic SEntence Representations
9
+ # is a toolkit to calculate multilingual sentence embeddings
10
+ # and to use them for document classification, bitext filtering
11
+ # and mining
12
+ #
13
+ # --------------------------------------------------------
14
+ #
15
+ # Simple MLP classifier for sentence embeddings
16
+
17
+
18
+ import argparse
19
+ import copy
20
+ import numpy as np
21
+ import torch
22
+ import torch.nn as nn
23
+ import torch.nn.functional as F
24
+ import torch.optim as optim
25
+ import torch.utils.data as data_utils
26
+
27
+
28
+ ################################################
29
+
30
+ def LoadData(bdir, dfn, lfn, dim=1024, bsize=32, shuffle=False, quiet=False):
31
+ x = np.fromfile(bdir + dfn, dtype=np.float32, count=-1)
32
+ x.resize(x.shape[0] // dim, dim)
33
+
34
+ lbl = np.loadtxt(bdir + lfn, dtype=np.int32)
35
+ lbl.reshape(lbl.shape[0], 1)
36
+ if not quiet:
37
+ print(' - read {:d}x{:d} elements in {:s}'.format(x.shape[0], x.shape[1], dfn))
38
+ print(' - read {:d} labels [{:d},{:d}] in {:s}'
39
+ .format(lbl.shape[0], lbl.min(), lbl.max(), lfn))
40
+
41
+ D = data_utils.TensorDataset(torch.from_numpy(x), torch.from_numpy(lbl))
42
+ loader = data_utils.DataLoader(D, batch_size=bsize, shuffle=shuffle)
43
+ return loader
44
+
45
+
46
+ ################################################
47
+
48
+ class Net(nn.Module):
49
+ def __init__(self, idim=1024, odim=2, nhid=None,
50
+ dropout=0.0, gpu=0, activation='TANH'):
51
+ super(Net, self).__init__()
52
+ self.gpu = gpu
53
+ modules = []
54
+
55
+ modules = []
56
+ print(' - mlp {:d}'.format(idim), end='')
57
+ if len(nhid) > 0:
58
+ if dropout > 0:
59
+ modules.append(nn.Dropout(p=dropout))
60
+ nprev = idim
61
+ for nh in nhid:
62
+ if nh > 0:
63
+ modules.append(nn.Linear(nprev, nh))
64
+ nprev = nh
65
+ if activation == 'TANH':
66
+ modules.append(nn.Tanh())
67
+ print('-{:d}t'.format(nh), end='')
68
+ elif activation == 'RELU':
69
+ modules.append(nn.ReLU())
70
+ print('-{:d}r'.format(nh), end='')
71
+ else:
72
+ raise Exception('Unrecognized activation {activation}')
73
+ if dropout > 0:
74
+ modules.append(nn.Dropout(p=dropout))
75
+ modules.append(nn.Linear(nprev, odim))
76
+ print('-{:d}, dropout={:.1f}'.format(odim, dropout))
77
+ else:
78
+ modules.append(nn.Linear(idim, odim))
79
+ print(' - mlp %d-%d'.format(idim, odim))
80
+ self.mlp = nn.Sequential(*modules)
81
+ # Softmax is included CrossEntropyLoss !
82
+
83
+ if self.gpu >= 0:
84
+ self.mlp = self.mlp.cuda()
85
+
86
+ def forward(self, x):
87
+ return self.mlp(x)
88
+
89
+ def TestCorpus(self, dset, name=' Dev', nlbl=4):
90
+ correct = 0
91
+ total = 0
92
+ self.mlp.train(mode=False)
93
+ corr = np.zeros(nlbl, dtype=np.int32)
94
+ for data in dset:
95
+ X, Y = data
96
+ Y = Y.long()
97
+ if self.gpu >= 0:
98
+ X = X.cuda()
99
+ Y = Y.cuda()
100
+ outputs = self.mlp(X)
101
+ _, predicted = torch.max(outputs.data, 1)
102
+ total += Y.size(0)
103
+ correct += (predicted == Y).int().sum()
104
+ for i in range(nlbl):
105
+ corr[i] += (predicted == i).int().sum()
106
+
107
+ print(' | {:4s}: {:5.2f}%'
108
+ .format(name, 100.0 * correct.float() / total), end='')
109
+ print(' | classes:', end='')
110
+ for i in range(nlbl):
111
+ print(' {:5.2f}'.format(100.0 * corr[i] / total), end='')
112
+
113
+ return correct, total
114
+
115
+
116
+ ################################################
117
+
118
+ parser = argparse.ArgumentParser(
119
+ formatter_class=argparse.RawDescriptionHelpFormatter,
120
+ description="Simple sentence classifier")
121
+
122
+ # Data
123
+ parser.add_argument(
124
+ '--base-dir', '-b', type=str, required=True, metavar='PATH',
125
+ help="Directory with all the data files)")
126
+ parser.add_argument(
127
+ '--save', '-s', type=str, required=False, metavar='PATH', default="",
128
+ help="File in which to save best network")
129
+ parser.add_argument(
130
+ '--train', '-t', type=str, required=True, metavar='STR',
131
+ help="Name of training corpus")
132
+ parser.add_argument(
133
+ '--train-labels', '-T', type=str, required=True, metavar='STR',
134
+ help="Name of training corpus (labels)")
135
+ parser.add_argument(
136
+ '--dev', '-d', type=str, required=True, metavar='STR',
137
+ help="Name of development corpus")
138
+ parser.add_argument(
139
+ '--dev-labels', '-D', type=str, required=True, metavar='STR',
140
+ help="Name of development corpus (labels)")
141
+ parser.add_argument(
142
+ '--test', '-e', type=str, required=True, metavar='STR',
143
+ help="Name of test corpus without language extension")
144
+ parser.add_argument(
145
+ '--test-labels', '-E', type=str, required=True, metavar='STR',
146
+ help="Name of test corpus without language extension (labels)")
147
+ parser.add_argument(
148
+ '--lang', '-L', nargs='+', default=None,
149
+ help="List of languages to test on")
150
+
151
+ # network definition
152
+ parser.add_argument(
153
+ "--dim", "-m", type=int, default=1024,
154
+ help="Dimension of sentence embeddings")
155
+ parser.add_argument(
156
+ '--nhid', '-n', type=int, default=[0], nargs='+',
157
+ help="List of hidden layer(s) dimensions")
158
+ parser.add_argument(
159
+ "--nb-classes", "-c", type=int, default=2,
160
+ help="Number of output classes")
161
+ parser.add_argument(
162
+ '--dropout', '-o', type=float, default=0.0, metavar='FLOAT',
163
+ help="Value of dropout")
164
+ parser.add_argument(
165
+ '--nepoch', '-N', type=int, default=100, metavar='INT',
166
+ help="Number of epochs")
167
+ parser.add_argument(
168
+ '--bsize', '-B', type=int, default=128, metavar='INT',
169
+ help="Batch size")
170
+ parser.add_argument(
171
+ '--seed', '-S', type=int, default=123456789, metavar='INT',
172
+ help="Initial random seed")
173
+ parser.add_argument(
174
+ '--lr', type=float, default=0.001, metavar='FLOAT',
175
+ help='Learning rate')
176
+ parser.add_argument(
177
+ '--wdecay', type=float, default=0.0, metavar='FLOAT',
178
+ help='Weight decay')
179
+ parser.add_argument(
180
+ '--gpu', '-g', type=int, default=-1, metavar='INT',
181
+ help="GPU id (-1 for CPU)")
182
+ args = parser.parse_args()
183
+
184
+ print(' - base directory: {}'.format(args.base_dir))
185
+ args.base_dir = args.base_dir + "/"
186
+
187
+ train_loader = LoadData(args.base_dir, args.train, args.train_labels,
188
+ dim=args.dim, bsize=args.bsize, shuffle=True)
189
+
190
+ dev_loader = LoadData(args.base_dir, args.dev, args.dev_labels,
191
+ dim=args.dim, bsize=args.bsize, shuffle=False)
192
+
193
+ # set GPU and random seed
194
+ torch.cuda.set_device(args.gpu)
195
+ np.random.seed(args.seed)
196
+ torch.manual_seed(args.seed)
197
+ torch.cuda.manual_seed(args.seed)
198
+ print(" - setting seed to %d" % args.seed)
199
+
200
+ # create network
201
+ net = Net(idim=args.dim, odim=args.nb_classes,
202
+ nhid=args.nhid, dropout=args.dropout, gpu=args.gpu)
203
+ if args.gpu >= 0:
204
+ criterion = nn.CrossEntropyLoss().cuda()
205
+ else:
206
+ criterion = nn.CrossEntropyLoss()
207
+
208
+ #optimizer = optim.Adam(net.parameters(), weight_decay=0.0)
209
+ # default: pytorch/optim/adam.py
210
+ # Py0.4: lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False):
211
+ # Py1.0: lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False):
212
+ optimizer = optim.Adam(net.parameters(),
213
+ lr=args.lr,
214
+ weight_decay=args.wdecay,
215
+ betas=(0.9, 0.999),
216
+ eps=1e-8,
217
+ amsgrad=False)
218
+
219
+ corr_best = 0
220
+ # loop multiple times over the dataset
221
+ for epoch in range(args.nepoch):
222
+
223
+ loss_epoch = 0.0
224
+ print('Ep {:4d}'.format(epoch), end='')
225
+ # for inputs, labels in train_loader:
226
+ for i, data in enumerate(train_loader, 0):
227
+ # get the inputs
228
+ inputs, labels = data
229
+ labels = labels.long()
230
+ if args.gpu >= 0:
231
+ inputs = inputs.cuda()
232
+ labels = labels.cuda()
233
+
234
+ # zero the parameter gradients
235
+ net.zero_grad()
236
+
237
+ # forward + backward + optimize
238
+ net.train(mode=True)
239
+ outputs = net(inputs)
240
+ loss = criterion(outputs, labels)
241
+ loss.backward()
242
+ optimizer.step()
243
+ loss_epoch += loss.item()
244
+
245
+ print(' | loss {:e}'.format(loss_epoch), end='')
246
+
247
+ corr, nbex = net.TestCorpus(dev_loader, 'Dev')
248
+ if corr >= corr_best:
249
+ print(' | saved')
250
+ corr_best = corr
251
+ net_best = copy.deepcopy(net)
252
+ else:
253
+ print('')
254
+
255
+
256
+ if 'net_best' in globals():
257
+ if args.save != '':
258
+ torch.save(net_best.cpu(), args.save)
259
+ print('Best Dev: {:d} = {:5.2f}%'
260
+ .format(corr_best, 100.0 * corr_best.float() / nbex))
261
+
262
+ if args.gpu >= 0:
263
+ net_best = net_best.cuda()
264
+
265
+ # test on (several) languages
266
+ for l in args.lang:
267
+ test_loader = LoadData(args.base_dir, args.test + '.' + l,
268
+ args.test_labels + '.' + l,
269
+ dim=args.dim, bsize=args.bsize,
270
+ shuffle=False, quiet=True)
271
+ print('Ep best | Eval Test lang {:s}'.format(l), end='')
272
+ net_best.TestCorpus(test_loader, 'Test')
273
+ print('')
laser/source/similarity_search.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+ # LASER Language-Agnostic SEntence Representations
9
+ # is a toolkit to calculate multilingual sentence embeddings
10
+ # and to use them for document classification, bitext filtering
11
+ # and mining
12
+ #
13
+ # --------------------------------------------------------
14
+ #
15
+ # Quora Q&A paraphrase detection
16
+
17
+ import os
18
+ import sys
19
+ import argparse
20
+ import faiss
21
+ import numpy as np
22
+
23
+ # get environment
24
+ assert os.environ.get('LASER'), 'Please set the enviornment variable LASER'
25
+ LASER = os.environ['LASER']
26
+
27
+ sys.path.append(LASER + '/source')
28
+ sys.path.append(LASER + '/source/lib')
29
+ from embed import SentenceEncoder, EncodeLoad, EncodeFile
30
+ from text_processing import Token, BPEfastApply
31
+ from indexing import IndexCreate, IndexSearchMultiple, IndexPrintConfusionMatrix
32
+
33
+ ###############################################################################
34
+
35
+ parser = argparse.ArgumentParser('LASER: similarity search')
36
+ parser.add_argument('--base-dir', type=str, default='.',
37
+ help='Base directory for all data files')
38
+ parser.add_argument('--data', type=str, required=True,
39
+ help='Direcory and basename of input data (language name will be added)')
40
+ parser.add_argument('--output', type=str, required=True,
41
+ help='Directory and basename of created data (language name will be added)')
42
+ parser.add_argument('--textual', action='store_true',
43
+ help='Use textual comparison instead of indicies')
44
+ parser.add_argument(
45
+ '--lang', '-l', nargs='+', required=True,
46
+ help="List of languages to test on")
47
+
48
+ # preprocessing
49
+ parser.add_argument('--bpe-codes', type=str, required=True,
50
+ help='Fast BPPE codes and vocabulary')
51
+ parser.add_argument('--verbose', action='store_true',
52
+ help='Detailed output')
53
+
54
+ # options for encoder
55
+ parser.add_argument('--encoder', type=str, required=True,
56
+ help='encoder to be used')
57
+ parser.add_argument('--buffer-size', type=int, default=100,
58
+ help='Buffer size (sentences)')
59
+ parser.add_argument('--max-tokens', type=int, default=12000,
60
+ help='Maximum number of tokens to process in a batch')
61
+ parser.add_argument('--max-sentences', type=int, default=None,
62
+ help='Maximum number of sentences to process in a batch')
63
+ parser.add_argument('--cpu', action='store_true',
64
+ help='Use CPU instead of GPU')
65
+
66
+ args = parser.parse_args()
67
+
68
+ print('LASER: similarity search')
69
+
70
+ print('\nProcessing:')
71
+ all_texts = []
72
+ if args.textual:
73
+ print(' - using textual comparision')
74
+ for l in args.lang:
75
+ with open(os.path.join(args.base_dir, args.data + '.' + l),
76
+ encoding='utf-8', errors='surrogateescape') as f:
77
+ texts = f.readlines()
78
+ print(' - {:s}: {:d} lines'.format(args.data + '.' + l, len(texts)))
79
+ all_texts.append(texts)
80
+
81
+ enc = EncodeLoad(args)
82
+
83
+ out_dir = os.path.dirname(args.output)
84
+ if not os.path.exists(out_dir):
85
+ print(' - creating directory {}'.format(out_dir))
86
+ os.mkdir(out_dir)
87
+
88
+ all_data = []
89
+ all_index = []
90
+ for l in args.lang:
91
+ Token(os.path.join(args.base_dir, args.data + '.' + l),
92
+ os.path.join(args.base_dir, args.output + '.tok.' + l),
93
+ lang=l,
94
+ romanize=True if l == 'el' else False,
95
+ lower_case=True,
96
+ verbose=args.verbose, over_write=False)
97
+ BPEfastApply(os.path.join(args.base_dir, args.output + '.tok.' + l),
98
+ os.path.join(args.base_dir, args.output + '.bpe.' + l),
99
+ args.bpe_codes,
100
+ verbose=args.verbose, over_write=False)
101
+ EncodeFile(enc,
102
+ os.path.join(args.base_dir, args.output + '.bpe.' + l),
103
+ os.path.join(args.base_dir, args.output + '.enc.' + l),
104
+ verbose=args.verbose, over_write=False)
105
+ d, idx = IndexCreate(os.path.join(args.base_dir, args.output + '.enc.' + l),
106
+ 'FlatL2',
107
+ verbose=args.verbose, save_index=False)
108
+ all_data.append(d)
109
+ all_index.append(idx)
110
+
111
+ err = IndexSearchMultiple(all_data, all_index, args.lang, texts=all_texts,
112
+ verbose=False, print_errors=False)
113
+ IndexPrintConfusionMatrix(err, args.lang)
laser/source/xsim.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ #
7
+ # LASER Language-Agnostic SEntence Representations
8
+ # is a toolkit to calculate multilingual sentence embeddings
9
+ # and to use them for document classification, bitext filtering
10
+ # and mining
11
+ #
12
+ # --------------------------------------------------------
13
+ #
14
+ # Tool to calculate multilingual similarity error rate (xSIM)
15
+
16
+ import faiss
17
+ import numpy as np
18
+ import typing as tp
19
+ import os
20
+ import json
21
+ from enum import Enum
22
+
23
+
24
+ class Margin(Enum):
25
+ RATIO = "ratio"
26
+ DISTANCE = "distance"
27
+ ABSOLUTE = "absolute"
28
+
29
+ @classmethod
30
+ def has_value(cls, value):
31
+ return value in cls._value2member_map_
32
+
33
+
34
+ def xSIM(
35
+ x: tp.Union[str, np.ndarray],
36
+ y: tp.Union[str, np.ndarray],
37
+ margin: str = Margin.RATIO.value,
38
+ k: int = 4,
39
+ dim: int = 1024,
40
+ fp16: bool = False,
41
+ eval_text: str = None,
42
+ augmented_json: str = None,
43
+ ) -> tp.Tuple[int, int, tp.Dict[str, int]]:
44
+ assert Margin.has_value(margin), f"Margin type: {margin}, is not supported."
45
+ if not isinstance(x, np.ndarray):
46
+ x = _load_embeddings(x, dim, fp16)
47
+ if not isinstance(y, np.ndarray):
48
+ y = _load_embeddings(y, dim, fp16)
49
+ # calculate xSIM error
50
+ return calculate_error(x, y, margin, k, eval_text, augmented_json)
51
+
52
+
53
+ def _load_embeddings(infile: str, dim: int, fp16: bool = False) -> np.ndarray:
54
+ assert os.path.isfile(infile), f"file: {infile} does not exist."
55
+ emb = np.fromfile(infile, dtype=np.float16 if fp16 else np.float32)
56
+ num_examples = emb.shape[0] // dim
57
+ emb.resize(num_examples, dim)
58
+ if fp16:
59
+ emb = emb.astype(np.float32) # faiss currently only supports fp32
60
+ return emb
61
+
62
+
63
+ def score_margin(
64
+ Dxy: np.ndarray,
65
+ Ixy: np.ndarray,
66
+ Ax: np.ndarray,
67
+ Ay: np.ndarray,
68
+ margin: str,
69
+ k: int,
70
+ ) -> np.ndarray:
71
+ nbex = Dxy.shape[0]
72
+ scores = np.zeros((nbex, k))
73
+ for i in range(nbex):
74
+ for j in range(k):
75
+ jj = Ixy[i, j]
76
+ a = Dxy[i, j]
77
+ b = (Ax[i] + Ay[jj]) / 2
78
+ if margin == Margin.RATIO.value:
79
+ scores[i, j] = a / b
80
+ else: # distance margin
81
+ scores[i, j] = a - b
82
+ return scores
83
+
84
+
85
+ def _score_knn(x: np.ndarray, y: np.ndarray, k: int, margin: str) -> np.ndarray:
86
+ nbex, dim = x.shape
87
+ # create index
88
+ idx_x = faiss.IndexFlatIP(dim)
89
+ idx_y = faiss.IndexFlatIP(dim)
90
+ # L2 normalization needed for cosine distance
91
+ faiss.normalize_L2(x)
92
+ faiss.normalize_L2(y)
93
+ idx_x.add(x)
94
+ idx_y.add(y)
95
+ if margin == Margin.ABSOLUTE.value:
96
+ scores, indices = idx_y.search(x, 1)
97
+ else:
98
+ # return cosine similarity and indices of k closest neighbors
99
+ Cos_xy, Idx_xy = idx_y.search(x, k)
100
+ Cos_yx, Idx_yx = idx_x.search(y, k)
101
+
102
+ # average cosines
103
+ Avg_xy = Cos_xy.mean(axis=1)
104
+ Avg_yx = Cos_yx.mean(axis=1)
105
+
106
+ scores = score_margin(Cos_xy, Idx_xy, Avg_xy, Avg_yx, margin, k)
107
+
108
+ # find best
109
+ best = scores.argmax(axis=1)
110
+ indices = np.zeros((nbex, 1), dtype=np.int32)
111
+ for i in range(nbex):
112
+ indices[i] = Idx_xy[i, best[i]]
113
+ return indices
114
+
115
+
116
+ def get_transform(augmented_json, closest_neighbor, src):
117
+ if (
118
+ closest_neighbor in augmented_json
119
+ and augmented_json[closest_neighbor]["src"] == src
120
+ ):
121
+ return augmented_json[closest_neighbor]["errtype"]
122
+ return "Misaligned"
123
+
124
+
125
+ def calculate_error(
126
+ x: np.ndarray,
127
+ y: np.ndarray,
128
+ margin: str = None,
129
+ k: int = 4,
130
+ eval_text: str = None,
131
+ augmented_json: str = None,
132
+ ) -> tp.Tuple[int, int, tp.Dict[str, int]]:
133
+ if augmented_json:
134
+ with open(augmented_json) as f:
135
+ augmented_json = json.load(f)
136
+ assert (
137
+ x.shape[0] < y.shape[0]
138
+ ), f"Shape mismatch: {x.shape[0]} >= target {y.shape[0]}"
139
+ else:
140
+ assert (
141
+ x.shape == y.shape
142
+ ), f"number of source {x.shape} / target {y.shape} shapes mismatch, "
143
+ nbex = x.shape[0]
144
+ augmented_report = {}
145
+
146
+ # for each x calculate the highest scoring neighbor from y
147
+ closest_neighbor = _score_knn(x, y, k, margin)
148
+
149
+ if eval_text: # calc textual error
150
+ lines = open(eval_text, encoding="utf-8", errors="surrogateescape").readlines()
151
+ err = 0
152
+ for ex in range(nbex):
153
+ if lines[ex] != lines[closest_neighbor[ex, 0]]:
154
+ err += 1
155
+ if augmented_json:
156
+ transform = get_transform(
157
+ augmented_json,
158
+ lines[closest_neighbor[ex, 0]].strip(),
159
+ lines[ex].strip(),
160
+ )
161
+ augmented_report[transform] = augmented_report.get(transform, 0) + 1
162
+ else: # calc index error
163
+ ref = np.linspace(0, nbex - 1, nbex).astype(int) # [0, nbex)
164
+ err = nbex - np.equal(closest_neighbor.reshape(nbex), ref).astype(int).sum()
165
+ return err, nbex, augmented_report
laser/tasks/CCMatrix/MatrixMine.pdf ADDED
Binary file (39.8 kB). View file
 
laser/tasks/CCMatrix/README.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CCMatrix: Mining Billions of High-Quality Parallel Sentences on the WEB
2
+
3
+ ## Parallel data
4
+
5
+ We show that margin-based bitext mining in LASER's multilingual sentence space can be applied to monolingual corpora of billions of sentences to produce high quality aligned translation data. We use thirty-two snapshots of a curated common crawl corpus [1] totaling 69 billion unique sentences. Using one unified approach for 80 languages, we were able to mine 10.8 billion parallel sentences, out of which only 2.9 billion are aligned with English.
6
+
7
+ ## Download
8
+
9
+ We open-source our scripts in this directory so that others may reproduce the data, evaluation and results reported in the CCMatrix paper.
10
+ ```
11
+ pip3 install cc_net
12
+ python3 dl_cc_matrix.py
13
+ ```
14
+
15
+ Please cite reference [2][3] if you use this data.
16
+
17
+
18
+ ## Evaluation
19
+
20
+ Evaluation
21
+ We have assessed the quality of our mined data with bilingual models and multilingual models.
22
+
23
+ * Bilingual models [2]: To evaluate the quality of the mined bitexts, we train NMT systems for most of the language pairs and evaluate them on TED, WMT and WAT test sets. Using our mined bitexts only and no human translated parallel data, we achieve a new state-of-the-art for a single system on the WMT'19 test set for translation between English and German, Russian and Chinese, as well as German/French. In particular, our English/German system outperforms the best single one by close to 4 BLEU points and is almost on pair with best WMT'19 evaluation system which uses system combination and back-translation. We also achieve excellent results for distant languages pairs like Russian/Japanese, outperforming the best submission at the 2019 workshop on Asian Translation (WAT).
24
+
25
+ * Multilingual models [3]: CCMatrix data is used to train M2M-100, a large-scale Many-to-Many multilingual translation model. The thousands of directions we mine produce training data for direct translations without relying solely on English data. We mine using novel strategy which exploits language groupings and bridge languages to avoid mining every possible direction while maintaining good accuracy. By training on this data and scaling model capacity through model parallelism and language-specific parameters, M2M-100 outperforms English-Centric multilingual models trained on data where either the source or target language is English. The system improves over 10 BLEU on average compared to an English-Centric baseline when translating directly between non-English directions. M2M-100 is competitive to bilingual models from WMT and improves over existing publicly available multilingual translation systems. To download the data, follow our instructions above. To download the models and reproduce the training, click [*here*](https://github.com/pytorch/fairseq/tree/master/examples/m2m_100)
26
+
27
+ Please note that additional data filtering was applied before training the M2M-100 model, see [3] for details.
28
+ Also, we have improved mining against English which leads to more bitexts, in particular for mid- and low-resources languages.
29
+ This new data was not used for M2M-100.
30
+
31
+ ## References
32
+
33
+ [1] Guillaume Wenzek, Marie-Anne Lachaux, Alexis Conneau, Vishrav Chaudhary, Francisco Guzmán, Armand Jouli and Edouard Grave,
34
+ [*CCNet: Extracting High Quality Monolingual Datasets from Web Crawl Data*](https://arxiv.org/abs/1911.00359)
35
+
36
+ [2] Holger Schwenk, Guillaume Wenzek, Sergey Edunov, Edouard Grave and Armand Joulin,
37
+ [*CCMatrix: Mining Billions of High-Quality Parallel Sentences on the WEB*](https://arxiv.org/abs/1911.04944)
38
+
39
+ [3] Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, and Armand Joulin. Beyond English-Centric Multilingual Machine Translation
laser/tasks/CCMatrix/dl_cc_matrix.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import contextlib
2
+ import gzip
3
+ import logging
4
+ import re
5
+ import subprocess
6
+ import tempfile
7
+ from collections import defaultdict
8
+ from pathlib import Path
9
+ from typing import Callable, Dict, Iterable, List, NamedTuple, Type
10
+
11
+ from cc_net.jsonql import open_remote_file, open_write
12
+ from cc_net.process_wet_file import CCSegmentsReader
13
+ from typing import Sequence
14
+ import functools
15
+ import multiprocessing
16
+
17
+ BUFFER_SIZE = "32G"
18
+ SORT_PARALLEL = 8
19
+
20
+ KNOWN_VERSIONS = ["v1.0.0", "v1.0.beta", "v1.0.alpha"]
21
+
22
+
23
+ class NormalizedBitextPtr(NamedTuple):
24
+ lang_pair: str
25
+ line_no: int
26
+ segment: str
27
+ digest: str
28
+ ptr_start: int
29
+ ptr_end: int
30
+ score: float
31
+
32
+
33
+ class Bitext(NamedTuple):
34
+ lang_pair: str
35
+ line_no: int
36
+ score: float
37
+ text: str
38
+
39
+
40
+ class SimpleBitext(NamedTuple):
41
+ line_no: int
42
+ score: float
43
+ text: str
44
+
45
+
46
+ WEB_PAT = re.compile(r"https?:[^ \n]* ")
47
+ WEB_REPL = "WEB "
48
+
49
+ WEB2_PAT = re.compile(r"https?:[^ \n]*\n")
50
+ WEB2_REPL = "WEB\n"
51
+
52
+
53
+ def clean_content(raw_content: str) -> str:
54
+ # We need to clean all the content, because otherwise there is no way for
55
+ # the user to know if we need to clean it or not.
56
+ par = raw_content
57
+ par = par.replace("</s>", ". ")
58
+ par = par.replace("\t", " ")
59
+ par = re.sub(WEB_PAT, WEB_REPL, par, count=0)
60
+ par = re.sub(WEB2_PAT, WEB2_REPL, par, count=0)
61
+ return par
62
+
63
+
64
+ def get_typed_parser(cls: Type) -> Callable:
65
+ types = cls.__annotations__.values()
66
+
67
+ def parser(line: str) -> NamedTuple:
68
+ parts = line.rstrip("\n").split("\t")
69
+ assert len(parts) == len(
70
+ types
71
+ ), f"Print size mismatch expected the following columns {cls.__annotations__} got: {parts}"
72
+ return cls(*(t(p) for t, p in zip(types, parts)))
73
+
74
+ return parser
75
+
76
+
77
+ def open_read(file: Path) -> Iterable[str]:
78
+ if file.suffix == ".gz":
79
+ reader = gzip.open(file, "rt")
80
+ else:
81
+ reader = open(file, "rt")
82
+ with reader as f:
83
+ for line in f:
84
+ yield line
85
+
86
+
87
+ def dl(outdir: Path = Path("data"), version: str = KNOWN_VERSIONS[0], parallelism: int = 8):
88
+ """
89
+ Download bitext pointers from FAIR dataset and extract corresponding CC snippets.
90
+ - version: Specific version to download
91
+ - outdir: Directory where the data should go. Files will be in {outdir}/{version}/raw/
92
+ """
93
+ assert version in KNOWN_VERSIONS, f"Unknown version {version}, chose from {KNOWN_VERSIONS}"
94
+ metadata_dir = f"https://dl.fbaipublicfiles.com/laser/CCMatrix/{version}"
95
+ file_list = [l.strip() for l in open_remote_file(metadata_dir + "/list.txt")]
96
+ outdir.mkdir(exist_ok=True)
97
+ outdir = outdir / version / "raw"
98
+ outdir.mkdir(exist_ok=True, parents=True)
99
+
100
+ dlf = functools.partial(dl_file, metadata_dir, outdir)
101
+ # list(map(dlf, file_list))
102
+ with multiprocessing.Pool(parallelism) as pool:
103
+ pool.map(dlf, file_list)
104
+
105
+
106
+ def get_documents(segment: str) -> Dict[str, str]:
107
+ return {d["digest"]: d["raw_content"] for d in CCSegmentsReader([segment])}
108
+
109
+
110
+ def dl_file(metadata_dir: str, outdir: Path, file: str):
111
+ metadata = "/".join((metadata_dir, file))
112
+ parser = get_typed_parser(NormalizedBitextPtr)
113
+ found_bitext, missed_bitext, skipped_line = 0, 0, 0
114
+ segment = ""
115
+ segment_downloads: Dict[str, int] = defaultdict(int)
116
+ raw_documents: Dict[str, str] = {}
117
+ cleaned_documents: Dict[str, str] = {}
118
+
119
+ outfile = outdir / file
120
+ if outfile.exists():
121
+ return
122
+ o = FileWriterWithTmp(outfile)
123
+ for i, line in enumerate(open_remote_file(metadata)):
124
+ try:
125
+ bitext: NormalizedBitextPtr = parser(line)
126
+ # Add some more assert in case the line is invalid but still parse
127
+ assert bitext.segment.startswith("crawl-data/")
128
+ assert bitext.digest.startswith("sha1:")
129
+ except AssertionError:
130
+ logging.error(f"Skipping line {i}: {line}")
131
+ skipped_line += 1
132
+ continue
133
+
134
+ if not segment or bitext.segment != segment:
135
+ segment = bitext.segment
136
+ segment_downloads[segment] += 1
137
+ # Load segment in RAM, purge document cache
138
+ raw_documents = get_documents(segment)
139
+ cleaned_documents = {}
140
+
141
+ raw_doc = raw_documents.get(bitext.digest)
142
+ if raw_doc is None:
143
+ logging.error(f"Document not found: {bitext.digest} in {segment}")
144
+ missed_bitext += 1
145
+ continue
146
+
147
+ clean_doc = cleaned_documents.get(bitext.digest)
148
+ if clean_doc is None:
149
+ clean_doc = clean_content(raw_doc)
150
+ cleaned_documents[bitext.digest] = clean_doc
151
+
152
+ text = clean_doc[bitext.ptr_start : bitext.ptr_end]
153
+ score = getattr(bitext, "score", 0.0)
154
+ bt = Bitext(bitext.lang_pair, bitext.line_no, score, text)
155
+ print(*bt, sep="\t", file=o)
156
+
157
+ o.close(True)
158
+ logging.info(f"Found {found_bitext} sentences, missed {missed_bitext} sentences.")
159
+ if skipped_line > 0:
160
+ logging.error(f"Skipped {skipped_line} unparsable lines")
161
+ expected_dl = len(segment_downloads)
162
+ actual_dl = sum(segment_downloads.values())
163
+
164
+ if actual_dl != expected_dl:
165
+ logging.error(
166
+ f"Some segments where downloaded twice. Total dl: {actual_dl}, distinct dl: {expected_dl}"
167
+ )
168
+
169
+
170
+ def _tmp(file: Path) -> Path:
171
+ tmp_dir = file.parent
172
+ prefix = file.name.split(".", 1)[0] + "."
173
+ suffix = ".tmp." + file.name[len(prefix) :]
174
+ _, tmp_path = tempfile.mkstemp(dir=tmp_dir, prefix=prefix, suffix=suffix)
175
+ return Path(tmp_path)
176
+
177
+
178
+ class FileWriterWithTmp:
179
+ def __init__(self, file: Path):
180
+ self.file = file
181
+ self.tmp_file = _tmp(file)
182
+ # We don't want to make FileWriterWithTmp a ContextManager
183
+ self.handle = open_write(self.tmp_file).__enter__()
184
+
185
+ def write(self, data) -> int:
186
+ return self.handle.write(data)
187
+
188
+ def close(self, success: bool = False):
189
+ self.handle.close()
190
+ if success:
191
+ self.tmp_file.rename(self.file)
192
+
193
+
194
+ def transpose_file(outdir: Path, file: Path) -> None:
195
+ sentinel_file = file.with_suffix(".transposed")
196
+ if sentinel_file.exists():
197
+ return
198
+ outputs: Dict[str, FileWriterWithTmp] = {}
199
+ parser = get_typed_parser(Bitext)
200
+ success = False
201
+ try:
202
+ for line in open_read(file):
203
+ bt: Bitext = parser(line)
204
+ lang_pair = bt.lang_pair
205
+ if bt.lang_pair not in outputs:
206
+ assert (
207
+ "/" in lang_pair
208
+ ), f"Invalid lang pair '{lang_pair}' should be 'src-trg/src' or 'src-trg/trg'"
209
+ (outdir / f"{lang_pair}").mkdir(exist_ok=True, parents=True)
210
+ o = FileWriterWithTmp(outdir / f"{lang_pair}_{file.name}")
211
+ outputs[lang_pair] = o
212
+ simple_bt = SimpleBitext(bt.line_no, bt.score, bt.text)
213
+ print(*simple_bt, sep="\t", file=outputs[lang_pair])
214
+ success = True
215
+ finally:
216
+ for o in outputs.values():
217
+ o.close(success)
218
+ if success:
219
+ sentinel_file.write_text("\n".join(str(o.file) for o in outputs.values()))
220
+ # file.unlink()
221
+
222
+
223
+ def sort_files(outdir: Path, lang_pair_dir: Path, lang: str) -> Path:
224
+ out = outdir / lang_pair_dir.name / f"{lang}.txt"
225
+ if out.exists():
226
+ return out
227
+
228
+ files: List[Path] = []
229
+ for f in lang_pair_dir.iterdir():
230
+ if not f.suffix == ".gz":
231
+ continue
232
+ if f.name.split("_")[0] != lang:
233
+ continue
234
+ files.append(f)
235
+
236
+ print(f"Found {len(files)} files for lang '{lang}' in {lang_pair_dir}: {files}")
237
+ assert len(files) > 0
238
+
239
+ (outdir / lang_pair_dir.name).mkdir(exist_ok=True, parents=True)
240
+ tmp_out = _tmp(out)
241
+
242
+ unzipped_files = []
243
+ for f in files:
244
+ subprocess.check_call(["gunzip", "-k", str(f)])
245
+ unzipped_files.append(str(f)[:-3])
246
+
247
+ sort_cmd = [
248
+ "sort",
249
+ "-nk1",
250
+ f"--parallel={SORT_PARALLEL}",
251
+ f"--buffer-size={BUFFER_SIZE}",
252
+ "--output",
253
+ str(tmp_out),
254
+ ] + unzipped_files
255
+ subprocess.check_call(sort_cmd)
256
+ tmp_out.rename(out)
257
+ return out
258
+
259
+
260
+ def finalize(
261
+ outdir: Path = Path("data"), version: str = KNOWN_VERSIONS[0], pairs: Sequence[str] = []
262
+ ) -> None:
263
+ """From the downloaded raw text files, extract the bitexts, sorted by language pair.
264
+ Assumes 'dl' has been run with the same outdir and version before.
265
+
266
+ - version: Specific version to download
267
+ - outdir: Directory where the data should go. Files will be in {outdir}/{version}/bitext/
268
+ - pairs: List of language pairs you are interested in. Defaults to all.
269
+ """
270
+ raw_dir = outdir / version / "raw"
271
+ if not raw_dir.is_dir():
272
+ cmd = f"python {__file__} dl --outdir {outdir} --version {version}"
273
+ assert raw_dir.is_dir(), f"Dir not found {raw_dir}. Did you run following command?\n{cmd}"
274
+
275
+ raw_files = list(raw_dir.glob("*.gz"))
276
+ split_dir = outdir / version / "split_by_lang"
277
+ split_dir.mkdir(exist_ok=True, parents=True)
278
+ tr = functools.partial(transpose_file, split_dir)
279
+ with multiprocessing.Pool() as pool:
280
+ pool.map(tr, raw_files)
281
+
282
+ bitext_dir = outdir / version / "bitext"
283
+ bitext_dir.mkdir(exist_ok=True, parents=True)
284
+ if pairs:
285
+ pair_dirs = []
286
+ for pair in pairs:
287
+ assert (
288
+ len(pair.split("-")) == 2
289
+ ), f"Invalid pair '{pair}', should be 'src-trg'"
290
+ pair_dir = split_dir / pair
291
+ assert (
292
+ pair_dir.is_dir()
293
+ ), f"Dir {pair_dir} not found for lang pair '{pair}'. Is the pair valid ?"
294
+ pair_dirs.append(pair_dir)
295
+ else:
296
+ pair_dirs = [d for d in split_dir.iterdir() if d.is_dir()]
297
+
298
+ for pair_dir in pair_dirs:
299
+ src, trg = pair_dir.name.split("-")
300
+ src_file = sort_files(bitext_dir, pair_dir, src)
301
+ trg_file = sort_files(bitext_dir, pair_dir, trg)
302
+ validate(src_file, trg_file)
303
+
304
+
305
+ def validate(src_file: Path, trg_file: Path) -> None:
306
+ """Checks that the segments in the given batch are valid."""
307
+ lines_src, lines_trg, found_pairs = 0, 0, 0
308
+ parser = get_typed_parser(SimpleBitext)
309
+ with open(src_file) as src_f, open(trg_file) as trg_f:
310
+ src_l = src_f.readline()
311
+ trg_l = trg_f.readline()
312
+ while src_l and trg_l:
313
+ src: SimpleBitext = parser(src_l)
314
+ trg: SimpleBitext = parser(trg_l)
315
+ if src.line_no <= trg.line_no:
316
+ lines_src += 1
317
+ src_l = src_f.readline()
318
+ if trg.line_no <= src.line_no:
319
+ lines_trg += 1
320
+ trg_l = trg_f.readline()
321
+ if trg.line_no == src.line_no:
322
+ found_pairs += 1
323
+
324
+ if found_pairs == lines_src and found_pairs == lines_trg:
325
+ logging.info(
326
+ f"Validated {src_file} and {trg_file}. Found {found_pairs} bitexts."
327
+ )
328
+ else:
329
+ logging.error(
330
+ f"Validated {src_file} and {trg_file}. "
331
+ f"Found {found_pairs} bitexts, from {lines_src} in {src_file} and {lines_trg} in {trg_file}"
332
+ )
333
+
334
+
335
+ if __name__ == "__main__":
336
+ import func_argparse
337
+
338
+ func_argparse.main(dl, finalize)
laser/tasks/SentimentAnalysis/README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Laser Encoder: Sentiment Analysis
2
+
3
+ ## Overview
4
+
5
+ This project demonstrates the application of the Laser Encoder tool for creating sentence embeddings in the context of sentiment analysis. The Laser Encoder is used to encode text data, and a sentiment analysis model is trained to predict the sentiment of the text.
6
+
7
+ ## Getting Started
8
+
9
+ To run the notebook in Google Colab, click the "Open in Colab" button below:
10
+
11
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NIXBLACK11/LASER-fork/blob/Sentiment-analysis-laser/tasks/SentimentAnalysis/SentimentAnalysis.ipynb)
12
+
13
+ Also, check out the hugging face space with the button below:
14
+
15
+ [![Open In Hugging Face Space](https://img.shields.io/badge/Open%20In-Hugging%20Face%20Space-blue?logo=huggingface)](https://huggingface.co/spaces/NIXBLACK/SentimentAnalysis_LASER_)
16
+
17
+
18
+ ## Example Usage
19
+
20
+ Run the Example Notebook:
21
+ Execute the provided Jupyter Notebook SentimentAnalysis.ipynb
22
+
23
+ jupyter notebook SentimentAnalysis.ipynb
24
+
25
+
26
+ ## Customization
27
+
28
+ - Modify the model architecture, hyperparameters, and training settings in the neural network model section based on your requirements.
29
+ - Customize the sentiment mapping and handling of unknown sentiments in the data preparation section.
30
+
31
+ ## Additional Notes
32
+ - Feel free to experiment with different models, embeddings, and hyperparameters to optimize performance.
33
+ - Ensure that the dimensions of embeddings and model inputs are compatible.
34
+ Adapt the code based on your specific dataset and use case.
laser/tasks/SentimentAnalysis/SentimentAnalysis.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
laser/tasks/WikiMatrix/README.md ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia
2
+
3
+ The goal of this project is to mine for parallel sentences in the textual content of Wikipedia for all possible language pairs.
4
+
5
+
6
+ ## Mined data
7
+ * 85 different languages, 1620 language pairs
8
+ * 134M parallel sentences, out of which 34M are aligned with English
9
+ * this [*table shows the amount of mined parallel sentences for most of the language pairs*](WikiMatrix-sizes.pdf)
10
+ * the mined bitext are stored on AWS and can de downloaded with the following command:
11
+ ```bash
12
+ wget https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/WikiMatrix.en-fr.tsv.gz
13
+ ```
14
+ Replace "en-fr" with the ISO codes of the desired language pair.
15
+ The language pair must be in alphabetical order, e.g. "de-en" and not "en-de".
16
+ The list of available bitexts and their sizes are given in the file [*list_of_bitexts.txt*](list_of_bitexts.txt).
17
+ Please do **not loop over all files** since AWs implements some [*limitations*](https://dl.fbaipublicfiles.com/README) to avoid abuse.
18
+
19
+ Use this command if you want to download all 1620 language pairs in one tar file (but this is 65GB!):
20
+ ```bash
21
+ wget https://dl.fbaipublicfiles.com/laser/WikiMatrix/WikiMatrix.v1.1620_language_pairs.tar
22
+ ```
23
+
24
+ ## Approach
25
+
26
+ We use LASER's bitext mining approach and encoder for 93 languages [2,3].
27
+ We do not use the inter-language links provided by Wikipedia,
28
+ but search over all Wikipedia articles of each language. We approach the
29
+ computational challenge to mine in almost 600 million sentences by using fast
30
+ indexing and similarity search with [*FAISS*](https://github.com/facebookresearch/faiss).
31
+ Prior to mining parallel sentences, we perform
32
+ sentence segmentation, deduplication and language identification.
33
+ Please see reference [1] for details.
34
+
35
+
36
+ ## Data extraction and threshold optimization
37
+ We provide a tool to extract parallel texts from the the TSV files:
38
+ ```bash
39
+ python3 extract.py \
40
+ --tsv WikiMatrix.en-fr.tsv.gz \
41
+ --bitext WikiMatrix.en-fr.txt \
42
+ --src-lang en --trg-lang fr \
43
+ --threshold 1.04
44
+ ```
45
+ One can specify the threshold on the margin score.
46
+ The higher the value, the more likely the sentences are mutual translations, but the less data one will get.
47
+ **A value of 1.04 seems to be good choice for most language pairs.** Please see the analysis in the paper for
48
+ more information [1].
49
+
50
+ ## Evaluation
51
+ To assess the quality of the mined bitexts, we trained neural MT system on all language pairs
52
+ for which we were able to mine at least 25k parallel sentences (with a margin threshold of 1.04).
53
+ We trained systems in both directions, source to target and target to source, and report BLEU scores
54
+ on the [*TED test*](https://github.com/neulab/word-embeddings-for-nmt) set proposed in [4].
55
+ This totals 1886 different NMT systems.
56
+ This [*table shows the BLEU scores for the most frequest language pairs*](WikiMatrix-bleu.pdf).
57
+ We achieve BLEU scores over 30 for several language pairs.
58
+
59
+ The goal is not to build state of the art systems for each language pair, but
60
+ to get an indication of the quality of the automatically mined data. These
61
+ BLEU scores should be of course appreciated in context of the sizes of the
62
+ mined corpora.
63
+
64
+ Obviously, we can not exclude that the
65
+ provided data contains some wrong alignments even though the margin is large.
66
+ Finally, we would like to point out that we run our approach on all available
67
+ languages in Wikipedia, independently of the quality of LASER's sentence
68
+ embeddings for each one.
69
+
70
+
71
+ ## License
72
+
73
+ The mined data is distributed under the Creative Commons Attribution-ShareAlike license.
74
+
75
+ Please cite reference [1] if you use this data.
76
+
77
+ ## References
78
+
79
+ [1] Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman,
80
+ [*WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia*](https://arxiv.org/abs/1907.05791)
81
+ arXiv, July 11 2019.
82
+
83
+ [2] Mikel Artetxe and Holger Schwenk,
84
+ [*Margin-based Parallel Corpus Mining with Multilingual Sentence Embeddings*](https://arxiv.org/abs/1811.01136)
85
+ arXiv, Nov 3 2018.
86
+
87
+ [3] Mikel Artetxe and Holger Schwenk,
88
+ [*Massively Multilingual Sentence Embeddings for Zero-Shot Cross-Lingual Transfer and Beyond*](https://arxiv.org/abs/1812.10464)
89
+ arXiv, Dec 26 2018.
90
+
91
+ [4] Ye Qi, Devendra Sachan, Matthieu Felix, Sarguna Padmanabhan and Graham Neubig,
92
+ [*When and Why Are Pre-Trained Word Embeddings Useful for Neural Machine Translation?*](https://www.aclweb.org/anthology/papers/N/N18/N18-2084/)
93
+ NAACL, pages 529-535, 2018.
laser/tasks/WikiMatrix/WikiMatrix-bleu.pdf ADDED
Binary file (54.3 kB). View file
 
laser/tasks/WikiMatrix/WikiMatrix-sizes.pdf ADDED
Binary file (60.3 kB). View file
 
laser/tasks/WikiMatrix/extract.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/python3
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+ # LASER Language-Agnostic SEntence Representations
9
+ # is a toolkit to calculate multilingual sentence embeddings
10
+ # and to use them for document classification, bitext filtering
11
+ # and mining
12
+ #
13
+ # --------------------------------------------------------
14
+ #
15
+ # Tool to extract subset of mined bitexts in a tsv.gz file
16
+
17
+ import os
18
+ import sys
19
+ import gzip
20
+ import argparse
21
+
22
+ ###############################################################################
23
+ #
24
+ # Main
25
+ #
26
+ ###############################################################################
27
+
28
+ parser = argparse.ArgumentParser(description='Tool to extract bitext from the WikiMatrix')
29
+ parser.add_argument('--encoding', default='utf-8',
30
+ help='character encoding for input/output')
31
+ parser.add_argument('--tsv', type=str, required=True,
32
+ help='File with mined bitexts')
33
+ parser.add_argument('--bitext', type=str, required=True,
34
+ help='Text file after sentence splitting')
35
+ parser.add_argument('--src-lang', type=str, required=True,
36
+ help='Source language')
37
+ parser.add_argument('--trg-lang', type=str, required=True,
38
+ help='Traget language')
39
+ parser.add_argument('--threshold', type=float, default=1.05,
40
+ help='Threshold on margin score')
41
+ parser.add_argument('--nb-sents', type=int, default=999999999,
42
+ help='Maximal number of sentences')
43
+ parser.add_argument('--nb-words-src', type=int, default=999999999,
44
+ help='Maxmimal numer of total words in the source language')
45
+ parser.add_argument('--nb-words-trg', type=int, default=999999999,
46
+ help='Maxmimal numer of total words in the target language')
47
+ args = parser.parse_args()
48
+
49
+ print('Tool to extract bitext from the WikiMatrix')
50
+
51
+ nl = 0
52
+ nw_src = 0
53
+ nw_trg = 0
54
+ print('Processing {}'.format(args.tsv))
55
+ with gzip.open(args.tsv, 'rt', encoding=args.encoding) as tsv:
56
+ with open(args.bitext + '.' + args.src_lang, 'wt', encoding=args.encoding) as fsrc:
57
+ with open(args.bitext + '.' + args.trg_lang, 'wt', encoding=args.encoding) as ftrg:
58
+ while nl < args.nb_sents:
59
+ line = tsv.readline()
60
+ if not line:
61
+ break
62
+ fields = line.split('\t')
63
+ cur_src = len(fields[1].split())
64
+ cur_trg = len(fields[2].split())
65
+ if float(fields[0]) < args.threshold:
66
+ break
67
+ if nw_src + cur_src > args.nb_words_src:
68
+ break
69
+ if nw_trg + cur_trg > args.nb_words_trg:
70
+ break
71
+ fsrc.write(fields[1].strip() + '\n')
72
+ ftrg.write(fields[2].strip() + '\n')
73
+ nw_src += cur_src
74
+ nw_trg += cur_trg
75
+ nl += 1
76
+ if nl % 100000 == 0:
77
+ print('\r - {:d} lines read'.format(nl), end='')
78
+
79
+ print('\r - wrote {:d} lines'.format(nl))
80
+ print(' - with {:d} source and {:d} target words'.format(nw_src, nw_trg))
81
+ print(' - last threshold is {:.4f}'.format(float(fields[0])))
laser/tasks/WikiMatrix/list_of_bitexts.txt ADDED
@@ -0,0 +1,1620 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WikiMatrix.an-ca.tsv 24616
2
+ WikiMatrix.an-de.tsv 12887
3
+ WikiMatrix.an-en.tsv 23313
4
+ WikiMatrix.an-es.tsv 33723
5
+ WikiMatrix.an-fr.tsv 16726
6
+ WikiMatrix.an-gl.tsv 15209
7
+ WikiMatrix.an-it.tsv 13203
8
+ WikiMatrix.an-pl.tsv 10456
9
+ WikiMatrix.an-pt.tsv 14850
10
+ WikiMatrix.an-ru.tsv 11579
11
+ WikiMatrix.ar-arz.tsv 29316
12
+ WikiMatrix.ar-az.tsv 17543
13
+ WikiMatrix.ar-ba.tsv 15093
14
+ WikiMatrix.ar-be.tsv 11720
15
+ WikiMatrix.ar-bg.tsv 54919
16
+ WikiMatrix.ar-bn.tsv 40997
17
+ WikiMatrix.ar-br.tsv 10707
18
+ WikiMatrix.ar-bs.tsv 34137
19
+ WikiMatrix.ar-ca.tsv 94324
20
+ WikiMatrix.ar-ceb.tsv 11056
21
+ WikiMatrix.ar-cs.tsv 67131
22
+ WikiMatrix.ar-da.tsv 53021
23
+ WikiMatrix.ar-de.tsv 99258
24
+ WikiMatrix.ar-el.tsv 66961
25
+ WikiMatrix.ar-en.tsv 999762
26
+ WikiMatrix.ar-eo.tsv 37130
27
+ WikiMatrix.ar-es.tsv 174557
28
+ WikiMatrix.ar-et.tsv 40659
29
+ WikiMatrix.ar-eu.tsv 24853
30
+ WikiMatrix.ar-fa.tsv 58545
31
+ WikiMatrix.ar-fi.tsv 53052
32
+ WikiMatrix.ar-fr.tsv 163549
33
+ WikiMatrix.ar-gl.tsv 50528
34
+ WikiMatrix.ar-he.tsv 68302
35
+ WikiMatrix.ar-hi.tsv 38318
36
+ WikiMatrix.ar-hr.tsv 38853
37
+ WikiMatrix.ar-hu.tsv 60661
38
+ WikiMatrix.ar-id.tsv 90815
39
+ WikiMatrix.ar-is.tsv 18271
40
+ WikiMatrix.ar-it.tsv 123838
41
+ WikiMatrix.ar-ja.tsv 83059
42
+ WikiMatrix.ar-kk.tsv 11688
43
+ WikiMatrix.ar-ko.tsv 48869
44
+ WikiMatrix.ar-lt.tsv 33495
45
+ WikiMatrix.ar-mk.tsv 52154
46
+ WikiMatrix.ar-ml.tsv 32012
47
+ WikiMatrix.ar-mr.tsv 32462
48
+ WikiMatrix.ar-nds.tsv 11783
49
+ WikiMatrix.ar-ne.tsv 12129
50
+ WikiMatrix.ar-nl.tsv 73006
51
+ WikiMatrix.ar-no.tsv 58790
52
+ WikiMatrix.ar-pl.tsv 74295
53
+ WikiMatrix.ar-pt.tsv 157441
54
+ WikiMatrix.ar-ro.tsv 71258
55
+ WikiMatrix.ar-ru.tsv 125312
56
+ WikiMatrix.ar-sh.tsv 35310
57
+ WikiMatrix.ar-si.tsv 32607
58
+ WikiMatrix.ar-sk.tsv 32135
59
+ WikiMatrix.ar-sl.tsv 39982
60
+ WikiMatrix.ar-sq.tsv 30042
61
+ WikiMatrix.ar-sr.tsv 49502
62
+ WikiMatrix.ar-sv.tsv 58224
63
+ WikiMatrix.ar-sw.tsv 13755
64
+ WikiMatrix.ar-ta.tsv 27250
65
+ WikiMatrix.ar-te.tsv 27072
66
+ WikiMatrix.ar-tl.tsv 18350
67
+ WikiMatrix.ar-tr.tsv 69844
68
+ WikiMatrix.ar-tt.tsv 14074
69
+ WikiMatrix.ar-uk.tsv 70551
70
+ WikiMatrix.ar-vi.tsv 93890
71
+ WikiMatrix.arz-de.tsv 11796
72
+ WikiMatrix.arz-en.tsv 18231
73
+ WikiMatrix.arz-es.tsv 12571
74
+ WikiMatrix.arz-fr.tsv 12047
75
+ WikiMatrix.ar-zh.tsv 86236
76
+ WikiMatrix.arz-it.tsv 10384
77
+ WikiMatrix.arz-pt.tsv 10887
78
+ WikiMatrix.arz-ru.tsv 12163
79
+ WikiMatrix.as-de.tsv 11031
80
+ WikiMatrix.as-es.tsv 11054
81
+ WikiMatrix.as-fr.tsv 12092
82
+ WikiMatrix.as-it.tsv 10844
83
+ WikiMatrix.azb-fr.tsv 10355
84
+ WikiMatrix.az-bg.tsv 14096
85
+ WikiMatrix.az-ca.tsv 15416
86
+ WikiMatrix.az-cs.tsv 17994
87
+ WikiMatrix.az-da.tsv 11999
88
+ WikiMatrix.az-de.tsv 34736
89
+ WikiMatrix.az-el.tsv 10594
90
+ WikiMatrix.az-en.tsv 71276
91
+ WikiMatrix.az-es.tsv 31334
92
+ WikiMatrix.az-et.tsv 10537
93
+ WikiMatrix.az-fa.tsv 16093
94
+ WikiMatrix.az-fi.tsv 14393
95
+ WikiMatrix.az-fr.tsv 29949
96
+ WikiMatrix.az-gl.tsv 10514
97
+ WikiMatrix.az-he.tsv 12252
98
+ WikiMatrix.az-hr.tsv 10638
99
+ WikiMatrix.az-hu.tsv 15439
100
+ WikiMatrix.az-id.tsv 17049
101
+ WikiMatrix.az-it.tsv 25780
102
+ WikiMatrix.az-ja.tsv 23140
103
+ WikiMatrix.az-ko.tsv 11806
104
+ WikiMatrix.az-nl.tsv 22641
105
+ WikiMatrix.az-no.tsv 13992
106
+ WikiMatrix.az-pl.tsv 22609
107
+ WikiMatrix.az-pt.tsv 25337
108
+ WikiMatrix.az-ro.tsv 14801
109
+ WikiMatrix.az-ru.tsv 47130
110
+ WikiMatrix.az-sr.tsv 10553
111
+ WikiMatrix.az-sv.tsv 19240
112
+ WikiMatrix.az-ta.tsv 11980
113
+ WikiMatrix.az-tr.tsv 42846
114
+ WikiMatrix.az-uk.tsv 19756
115
+ WikiMatrix.az-vi.tsv 10485
116
+ WikiMatrix.az-zh.tsv 19175
117
+ WikiMatrix.ba-bg.tsv 14238
118
+ WikiMatrix.ba-ca.tsv 17290
119
+ WikiMatrix.ba-cs.tsv 16981
120
+ WikiMatrix.ba-da.tsv 13015
121
+ WikiMatrix.ba-de.tsv 27046
122
+ WikiMatrix.ba-el.tsv 10653
123
+ WikiMatrix.ba-en.tsv 28176
124
+ WikiMatrix.ba-es.tsv 28201
125
+ WikiMatrix.ba-fi.tsv 12213
126
+ WikiMatrix.ba-fr.tsv 29638
127
+ WikiMatrix.ba-gl.tsv 12390
128
+ WikiMatrix.ba-hr.tsv 10663
129
+ WikiMatrix.ba-hu.tsv 12223
130
+ WikiMatrix.ba-id.tsv 12203
131
+ WikiMatrix.ba-it.tsv 26652
132
+ WikiMatrix.ba-ja.tsv 13782
133
+ WikiMatrix.ba-nl.tsv 21836
134
+ WikiMatrix.ba-no.tsv 15233
135
+ WikiMatrix.ba-pl.tsv 19390
136
+ WikiMatrix.ba-pt.tsv 24870
137
+ WikiMatrix.bar-de.tsv 41990
138
+ WikiMatrix.bar-en.tsv 16990
139
+ WikiMatrix.bar-es.tsv 12506
140
+ WikiMatrix.bar-fr.tsv 12036
141
+ WikiMatrix.bar-it.tsv 10450
142
+ WikiMatrix.ba-ro.tsv 15499
143
+ WikiMatrix.bar-pt.tsv 10377
144
+ WikiMatrix.bar-ru.tsv 10220
145
+ WikiMatrix.ba-ru.tsv 42893
146
+ WikiMatrix.ba-sh.tsv 10485
147
+ WikiMatrix.ba-sk.tsv 10764
148
+ WikiMatrix.ba-sl.tsv 10349
149
+ WikiMatrix.ba-sr.tsv 10182
150
+ WikiMatrix.ba-sv.tsv 20346
151
+ WikiMatrix.ba-tr.tsv 11183
152
+ WikiMatrix.ba-uk.tsv 15915
153
+ WikiMatrix.ba-zh.tsv 10468
154
+ WikiMatrix.be-bg.tsv 16061
155
+ WikiMatrix.be-ca.tsv 16344
156
+ WikiMatrix.be-cs.tsv 14343
157
+ WikiMatrix.be-de.tsv 20671
158
+ WikiMatrix.be-en.tsv 33927
159
+ WikiMatrix.be-es.tsv 28153
160
+ WikiMatrix.be-fi.tsv 10959
161
+ WikiMatrix.be-fr.tsv 24250
162
+ WikiMatrix.be-he.tsv 10710
163
+ WikiMatrix.be-hu.tsv 11940
164
+ WikiMatrix.be-it.tsv 24351
165
+ WikiMatrix.be-ja.tsv 12032
166
+ WikiMatrix.be-nl.tsv 14188
167
+ WikiMatrix.be-no.tsv 10997
168
+ WikiMatrix.be-pl.tsv 19438
169
+ WikiMatrix.be-pt.tsv 23580
170
+ WikiMatrix.be-ro.tsv 13182
171
+ WikiMatrix.be-ru.tsv 161215
172
+ WikiMatrix.be-sr.tsv 10206
173
+ WikiMatrix.be-sv.tsv 16161
174
+ WikiMatrix.be-uk.tsv 80801
175
+ WikiMatrix.bg-bn.tsv 38072
176
+ WikiMatrix.bg-bs.tsv 34760
177
+ WikiMatrix.bg-ca.tsv 76189
178
+ WikiMatrix.bg-ceb.tsv 11166
179
+ WikiMatrix.bg-cs.tsv 79005
180
+ WikiMatrix.bg-da.tsv 53738
181
+ WikiMatrix.bg-de.tsv 132146
182
+ WikiMatrix.bg-el.tsv 62768
183
+ WikiMatrix.bg-en.tsv 357969
184
+ WikiMatrix.bg-eo.tsv 40884
185
+ WikiMatrix.bg-es.tsv 122534
186
+ WikiMatrix.bg-et.tsv 43393
187
+ WikiMatrix.bg-eu.tsv 25564
188
+ WikiMatrix.bg-fa.tsv 37158
189
+ WikiMatrix.bg-fi.tsv 61847
190
+ WikiMatrix.bg-fr.tsv 117264
191
+ WikiMatrix.bg-gl.tsv 43273
192
+ WikiMatrix.bg-he.tsv 58167
193
+ WikiMatrix.bg-hi.tsv 30349
194
+ WikiMatrix.bg-hr.tsv 47877
195
+ WikiMatrix.bg-hu.tsv 68595
196
+ WikiMatrix.bg-id.tsv 60639
197
+ WikiMatrix.bg-is.tsv 17659
198
+ WikiMatrix.bg-it.tsv 102305
199
+ WikiMatrix.bg-ja.tsv 71117
200
+ WikiMatrix.bg-kk.tsv 11542
201
+ WikiMatrix.bg-ko.tsv 38280
202
+ WikiMatrix.bg-lt.tsv 42406
203
+ WikiMatrix.bg-mk.tsv 86038
204
+ WikiMatrix.bg-ml.tsv 29348
205
+ WikiMatrix.bg-mr.tsv 35898
206
+ WikiMatrix.bg-nds.tsv 11308
207
+ WikiMatrix.bg-ne.tsv 13616
208
+ WikiMatrix.bg-nl.tsv 84025
209
+ WikiMatrix.bg-no.tsv 58964
210
+ WikiMatrix.bg-pl.tsv 96090
211
+ WikiMatrix.bg-pt.tsv 114067
212
+ WikiMatrix.bg-ro.tsv 69902
213
+ WikiMatrix.bg-ru.tsv 270073
214
+ WikiMatrix.bg-sh.tsv 41845
215
+ WikiMatrix.bg-si.tsv 31112
216
+ WikiMatrix.bg-sk.tsv 43375
217
+ WikiMatrix.bg-sl.tsv 46673
218
+ WikiMatrix.bg-sq.tsv 26037
219
+ WikiMatrix.bg-sr.tsv 65281
220
+ WikiMatrix.bg-sv.tsv 63135
221
+ WikiMatrix.bg-sw.tsv 12945
222
+ WikiMatrix.bg-ta.tsv 21462
223
+ WikiMatrix.bg-te.tsv 23487
224
+ WikiMatrix.bg-tl.tsv 21198
225
+ WikiMatrix.bg-tr.tsv 56592
226
+ WikiMatrix.bg-tt.tsv 12088
227
+ WikiMatrix.bg-uk.tsv 126154
228
+ WikiMatrix.bg-vi.tsv 60738
229
+ WikiMatrix.bg-zh.tsv 60373
230
+ WikiMatrix.bn-bs.tsv 21448
231
+ WikiMatrix.bn-ca.tsv 41891
232
+ WikiMatrix.bn-cs.tsv 47405
233
+ WikiMatrix.bn-da.tsv 33723
234
+ WikiMatrix.bn-de.tsv 70350
235
+ WikiMatrix.bn-el.tsv 36202
236
+ WikiMatrix.bn-en.tsv 280567
237
+ WikiMatrix.bn-eo.tsv 27166
238
+ WikiMatrix.bn-es.tsv 81824
239
+ WikiMatrix.bn-et.tsv 26968
240
+ WikiMatrix.bn-eu.tsv 14912
241
+ WikiMatrix.bn-fa.tsv 20952
242
+ WikiMatrix.bn-fi.tsv 37517
243
+ WikiMatrix.bn-fr.tsv 68784
244
+ WikiMatrix.bn-gl.tsv 27666
245
+ WikiMatrix.bn-he.tsv 34274
246
+ WikiMatrix.bn-hi.tsv 21240
247
+ WikiMatrix.bn-hr.tsv 23924
248
+ WikiMatrix.bn-hu.tsv 41219
249
+ WikiMatrix.bn-id.tsv 36553
250
+ WikiMatrix.bn-it.tsv 64222
251
+ WikiMatrix.bn-ja.tsv 38462
252
+ WikiMatrix.bn-ko.tsv 20915
253
+ WikiMatrix.bn-lt.tsv 21523
254
+ WikiMatrix.bn-mk.tsv 23173
255
+ WikiMatrix.bn-nl.tsv 50217
256
+ WikiMatrix.bn-no.tsv 35729
257
+ WikiMatrix.bn-pl.tsv 52856
258
+ WikiMatrix.bn-pt.tsv 76354
259
+ WikiMatrix.bn-ro.tsv 46700
260
+ WikiMatrix.bn-ru.tsv 62512
261
+ WikiMatrix.bn-sh.tsv 20767
262
+ WikiMatrix.bn-sk.tsv 25064
263
+ WikiMatrix.bn-sl.tsv 26700
264
+ WikiMatrix.bn-sq.tsv 17724
265
+ WikiMatrix.bn-sr.tsv 25613
266
+ WikiMatrix.bn-sv.tsv 54274
267
+ WikiMatrix.bn-ta.tsv 12734
268
+ WikiMatrix.bn-tr.tsv 33161
269
+ WikiMatrix.bn-uk.tsv 37701
270
+ WikiMatrix.bn-vi.tsv 31080
271
+ WikiMatrix.bn-zh.tsv 31604
272
+ WikiMatrix.br-de.tsv 20925
273
+ WikiMatrix.br-en.tsv 16902
274
+ WikiMatrix.br-es.tsv 22492
275
+ WikiMatrix.br-fr.tsv 23892
276
+ WikiMatrix.br-it.tsv 22410
277
+ WikiMatrix.br-pt.tsv 19806
278
+ WikiMatrix.br-ru.tsv 16104
279
+ WikiMatrix.br-uk.tsv 11428
280
+ WikiMatrix.bs-ca.tsv 44601
281
+ WikiMatrix.bs-cs.tsv 43380
282
+ WikiMatrix.bs-da.tsv 32718
283
+ WikiMatrix.bs-de.tsv 71019
284
+ WikiMatrix.bs-el.tsv 33881
285
+ WikiMatrix.bs-en.tsv 210690
286
+ WikiMatrix.bs-eo.tsv 24088
287
+ WikiMatrix.bs-es.tsv 70064
288
+ WikiMatrix.bs-et.tsv 25631
289
+ WikiMatrix.bs-eu.tsv 16473
290
+ WikiMatrix.bs-fa.tsv 20287
291
+ WikiMatrix.bs-fi.tsv 36106
292
+ WikiMatrix.bs-fr.tsv 60013
293
+ WikiMatrix.bs-gl.tsv 32509
294
+ WikiMatrix.bs-he.tsv 30165
295
+ WikiMatrix.bs-hi.tsv 16693
296
+ WikiMatrix.bs-hr.tsv 164225
297
+ WikiMatrix.bs-hu.tsv 39139
298
+ WikiMatrix.bs-id.tsv 38865
299
+ WikiMatrix.bs-is.tsv 11489
300
+ WikiMatrix.bs-it.tsv 52824
301
+ WikiMatrix.bs-ja.tsv 36882
302
+ WikiMatrix.bs-ko.tsv 22710
303
+ WikiMatrix.bs-lt.tsv 23114
304
+ WikiMatrix.bs-mk.tsv 39333
305
+ WikiMatrix.bs-ml.tsv 19148
306
+ WikiMatrix.bs-mr.tsv 20082
307
+ WikiMatrix.bs-nl.tsv 45271
308
+ WikiMatrix.bs-no.tsv 36061
309
+ WikiMatrix.bs-pl.tsv 48283
310
+ WikiMatrix.bs-pt.tsv 62118
311
+ WikiMatrix.bs-ro.tsv 37605
312
+ WikiMatrix.bs-ru.tsv 59540
313
+ WikiMatrix.bs-sh.tsv 178354
314
+ WikiMatrix.bs-si.tsv 16269
315
+ WikiMatrix.bs-sk.tsv 25108
316
+ WikiMatrix.bs-sl.tsv 34165
317
+ WikiMatrix.bs-sq.tsv 19923
318
+ WikiMatrix.bs-sr.tsv 130890
319
+ WikiMatrix.bs-sv.tsv 38600
320
+ WikiMatrix.bs-ta.tsv 15962
321
+ WikiMatrix.bs-te.tsv 12974
322
+ WikiMatrix.bs-tl.tsv 13894
323
+ WikiMatrix.bs-tr.tsv 33212
324
+ WikiMatrix.bs-uk.tsv 39682
325
+ WikiMatrix.bs-vi.tsv 38866
326
+ WikiMatrix.bs-zh.tsv 31707
327
+ WikiMatrix.ca-ceb.tsv 14847
328
+ WikiMatrix.ca-cs.tsv 100782
329
+ WikiMatrix.ca-da.tsv 86539
330
+ WikiMatrix.ca-de.tsv 180321
331
+ WikiMatrix.ca-el.tsv 90118
332
+ WikiMatrix.ca-en.tsv 1205908
333
+ WikiMatrix.ca-eo.tsv 81716
334
+ WikiMatrix.ca-es.tsv 1580036
335
+ WikiMatrix.ca-et.tsv 54756
336
+ WikiMatrix.ca-eu.tsv 77232
337
+ WikiMatrix.ca-fa.tsv 44064
338
+ WikiMatrix.ca-fi.tsv 83094
339
+ WikiMatrix.ca-fo.tsv 13082
340
+ WikiMatrix.ca-fr.tsv 490870
341
+ WikiMatrix.ca-fy.tsv 13000
342
+ WikiMatrix.ca-gl.tsv 268445
343
+ WikiMatrix.ca-he.tsv 84339
344
+ WikiMatrix.ca-hi.tsv 37348
345
+ WikiMatrix.ca-hr.tsv 57726
346
+ WikiMatrix.ca-hu.tsv 92229
347
+ WikiMatrix.ca-id.tsv 107262
348
+ WikiMatrix.ca-is.tsv 23961
349
+ WikiMatrix.ca-it.tsv 316207
350
+ WikiMatrix.ca-ja.tsv 103898
351
+ WikiMatrix.ca-ka.tsv 11585
352
+ WikiMatrix.ca-kk.tsv 12931
353
+ WikiMatrix.ca-ko.tsv 52062
354
+ WikiMatrix.ca-la.tsv 12936
355
+ WikiMatrix.ca-lb.tsv 12167
356
+ WikiMatrix.ca-lt.tsv 45454
357
+ WikiMatrix.ca-mk.tsv 61863
358
+ WikiMatrix.ca-ml.tsv 45785
359
+ WikiMatrix.ca-mr.tsv 56224
360
+ WikiMatrix.ca-nds.tsv 16849
361
+ WikiMatrix.ca-ne.tsv 17559
362
+ WikiMatrix.ca-nl.tsv 144699
363
+ WikiMatrix.ca-no.tsv 102814
364
+ WikiMatrix.ca-oc.tsv 57688
365
+ WikiMatrix.ca-pl.tsv 121144
366
+ WikiMatrix.ca-pt.tsv 358872
367
+ WikiMatrix.ca-ro.tsv 110611
368
+ WikiMatrix.ca-ru.tsv 169694
369
+ WikiMatrix.ca-sh.tsv 52130
370
+ WikiMatrix.ca-si.tsv 52526
371
+ WikiMatrix.ca-sk.tsv 50258
372
+ WikiMatrix.ca-sl.tsv 57635
373
+ WikiMatrix.ca-sq.tsv 34778
374
+ WikiMatrix.ca-sr.tsv 67675
375
+ WikiMatrix.ca-sv.tsv 102757
376
+ WikiMatrix.ca-sw.tsv 14172
377
+ WikiMatrix.ca-ta.tsv 30492
378
+ WikiMatrix.ca-te.tsv 35458
379
+ WikiMatrix.ca-tl.tsv 31806
380
+ WikiMatrix.ca-tr.tsv 77056
381
+ WikiMatrix.ca-tt.tsv 16252
382
+ WikiMatrix.ca-uk.tsv 98316
383
+ WikiMatrix.ca-vi.tsv 106890
384
+ WikiMatrix.ca-zh.tsv 90642
385
+ WikiMatrix.ceb-cs.tsv 13961
386
+ WikiMatrix.ceb-de.tsv 22557
387
+ WikiMatrix.ceb-en.tsv 29061
388
+ WikiMatrix.ceb-es.tsv 27593
389
+ WikiMatrix.ceb-fi.tsv 10552
390
+ WikiMatrix.ceb-fr.tsv 24359
391
+ WikiMatrix.ceb-hu.tsv 12546
392
+ WikiMatrix.ceb-it.tsv 24544
393
+ WikiMatrix.ceb-ja.tsv 14628
394
+ WikiMatrix.ceb-nl.tsv 15981
395
+ WikiMatrix.ceb-no.tsv 10617
396
+ WikiMatrix.ceb-pl.tsv 17744
397
+ WikiMatrix.ceb-pt.tsv 20982
398
+ WikiMatrix.ceb-ro.tsv 11740
399
+ WikiMatrix.ceb-ru.tsv 21786
400
+ WikiMatrix.ceb-sv.tsv 55991
401
+ WikiMatrix.ceb-uk.tsv 12630
402
+ WikiMatrix.cs-da.tsv 75869
403
+ WikiMatrix.cs-de.tsv 233859
404
+ WikiMatrix.cs-el.tsv 70243
405
+ WikiMatrix.cs-en.tsv 519194
406
+ WikiMatrix.cs-eo.tsv 75647
407
+ WikiMatrix.cs-es.tsv 181522
408
+ WikiMatrix.cs-et.tsv 62499
409
+ WikiMatrix.cs-eu.tsv 36854
410
+ WikiMatrix.cs-fa.tsv 45233
411
+ WikiMatrix.cs-fi.tsv 95910
412
+ WikiMatrix.cs-fr.tsv 185766
413
+ WikiMatrix.cs-fy.tsv 10155
414
+ WikiMatrix.cs-gl.tsv 54156
415
+ WikiMatrix.cs-he.tsv 72677
416
+ WikiMatrix.cs-hi.tsv 38939
417
+ WikiMatrix.cs-hr.tsv 63902
418
+ WikiMatrix.cs-hu.tsv 105871
419
+ WikiMatrix.cs-id.tsv 78669
420
+ WikiMatrix.cs-is.tsv 23143
421
+ WikiMatrix.cs-it.tsv 161101
422
+ WikiMatrix.cs-ja.tsv 105593
423
+ WikiMatrix.cs-ka.tsv 10280
424
+ WikiMatrix.cs-kk.tsv 15269
425
+ WikiMatrix.cs-ko.tsv 53009
426
+ WikiMatrix.cs-la.tsv 11106
427
+ WikiMatrix.cs-lt.tsv 55863
428
+ WikiMatrix.cs-mk.tsv 51965
429
+ WikiMatrix.cs-ml.tsv 36217
430
+ WikiMatrix.cs-mr.tsv 41772
431
+ WikiMatrix.cs-nds.tsv 14694
432
+ WikiMatrix.cs-ne.tsv 15583
433
+ WikiMatrix.cs-nl.tsv 139344
434
+ WikiMatrix.cs-no.tsv 86494
435
+ WikiMatrix.cs-oc.tsv 11347
436
+ WikiMatrix.cs-pl.tsv 176644
437
+ WikiMatrix.cs-pt.tsv 153498
438
+ WikiMatrix.cs-ro.tsv 82650
439
+ WikiMatrix.cs-ru.tsv 186997
440
+ WikiMatrix.cs-sh.tsv 50524
441
+ WikiMatrix.cs-si.tsv 37450
442
+ WikiMatrix.cs-sk.tsv 474501
443
+ WikiMatrix.cs-sl.tsv 64723
444
+ WikiMatrix.cs-sq.tsv 30247
445
+ WikiMatrix.cs-sr.tsv 63977
446
+ WikiMatrix.cs-sv.tsv 97411
447
+ WikiMatrix.cs-sw.tsv 15456
448
+ WikiMatrix.cs-ta.tsv 31623
449
+ WikiMatrix.cs-te.tsv 34268
450
+ WikiMatrix.cs-tl.tsv 25877
451
+ WikiMatrix.cs-tr.tsv 75298
452
+ WikiMatrix.cs-tt.tsv 14187
453
+ WikiMatrix.cs-uk.tsv 104982
454
+ WikiMatrix.cs-vi.tsv 74800
455
+ WikiMatrix.cs-zh.tsv 80380
456
+ WikiMatrix.da-de.tsv 180346
457
+ WikiMatrix.da-el.tsv 54103
458
+ WikiMatrix.da-en.tsv 436051
459
+ WikiMatrix.da-eo.tsv 39229
460
+ WikiMatrix.da-es.tsv 140600
461
+ WikiMatrix.da-et.tsv 45476
462
+ WikiMatrix.da-eu.tsv 26471
463
+ WikiMatrix.da-fa.tsv 29956
464
+ WikiMatrix.da-fi.tsv 75305
465
+ WikiMatrix.da-fo.tsv 12572
466
+ WikiMatrix.da-fr.tsv 142489
467
+ WikiMatrix.da-gl.tsv 44177
468
+ WikiMatrix.da-he.tsv 55865
469
+ WikiMatrix.da-hi.tsv 25361
470
+ WikiMatrix.da-hr.tsv 43287
471
+ WikiMatrix.da-hu.tsv 69597
472
+ WikiMatrix.da-id.tsv 63913
473
+ WikiMatrix.da-is.tsv 20952
474
+ WikiMatrix.da-it.tsv 115905
475
+ WikiMatrix.da-ja.tsv 76251
476
+ WikiMatrix.da-ko.tsv 37016
477
+ WikiMatrix.da-lt.tsv 35446
478
+ WikiMatrix.da-mk.tsv 39837
479
+ WikiMatrix.da-ml.tsv 30210
480
+ WikiMatrix.da-mr.tsv 35952
481
+ WikiMatrix.da-nds.tsv 11399
482
+ WikiMatrix.da-ne.tsv 12258
483
+ WikiMatrix.da-nl.tsv 110077
484
+ WikiMatrix.da-no.tsv 303266
485
+ WikiMatrix.da-pl.tsv 89734
486
+ WikiMatrix.da-pt.tsv 123217
487
+ WikiMatrix.da-ro.tsv 70268
488
+ WikiMatrix.da-ru.tsv 109086
489
+ WikiMatrix.da-sh.tsv 37811
490
+ WikiMatrix.da-si.tsv 32338
491
+ WikiMatrix.da-sk.tsv 39731
492
+ WikiMatrix.da-sl.tsv 40166
493
+ WikiMatrix.da-sq.tsv 23038
494
+ WikiMatrix.da-sr.tsv 43677
495
+ WikiMatrix.da-sv.tsv 168311
496
+ WikiMatrix.da-sw.tsv 11561
497
+ WikiMatrix.da-ta.tsv 20656
498
+ WikiMatrix.da-te.tsv 21459
499
+ WikiMatrix.da-tl.tsv 23770
500
+ WikiMatrix.da-tr.tsv 55021
501
+ WikiMatrix.da-tt.tsv 11511
502
+ WikiMatrix.da-uk.tsv 62966
503
+ WikiMatrix.da-vi.tsv 68811
504
+ WikiMatrix.da-zh.tsv 57975
505
+ WikiMatrix.de-el.tsv 95377
506
+ WikiMatrix.de-en.tsv 1573437
507
+ WikiMatrix.de-eo.tsv 186502
508
+ WikiMatrix.de-es.tsv 418724
509
+ WikiMatrix.de-et.tsv 106627
510
+ WikiMatrix.de-eu.tsv 53517
511
+ WikiMatrix.de-fa.tsv 66193
512
+ WikiMatrix.de-fi.tsv 163341
513
+ WikiMatrix.de-fo.tsv 14842
514
+ WikiMatrix.de-fr.tsv 626166
515
+ WikiMatrix.de-fy.tsv 16523
516
+ WikiMatrix.de-gl.tsv 80842
517
+ WikiMatrix.de-gom.tsv 10721
518
+ WikiMatrix.de-he.tsv 109703
519
+ WikiMatrix.de-hi.tsv 57760
520
+ WikiMatrix.de-hr.tsv 87640
521
+ WikiMatrix.de-hu.tsv 192730
522
+ WikiMatrix.de-hy.tsv 11529
523
+ WikiMatrix.de-id.tsv 107890
524
+ WikiMatrix.de-is.tsv 34569
525
+ WikiMatrix.de-it.tsv 388342
526
+ WikiMatrix.de-ja.tsv 217547
527
+ WikiMatrix.de-ka.tsv 15369
528
+ WikiMatrix.de-kk.tsv 23972
529
+ WikiMatrix.de-ko.tsv 82280
530
+ WikiMatrix.de-la.tsv 17846
531
+ WikiMatrix.de-lb.tsv 26924
532
+ WikiMatrix.de-lt.tsv 78962
533
+ WikiMatrix.de-mk.tsv 64773
534
+ WikiMatrix.de-ml.tsv 51618
535
+ WikiMatrix.de-mr.tsv 58672
536
+ WikiMatrix.de-nds.tsv 75590
537
+ WikiMatrix.de-ne.tsv 21897
538
+ WikiMatrix.de-nl.tsv 472831
539
+ WikiMatrix.de-no.tsv 207477
540
+ WikiMatrix.de-oc.tsv 17152
541
+ WikiMatrix.de-pl.tsv 285039
542
+ WikiMatrix.de-pt.tsv 294059
543
+ WikiMatrix.de-rm.tsv 10576
544
+ WikiMatrix.de-ro.tsv 129013
545
+ WikiMatrix.de-ru.tsv 368206
546
+ WikiMatrix.de-sh.tsv 68373
547
+ WikiMatrix.de-si.tsv 50991
548
+ WikiMatrix.de-sk.tsv 94959
549
+ WikiMatrix.de-sl.tsv 106666
550
+ WikiMatrix.de-sq.tsv 51177
551
+ WikiMatrix.de-sr.tsv 81479
552
+ WikiMatrix.de-sv.tsv 216938
553
+ WikiMatrix.de-sw.tsv 20702
554
+ WikiMatrix.de-ta.tsv 58600
555
+ WikiMatrix.de-te.tsv 57957
556
+ WikiMatrix.de-tg.tsv 11121
557
+ WikiMatrix.de-tl.tsv 32893
558
+ WikiMatrix.de-tr.tsv 127051
559
+ WikiMatrix.de-tt.tsv 23087
560
+ WikiMatrix.de-uk.tsv 165076
561
+ WikiMatrix.de-vi.tsv 107022
562
+ WikiMatrix.de-wuu.tsv 11173
563
+ WikiMatrix.de-zh.tsv 134077
564
+ WikiMatrix.el-en.tsv 620801
565
+ WikiMatrix.el-eo.tsv 39852
566
+ WikiMatrix.el-es.tsv 145191
567
+ WikiMatrix.el-et.tsv 41026
568
+ WikiMatrix.el-eu.tsv 23862
569
+ WikiMatrix.el-fa.tsv 35116
570
+ WikiMatrix.el-fi.tsv 55435
571
+ WikiMatrix.el-fr.tsv 137073
572
+ WikiMatrix.el-gl.tsv 48685
573
+ WikiMatrix.el-he.tsv 56833
574
+ WikiMatrix.el-hi.tsv 26307
575
+ WikiMatrix.el-hr.tsv 43565
576
+ WikiMatrix.el-hu.tsv 64636
577
+ WikiMatrix.el-id.tsv 73368
578
+ WikiMatrix.el-is.tsv 15794
579
+ WikiMatrix.el-it.tsv 119290
580
+ WikiMatrix.el-ja.tsv 69478
581
+ WikiMatrix.el-ko.tsv 35634
582
+ WikiMatrix.el-lt.tsv 34372
583
+ WikiMatrix.el-mk.tsv 52936
584
+ WikiMatrix.el-ml.tsv 27124
585
+ WikiMatrix.el-mr.tsv 32288
586
+ WikiMatrix.el-nl.tsv 76721
587
+ WikiMatrix.el-no.tsv 60863
588
+ WikiMatrix.el-pl.tsv 77338
589
+ WikiMatrix.el-pt.tsv 144004
590
+ WikiMatrix.el-ro.tsv 78731
591
+ WikiMatrix.el-ru.tsv 114815
592
+ WikiMatrix.el-sh.tsv 38130
593
+ WikiMatrix.el-si.tsv 31562
594
+ WikiMatrix.el-sk.tsv 35679
595
+ WikiMatrix.el-sl.tsv 46819
596
+ WikiMatrix.el-sq.tsv 28074
597
+ WikiMatrix.el-sr.tsv 52918
598
+ WikiMatrix.el-sv.tsv 62158
599
+ WikiMatrix.el-sw.tsv 11271
600
+ WikiMatrix.el-ta.tsv 16938
601
+ WikiMatrix.el-te.tsv 18789
602
+ WikiMatrix.el-tl.tsv 20861
603
+ WikiMatrix.el-tr.tsv 56445
604
+ WikiMatrix.el-uk.tsv 68884
605
+ WikiMatrix.el-vi.tsv 75576
606
+ WikiMatrix.el-zh.tsv 62957
607
+ WikiMatrix.en-eo.tsv 298200
608
+ WikiMatrix.en-es.tsv 3377911
609
+ WikiMatrix.en-et.tsv 243869
610
+ WikiMatrix.en-eu.tsv 119479
611
+ WikiMatrix.en-fa.tsv 303805
612
+ WikiMatrix.en-fi.tsv 375723
613
+ WikiMatrix.en-fo.tsv 32317
614
+ WikiMatrix.en-fr.tsv 2757883
615
+ WikiMatrix.en-fy.tsv 32249
616
+ WikiMatrix.en-gl.tsv 446151
617
+ WikiMatrix.en-he.tsv 545744
618
+ WikiMatrix.en-hi.tsv 231459
619
+ WikiMatrix.en-hr.tsv 259498
620
+ WikiMatrix.en-hu.tsv 488318
621
+ WikiMatrix.en-id.tsv 1019170
622
+ WikiMatrix.en-io.tsv 11209
623
+ WikiMatrix.en-is.tsv 85991
624
+ WikiMatrix.en-it.tsv 2126083
625
+ WikiMatrix.en-ja.tsv 851706
626
+ WikiMatrix.en-jv.tsv 13048
627
+ WikiMatrix.en-ka.tsv 12807
628
+ WikiMatrix.en-kk.tsv 20053
629
+ WikiMatrix.en-ko.tsv 306900
630
+ WikiMatrix.en-la.tsv 32280
631
+ WikiMatrix.en-lb.tsv 22281
632
+ WikiMatrix.en-lmo.tsv 10434
633
+ WikiMatrix.en-lt.tsv 157525
634
+ WikiMatrix.en-mg.tsv 13959
635
+ WikiMatrix.en-mk.tsv 395394
636
+ WikiMatrix.en-ml.tsv 71508
637
+ WikiMatrix.en-mr.tsv 124308
638
+ WikiMatrix.en-mwl.tsv 10443
639
+ WikiMatrix.en-nds_nl.tsv 10550
640
+ WikiMatrix.en-nds.tsv 43401
641
+ WikiMatrix.en-ne.tsv 15015
642
+ WikiMatrix.en-nl.tsv 796507
643
+ WikiMatrix.en-no.tsv 636472
644
+ WikiMatrix.en-oc.tsv 37331
645
+ WikiMatrix.en-pl.tsv 668646
646
+ WikiMatrix.en-pt.tsv 2461557
647
+ WikiMatrix.en-ro.tsv 631485
648
+ WikiMatrix.en-ru.tsv 1661908
649
+ WikiMatrix.en-sh.tsv 224146
650
+ WikiMatrix.en-simple.tsv 599340
651
+ WikiMatrix.en-si.tsv 115045
652
+ WikiMatrix.en-sk.tsv 178984
653
+ WikiMatrix.en-sl.tsv 318027
654
+ WikiMatrix.en-sq.tsv 180111
655
+ WikiMatrix.en-sr.tsv 395568
656
+ WikiMatrix.en-sv.tsv 546288
657
+ WikiMatrix.en-sw.tsv 51386
658
+ WikiMatrix.en-ta.tsv 95161
659
+ WikiMatrix.en-te.tsv 91910
660
+ WikiMatrix.en-tg.tsv 15002
661
+ WikiMatrix.en-tl.tsv 75446
662
+ WikiMatrix.en-tr.tsv 477735
663
+ WikiMatrix.en-tt.tsv 32153
664
+ WikiMatrix.en-ug.tsv 10698
665
+ WikiMatrix.en-uk.tsv 681114
666
+ WikiMatrix.en-vi.tsv 1073751
667
+ WikiMatrix.en-wuu.tsv 17675
668
+ WikiMatrix.en-zh.tsv 786511
669
+ WikiMatrix.eo-es.tsv 149827
670
+ WikiMatrix.eo-et.tsv 31921
671
+ WikiMatrix.eo-eu.tsv 25283
672
+ WikiMatrix.eo-fa.tsv 23234
673
+ WikiMatrix.eo-fi.tsv 46112
674
+ WikiMatrix.eo-fr.tsv 134088
675
+ WikiMatrix.eo-gl.tsv 46309
676
+ WikiMatrix.eo-he.tsv 39004
677
+ WikiMatrix.eo-hi.tsv 22778
678
+ WikiMatrix.eo-hr.tsv 29259
679
+ WikiMatrix.eo-hu.tsv 57398
680
+ WikiMatrix.eo-id.tsv 46010
681
+ WikiMatrix.eo-is.tsv 15379
682
+ WikiMatrix.eo-it.tsv 101947
683
+ WikiMatrix.eo-ja.tsv 48733
684
+ WikiMatrix.eo-ko.tsv 26463
685
+ WikiMatrix.eo-lt.tsv 28059
686
+ WikiMatrix.eo-mk.tsv 30254
687
+ WikiMatrix.eo-ml.tsv 28437
688
+ WikiMatrix.eo-mr.tsv 28622
689
+ WikiMatrix.eo-nds.tsv 11812
690
+ WikiMatrix.eo-nl.tsv 81182
691
+ WikiMatrix.eo-no.tsv 47185
692
+ WikiMatrix.eo-pl.tsv 77317
693
+ WikiMatrix.eo-pt.tsv 91599
694
+ WikiMatrix.eo-ro.tsv 43594
695
+ WikiMatrix.eo-ru.tsv 81964
696
+ WikiMatrix.eo-sh.tsv 26394
697
+ WikiMatrix.eo-si.tsv 28638
698
+ WikiMatrix.eo-sk.tsv 41405
699
+ WikiMatrix.eo-sl.tsv 32362
700
+ WikiMatrix.eo-sq.tsv 19844
701
+ WikiMatrix.eo-sr.tsv 36234
702
+ WikiMatrix.eo-sv.tsv 53442
703
+ WikiMatrix.eo-ta.tsv 16284
704
+ WikiMatrix.eo-te.tsv 19804
705
+ WikiMatrix.eo-tl.tsv 17779
706
+ WikiMatrix.eo-tr.tsv 37653
707
+ WikiMatrix.eo-uk.tsv 50410
708
+ WikiMatrix.eo-vi.tsv 42253
709
+ WikiMatrix.eo-zh.tsv 39852
710
+ WikiMatrix.es-et.tsv 89252
711
+ WikiMatrix.es-eu.tsv 154280
712
+ WikiMatrix.es-fa.tsv 83056
713
+ WikiMatrix.es-fi.tsv 155486
714
+ WikiMatrix.es-fo.tsv 21382
715
+ WikiMatrix.es-fr.tsv 905760
716
+ WikiMatrix.es-fy.tsv 21959
717
+ WikiMatrix.es-gl.tsv 610824
718
+ WikiMatrix.es-gom.tsv 13914
719
+ WikiMatrix.es-he.tsv 153353
720
+ WikiMatrix.es-hi.tsv 71866
721
+ WikiMatrix.es-hr.tsv 94295
722
+ WikiMatrix.es-hu.tsv 167286
723
+ WikiMatrix.es-hy.tsv 13124
724
+ WikiMatrix.es-id.tsv 198191
725
+ WikiMatrix.es-is.tsv 42377
726
+ WikiMatrix.es-it.tsv 671298
727
+ WikiMatrix.es-ja.tsv 219260
728
+ WikiMatrix.es-jv.tsv 12254
729
+ WikiMatrix.es-ka.tsv 16433
730
+ WikiMatrix.es-kk.tsv 26257
731
+ WikiMatrix.es-ko.tsv 108385
732
+ WikiMatrix.es-la.tsv 20803
733
+ WikiMatrix.es-lb.tsv 19884
734
+ WikiMatrix.es-lt.tsv 76193
735
+ WikiMatrix.es-mk.tsv 92702
736
+ WikiMatrix.es-ml.tsv 65508
737
+ WikiMatrix.es-mr.tsv 98088
738
+ WikiMatrix.es-nds.tsv 28568
739
+ WikiMatrix.es-ne.tsv 25483
740
+ WikiMatrix.es-nl.tsv 272587
741
+ WikiMatrix.es-no.tsv 181719
742
+ WikiMatrix.es-oc.tsv 35804
743
+ WikiMatrix.es-pl.tsv 235464
744
+ WikiMatrix.es-pt.tsv 923724
745
+ WikiMatrix.es-ro.tsv 183489
746
+ WikiMatrix.es-ru.tsv 393314
747
+ WikiMatrix.es-sh.tsv 81086
748
+ WikiMatrix.es-si.tsv 84161
749
+ WikiMatrix.es-sk.tsv 81589
750
+ WikiMatrix.es-sl.tsv 93744
751
+ WikiMatrix.es-sq.tsv 53815
752
+ WikiMatrix.es-sr.tsv 107044
753
+ WikiMatrix.es-sv.tsv 181152
754
+ WikiMatrix.es-sw.tsv 21991
755
+ WikiMatrix.es-ta.tsv 57223
756
+ WikiMatrix.es-te.tsv 71668
757
+ WikiMatrix.es-tl.tsv 48392
758
+ WikiMatrix.es-tr.tsv 147352
759
+ WikiMatrix.es-tt.tsv 26290
760
+ WikiMatrix.es-uk.tsv 187294
761
+ WikiMatrix.es-vi.tsv 206705
762
+ WikiMatrix.es-wuu.tsv 12873
763
+ WikiMatrix.es-zh.tsv 174315
764
+ WikiMatrix.et-eu.tsv 22986
765
+ WikiMatrix.et-fa.tsv 24256
766
+ WikiMatrix.et-fi.tsv 70662
767
+ WikiMatrix.et-fr.tsv 85947
768
+ WikiMatrix.et-gl.tsv 32333
769
+ WikiMatrix.et-he.tsv 39824
770
+ WikiMatrix.et-hi.tsv 20988
771
+ WikiMatrix.et-hr.tsv 33532
772
+ WikiMatrix.et-hu.tsv 56432
773
+ WikiMatrix.et-id.tsv 41272
774
+ WikiMatrix.et-is.tsv 14970
775
+ WikiMatrix.et-it.tsv 75461
776
+ WikiMatrix.et-ja.tsv 57643
777
+ WikiMatrix.et-ko.tsv 29213
778
+ WikiMatrix.et-lt.tsv 35847
779
+ WikiMatrix.et-mk.tsv 32911
780
+ WikiMatrix.et-ml.tsv 20233
781
+ WikiMatrix.et-mr.tsv 21549
782
+ WikiMatrix.et-nl.tsv 72505
783
+ WikiMatrix.et-no.tsv 49810
784
+ WikiMatrix.et-pl.tsv 73151
785
+ WikiMatrix.et-pt.tsv 76955
786
+ WikiMatrix.et-ro.tsv 48427
787
+ WikiMatrix.et-ru.tsv 96345
788
+ WikiMatrix.et-sh.tsv 27195
789
+ WikiMatrix.et-si.tsv 19538
790
+ WikiMatrix.et-sk.tsv 34194
791
+ WikiMatrix.et-sl.tsv 35300
792
+ WikiMatrix.et-sq.tsv 18948
793
+ WikiMatrix.et-sr.tsv 34016
794
+ WikiMatrix.et-sv.tsv 58124
795
+ WikiMatrix.et-ta.tsv 16587
796
+ WikiMatrix.et-te.tsv 16967
797
+ WikiMatrix.et-tl.tsv 15617
798
+ WikiMatrix.et-tr.tsv 43264
799
+ WikiMatrix.et-uk.tsv 56089
800
+ WikiMatrix.et-vi.tsv 40281
801
+ WikiMatrix.et-zh.tsv 44047
802
+ WikiMatrix.eu-fa.tsv 14476
803
+ WikiMatrix.eu-fi.tsv 33576
804
+ WikiMatrix.eu-fr.tsv 65731
805
+ WikiMatrix.eu-gl.tsv 43100
806
+ WikiMatrix.eu-he.tsv 25498
807
+ WikiMatrix.eu-hi.tsv 13049
808
+ WikiMatrix.eu-hr.tsv 21394
809
+ WikiMatrix.eu-hu.tsv 35098
810
+ WikiMatrix.eu-id.tsv 27036
811
+ WikiMatrix.eu-is.tsv 10055
812
+ WikiMatrix.eu-it.tsv 54958
813
+ WikiMatrix.eu-ja.tsv 33986
814
+ WikiMatrix.eu-ko.tsv 18156
815
+ WikiMatrix.eu-lt.tsv 19463
816
+ WikiMatrix.eu-mk.tsv 19208
817
+ WikiMatrix.eu-ml.tsv 11113
818
+ WikiMatrix.eu-mr.tsv 10301
819
+ WikiMatrix.eu-nl.tsv 44131
820
+ WikiMatrix.eu-no.tsv 29644
821
+ WikiMatrix.eu-pl.tsv 43382
822
+ WikiMatrix.eu-pt.tsv 58821
823
+ WikiMatrix.eu-ro.tsv 30397
824
+ WikiMatrix.eu-ru.tsv 47206
825
+ WikiMatrix.eu-sh.tsv 19346
826
+ WikiMatrix.eu-sk.tsv 20316
827
+ WikiMatrix.eu-sl.tsv 20626
828
+ WikiMatrix.eu-sq.tsv 12941
829
+ WikiMatrix.eu-sr.tsv 21433
830
+ WikiMatrix.eu-sv.tsv 38206
831
+ WikiMatrix.eu-ta.tsv 13885
832
+ WikiMatrix.eu-te.tsv 11444
833
+ WikiMatrix.eu-tr.tsv 29185
834
+ WikiMatrix.eu-uk.tsv 30006
835
+ WikiMatrix.eu-vi.tsv 25722
836
+ WikiMatrix.eu-zh.tsv 23990
837
+ WikiMatrix.fa-fi.tsv 34069
838
+ WikiMatrix.fa-fr.tsv 71278
839
+ WikiMatrix.fa-gl.tsv 25353
840
+ WikiMatrix.fa-he.tsv 36955
841
+ WikiMatrix.fa-hi.tsv 20557
842
+ WikiMatrix.fa-hr.tsv 24987
843
+ WikiMatrix.fa-hu.tsv 39139
844
+ WikiMatrix.fa-id.tsv 46991
845
+ WikiMatrix.fa-it.tsv 64468
846
+ WikiMatrix.fa-ja.tsv 46942
847
+ WikiMatrix.fa-ko.tsv 26572
848
+ WikiMatrix.fa-lt.tsv 20032
849
+ WikiMatrix.fa-mk.tsv 27555
850
+ WikiMatrix.fa-ml.tsv 11083
851
+ WikiMatrix.fa-mr.tsv 10684
852
+ WikiMatrix.fa-nl.tsv 49211
853
+ WikiMatrix.fa-no.tsv 32827
854
+ WikiMatrix.fa-pl.tsv 50792
855
+ WikiMatrix.fa-pt.tsv 77606
856
+ WikiMatrix.fa-ro.tsv 40515
857
+ WikiMatrix.fa-ru.tsv 72954
858
+ WikiMatrix.fa-sh.tsv 21729
859
+ WikiMatrix.fa-sk.tsv 21717
860
+ WikiMatrix.fa-sl.tsv 24549
861
+ WikiMatrix.fa-sq.tsv 17644
862
+ WikiMatrix.fa-sr.tsv 30075
863
+ WikiMatrix.fa-sv.tsv 42447
864
+ WikiMatrix.fa-ta.tsv 21879
865
+ WikiMatrix.fa-te.tsv 12711
866
+ WikiMatrix.fa-tr.tsv 42681
867
+ WikiMatrix.fa-uk.tsv 41735
868
+ WikiMatrix.fa-vi.tsv 38848
869
+ WikiMatrix.fa-zh.tsv 42042
870
+ WikiMatrix.fi-fr.tsv 156225
871
+ WikiMatrix.fi-gl.tsv 47377
872
+ WikiMatrix.fi-he.tsv 64406
873
+ WikiMatrix.fi-hi.tsv 28707
874
+ WikiMatrix.fi-hr.tsv 48618
875
+ WikiMatrix.fi-hu.tsv 90196
876
+ WikiMatrix.fi-id.tsv 63983
877
+ WikiMatrix.fi-is.tsv 22671
878
+ WikiMatrix.fi-it.tsv 131193
879
+ WikiMatrix.fi-ja.tsv 87559
880
+ WikiMatrix.fi-ko.tsv 43152
881
+ WikiMatrix.fi-lt.tsv 47157
882
+ WikiMatrix.fi-mk.tsv 40253
883
+ WikiMatrix.fi-ml.tsv 29127
884
+ WikiMatrix.fi-mr.tsv 30489
885
+ WikiMatrix.fi-nds.tsv 12120
886
+ WikiMatrix.fi-ne.tsv 10944
887
+ WikiMatrix.fi-nl.tsv 126003
888
+ WikiMatrix.fi-no.tsv 86413
889
+ WikiMatrix.fi-oc.tsv 10219
890
+ WikiMatrix.fi-pl.tsv 119130
891
+ WikiMatrix.fi-pt.tsv 131186
892
+ WikiMatrix.fi-ro.tsv 69926
893
+ WikiMatrix.fi-ru.tsv 139383
894
+ WikiMatrix.fi-sh.tsv 39988
895
+ WikiMatrix.fi-si.tsv 27125
896
+ WikiMatrix.fi-sk.tsv 50645
897
+ WikiMatrix.fi-sl.tsv 46789
898
+ WikiMatrix.fi-sq.tsv 25032
899
+ WikiMatrix.fi-sr.tsv 46945
900
+ WikiMatrix.fi-sv.tsv 126098
901
+ WikiMatrix.fi-sw.tsv 12603
902
+ WikiMatrix.fi-ta.tsv 23818
903
+ WikiMatrix.fi-te.tsv 24903
904
+ WikiMatrix.fi-tl.tsv 21521
905
+ WikiMatrix.fi-tr.tsv 72100
906
+ WikiMatrix.fi-tt.tsv 10236
907
+ WikiMatrix.fi-uk.tsv 76304
908
+ WikiMatrix.fi-vi.tsv 60265
909
+ WikiMatrix.fi-zh.tsv 64244
910
+ WikiMatrix.fo-fr.tsv 18125
911
+ WikiMatrix.fo-it.tsv 15116
912
+ WikiMatrix.fo-nl.tsv 11341
913
+ WikiMatrix.fo-pl.tsv 11846
914
+ WikiMatrix.fo-pt.tsv 17485
915
+ WikiMatrix.fo-ru.tsv 13640
916
+ WikiMatrix.fo-sv.tsv 12903
917
+ WikiMatrix.fr-fy.tsv 18384
918
+ WikiMatrix.fr-gl.tsv 154872
919
+ WikiMatrix.fr-gom.tsv 13233
920
+ WikiMatrix.fr-he.tsv 136974
921
+ WikiMatrix.fr-hi.tsv 60717
922
+ WikiMatrix.fr-hr.tsv 85047
923
+ WikiMatrix.fr-hu.tsv 164733
924
+ WikiMatrix.fr-hy.tsv 12458
925
+ WikiMatrix.fr-id.tsv 161857
926
+ WikiMatrix.fr-is.tsv 38273
927
+ WikiMatrix.fr-it.tsv 744432
928
+ WikiMatrix.fr-ja.tsv 214852
929
+ WikiMatrix.fr-jv.tsv 10933
930
+ WikiMatrix.fr-ka.tsv 17291
931
+ WikiMatrix.fr-kk.tsv 24401
932
+ WikiMatrix.fr-ko.tsv 89109
933
+ WikiMatrix.fr-la.tsv 18936
934
+ WikiMatrix.fr-lb.tsv 18459
935
+ WikiMatrix.fr-lt.tsv 71060
936
+ WikiMatrix.fr-mg.tsv 12043
937
+ WikiMatrix.fr-mk.tsv 83969
938
+ WikiMatrix.fr-ml.tsv 62719
939
+ WikiMatrix.fr-mr.tsv 83646
940
+ WikiMatrix.fr-nds.tsv 25658
941
+ WikiMatrix.fr-ne.tsv 25868
942
+ WikiMatrix.fr-nl.tsv 331777
943
+ WikiMatrix.fr-no.tsv 166978
944
+ WikiMatrix.fr-oc.tsv 124226
945
+ WikiMatrix.fr-pl.tsv 255763
946
+ WikiMatrix.fr-pt.tsv 558861
947
+ WikiMatrix.fr-ro.tsv 206443
948
+ WikiMatrix.fr-ru.tsv 410005
949
+ WikiMatrix.fr-sh.tsv 72887
950
+ WikiMatrix.fr-si.tsv 74448
951
+ WikiMatrix.fr-sk.tsv 83657
952
+ WikiMatrix.fr-sl.tsv 86073
953
+ WikiMatrix.fr-sq.tsv 48654
954
+ WikiMatrix.fr-sr.tsv 92133
955
+ WikiMatrix.fr-sv.tsv 186370
956
+ WikiMatrix.fr-sw.tsv 19908
957
+ WikiMatrix.fr-ta.tsv 56336
958
+ WikiMatrix.fr-te.tsv 65809
959
+ WikiMatrix.fr-tl.tsv 42182
960
+ WikiMatrix.fr-tr.tsv 130472
961
+ WikiMatrix.fr-tt.tsv 26231
962
+ WikiMatrix.fr-uk.tsv 170063
963
+ WikiMatrix.fr-vi.tsv 165937
964
+ WikiMatrix.fr-wuu.tsv 11999
965
+ WikiMatrix.fr-zh.tsv 157013
966
+ WikiMatrix.fy-it.tsv 17275
967
+ WikiMatrix.fy-nl.tsv 38648
968
+ WikiMatrix.fy-pl.tsv 12437
969
+ WikiMatrix.fy-pt.tsv 18487
970
+ WikiMatrix.fy-ru.tsv 14073
971
+ WikiMatrix.fy-sv.tsv 13136
972
+ WikiMatrix.gl-he.tsv 41858
973
+ WikiMatrix.gl-hi.tsv 21454
974
+ WikiMatrix.gl-hr.tsv 33940
975
+ WikiMatrix.gl-hu.tsv 50347
976
+ WikiMatrix.gl-id.tsv 56200
977
+ WikiMatrix.gl-is.tsv 14870
978
+ WikiMatrix.gl-it.tsv 120462
979
+ WikiMatrix.gl-ja.tsv 50922
980
+ WikiMatrix.gl-ko.tsv 28478
981
+ WikiMatrix.gl-lt.tsv 27669
982
+ WikiMatrix.gl-mk.tsv 35727
983
+ WikiMatrix.gl-ml.tsv 29945
984
+ WikiMatrix.gl-mr.tsv 39026
985
+ WikiMatrix.gl-nds.tsv 10043
986
+ WikiMatrix.gl-ne.tsv 11932
987
+ WikiMatrix.gl-nl.tsv 66259
988
+ WikiMatrix.gl-no.tsv 52272
989
+ WikiMatrix.gl-oc.tsv 17008
990
+ WikiMatrix.gl-pl.tsv 65374
991
+ WikiMatrix.gl-pt.tsv 227507
992
+ WikiMatrix.gl-ro.tsv 56079
993
+ WikiMatrix.gl-ru.tsv 84460
994
+ WikiMatrix.gl-sh.tsv 30941
995
+ WikiMatrix.gl-si.tsv 36721
996
+ WikiMatrix.gl-sk.tsv 29118
997
+ WikiMatrix.gl-sl.tsv 33881
998
+ WikiMatrix.gl-sq.tsv 20614
999
+ WikiMatrix.gl-sr.tsv 39519
1000
+ WikiMatrix.gl-sv.tsv 54302
1001
+ WikiMatrix.gl-ta.tsv 15445
1002
+ WikiMatrix.gl-te.tsv 17166
1003
+ WikiMatrix.gl-tl.tsv 22377
1004
+ WikiMatrix.gl-tr.tsv 43313
1005
+ WikiMatrix.gl-tt.tsv 12039
1006
+ WikiMatrix.gl-uk.tsv 51273
1007
+ WikiMatrix.gl-vi.tsv 58599
1008
+ WikiMatrix.gl-zh.tsv 46609
1009
+ WikiMatrix.gom-it.tsv 13099
1010
+ WikiMatrix.gom-pt.tsv 11983
1011
+ WikiMatrix.gom-ru.tsv 10566
1012
+ WikiMatrix.he-hi.tsv 28427
1013
+ WikiMatrix.he-hr.tsv 41487
1014
+ WikiMatrix.he-hu.tsv 65954
1015
+ WikiMatrix.he-id.tsv 63296
1016
+ WikiMatrix.he-is.tsv 17590
1017
+ WikiMatrix.he-it.tsv 121221
1018
+ WikiMatrix.he-ja.tsv 82041
1019
+ WikiMatrix.he-ko.tsv 43724
1020
+ WikiMatrix.he-lt.tsv 35179
1021
+ WikiMatrix.he-mk.tsv 42893
1022
+ WikiMatrix.he-ml.tsv 26296
1023
+ WikiMatrix.he-mr.tsv 25941
1024
+ WikiMatrix.he-nl.tsv 86933
1025
+ WikiMatrix.he-no.tsv 64090
1026
+ WikiMatrix.he-pl.tsv 84210
1027
+ WikiMatrix.he-pt.tsv 133567
1028
+ WikiMatrix.he-ro.tsv 67831
1029
+ WikiMatrix.he-ru.tsv 131378
1030
+ WikiMatrix.he-sh.tsv 35352
1031
+ WikiMatrix.he-si.tsv 21382
1032
+ WikiMatrix.he-sk.tsv 36947
1033
+ WikiMatrix.he-sl.tsv 38755
1034
+ WikiMatrix.he-sq.tsv 23046
1035
+ WikiMatrix.he-sr.tsv 45889
1036
+ WikiMatrix.he-sv.tsv 67852
1037
+ WikiMatrix.he-sw.tsv 10339
1038
+ WikiMatrix.he-ta.tsv 21839
1039
+ WikiMatrix.he-te.tsv 25488
1040
+ WikiMatrix.he-tl.tsv 13968
1041
+ WikiMatrix.he-tr.tsv 54841
1042
+ WikiMatrix.he-uk.tsv 73310
1043
+ WikiMatrix.he-vi.tsv 66128
1044
+ WikiMatrix.he-zh.tsv 62796
1045
+ WikiMatrix.hi-hr.tsv 21019
1046
+ WikiMatrix.hi-hu.tsv 33900
1047
+ WikiMatrix.hi-id.tsv 31354
1048
+ WikiMatrix.hi-it.tsv 56025
1049
+ WikiMatrix.hi-ja.tsv 35864
1050
+ WikiMatrix.hi-ko.tsv 18367
1051
+ WikiMatrix.hi-lt.tsv 16614
1052
+ WikiMatrix.hi-mk.tsv 24869
1053
+ WikiMatrix.hi-mr.tsv 11686
1054
+ WikiMatrix.hi-ne.tsv 12315
1055
+ WikiMatrix.hi-nl.tsv 40620
1056
+ WikiMatrix.hi-no.tsv 27952
1057
+ WikiMatrix.hi-pl.tsv 44014
1058
+ WikiMatrix.hi-pt.tsv 63743
1059
+ WikiMatrix.hi-ro.tsv 35158
1060
+ WikiMatrix.hi-ru.tsv 56751
1061
+ WikiMatrix.hi-sh.tsv 17960
1062
+ WikiMatrix.hi-sk.tsv 18987
1063
+ WikiMatrix.hi-sl.tsv 21600
1064
+ WikiMatrix.hi-sq.tsv 14770
1065
+ WikiMatrix.hi-sr.tsv 22522
1066
+ WikiMatrix.hi-sv.tsv 40738
1067
+ WikiMatrix.hi-ta.tsv 13224
1068
+ WikiMatrix.hi-te.tsv 18147
1069
+ WikiMatrix.hi-tr.tsv 29786
1070
+ WikiMatrix.hi-uk.tsv 33725
1071
+ WikiMatrix.hi-vi.tsv 26293
1072
+ WikiMatrix.hi-zh.tsv 30167
1073
+ WikiMatrix.hr-hu.tsv 58438
1074
+ WikiMatrix.hr-id.tsv 47104
1075
+ WikiMatrix.hr-is.tsv 14241
1076
+ WikiMatrix.hr-it.tsv 80194
1077
+ WikiMatrix.hr-ja.tsv 48151
1078
+ WikiMatrix.hr-ko.tsv 27662
1079
+ WikiMatrix.hr-lt.tsv 31432
1080
+ WikiMatrix.hr-mk.tsv 52353
1081
+ WikiMatrix.hr-ml.tsv 24061
1082
+ WikiMatrix.hr-mr.tsv 24490
1083
+ WikiMatrix.hr-ne.tsv 10741
1084
+ WikiMatrix.hr-nl.tsv 65007
1085
+ WikiMatrix.hr-no.tsv 48269
1086
+ WikiMatrix.hr-pl.tsv 71529
1087
+ WikiMatrix.hr-pt.tsv 85373
1088
+ WikiMatrix.hr-ro.tsv 51221
1089
+ WikiMatrix.hr-ru.tsv 85888
1090
+ WikiMatrix.hr-sh.tsv 666685
1091
+ WikiMatrix.hr-si.tsv 19842
1092
+ WikiMatrix.hr-sk.tsv 35635
1093
+ WikiMatrix.hr-sl.tsv 53346
1094
+ WikiMatrix.hr-sq.tsv 21471
1095
+ WikiMatrix.hr-sr.tsv 205175
1096
+ WikiMatrix.hr-sv.tsv 56793
1097
+ WikiMatrix.hr-ta.tsv 16692
1098
+ WikiMatrix.hr-te.tsv 16411
1099
+ WikiMatrix.hr-tl.tsv 17463
1100
+ WikiMatrix.hr-tr.tsv 42175
1101
+ WikiMatrix.hr-uk.tsv 55749
1102
+ WikiMatrix.hr-vi.tsv 46750
1103
+ WikiMatrix.hr-zh.tsv 42053
1104
+ WikiMatrix.hu-id.tsv 70813
1105
+ WikiMatrix.hu-is.tsv 20377
1106
+ WikiMatrix.hu-it.tsv 146012
1107
+ WikiMatrix.hu-ja.tsv 99686
1108
+ WikiMatrix.hu-kk.tsv 11558
1109
+ WikiMatrix.hu-ko.tsv 49720
1110
+ WikiMatrix.hu-lt.tsv 48514
1111
+ WikiMatrix.hu-mk.tsv 47880
1112
+ WikiMatrix.hu-ml.tsv 27146
1113
+ WikiMatrix.hu-mr.tsv 28805
1114
+ WikiMatrix.hu-nds.tsv 12598
1115
+ WikiMatrix.hu-ne.tsv 10988
1116
+ WikiMatrix.hu-nl.tsv 121366
1117
+ WikiMatrix.hu-no.tsv 75452
1118
+ WikiMatrix.hu-oc.tsv 10104
1119
+ WikiMatrix.hu-pl.tsv 126850
1120
+ WikiMatrix.hu-pt.tsv 148377
1121
+ WikiMatrix.hu-ro.tsv 87958
1122
+ WikiMatrix.hu-ru.tsv 149514
1123
+ WikiMatrix.hu-sh.tsv 46865
1124
+ WikiMatrix.hu-si.tsv 26089
1125
+ WikiMatrix.hu-sk.tsv 56197
1126
+ WikiMatrix.hu-sl.tsv 55097
1127
+ WikiMatrix.hu-sq.tsv 27366
1128
+ WikiMatrix.hu-sr.tsv 53429
1129
+ WikiMatrix.hu-sv.tsv 88872
1130
+ WikiMatrix.hu-sw.tsv 13743
1131
+ WikiMatrix.hu-ta.tsv 29256
1132
+ WikiMatrix.hu-te.tsv 30768
1133
+ WikiMatrix.hu-tl.tsv 20518
1134
+ WikiMatrix.hu-tr.tsv 75715
1135
+ WikiMatrix.hu-uk.tsv 83066
1136
+ WikiMatrix.hu-vi.tsv 74351
1137
+ WikiMatrix.hu-zh.tsv 75242
1138
+ WikiMatrix.hy-it.tsv 12210
1139
+ WikiMatrix.hy-pt.tsv 11393
1140
+ WikiMatrix.hy-ru.tsv 12074
1141
+ WikiMatrix.id-is.tsv 16944
1142
+ WikiMatrix.id-it.tsv 146885
1143
+ WikiMatrix.id-ja.tsv 77397
1144
+ WikiMatrix.id-jv.tsv 19595
1145
+ WikiMatrix.id-ko.tsv 45970
1146
+ WikiMatrix.id-lt.tsv 33551
1147
+ WikiMatrix.id-mk.tsv 55991
1148
+ WikiMatrix.id-ml.tsv 25693
1149
+ WikiMatrix.id-mr.tsv 23390
1150
+ WikiMatrix.id-ne.tsv 10057
1151
+ WikiMatrix.id-nl.tsv 101197
1152
+ WikiMatrix.id-no.tsv 83641
1153
+ WikiMatrix.id-pl.tsv 93486
1154
+ WikiMatrix.id-pt.tsv 204470
1155
+ WikiMatrix.id-ro.tsv 94439
1156
+ WikiMatrix.id-ru.tsv 127410
1157
+ WikiMatrix.id-sh.tsv 43738
1158
+ WikiMatrix.id-si.tsv 23134
1159
+ WikiMatrix.id-sk.tsv 37954
1160
+ WikiMatrix.id-sl.tsv 46656
1161
+ WikiMatrix.id-sq.tsv 32624
1162
+ WikiMatrix.id-sr.tsv 56109
1163
+ WikiMatrix.id-sv.tsv 79193
1164
+ WikiMatrix.id-sw.tsv 13829
1165
+ WikiMatrix.id-ta.tsv 24647
1166
+ WikiMatrix.id-te.tsv 19049
1167
+ WikiMatrix.id-tl.tsv 21284
1168
+ WikiMatrix.id-tr.tsv 79176
1169
+ WikiMatrix.id-tt.tsv 11627
1170
+ WikiMatrix.id-uk.tsv 73379
1171
+ WikiMatrix.id-vi.tsv 146746
1172
+ WikiMatrix.id-zh.tsv 83566
1173
+ WikiMatrix.is-it.tsv 31787
1174
+ WikiMatrix.is-ja.tsv 18848
1175
+ WikiMatrix.is-lt.tsv 12041
1176
+ WikiMatrix.is-mk.tsv 12532
1177
+ WikiMatrix.is-nl.tsv 27334
1178
+ WikiMatrix.is-no.tsv 22321
1179
+ WikiMatrix.is-pl.tsv 27453
1180
+ WikiMatrix.is-pt.tsv 35263
1181
+ WikiMatrix.is-ro.tsv 20255
1182
+ WikiMatrix.is-ru.tsv 30010
1183
+ WikiMatrix.is-sh.tsv 13271
1184
+ WikiMatrix.is-sk.tsv 13204
1185
+ WikiMatrix.is-sl.tsv 13405
1186
+ WikiMatrix.is-sr.tsv 13764
1187
+ WikiMatrix.is-sv.tsv 28017
1188
+ WikiMatrix.is-tr.tsv 16153
1189
+ WikiMatrix.is-uk.tsv 18889
1190
+ WikiMatrix.is-vi.tsv 16523
1191
+ WikiMatrix.is-zh.tsv 14873
1192
+ WikiMatrix.it-ja.tsv 179031
1193
+ WikiMatrix.it-jv.tsv 11246
1194
+ WikiMatrix.it-ka.tsv 16256
1195
+ WikiMatrix.it-kk.tsv 24825
1196
+ WikiMatrix.it-ko.tsv 83911
1197
+ WikiMatrix.it-la.tsv 17036
1198
+ WikiMatrix.it-lb.tsv 15844
1199
+ WikiMatrix.it-lmo.tsv 11595
1200
+ WikiMatrix.it-lt.tsv 62439
1201
+ WikiMatrix.it-mk.tsv 73015
1202
+ WikiMatrix.it-ml.tsv 58237
1203
+ WikiMatrix.it-mr.tsv 78773
1204
+ WikiMatrix.it-nds.tsv 22202
1205
+ WikiMatrix.it-ne.tsv 24633
1206
+ WikiMatrix.it-nl.tsv 240569
1207
+ WikiMatrix.it-no.tsv 150403
1208
+ WikiMatrix.it-oc.tsv 20093
1209
+ WikiMatrix.it-pl.tsv 219293
1210
+ WikiMatrix.it-pt.tsv 480108
1211
+ WikiMatrix.it-ro.tsv 161759
1212
+ WikiMatrix.it-ru.tsv 303974
1213
+ WikiMatrix.it-scn.tsv 11231
1214
+ WikiMatrix.it-sh.tsv 67153
1215
+ WikiMatrix.it-si.tsv 68652
1216
+ WikiMatrix.it-sk.tsv 72794
1217
+ WikiMatrix.it-sl.tsv 81545
1218
+ WikiMatrix.it-sq.tsv 48707
1219
+ WikiMatrix.it-sr.tsv 83320
1220
+ WikiMatrix.it-sv.tsv 153800
1221
+ WikiMatrix.it-sw.tsv 19586
1222
+ WikiMatrix.it-ta.tsv 44891
1223
+ WikiMatrix.it-te.tsv 58221
1224
+ WikiMatrix.it-tl.tsv 41245
1225
+ WikiMatrix.it-tr.tsv 112630
1226
+ WikiMatrix.it-tt.tsv 23566
1227
+ WikiMatrix.it-uk.tsv 144863
1228
+ WikiMatrix.it-vi.tsv 143644
1229
+ WikiMatrix.it-wuu.tsv 10484
1230
+ WikiMatrix.it-zh.tsv 137288
1231
+ WikiMatrix.ja-kk.tsv 14270
1232
+ WikiMatrix.ja-ko.tsv 222118
1233
+ WikiMatrix.ja-lt.tsv 47361
1234
+ WikiMatrix.ja-mk.tsv 48010
1235
+ WikiMatrix.ja-ml.tsv 21616
1236
+ WikiMatrix.ja-mr.tsv 23173
1237
+ WikiMatrix.ja-nds.tsv 11228
1238
+ WikiMatrix.ja-nl.tsv 123955
1239
+ WikiMatrix.ja-no.tsv 81283
1240
+ WikiMatrix.ja-pl.tsv 128372
1241
+ WikiMatrix.ja-pt.tsv 175188
1242
+ WikiMatrix.ja-ro.tsv 79395
1243
+ WikiMatrix.ja-ru.tsv 196556
1244
+ WikiMatrix.ja-sh.tsv 40636
1245
+ WikiMatrix.ja-si.tsv 19798
1246
+ WikiMatrix.ja-sk.tsv 48948
1247
+ WikiMatrix.ja-sl.tsv 50219
1248
+ WikiMatrix.ja-sq.tsv 28281
1249
+ WikiMatrix.ja-sr.tsv 51763
1250
+ WikiMatrix.ja-sv.tsv 96872
1251
+ WikiMatrix.ja-sw.tsv 12391
1252
+ WikiMatrix.ja-ta.tsv 37201
1253
+ WikiMatrix.ja-te.tsv 31809
1254
+ WikiMatrix.ja-tl.tsv 12366
1255
+ WikiMatrix.ja-tr.tsv 84255
1256
+ WikiMatrix.ja-tt.tsv 12937
1257
+ WikiMatrix.ja-uk.tsv 92317
1258
+ WikiMatrix.ja-vi.tsv 75798
1259
+ WikiMatrix.ja-zh.tsv 267409
1260
+ WikiMatrix.jv-pt.tsv 11226
1261
+ WikiMatrix.ka-nl.tsv 12120
1262
+ WikiMatrix.ka-pl.tsv 11605
1263
+ WikiMatrix.ka-pt.tsv 14003
1264
+ WikiMatrix.ka-ru.tsv 13330
1265
+ WikiMatrix.ka-sv.tsv 12345
1266
+ WikiMatrix.kk-nl.tsv 18071
1267
+ WikiMatrix.kk-no.tsv 11301
1268
+ WikiMatrix.kk-pl.tsv 17893
1269
+ WikiMatrix.kk-pt.tsv 22150
1270
+ WikiMatrix.kk-ro.tsv 12467
1271
+ WikiMatrix.kk-ru.tsv 32807
1272
+ WikiMatrix.kk-sv.tsv 16574
1273
+ WikiMatrix.kk-tr.tsv 10081
1274
+ WikiMatrix.kk-uk.tsv 14581
1275
+ WikiMatrix.ko-lt.tsv 23324
1276
+ WikiMatrix.ko-mk.tsv 26857
1277
+ WikiMatrix.ko-ml.tsv 10118
1278
+ WikiMatrix.ko-mr.tsv 10568
1279
+ WikiMatrix.ko-nl.tsv 56609
1280
+ WikiMatrix.ko-no.tsv 41716
1281
+ WikiMatrix.ko-pl.tsv 63894
1282
+ WikiMatrix.ko-pt.tsv 93224
1283
+ WikiMatrix.ko-ro.tsv 47054
1284
+ WikiMatrix.ko-ru.tsv 89951
1285
+ WikiMatrix.ko-sh.tsv 23213
1286
+ WikiMatrix.ko-sk.tsv 25644
1287
+ WikiMatrix.ko-sl.tsv 26403
1288
+ WikiMatrix.ko-sq.tsv 17929
1289
+ WikiMatrix.ko-sr.tsv 29639
1290
+ WikiMatrix.ko-sv.tsv 51718
1291
+ WikiMatrix.ko-ta.tsv 17059
1292
+ WikiMatrix.ko-te.tsv 13610
1293
+ WikiMatrix.ko-tr.tsv 47497
1294
+ WikiMatrix.ko-uk.tsv 48954
1295
+ WikiMatrix.ko-vi.tsv 49283
1296
+ WikiMatrix.ko-zh.tsv 57932
1297
+ WikiMatrix.la-nl.tsv 12202
1298
+ WikiMatrix.la-pl.tsv 13391
1299
+ WikiMatrix.la-pt.tsv 18561
1300
+ WikiMatrix.la-ro.tsv 10267
1301
+ WikiMatrix.la-ru.tsv 14815
1302
+ WikiMatrix.la-sv.tsv 13396
1303
+ WikiMatrix.lb-nl.tsv 11163
1304
+ WikiMatrix.lb-pl.tsv 11378
1305
+ WikiMatrix.lb-pt.tsv 16576
1306
+ WikiMatrix.lb-ru.tsv 11807
1307
+ WikiMatrix.lb-sv.tsv 12339
1308
+ WikiMatrix.lt-mk.tsv 28117
1309
+ WikiMatrix.lt-ml.tsv 16474
1310
+ WikiMatrix.lt-mr.tsv 16648
1311
+ WikiMatrix.lt-nl.tsv 57966
1312
+ WikiMatrix.lt-no.tsv 39216
1313
+ WikiMatrix.lt-pl.tsv 70315
1314
+ WikiMatrix.lt-pt.tsv 64976
1315
+ WikiMatrix.lt-ro.tsv 39152
1316
+ WikiMatrix.lt-ru.tsv 107783
1317
+ WikiMatrix.lt-sh.tsv 25495
1318
+ WikiMatrix.lt-si.tsv 15384
1319
+ WikiMatrix.lt-sk.tsv 30843
1320
+ WikiMatrix.lt-sl.tsv 30859
1321
+ WikiMatrix.lt-sq.tsv 16358
1322
+ WikiMatrix.lt-sr.tsv 29967
1323
+ WikiMatrix.lt-sv.tsv 46008
1324
+ WikiMatrix.lt-ta.tsv 13005
1325
+ WikiMatrix.lt-te.tsv 11731
1326
+ WikiMatrix.lt-tl.tsv 12904
1327
+ WikiMatrix.lt-tr.tsv 36776
1328
+ WikiMatrix.lt-uk.tsv 57413
1329
+ WikiMatrix.lt-vi.tsv 33170
1330
+ WikiMatrix.lt-zh.tsv 35895
1331
+ WikiMatrix.mk-ml.tsv 21457
1332
+ WikiMatrix.mk-mr.tsv 22675
1333
+ WikiMatrix.mk-nl.tsv 53320
1334
+ WikiMatrix.mk-no.tsv 46342
1335
+ WikiMatrix.mk-pl.tsv 56928
1336
+ WikiMatrix.mk-pt.tsv 93291
1337
+ WikiMatrix.mk-ro.tsv 56342
1338
+ WikiMatrix.mk-ru.tsv 88000
1339
+ WikiMatrix.mk-sh.tsv 52825
1340
+ WikiMatrix.mk-si.tsv 19587
1341
+ WikiMatrix.mk-sk.tsv 29821
1342
+ WikiMatrix.mk-sl.tsv 39973
1343
+ WikiMatrix.mk-sq.tsv 25078
1344
+ WikiMatrix.mk-sr.tsv 106377
1345
+ WikiMatrix.mk-sv.tsv 48072
1346
+ WikiMatrix.mk-ta.tsv 14353
1347
+ WikiMatrix.mk-te.tsv 15254
1348
+ WikiMatrix.mk-tl.tsv 16689
1349
+ WikiMatrix.mk-tr.tsv 43430
1350
+ WikiMatrix.mk-uk.tsv 57515
1351
+ WikiMatrix.mk-vi.tsv 57549
1352
+ WikiMatrix.mk-zh.tsv 45671
1353
+ WikiMatrix.ml-nl.tsv 41804
1354
+ WikiMatrix.ml-no.tsv 32249
1355
+ WikiMatrix.ml-pl.tsv 41517
1356
+ WikiMatrix.ml-pt.tsv 58378
1357
+ WikiMatrix.ml-ro.tsv 35368
1358
+ WikiMatrix.ml-ru.tsv 46205
1359
+ WikiMatrix.ml-sh.tsv 21975
1360
+ WikiMatrix.ml-sk.tsv 22420
1361
+ WikiMatrix.ml-sl.tsv 21686
1362
+ WikiMatrix.ml-sq.tsv 13707
1363
+ WikiMatrix.ml-sr.tsv 20165
1364
+ WikiMatrix.ml-sv.tsv 44814
1365
+ WikiMatrix.ml-tr.tsv 16597
1366
+ WikiMatrix.ml-uk.tsv 26706
1367
+ WikiMatrix.ml-vi.tsv 15688
1368
+ WikiMatrix.ml-zh.tsv 17523
1369
+ WikiMatrix.mr-nl.tsv 46456
1370
+ WikiMatrix.mr-no.tsv 35123
1371
+ WikiMatrix.mr-pl.tsv 47091
1372
+ WikiMatrix.mr-pt.tsv 86686
1373
+ WikiMatrix.mr-ro.tsv 47259
1374
+ WikiMatrix.mr-ru.tsv 50400
1375
+ WikiMatrix.mr-sh.tsv 22428
1376
+ WikiMatrix.mr-sk.tsv 25169
1377
+ WikiMatrix.mr-sl.tsv 25021
1378
+ WikiMatrix.mr-sq.tsv 13098
1379
+ WikiMatrix.mr-sr.tsv 19078
1380
+ WikiMatrix.mr-sv.tsv 56338
1381
+ WikiMatrix.mr-tr.tsv 17343
1382
+ WikiMatrix.mr-uk.tsv 26221
1383
+ WikiMatrix.mr-vi.tsv 14772
1384
+ WikiMatrix.mr-zh.tsv 17442
1385
+ WikiMatrix.mwl-pt.tsv 34539
1386
+ WikiMatrix.nds_nl-nl.tsv 15316
1387
+ WikiMatrix.nds-nl.tsv 19081
1388
+ WikiMatrix.nds-no.tsv 12797
1389
+ WikiMatrix.nds-pl.tsv 18216
1390
+ WikiMatrix.nds-pt.tsv 22939
1391
+ WikiMatrix.nds-ro.tsv 13008
1392
+ WikiMatrix.nds-ru.tsv 20062
1393
+ WikiMatrix.nds-sv.tsv 18542
1394
+ WikiMatrix.nds-uk.tsv 11947
1395
+ WikiMatrix.ne-nl.tsv 17856
1396
+ WikiMatrix.ne-no.tsv 13954
1397
+ WikiMatrix.ne-pl.tsv 17302
1398
+ WikiMatrix.ne-pt.tsv 21399
1399
+ WikiMatrix.ne-ro.tsv 14108
1400
+ WikiMatrix.ne-ru.tsv 19225
1401
+ WikiMatrix.ne-sh.tsv 10471
1402
+ WikiMatrix.ne-sk.tsv 10400
1403
+ WikiMatrix.ne-sl.tsv 10418
1404
+ WikiMatrix.ne-sv.tsv 17951
1405
+ WikiMatrix.ne-uk.tsv 11500
1406
+ WikiMatrix.nl-no.tsv 133308
1407
+ WikiMatrix.nl-oc.tsv 13488
1408
+ WikiMatrix.nl-pl.tsv 177117
1409
+ WikiMatrix.nl-pt.tsv 218472
1410
+ WikiMatrix.nl-ro.tsv 96776
1411
+ WikiMatrix.nl-ru.tsv 199345
1412
+ WikiMatrix.nl-sh.tsv 53430
1413
+ WikiMatrix.nl-si.tsv 42365
1414
+ WikiMatrix.nl-sk.tsv 66565
1415
+ WikiMatrix.nl-sl.tsv 64687
1416
+ WikiMatrix.nl-sq.tsv 34902
1417
+ WikiMatrix.nl-sr.tsv 61780
1418
+ WikiMatrix.nl-sv.tsv 151735
1419
+ WikiMatrix.nl-sw.tsv 16582
1420
+ WikiMatrix.nl-ta.tsv 37639
1421
+ WikiMatrix.nl-te.tsv 35569
1422
+ WikiMatrix.nl-tl.tsv 29776
1423
+ WikiMatrix.nl-tr.tsv 90968
1424
+ WikiMatrix.nl-tt.tsv 18420
1425
+ WikiMatrix.nl-uk.tsv 104378
1426
+ WikiMatrix.nl-vi.tsv 84022
1427
+ WikiMatrix.nl-zh.tsv 88818
1428
+ WikiMatrix.no-pl.tsv 103674
1429
+ WikiMatrix.no-pt.tsv 161215
1430
+ WikiMatrix.no-ro.tsv 74943
1431
+ WikiMatrix.no-ru.tsv 121486
1432
+ WikiMatrix.no-sh.tsv 42996
1433
+ WikiMatrix.no-si.tsv 28739
1434
+ WikiMatrix.no-sk.tsv 43781
1435
+ WikiMatrix.no-sl.tsv 51732
1436
+ WikiMatrix.no-sq.tsv 26679
1437
+ WikiMatrix.no-sr.tsv 47744
1438
+ WikiMatrix.no-sv.tsv 270882
1439
+ WikiMatrix.no-sw.tsv 12710
1440
+ WikiMatrix.no-ta.tsv 24589
1441
+ WikiMatrix.no-te.tsv 23501
1442
+ WikiMatrix.no-tl.tsv 24491
1443
+ WikiMatrix.no-tr.tsv 61772
1444
+ WikiMatrix.no-tt.tsv 13155
1445
+ WikiMatrix.no-uk.tsv 69895
1446
+ WikiMatrix.no-vi.tsv 79750
1447
+ WikiMatrix.no-zh.tsv 63206
1448
+ WikiMatrix.oc-pl.tsv 13703
1449
+ WikiMatrix.oc-pt.tsv 24424
1450
+ WikiMatrix.oc-ro.tsv 11840
1451
+ WikiMatrix.oc-ru.tsv 14902
1452
+ WikiMatrix.oc-sv.tsv 12596
1453
+ WikiMatrix.pl-pt.tsv 200506
1454
+ WikiMatrix.pl-ro.tsv 97037
1455
+ WikiMatrix.pl-ru.tsv 285946
1456
+ WikiMatrix.pl-sh.tsv 56752
1457
+ WikiMatrix.pl-si.tsv 40941
1458
+ WikiMatrix.pl-sk.tsv 81071
1459
+ WikiMatrix.pl-sl.tsv 68333
1460
+ WikiMatrix.pl-sq.tsv 35947
1461
+ WikiMatrix.pl-sr.tsv 69550
1462
+ WikiMatrix.pl-sv.tsv 121793
1463
+ WikiMatrix.pl-sw.tsv 16928
1464
+ WikiMatrix.pl-ta.tsv 39892
1465
+ WikiMatrix.pl-te.tsv 42060
1466
+ WikiMatrix.pl-tl.tsv 28804
1467
+ WikiMatrix.pl-tr.tsv 92945
1468
+ WikiMatrix.pl-tt.tsv 16386
1469
+ WikiMatrix.pl-uk.tsv 172368
1470
+ WikiMatrix.pl-vi.tsv 84550
1471
+ WikiMatrix.pl-zh.tsv 92708
1472
+ WikiMatrix.pt-ro.tsv 177269
1473
+ WikiMatrix.pt-ru.tsv 312869
1474
+ WikiMatrix.pt-sh.tsv 74080
1475
+ WikiMatrix.pt-si.tsv 76114
1476
+ WikiMatrix.pt-sk.tsv 71181
1477
+ WikiMatrix.pt-sl.tsv 85307
1478
+ WikiMatrix.pt-sq.tsv 47867
1479
+ WikiMatrix.pt-sr.tsv 101375
1480
+ WikiMatrix.pt-sv.tsv 155481
1481
+ WikiMatrix.pt-sw.tsv 20692
1482
+ WikiMatrix.pt-ta.tsv 42380
1483
+ WikiMatrix.pt-te.tsv 54636
1484
+ WikiMatrix.pt-tl.tsv 45927
1485
+ WikiMatrix.pt-tr.tsv 140579
1486
+ WikiMatrix.pt-tt.tsv 23174
1487
+ WikiMatrix.pt-uk.tsv 156140
1488
+ WikiMatrix.pt-vi.tsv 213119
1489
+ WikiMatrix.pt-wuu.tsv 11129
1490
+ WikiMatrix.pt-zh.tsv 165205
1491
+ WikiMatrix.ro-ru.tsv 136407
1492
+ WikiMatrix.ro-sh.tsv 44686
1493
+ WikiMatrix.ro-si.tsv 43266
1494
+ WikiMatrix.ro-sk.tsv 42561
1495
+ WikiMatrix.ro-sl.tsv 49716
1496
+ WikiMatrix.ro-sq.tsv 30941
1497
+ WikiMatrix.ro-sr.tsv 58682
1498
+ WikiMatrix.ro-sv.tsv 75782
1499
+ WikiMatrix.ro-sw.tsv 15025
1500
+ WikiMatrix.ro-ta.tsv 23098
1501
+ WikiMatrix.ro-te.tsv 27477
1502
+ WikiMatrix.ro-tl.tsv 29061
1503
+ WikiMatrix.ro-tr.tsv 72180
1504
+ WikiMatrix.ro-tt.tsv 13876
1505
+ WikiMatrix.ro-uk.tsv 82153
1506
+ WikiMatrix.ro-vi.tsv 96125
1507
+ WikiMatrix.ro-zh.tsv 74790
1508
+ WikiMatrix.ru-sh.tsv 70262
1509
+ WikiMatrix.ru-si.tsv 42594
1510
+ WikiMatrix.ru-sk.tsv 85656
1511
+ WikiMatrix.ru-sl.tsv 78858
1512
+ WikiMatrix.ru-sq.tsv 44661
1513
+ WikiMatrix.ru-sr.tsv 114775
1514
+ WikiMatrix.ru-sv.tsv 140222
1515
+ WikiMatrix.ru-sw.tsv 17943
1516
+ WikiMatrix.ru-ta.tsv 54465
1517
+ WikiMatrix.ru-te.tsv 55768
1518
+ WikiMatrix.ru-tg.tsv 10759
1519
+ WikiMatrix.ru-tl.tsv 29214
1520
+ WikiMatrix.ru-tr.tsv 119345
1521
+ WikiMatrix.ru-tt.tsv 25244
1522
+ WikiMatrix.ru-uk.tsv 2486905
1523
+ WikiMatrix.ru-vi.tsv 122026
1524
+ WikiMatrix.ru-wuu.tsv 10421
1525
+ WikiMatrix.ru-zh.tsv 148733
1526
+ WikiMatrix.sh-si.tsv 17999
1527
+ WikiMatrix.sh-sk.tsv 27941
1528
+ WikiMatrix.sh-sl.tsv 46667
1529
+ WikiMatrix.sh-sq.tsv 19045
1530
+ WikiMatrix.sh-sr.tsv 373728
1531
+ WikiMatrix.sh-sv.tsv 46389
1532
+ WikiMatrix.sh-ta.tsv 14229
1533
+ WikiMatrix.sh-te.tsv 13914
1534
+ WikiMatrix.sh-tl.tsv 17012
1535
+ WikiMatrix.sh-tr.tsv 35108
1536
+ WikiMatrix.sh-uk.tsv 45971
1537
+ WikiMatrix.sh-vi.tsv 42484
1538
+ WikiMatrix.sh-zh.tsv 36099
1539
+ WikiMatrix.si-sk.tsv 22131
1540
+ WikiMatrix.si-sl.tsv 22809
1541
+ WikiMatrix.si-sq.tsv 10145
1542
+ WikiMatrix.si-sr.tsv 15895
1543
+ WikiMatrix.si-sv.tsv 48372
1544
+ WikiMatrix.si-tr.tsv 15421
1545
+ WikiMatrix.si-uk.tsv 21209
1546
+ WikiMatrix.si-vi.tsv 14999
1547
+ WikiMatrix.si-zh.tsv 16002
1548
+ WikiMatrix.sk-sl.tsv 36507
1549
+ WikiMatrix.sk-sq.tsv 17211
1550
+ WikiMatrix.sk-sr.tsv 34375
1551
+ WikiMatrix.sk-sv.tsv 51536
1552
+ WikiMatrix.sk-ta.tsv 14594
1553
+ WikiMatrix.sk-te.tsv 15627
1554
+ WikiMatrix.sk-tl.tsv 16713
1555
+ WikiMatrix.sk-tr.tsv 37685
1556
+ WikiMatrix.sk-uk.tsv 51350
1557
+ WikiMatrix.sk-vi.tsv 38667
1558
+ WikiMatrix.sk-zh.tsv 38556
1559
+ WikiMatrix.sl-sq.tsv 19695
1560
+ WikiMatrix.sl-sr.tsv 47119
1561
+ WikiMatrix.sl-sv.tsv 50838
1562
+ WikiMatrix.sl-ta.tsv 15526
1563
+ WikiMatrix.sl-te.tsv 16081
1564
+ WikiMatrix.sl-tl.tsv 17840
1565
+ WikiMatrix.sl-tr.tsv 39624
1566
+ WikiMatrix.sl-uk.tsv 50320
1567
+ WikiMatrix.sl-vi.tsv 48297
1568
+ WikiMatrix.sl-zh.tsv 42036
1569
+ WikiMatrix.sq-sr.tsv 25103
1570
+ WikiMatrix.sq-sv.tsv 31183
1571
+ WikiMatrix.sq-ta.tsv 13707
1572
+ WikiMatrix.sq-te.tsv 10575
1573
+ WikiMatrix.sq-tl.tsv 10943
1574
+ WikiMatrix.sq-tr.tsv 27534
1575
+ WikiMatrix.sq-uk.tsv 29077
1576
+ WikiMatrix.sq-vi.tsv 30454
1577
+ WikiMatrix.sq-zh.tsv 24128
1578
+ WikiMatrix.sr-sv.tsv 51675
1579
+ WikiMatrix.sr-ta.tsv 19095
1580
+ WikiMatrix.sr-te.tsv 18178
1581
+ WikiMatrix.sr-tl.tsv 14064
1582
+ WikiMatrix.sr-tr.tsv 43382
1583
+ WikiMatrix.sr-uk.tsv 71932
1584
+ WikiMatrix.sr-vi.tsv 56213
1585
+ WikiMatrix.sr-zh.tsv 45291
1586
+ WikiMatrix.sv-sw.tsv 16864
1587
+ WikiMatrix.sv-ta.tsv 33629
1588
+ WikiMatrix.sv-te.tsv 39137
1589
+ WikiMatrix.sv-tl.tsv 35027
1590
+ WikiMatrix.sv-tr.tsv 72959
1591
+ WikiMatrix.sv-tt.tsv 17518
1592
+ WikiMatrix.sv-uk.tsv 82027
1593
+ WikiMatrix.sv-vi.tsv 74202
1594
+ WikiMatrix.sv-zh.tsv 73747
1595
+ WikiMatrix.sw-tr.tsv 12260
1596
+ WikiMatrix.sw-uk.tsv 12284
1597
+ WikiMatrix.sw-vi.tsv 10822
1598
+ WikiMatrix.sw-zh.tsv 11233
1599
+ WikiMatrix.ta-tr.tsv 29056
1600
+ WikiMatrix.ta-uk.tsv 30604
1601
+ WikiMatrix.ta-vi.tsv 19365
1602
+ WikiMatrix.ta-zh.tsv 27184
1603
+ WikiMatrix.te-tr.tsv 21596
1604
+ WikiMatrix.te-uk.tsv 30800
1605
+ WikiMatrix.te-vi.tsv 16788
1606
+ WikiMatrix.te-zh.tsv 20912
1607
+ WikiMatrix.tl-tr.tsv 12260
1608
+ WikiMatrix.tl-uk.tsv 16560
1609
+ WikiMatrix.tl-vi.tsv 17399
1610
+ WikiMatrix.tl-zh.tsv 10492
1611
+ WikiMatrix.tr-tt.tsv 10644
1612
+ WikiMatrix.tr-uk.tsv 67753
1613
+ WikiMatrix.tr-vi.tsv 77062
1614
+ WikiMatrix.tr-zh.tsv 69162
1615
+ WikiMatrix.tt-uk.tsv 11500
1616
+ WikiMatrix.tt-zh.tsv 10587
1617
+ WikiMatrix.uk-vi.tsv 73104
1618
+ WikiMatrix.uk-zh.tsv 72752
1619
+ WikiMatrix.vi-zh.tsv 89445
1620
+ WikiMatrix.wuu-zh.tsv 43747
laser/tasks/bucc/README.md ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # LASER: application to bitext mining
2
+
3
+ This codes shows how to use the multilingual sentence embeddings to mine
4
+ for parallel data in (huge) collections of monolingual data.
5
+
6
+ The underlying idea is pretty simple:
7
+ * embed the sentences in the two languages into the joint sentence space
8
+ * calculate all pairwise distances between the sentences.
9
+ This is of complexity O(N\*M) and can be done very efficiently with
10
+ the FAISS library [2]
11
+ * all sentence pairs which have a distance below a threshold
12
+ are considered as parallel
13
+ * this approach can be further improved using a margin criterion [3]
14
+
15
+ Here, we apply this idea to the data provided by the shared task of the BUCC
16
+ [Workshop on Building and Using Comparable Corpora](https://comparable.limsi.fr/bucc2018/bucc2018-task.html).
17
+
18
+ The same approach can be scaled up to huge collections of monolingual texts
19
+ (several billions) using more advanced features of the FAISS toolkit.
20
+
21
+ ## Installation
22
+
23
+ * Please first download the BUCC shared task data
24
+ [here](https://comparable.limsi.fr/bucc2017/cgi-bin/download-data-2018.cgi)
25
+ and install it the directory "downloaded"
26
+ * running the script
27
+ ```bash
28
+ ./bucc.sh
29
+ ```
30
+
31
+ ## Results
32
+
33
+ Optimized on the F-scores on the training corpus.
34
+ These results differ slighty from those published in [4] due to the switch from PyTorch 0.4 to 1.0.
35
+
36
+ | Languages | Threshold | precision | Recall | F-score |
37
+ |-----------|-----------|-----------|--------|---------|
38
+ | fr-en | 1.088131 | 91.52 | 93.32 | 92.41 |
39
+ | de-en | 1.092056 | 95.65 | 95.19 | 95.42 |
40
+ | ru-en | 1.093404 | 90.60 | 94.04 | 92.29 |
41
+ | zh-en | 1.085999 | 91.99 | 91.31 | 91.65 |
42
+
43
+ Results on the official test set are scored by the organizers of the BUCC workshop.
44
+
45
+
46
+ Below, we compare our approach to the [official results of the 2018 edition
47
+ of the BUCC workshop](http://lrec-conf.org/workshops/lrec2018/W8/pdf/12_W8.pdf) [1].
48
+ More details on our approach are provided in [2,3,4]
49
+
50
+ | System | fr-en | de-en | ru-en | zh-en |
51
+ |----------------------|-------|-------|-------|-------|
52
+ | Azpeitia et al '17 | 79.5 | 83.7 | - | - |
53
+ | Azpeitia et al '18 | 81.5 | 85.5 | 81.3 | 77.5 |
54
+ |Bouamor and Sajjad '18| 76.0 | - | - | - |
55
+ | Chongman et al '18 | - | - | - | 56 |
56
+ | LASER [3] | 75.8 | 76.9 | - | - |
57
+ | LASER [4] | 93.1 | 96.2 | 92.3 | 92.7 |
58
+
59
+ All numbers are F1-scores on the test set.
60
+
61
+ ## Bonus
62
+
63
+ To show case the highly multilingual aspect of LASER's sentence embeddings,
64
+ we also mine for bitexts for language pairs which do not include English, e.g.
65
+ French-German, Russian-French or Chinese-Russian.
66
+ This is also performed by the script bucc.sh
67
+
68
+ Below the number of extracted parallel sentences for each language pair.
69
+
70
+ | src/trg | French | German | Russian | Chinese |
71
+ |---------|--------|--------|---------|---------|
72
+ | French | n/a | 2795 | 3327 | 387 |
73
+ | German | 2795 | n/a | 3661 | 466 |
74
+ | Russian | 3327 | 3661 | n/a | 664 |
75
+ | Chinese | 387 | 466 | 664 | n/a |
76
+
77
+
78
+ ## References
79
+
80
+ [1] Pierre Zweigenbaum, Serge Sharoff and Reinhard Rapp,`
81
+ [*Overview of the Third BUCC Shared Task: Spotting Parallel Sentences in Comparable Corpora*](http://lrec-conf.org/workshops/lrec2018/W8/pdf/12_W8.pdf),
82
+ LREC, 2018.
83
+
84
+ [2] Holger Schwenk,
85
+ [*Filtering and Mining Parallel Data in a Joint Multilingual Space*](https://arxiv.org/abs/1805.09822),
86
+ ACL, July 2018
87
+
88
+ [3] Mikel Artetxe and Holger Schwenk,
89
+ [*Margin-based Parallel Corpus Mining with Multilingual Sentence Embeddings*](https://arxiv.org/abs/1811.01136)
90
+ arXiv, 3 Nov 2018.
91
+
92
+ [3] Mikel Artetxe and Holger Schwenk,
93
+ [*Massively Multilingual Sentence Embeddings for Zero-Shot Cross-Lingual Transfer and Beyond*](https://arxiv.org/abs/1812.10464)
94
+ arXiv, 26 Dec 2018.