Spaces:
Sleeping
Sleeping
KuangDW
commited on
Commit
·
8dfab00
1
Parent(s):
2e5836c
add embed.sh and cython file
Browse files- laser/.gitignore +0 -1
- laser/tasks/embed/README.md +44 -0
- laser/tasks/embed/embed.sh +79 -0
- vecalign/.gitignore +1 -2
laser/.gitignore
CHANGED
@@ -3,7 +3,6 @@ source/lib/__pycache__
|
|
3 |
models
|
4 |
tools-external
|
5 |
tasks/mldoc/MLDoc
|
6 |
-
embed
|
7 |
tasks/bucc/downloaded
|
8 |
tasks/similarity/dev/
|
9 |
tasks/xnli/XNLI-1.0*
|
|
|
3 |
models
|
4 |
tools-external
|
5 |
tasks/mldoc/MLDoc
|
|
|
6 |
tasks/bucc/downloaded
|
7 |
tasks/similarity/dev/
|
8 |
tasks/xnli/XNLI-1.0*
|
laser/tasks/embed/README.md
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# LASER: calculation of sentence embeddings
|
2 |
+
|
3 |
+
Tool to calculate sentence embeddings for an arbitrary text file:
|
4 |
+
```
|
5 |
+
bash ./embed.sh INPUT-FILE OUTPUT-FILE [LANGUAGE]
|
6 |
+
```
|
7 |
+
|
8 |
+
The input will first be tokenized, and then sentence embeddings will be generated. If a `language` is specified,
|
9 |
+
then `embed.sh` will look for a language-specific LASER3 encoder using the format: `{model_dir}/laser3-{language}.{version}.pt`.
|
10 |
+
Otherwise it will default to LASER2 which covers the same 93 languages as [the original LASER encoder](https://arxiv.org/pdf/1812.10464.pdf).
|
11 |
+
|
12 |
+
**NOTE:** please set the model location (`model_dir` in `embed.sh`) before running. We recommend to download the models from the NLLB
|
13 |
+
release (see [here](/nllb/README.md)). Optionally you can also select the model version number for downloaded LASER3 models. This currently defaults to: `1` (initial release).
|
14 |
+
|
15 |
+
## Output format
|
16 |
+
|
17 |
+
The embeddings are stored in float32 matrices in raw binary format.
|
18 |
+
They can be read in Python by:
|
19 |
+
```
|
20 |
+
import numpy as np
|
21 |
+
dim = 1024
|
22 |
+
X = np.fromfile("my_embeddings.bin", dtype=np.float32, count=-1)
|
23 |
+
X.resize(X.shape[0] // dim, dim)
|
24 |
+
```
|
25 |
+
X is a N x 1024 matrix where N is the number of lines in the text file.
|
26 |
+
|
27 |
+
## Examples
|
28 |
+
|
29 |
+
In order to encode an input text in any of the 93 languages supported by LASER2 (e.g. Afrikaans, English, French):
|
30 |
+
```
|
31 |
+
./embed.sh input_file output_file
|
32 |
+
```
|
33 |
+
|
34 |
+
To use a language-specific encoder (if available), such as for example: Wolof, Hausa, or Irish:
|
35 |
+
```
|
36 |
+
./embed.sh input_file output_file wol_Latn
|
37 |
+
```
|
38 |
+
```
|
39 |
+
./embed.sh input_file output_file hau_Latn
|
40 |
+
```
|
41 |
+
```
|
42 |
+
./embed.sh input_file output_file gle_Latn
|
43 |
+
```
|
44 |
+
|
laser/tasks/embed/embed.sh
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the BSD-style license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
#
|
8 |
+
# LASER Language-Agnostic SEntence Representations
|
9 |
+
# is a toolkit to calculate multilingual sentence embeddings
|
10 |
+
# and to use them for document classification, bitext filtering
|
11 |
+
# and mining
|
12 |
+
#
|
13 |
+
# --------------------------------------------------------
|
14 |
+
#
|
15 |
+
# bash script to calculate sentence embeddings for arbitrary
|
16 |
+
# text file
|
17 |
+
|
18 |
+
#############################
|
19 |
+
# BEGIN PARAMETERS TO SET
|
20 |
+
#############################
|
21 |
+
# location of models (e.g. /path/to/models); no trailing slash
|
22 |
+
model_dir="laser"
|
23 |
+
|
24 |
+
# version number for LASER3 models
|
25 |
+
version=1
|
26 |
+
#############################
|
27 |
+
# END PARAMETERS TO SET
|
28 |
+
#############################
|
29 |
+
|
30 |
+
if [ -z ${model_dir} ]; then
|
31 |
+
echo "Please set model directory within script"
|
32 |
+
exit 1
|
33 |
+
elif [ ! -d ${model_dir} ]; then
|
34 |
+
echo "Can't find model directory: $model_dir"
|
35 |
+
exit 1
|
36 |
+
fi
|
37 |
+
|
38 |
+
if [ -z ${LASER} ] ; then
|
39 |
+
echo "Please set the environment variable 'LASER'"
|
40 |
+
exit 1
|
41 |
+
fi
|
42 |
+
|
43 |
+
if [ $# -lt 2 ] ; then
|
44 |
+
echo "usage: embed.sh input-file output-file [language]"
|
45 |
+
exit 1
|
46 |
+
fi
|
47 |
+
|
48 |
+
infile=$1
|
49 |
+
outfile=$2
|
50 |
+
language=$3
|
51 |
+
|
52 |
+
# default to laser2
|
53 |
+
model_file=${model_dir}/laser2.pt
|
54 |
+
spm=${model_dir}/laser2.spm
|
55 |
+
|
56 |
+
if [ ! -z ${language} ]; then
|
57 |
+
model_file=${model_dir}/laser3-$language.v$version.pt
|
58 |
+
lang_specific_spm=${model_dir}/laser3-$language.v$version.spm
|
59 |
+
if [[ -s $lang_specific_spm ]]; then
|
60 |
+
spm=$lang_specific_spm
|
61 |
+
fi
|
62 |
+
fi
|
63 |
+
|
64 |
+
if [[ ! -s $model_file ]]; then
|
65 |
+
echo "couldn't find model file: $model_file"
|
66 |
+
exit 1
|
67 |
+
fi
|
68 |
+
|
69 |
+
if [[ ! -s $spm ]]; then
|
70 |
+
echo "couldn't find spm: $spm"
|
71 |
+
exit 1
|
72 |
+
fi
|
73 |
+
|
74 |
+
python3 ${LASER}/source/embed.py \
|
75 |
+
--input ${infile} \
|
76 |
+
--encoder ${model_file} \
|
77 |
+
--spm-model $spm \
|
78 |
+
--output ${outfile} \
|
79 |
+
--verbose
|
vecalign/.gitignore
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
build/
|
2 |
-
dp_core.c*
|
3 |
dp_core.html
|
4 |
__pycache__/
|
5 |
.idea
|
@@ -7,4 +6,4 @@ __pycache__/
|
|
7 |
.pytest_cache/
|
8 |
venv/
|
9 |
fairseq/
|
10 |
-
scores/
|
|
|
1 |
build/
|
|
|
2 |
dp_core.html
|
3 |
__pycache__/
|
4 |
.idea
|
|
|
6 |
.pytest_cache/
|
7 |
venv/
|
8 |
fairseq/
|
9 |
+
scores/
|