Spaces:
Sleeping
Sleeping
# Copyright (c) Facebook, Inc. and its affiliates. | |
# All rights reserved. | |
# | |
# This source code is licensed under the BSD-style license found in the | |
# LICENSE file in the root directory of this source tree. | |
# | |
# LASER Language-Agnostic SEntence Representations | |
# is a toolkit to calculate multilingual sentence embeddings | |
# and to use them for document classification, bitext filtering | |
# and mining | |
# | |
#------------------------------------------------------- | |
# | |
# This bash script installs third party software | |
# | |
if [ -z ${LASER} ] ; then | |
echo "Please set the environment variable 'LASER'" | |
exit | |
fi | |
################################################################### | |
# | |
# Generic helper functions | |
# | |
################################################################### | |
MKDIR () { | |
dname=$1 | |
if [ ! -d ${dname} ] ; then | |
echo " - creating directory ${dname}" | |
mkdir -p ${dname} | |
fi | |
} | |
bdir="${LASER}" | |
tools_ext="${bdir}/tools-external" | |
MKDIR $tools_ext | |
################################################################### | |
# | |
# Tokenization tools from Moses | |
# It is important to use the official release V4 and not the current one | |
# to obtain the same results than the published ones. | |
# (the behavior of the tokenizer for end-of-sentence abbreviations has changed) | |
# | |
################################################################### | |
InstallMosesTools () { | |
moses_git="https://raw.githubusercontent.com/moses-smt/mosesdecoder/RELEASE-4.0/scripts" | |
moses_files=("tokenizer/tokenizer.perl" "tokenizer/detokenizer.perl" \ | |
"tokenizer/normalize-punctuation.perl" \ | |
"tokenizer/remove-non-printing-char.perl" \ | |
"tokenizer/deescape-special-chars.perl" \ | |
"tokenizer/lowercase.perl" \ | |
"tokenizer/basic-protected-patterns" \ | |
) | |
wdir="${tools_ext}/moses-tokenizer/tokenizer" | |
MKDIR ${wdir} | |
cd ${wdir} | |
for f in ${moses_files[@]} ; do | |
if [ ! -f `basename ${f}` ] ; then | |
echo " - download ${f}" | |
wget -q ${moses_git}/${f} | |
fi | |
done | |
chmod 755 *perl | |
# download non-breaking prefixes per language | |
moses_non_breakings="share/nonbreaking_prefixes/nonbreaking_prefix" | |
moses_non_breaking_langs=( \ | |
"ca" "cs" "de" "el" "en" "es" "fi" "fr" "ga" "hu" "is" \ | |
"it" "lt" "lv" "nl" "pl" "pt" "ro" "ru" "sk" "sl" "sv" \ | |
"ta" "yue" "zh" ) | |
wdir="${tools_ext}/moses-tokenizer/share/nonbreaking_prefixes" | |
MKDIR ${wdir} | |
cd ${wdir} | |
for l in ${moses_non_breaking_langs[@]} ; do | |
f="${moses_non_breakings}.${l}" | |
if [ ! -f `basename ${f}` ] ; then | |
echo " - download ${f}" | |
wget -q ${moses_git}/${f} | |
fi | |
done | |
} | |
################################################################### | |
# | |
# FAST BPE | |
# | |
################################################################### | |
InstallFastBPE () { | |
cd ${tools_ext} | |
if [ ! -x fastBPE/fast ] ; then | |
echo " - download fastBPE software from github" | |
wget https://github.com/glample/fastBPE/archive/master.zip | |
unzip master.zip | |
/bin/rm master.zip | |
mv fastBPE-master fastBPE | |
cd fastBPE | |
echo " - compiling" | |
g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast | |
if [ $? -eq 1 ] ; then | |
echo "ERROR: compilation failed, please install manually"; exit | |
fi | |
python setup.py install | |
fi | |
} | |
################################################################### | |
# | |
# SENTENCEPIECE | |
# | |
################################################################### | |
InstallSentencePiece () { | |
cd ${tools_ext} | |
if [ ! -d sentencepiece-master ] ; then | |
echo " - download sentencepiece from github" | |
wget https://github.com/google/sentencepiece/archive/master.zip | |
unzip master.zip | |
/bin/rm master.zip | |
if [ ! -s /usr/local/bin/spm_encode ] ; then | |
echo " - building code " | |
cd sentencepiece-master | |
mkdir build | |
cd build | |
cmake .. | |
make -j 10 | |
fi | |
fi | |
} | |
################################################################### | |
# | |
# Install Japanese tokenizer Mecab | |
# We do not use automatic installation with "pip" but directly add the soruce directory | |
# | |
################################################################### | |
InstallMecab () { | |
cd ${tools_ext} | |
if [ ! -x mecab/mecab/bin/mecab ] ; then | |
echo " - download mecab from github" | |
wget https://github.com/taku910/mecab/archive/master.zip | |
unzip master.zip | |
#/bin/rm master.zip | |
if [ ! -s mecab/bin/mecab ] ; then | |
mkdir mecab | |
cd mecab-master/mecab | |
echo " - installing code" | |
./configure --prefix ${tools_ext}/mecab && make && make install | |
if [ $? -q 1 ] ; then | |
echo "ERROR: installation failed, please install manually"; exit | |
fi | |
fi | |
if [ ! -d mecab/lib/mecab/dic/ipadic ] ; then | |
cd ${tools_ext}/mecab-master/mecab-ipadic | |
echo " - installing dictionaries" | |
./configure --prefix ${tools_ext}/mecab --with-mecab-config=${tools_ext}/mecab/bin/mecab-config \ | |
&& make && make install | |
if [ $? -eq 1 ] ; then | |
echo "ERROR: compilation failed, please install manually"; exit | |
fi | |
fi | |
fi | |
} | |
################################################################### | |
# | |
# main | |
# | |
################################################################### | |
echo "Installing the laser_encoders package in editable mode" | |
pip install -e . | |
echo "Installing external tools" | |
InstallMosesTools | |
InstallFastBPE | |
InstallSentencePiece | |
#InstallMecab | |
echo "" | |
echo "automatic installation of the Japanese tokenizer mecab may be tricky" | |
echo "Please install it manually from https://github.com/taku910/mecab" | |
echo "" | |
echo "The installation directory should be ${LASER}/tools-external/mecab" | |
echo "" | |