#!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. # # LASER Language-Agnostic SEntence Representations # is a toolkit to calculate multilingual sentence embeddings # and to use them for document classification, bitext filtering # and mining # #------------------------------------------------------- # # This bash script installs third party software # if [ -z ${LASER} ] ; then echo "Please set the environment variable 'LASER'" exit fi ################################################################### # # Generic helper functions # ################################################################### MKDIR () { dname=$1 if [ ! -d ${dname} ] ; then echo " - creating directory ${dname}" mkdir -p ${dname} fi } bdir="${LASER}" tools_ext="${bdir}/tools-external" MKDIR $tools_ext ################################################################### # # Tokenization tools from Moses # It is important to use the official release V4 and not the current one # to obtain the same results than the published ones. # (the behavior of the tokenizer for end-of-sentence abbreviations has changed) # ################################################################### InstallMosesTools () { moses_git="https://raw.githubusercontent.com/moses-smt/mosesdecoder/RELEASE-4.0/scripts" moses_files=("tokenizer/tokenizer.perl" "tokenizer/detokenizer.perl" \ "tokenizer/normalize-punctuation.perl" \ "tokenizer/remove-non-printing-char.perl" \ "tokenizer/deescape-special-chars.perl" \ "tokenizer/lowercase.perl" \ "tokenizer/basic-protected-patterns" \ ) wdir="${tools_ext}/moses-tokenizer/tokenizer" MKDIR ${wdir} cd ${wdir} for f in ${moses_files[@]} ; do if [ ! -f `basename ${f}` ] ; then echo " - download ${f}" wget -q ${moses_git}/${f} fi done chmod 755 *perl # download non-breaking prefixes per language moses_non_breakings="share/nonbreaking_prefixes/nonbreaking_prefix" moses_non_breaking_langs=( \ "ca" "cs" "de" "el" "en" "es" "fi" "fr" "ga" "hu" "is" \ "it" "lt" "lv" "nl" "pl" "pt" "ro" "ru" "sk" "sl" "sv" \ "ta" "yue" "zh" ) wdir="${tools_ext}/moses-tokenizer/share/nonbreaking_prefixes" MKDIR ${wdir} cd ${wdir} for l in ${moses_non_breaking_langs[@]} ; do f="${moses_non_breakings}.${l}" if [ ! -f `basename ${f}` ] ; then echo " - download ${f}" wget -q ${moses_git}/${f} fi done } ################################################################### # # FAST BPE # ################################################################### InstallFastBPE () { cd ${tools_ext} if [ ! -x fastBPE/fast ] ; then echo " - download fastBPE software from github" wget https://github.com/glample/fastBPE/archive/master.zip unzip master.zip /bin/rm master.zip mv fastBPE-master fastBPE cd fastBPE echo " - compiling" g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast if [ $? -eq 1 ] ; then echo "ERROR: compilation failed, please install manually"; exit fi python setup.py install fi } ################################################################### # # SENTENCEPIECE # ################################################################### InstallSentencePiece () { cd ${tools_ext} if [ ! -d sentencepiece-master ] ; then echo " - download sentencepiece from github" wget https://github.com/google/sentencepiece/archive/master.zip unzip master.zip /bin/rm master.zip if [ ! -s /usr/local/bin/spm_encode ] ; then echo " - building code " cd sentencepiece-master mkdir build cd build cmake .. make -j 10 fi fi } ################################################################### # # Install Japanese tokenizer Mecab # We do not use automatic installation with "pip" but directly add the soruce directory # ################################################################### InstallMecab () { cd ${tools_ext} if [ ! -x mecab/mecab/bin/mecab ] ; then echo " - download mecab from github" wget https://github.com/taku910/mecab/archive/master.zip unzip master.zip #/bin/rm master.zip if [ ! -s mecab/bin/mecab ] ; then mkdir mecab cd mecab-master/mecab echo " - installing code" ./configure --prefix ${tools_ext}/mecab && make && make install if [ $? -q 1 ] ; then echo "ERROR: installation failed, please install manually"; exit fi fi if [ ! -d mecab/lib/mecab/dic/ipadic ] ; then cd ${tools_ext}/mecab-master/mecab-ipadic echo " - installing dictionaries" ./configure --prefix ${tools_ext}/mecab --with-mecab-config=${tools_ext}/mecab/bin/mecab-config \ && make && make install if [ $? -eq 1 ] ; then echo "ERROR: compilation failed, please install manually"; exit fi fi fi } ################################################################### # # main # ################################################################### echo "Installing the laser_encoders package in editable mode" pip install -e . echo "Installing external tools" InstallMosesTools InstallFastBPE InstallSentencePiece #InstallMecab echo "" echo "automatic installation of the Japanese tokenizer mecab may be tricky" echo "Please install it manually from https://github.com/taku910/mecab" echo "" echo "The installation directory should be ${LASER}/tools-external/mecab" echo ""