Spaces:

nvidia
/

P2A-test-NV

Sleeping

P2A-test-NV / laser /install_external_tools.sh

KuangDW

Add laser2.spm using Git LFS

05d3571 13 days ago

5.71 kB

	#!/bin/bash
	# Copyright (c) Facebook, Inc. and its affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the BSD-style license found in the
	# LICENSE file in the root directory of this source tree.
	#
	# LASER Language-Agnostic SEntence Representations
	# is a toolkit to calculate multilingual sentence embeddings
	# and to use them for document classification, bitext filtering
	# and mining
	#
	#-------------------------------------------------------
	#
	# This bash script installs third party software
	#

	if [ -z ${LASER} ] ; then
	echo "Please set the environment variable 'LASER'"
	exit
	fi

	###################################################################
	#
	# Generic helper functions
	#
	###################################################################

	MKDIR () {
	dname=$1
	if [ ! -d ${dname} ] ; then
	echo " - creating directory ${dname}"
	mkdir -p ${dname}
	fi
	}


	bdir="${LASER}"
	tools_ext="${bdir}/tools-external"
	MKDIR $tools_ext

	###################################################################
	#
	# Tokenization tools from Moses
	# It is important to use the official release V4 and not the current one
	# to obtain the same results than the published ones.
	# (the behavior of the tokenizer for end-of-sentence abbreviations has changed)
	#
	###################################################################

	InstallMosesTools () {
	moses_git="https://raw.githubusercontent.com/moses-smt/mosesdecoder/RELEASE-4.0/scripts"
	moses_files=("tokenizer/tokenizer.perl" "tokenizer/detokenizer.perl" \
	"tokenizer/normalize-punctuation.perl" \
	"tokenizer/remove-non-printing-char.perl" \
	"tokenizer/deescape-special-chars.perl" \
	"tokenizer/lowercase.perl" \
	"tokenizer/basic-protected-patterns" \
	)

	wdir="${tools_ext}/moses-tokenizer/tokenizer"
	MKDIR ${wdir}
	cd ${wdir}

	for f in ${moses_files[@]} ; do
	if [ ! -f `basename ${f}` ] ; then
	echo " - download ${f}"
	wget -q ${moses_git}/${f}
	fi
	done
	chmod 755 *perl

	# download non-breaking prefixes per language
	moses_non_breakings="share/nonbreaking_prefixes/nonbreaking_prefix"
	moses_non_breaking_langs=( \
	"ca" "cs" "de" "el" "en" "es" "fi" "fr" "ga" "hu" "is" \
	"it" "lt" "lv" "nl" "pl" "pt" "ro" "ru" "sk" "sl" "sv" \
	"ta" "yue" "zh" )
	wdir="${tools_ext}/moses-tokenizer/share/nonbreaking_prefixes"
	MKDIR ${wdir}
	cd ${wdir}

	for l in ${moses_non_breaking_langs[@]} ; do
	f="${moses_non_breakings}.${l}"
	if [ ! -f `basename ${f}` ] ; then
	echo " - download ${f}"
	wget -q ${moses_git}/${f}
	fi
	done
	}


	###################################################################
	#
	# FAST BPE
	#
	###################################################################

	InstallFastBPE () {
	cd ${tools_ext}
	if [ ! -x fastBPE/fast ] ; then
	echo " - download fastBPE software from github"
	wget https://github.com/glample/fastBPE/archive/master.zip
	unzip master.zip
	/bin/rm master.zip
	mv fastBPE-master fastBPE
	cd fastBPE
	echo " - compiling"
	g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast
	if [ $? -eq 1 ] ; then
	echo "ERROR: compilation failed, please install manually"; exit
	fi
	python setup.py install
	fi
	}

	###################################################################
	#
	# SENTENCEPIECE
	#
	###################################################################

	InstallSentencePiece () {
	cd ${tools_ext}
	if [ ! -d sentencepiece-master ] ; then
	echo " - download sentencepiece from github"
	wget https://github.com/google/sentencepiece/archive/master.zip
	unzip master.zip
	/bin/rm master.zip
	if [ ! -s /usr/local/bin/spm_encode ] ; then
	echo " - building code "
	cd sentencepiece-master
	mkdir build
	cd build
	cmake ..
	make -j 10
	fi
	fi
	}


	###################################################################
	#
	# Install Japanese tokenizer Mecab
	# We do not use automatic installation with "pip" but directly add the soruce directory
	#
	###################################################################

	InstallMecab () {
	cd ${tools_ext}
	if [ ! -x mecab/mecab/bin/mecab ] ; then
	echo " - download mecab from github"
	wget https://github.com/taku910/mecab/archive/master.zip
	unzip master.zip
	#/bin/rm master.zip
	if [ ! -s mecab/bin/mecab ] ; then
	mkdir mecab
	cd mecab-master/mecab
	echo " - installing code"
	./configure --prefix ${tools_ext}/mecab && make && make install
	if [ $? -q 1 ] ; then
	echo "ERROR: installation failed, please install manually"; exit
	fi
	fi
	if [ ! -d mecab/lib/mecab/dic/ipadic ] ; then
	cd ${tools_ext}/mecab-master/mecab-ipadic
	echo " - installing dictionaries"
	./configure --prefix ${tools_ext}/mecab --with-mecab-config=${tools_ext}/mecab/bin/mecab-config \
	&& make && make install
	if [ $? -eq 1 ] ; then
	echo "ERROR: compilation failed, please install manually"; exit
	fi
	fi
	fi
	}


	###################################################################
	#
	# main
	#
	###################################################################

	echo "Installing the laser_encoders package in editable mode"

	pip install -e .

	echo "Installing external tools"

	InstallMosesTools
	InstallFastBPE
	InstallSentencePiece

	#InstallMecab
	echo ""
	echo "automatic installation of the Japanese tokenizer mecab may be tricky"
	echo "Please install it manually from https://github.com/taku910/mecab"
	echo ""
	echo "The installation directory should be ${LASER}/tools-external/mecab"
	echo ""