KuangDW
Add laser2.spm using Git LFS
05d3571
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
#
# LASER Language-Agnostic SEntence Representations
# is a toolkit to calculate multilingual sentence embeddings
# and to use them for document classification, bitext filtering
# and mining
#
# --------------------------------------------------------
#
# bash script to mine for bitexts in the BUCC corpus
if [ -z ${LASER+x} ] ; then
echo "Please set the environment variable 'LASER'"
exit
fi
# general config
bucc="bucc2018"
data="."
xdir=${data}/downloaded # tar files as distrubuted by the BUCC evaluation
ddir=${data}/${bucc} # raw texts of BUCC
edir=${data}/embed # normalized texts and embeddings
langs=("fr" "de" "ru" "zh")
ltrg="en" # English is always the 2nd language
# encoder
model_dir="${LASER}/models"
encoder="${model_dir}/bilstm.93langs.2018-12-26.pt"
bpe_codes="${model_dir}/93langs.fcodes"
###################################################################
#
# Extract files with labels and texts from the BUCC corpus
#
###################################################################
GetData () {
fn1=$1; fn2=$2; lang=$3
outf="${edir}/${bucc}.${lang}-${ltrg}.${fn2}"
for ll in ${ltrg} ${lang} ; do
inf="${ddir}/${fn1}.${ll}"
if [ ! -f ${outf}.txt.${ll} ] ; then
echo " - extract files ${outf} in ${ll}"
cat ${inf} | cut -f1 > ${outf}.id.${ll}
cat ${inf} | cut -f2 > ${outf}.txt.${ll}
fi
done
}
ExtractBUCC () {
slang=$1
tlang=${ltrg}
pushd ${data} > /dev/null
if [ ! -d ${ddir}/${slang}-${tlang} ] ; then
for tf in ${xdir}/${bucc}-${slang}-${tlang}.*.tar.bz2 ; do
echo " - extract from tar `basename ${tf}`"
tar jxf $tf
done
fi
GetData "${slang}-${tlang}/${slang}-${tlang}.sample" "dev" ${slang}
GetData "${slang}-${tlang}/${slang}-${tlang}.training" "train" ${slang}
GetData "${slang}-${tlang}/${slang}-${tlang}.test" "test" ${slang}
popd > /dev/null
}
###################################################################
#
# Tokenize and Embed
#
###################################################################
Embed () {
ll=$2
txt="$1.txt.${ll}"
enc="$1.enc.${ll}"
if [ ! -s ${enc} ] ; then
cat ${txt} | python3 ${LASER}/source/embed.py \
--encoder ${encoder} \
--token-lang ${ll} \
--bpe-codes ${bpe_codes} \
--output ${enc} \
--verbose
fi
}
###################################################################
#
# Mine for bitexts
#
###################################################################
Mine () {
bn=$1
l1=$2
l2=$3
cand="${bn}.candidates.tsv"
if [ ! -s ${cand} ] ; then
python3 ${LASER}/source/mine_bitexts.py \
${bn}.txt.${l1} ${bn}.txt.${l2} \
--src-lang ${l1} --trg-lang ${l2} \
--src-embeddings ${bn}.enc.${l1} --trg-embeddings ${bn}.enc.${l2} \
--unify --mode mine --retrieval max --margin ratio -k 4 \
--output ${cand} \
--verbose --gpu
fi
}
###################################################################
#
# Main loop
#
###################################################################
echo -e "\nProcessing BUCC data in ${data}"
# create output directories
for d in ${ddir} ${edir} ; do
mkdir -p ${d}
done
for lsrc in ${langs[@]} ; do
ExtractBUCC ${lsrc}
# Tokenize and embed train
bname="${bucc}.${lsrc}-${ltrg}"
part="${bname}.train"
Embed ${edir}/${part} ${lsrc} ${encoder} ${bpe_codes}
Embed ${edir}/${part} ${ltrg} ${encoder} ${bpe_codes}
# mine for texts in train
Mine ${edir}/${part} ${lsrc} ${ltrg}
# optimize threshold on BUCC training data and provided gold alignments
if [ ! -s ${part}.log ] ; then
python3 bucc.py \
--src-lang ${lsrc} --trg-lang ${ltrg} \
--bucc-texts ${edir}/${part}.txt \
--bucc-ids ${edir}/${part}.id \
--candidates ${edir}/${part}.candidates.tsv \
--gold ${ddir}/${lsrc}-${ltrg}/${lsrc}-${ltrg}.training.gold \
--verbose \
| tee ${part}.log
fi
# Tokenize and embed test
part="${bname}.test"
Embed ${edir}/${part} ${lsrc} ${encoder} ${bpe_codes}
Embed ${edir}/${part} ${ltrg} ${encoder} ${bpe_codes}
# mine for texts in test
Mine ${edir}/${part} ${lsrc} ${ltrg}
# extract test bitexts for treshhold optimized on train
th=`grep 'best threshold' ${bname}.train.log | sed -e 's/[=:]/ /g' | awk '{print $4}'`
extracted="${edir}/${part}.extracted.tsv"
if [ ! -s ${extracted} ] ; then
python3 bucc.py \
--src-lang ${lsrc} --trg-lang ${ltrg} \
--bucc-texts ${edir}/${part}.txt \
--bucc-ids ${edir}/${part}.id \
--candidates ${edir}/${part}.candidates.tsv \
--threshold ${th} --output ${extracted} \
--verbose
fi
done
# Bonus: extract bitexts with English alignments
# using a (conservative) threshold of 1.1
# All the data is supposed to be already tokenized
th=1.1
for lsrc in ${langs[@]} ; do
for ltrg in ${langs[@]} ; do
if [ ${lsrc} != 'en' -a ${ltrg} != "en" -a ${lsrc} != ${ltrg} ] ; then
bitext="${bucc}.${lsrc}-${ltrg}.train.extracted.th${th}.csv"
if [ ! -s ${bitext} ] ; then
echo "Extracting bitexts for ${lsrc}-${ltrg}"
python3 ${LASER}/source/mine_bitexts.py \
${edir}/${bucc}.${lsrc}-en.train.txt.${lsrc} \
${edir}/${bucc}.${ltrg}-en.train.txt.${ltrg} \
--src-lang ${lsrc} --trg-lang ${ltrg} \
--src-embeddings ${edir}/${bucc}.${lsrc}-en.train.enc.${lsrc} \
--trg-embeddings ${edir}/${bucc}.${ltrg}-en.train.enc.${ltrg} \
--unify --mode mine --retrieval max --margin ratio -k 4 \
--output ${bitext} --threshold ${th} \
--verbose --gpu
fi
fi
done
done