Spaces:
Sleeping
Sleeping
# Copyright (c) Facebook, Inc. and its affiliates. | |
# All rights reserved. | |
# | |
# This source code is licensed under the BSD-style license found in the | |
# LICENSE file in the root directory of this source tree. | |
# | |
# LASER Language-Agnostic SEntence Representations | |
# is a toolkit to calculate multilingual sentence embeddings | |
# and to use them for document classification, bitext filtering | |
# and mining | |
# | |
# -------------------------------------------------------- | |
# | |
# bash script to mine for bitexts in the BUCC corpus | |
if [ -z ${LASER+x} ] ; then | |
echo "Please set the environment variable 'LASER'" | |
exit | |
fi | |
# general config | |
bucc="bucc2018" | |
data="." | |
xdir=${data}/downloaded # tar files as distrubuted by the BUCC evaluation | |
ddir=${data}/${bucc} # raw texts of BUCC | |
edir=${data}/embed # normalized texts and embeddings | |
langs=("fr" "de" "ru" "zh") | |
ltrg="en" # English is always the 2nd language | |
# encoder | |
model_dir="${LASER}/models" | |
encoder="${model_dir}/bilstm.93langs.2018-12-26.pt" | |
bpe_codes="${model_dir}/93langs.fcodes" | |
################################################################### | |
# | |
# Extract files with labels and texts from the BUCC corpus | |
# | |
################################################################### | |
GetData () { | |
fn1=$1; fn2=$2; lang=$3 | |
outf="${edir}/${bucc}.${lang}-${ltrg}.${fn2}" | |
for ll in ${ltrg} ${lang} ; do | |
inf="${ddir}/${fn1}.${ll}" | |
if [ ! -f ${outf}.txt.${ll} ] ; then | |
echo " - extract files ${outf} in ${ll}" | |
cat ${inf} | cut -f1 > ${outf}.id.${ll} | |
cat ${inf} | cut -f2 > ${outf}.txt.${ll} | |
fi | |
done | |
} | |
ExtractBUCC () { | |
slang=$1 | |
tlang=${ltrg} | |
pushd ${data} > /dev/null | |
if [ ! -d ${ddir}/${slang}-${tlang} ] ; then | |
for tf in ${xdir}/${bucc}-${slang}-${tlang}.*.tar.bz2 ; do | |
echo " - extract from tar `basename ${tf}`" | |
tar jxf $tf | |
done | |
fi | |
GetData "${slang}-${tlang}/${slang}-${tlang}.sample" "dev" ${slang} | |
GetData "${slang}-${tlang}/${slang}-${tlang}.training" "train" ${slang} | |
GetData "${slang}-${tlang}/${slang}-${tlang}.test" "test" ${slang} | |
popd > /dev/null | |
} | |
################################################################### | |
# | |
# Tokenize and Embed | |
# | |
################################################################### | |
Embed () { | |
ll=$2 | |
txt="$1.txt.${ll}" | |
enc="$1.enc.${ll}" | |
if [ ! -s ${enc} ] ; then | |
cat ${txt} | python3 ${LASER}/source/embed.py \ | |
--encoder ${encoder} \ | |
--token-lang ${ll} \ | |
--bpe-codes ${bpe_codes} \ | |
--output ${enc} \ | |
--verbose | |
fi | |
} | |
################################################################### | |
# | |
# Mine for bitexts | |
# | |
################################################################### | |
Mine () { | |
bn=$1 | |
l1=$2 | |
l2=$3 | |
cand="${bn}.candidates.tsv" | |
if [ ! -s ${cand} ] ; then | |
python3 ${LASER}/source/mine_bitexts.py \ | |
${bn}.txt.${l1} ${bn}.txt.${l2} \ | |
--src-lang ${l1} --trg-lang ${l2} \ | |
--src-embeddings ${bn}.enc.${l1} --trg-embeddings ${bn}.enc.${l2} \ | |
--unify --mode mine --retrieval max --margin ratio -k 4 \ | |
--output ${cand} \ | |
--verbose --gpu | |
fi | |
} | |
################################################################### | |
# | |
# Main loop | |
# | |
################################################################### | |
echo -e "\nProcessing BUCC data in ${data}" | |
# create output directories | |
for d in ${ddir} ${edir} ; do | |
mkdir -p ${d} | |
done | |
for lsrc in ${langs[@]} ; do | |
ExtractBUCC ${lsrc} | |
# Tokenize and embed train | |
bname="${bucc}.${lsrc}-${ltrg}" | |
part="${bname}.train" | |
Embed ${edir}/${part} ${lsrc} ${encoder} ${bpe_codes} | |
Embed ${edir}/${part} ${ltrg} ${encoder} ${bpe_codes} | |
# mine for texts in train | |
Mine ${edir}/${part} ${lsrc} ${ltrg} | |
# optimize threshold on BUCC training data and provided gold alignments | |
if [ ! -s ${part}.log ] ; then | |
python3 bucc.py \ | |
--src-lang ${lsrc} --trg-lang ${ltrg} \ | |
--bucc-texts ${edir}/${part}.txt \ | |
--bucc-ids ${edir}/${part}.id \ | |
--candidates ${edir}/${part}.candidates.tsv \ | |
--gold ${ddir}/${lsrc}-${ltrg}/${lsrc}-${ltrg}.training.gold \ | |
--verbose \ | |
| tee ${part}.log | |
fi | |
# Tokenize and embed test | |
part="${bname}.test" | |
Embed ${edir}/${part} ${lsrc} ${encoder} ${bpe_codes} | |
Embed ${edir}/${part} ${ltrg} ${encoder} ${bpe_codes} | |
# mine for texts in test | |
Mine ${edir}/${part} ${lsrc} ${ltrg} | |
# extract test bitexts for treshhold optimized on train | |
th=`grep 'best threshold' ${bname}.train.log | sed -e 's/[=:]/ /g' | awk '{print $4}'` | |
extracted="${edir}/${part}.extracted.tsv" | |
if [ ! -s ${extracted} ] ; then | |
python3 bucc.py \ | |
--src-lang ${lsrc} --trg-lang ${ltrg} \ | |
--bucc-texts ${edir}/${part}.txt \ | |
--bucc-ids ${edir}/${part}.id \ | |
--candidates ${edir}/${part}.candidates.tsv \ | |
--threshold ${th} --output ${extracted} \ | |
--verbose | |
fi | |
done | |
# Bonus: extract bitexts with English alignments | |
# using a (conservative) threshold of 1.1 | |
# All the data is supposed to be already tokenized | |
th=1.1 | |
for lsrc in ${langs[@]} ; do | |
for ltrg in ${langs[@]} ; do | |
if [ ${lsrc} != 'en' -a ${ltrg} != "en" -a ${lsrc} != ${ltrg} ] ; then | |
bitext="${bucc}.${lsrc}-${ltrg}.train.extracted.th${th}.csv" | |
if [ ! -s ${bitext} ] ; then | |
echo "Extracting bitexts for ${lsrc}-${ltrg}" | |
python3 ${LASER}/source/mine_bitexts.py \ | |
${edir}/${bucc}.${lsrc}-en.train.txt.${lsrc} \ | |
${edir}/${bucc}.${ltrg}-en.train.txt.${ltrg} \ | |
--src-lang ${lsrc} --trg-lang ${ltrg} \ | |
--src-embeddings ${edir}/${bucc}.${lsrc}-en.train.enc.${lsrc} \ | |
--trg-embeddings ${edir}/${bucc}.${ltrg}-en.train.enc.${ltrg} \ | |
--unify --mode mine --retrieval max --margin ratio -k 4 \ | |
--output ${bitext} --threshold ${th} \ | |
--verbose --gpu | |
fi | |
fi | |
done | |
done | |