Spaces:
Sleeping
Sleeping
File size: 5,843 Bytes
05d3571 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 |
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
#
# LASER Language-Agnostic SEntence Representations
# is a toolkit to calculate multilingual sentence embeddings
# and to use them for document classification, bitext filtering
# and mining
#
# --------------------------------------------------------
#
# bash script to mine for bitexts in the BUCC corpus
if [ -z ${LASER+x} ] ; then
echo "Please set the environment variable 'LASER'"
exit
fi
# general config
bucc="bucc2018"
data="."
xdir=${data}/downloaded # tar files as distrubuted by the BUCC evaluation
ddir=${data}/${bucc} # raw texts of BUCC
edir=${data}/embed # normalized texts and embeddings
langs=("fr" "de" "ru" "zh")
ltrg="en" # English is always the 2nd language
# encoder
model_dir="${LASER}/models"
encoder="${model_dir}/bilstm.93langs.2018-12-26.pt"
bpe_codes="${model_dir}/93langs.fcodes"
###################################################################
#
# Extract files with labels and texts from the BUCC corpus
#
###################################################################
GetData () {
fn1=$1; fn2=$2; lang=$3
outf="${edir}/${bucc}.${lang}-${ltrg}.${fn2}"
for ll in ${ltrg} ${lang} ; do
inf="${ddir}/${fn1}.${ll}"
if [ ! -f ${outf}.txt.${ll} ] ; then
echo " - extract files ${outf} in ${ll}"
cat ${inf} | cut -f1 > ${outf}.id.${ll}
cat ${inf} | cut -f2 > ${outf}.txt.${ll}
fi
done
}
ExtractBUCC () {
slang=$1
tlang=${ltrg}
pushd ${data} > /dev/null
if [ ! -d ${ddir}/${slang}-${tlang} ] ; then
for tf in ${xdir}/${bucc}-${slang}-${tlang}.*.tar.bz2 ; do
echo " - extract from tar `basename ${tf}`"
tar jxf $tf
done
fi
GetData "${slang}-${tlang}/${slang}-${tlang}.sample" "dev" ${slang}
GetData "${slang}-${tlang}/${slang}-${tlang}.training" "train" ${slang}
GetData "${slang}-${tlang}/${slang}-${tlang}.test" "test" ${slang}
popd > /dev/null
}
###################################################################
#
# Tokenize and Embed
#
###################################################################
Embed () {
ll=$2
txt="$1.txt.${ll}"
enc="$1.enc.${ll}"
if [ ! -s ${enc} ] ; then
cat ${txt} | python3 ${LASER}/source/embed.py \
--encoder ${encoder} \
--token-lang ${ll} \
--bpe-codes ${bpe_codes} \
--output ${enc} \
--verbose
fi
}
###################################################################
#
# Mine for bitexts
#
###################################################################
Mine () {
bn=$1
l1=$2
l2=$3
cand="${bn}.candidates.tsv"
if [ ! -s ${cand} ] ; then
python3 ${LASER}/source/mine_bitexts.py \
${bn}.txt.${l1} ${bn}.txt.${l2} \
--src-lang ${l1} --trg-lang ${l2} \
--src-embeddings ${bn}.enc.${l1} --trg-embeddings ${bn}.enc.${l2} \
--unify --mode mine --retrieval max --margin ratio -k 4 \
--output ${cand} \
--verbose --gpu
fi
}
###################################################################
#
# Main loop
#
###################################################################
echo -e "\nProcessing BUCC data in ${data}"
# create output directories
for d in ${ddir} ${edir} ; do
mkdir -p ${d}
done
for lsrc in ${langs[@]} ; do
ExtractBUCC ${lsrc}
# Tokenize and embed train
bname="${bucc}.${lsrc}-${ltrg}"
part="${bname}.train"
Embed ${edir}/${part} ${lsrc} ${encoder} ${bpe_codes}
Embed ${edir}/${part} ${ltrg} ${encoder} ${bpe_codes}
# mine for texts in train
Mine ${edir}/${part} ${lsrc} ${ltrg}
# optimize threshold on BUCC training data and provided gold alignments
if [ ! -s ${part}.log ] ; then
python3 bucc.py \
--src-lang ${lsrc} --trg-lang ${ltrg} \
--bucc-texts ${edir}/${part}.txt \
--bucc-ids ${edir}/${part}.id \
--candidates ${edir}/${part}.candidates.tsv \
--gold ${ddir}/${lsrc}-${ltrg}/${lsrc}-${ltrg}.training.gold \
--verbose \
| tee ${part}.log
fi
# Tokenize and embed test
part="${bname}.test"
Embed ${edir}/${part} ${lsrc} ${encoder} ${bpe_codes}
Embed ${edir}/${part} ${ltrg} ${encoder} ${bpe_codes}
# mine for texts in test
Mine ${edir}/${part} ${lsrc} ${ltrg}
# extract test bitexts for treshhold optimized on train
th=`grep 'best threshold' ${bname}.train.log | sed -e 's/[=:]/ /g' | awk '{print $4}'`
extracted="${edir}/${part}.extracted.tsv"
if [ ! -s ${extracted} ] ; then
python3 bucc.py \
--src-lang ${lsrc} --trg-lang ${ltrg} \
--bucc-texts ${edir}/${part}.txt \
--bucc-ids ${edir}/${part}.id \
--candidates ${edir}/${part}.candidates.tsv \
--threshold ${th} --output ${extracted} \
--verbose
fi
done
# Bonus: extract bitexts with English alignments
# using a (conservative) threshold of 1.1
# All the data is supposed to be already tokenized
th=1.1
for lsrc in ${langs[@]} ; do
for ltrg in ${langs[@]} ; do
if [ ${lsrc} != 'en' -a ${ltrg} != "en" -a ${lsrc} != ${ltrg} ] ; then
bitext="${bucc}.${lsrc}-${ltrg}.train.extracted.th${th}.csv"
if [ ! -s ${bitext} ] ; then
echo "Extracting bitexts for ${lsrc}-${ltrg}"
python3 ${LASER}/source/mine_bitexts.py \
${edir}/${bucc}.${lsrc}-en.train.txt.${lsrc} \
${edir}/${bucc}.${ltrg}-en.train.txt.${ltrg} \
--src-lang ${lsrc} --trg-lang ${ltrg} \
--src-embeddings ${edir}/${bucc}.${lsrc}-en.train.enc.${lsrc} \
--trg-embeddings ${edir}/${bucc}.${ltrg}-en.train.enc.${ltrg} \
--unify --mode mine --retrieval max --margin ratio -k 4 \
--output ${bitext} --threshold ${th} \
--verbose --gpu
fi
fi
done
done
|