Spaces:
Sleeping
Sleeping
File size: 5,165 Bytes
05d3571 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
#
# LASER Language-Agnostic SEntence Representations
# is a toolkit to calculate multilingual sentence embeddings
# and to use them for document classification, bitext filtering
# and mining
#
# --------------------------------------------------------
#
# bash script to downlaod and extract XNLI and multiNLI corpus
if [ -z ${LASER+x} ] ; then
echo "Please set the environment variable 'LASER'"
exit
fi
xnli="XNLI-1.0"
xnli_mt="XNLI-MT-1.0"
xnli_http="https://dl.fbaipublicfiles.com/XNLI"
mnli_http="https://www.nyu.edu/projects/bowman/multinli/multinli_1.0.zip"
languages=("en" "fr" "es" "de" "el" "bg" "ru" "tr" "ar" "vi" "th" "zh" "hi" "sw" "ur")
edir="embed"
# encoder
model_dir="${LASER}/models"
encoder="${model_dir}/bilstm.93langs.2018-12-26.pt"
bpe_codes="${model_dir}/93langs.fcodes"
# NLI classifier params
N=200
nhid="512 384"
drop=0.3
seed=159753
bsize=128
lr=0.001
##############################################################################################
# get the XNLI dev and test corpus in 15 languages
ExtractXNLI () {
echo "Installing XNLI"
if [ ! -s ${xnli}/xnli.test.tsv ] ; then
echo " - Downloading "
wget -q ${xnli_http}/${xnli}.zip
echo " - unzip "
unzip -q ${xnli}.zip
/bin/rm -rf __MACOS ${xnli}.zip
fi
for lang in ${languages[@]} ; do
for part in "dev" "test" ; do
if [ ! -f ${edir}/xnli.${part}.prem.${lang} ] ; then
echo " - extracting xnli.${part}.${lang}"
tail -n +2 ${xnli}/xnli.${part}.tsv \
| grep "^${lang}" | cut -f7 \
> ${edir}/xnli.${part}.prem.${lang}
tail -n +2 ${xnli}/xnli.${part}.tsv \
| grep "^${lang}" | cut -f8 \
> ${edir}/xnli.${part}.hyp.${lang}
tail -n +2 ${xnli}/xnli.${part}.tsv \
| grep "^${lang}" | cut -f2 \
| sed -e 's/entailment/0/' -e 's/neutral/1/' -e 's/contradiction/2/' \
> ${edir}/xnli.${part}.cl.${lang}
fi
done
done
}
##############################################################################################
# https://www.nyu.edu/projects/bowman/multinli/multinli_1.0.zip
# MT translated data is already tokenized !
ExtractXNLI_MT () {
echo "Installing XNLI MT"
if [ ! -d ${xnli_mt}/multinli ] ; then
echo " - Downloading "
wget -q ${xnli_http}/${xnli_mt}.zip
echo " - unzip "
unzip -q ${xnli_mt}.zip
/bin/rm -rf __MACOS ${xnli_mt}.zip
fi
part="train"
for lang in "en" ; do
if [ ! -f ${edir}/multinli.${part}.prem.${lang}.gz ] ; then
echo " - extracting ${part}.${lang}"
tail -n +2 ${xnli_mt}/multinli/multinli.${part}.${lang}.tsv \
| cut -f1 > ${edir}/multinli.${part}.prem.${lang}
tail -n +2 ${xnli_mt}/multinli/multinli.${part}.${lang}.tsv \
| cut -f2 > ${edir}/multinli.${part}.hyp.${lang}
tail -n +2 ${xnli_mt}/multinli/multinli.${part}.${lang}.tsv \
| cut -f3 \
| sed -e 's/entailment/0/' -e 's/neutral/1/' -e 's/contradictory/2/' \
> ${edir}/multinli.${part}.cl.${lang}
fi
done
}
##############################################################################################
# https://www.nyu.edu/projects/bowman/multinli/multinli_1.0.zip
# MT translated data is already tokenized !
ExtractMNLI () {
echo "Installing MultiNLI"
train_txt="multinli_1.0/multinli_1.0_train.txt"
if [ ! -d ${edir} ] ; then mkdir -p ${edir}; fi
if [ ! -f ${edir}/xnli.train.cl.en ] ; then
echo " - Downloading"
wget -q ${mnli_http}
echo " - unzip"
unzip -q multinli_1.0.zip ${train_txt}
echo " - extracting"
tail -n +2 ${train_txt} | cut -f6 | gzip > ${edir}/xnli.train.prem.en.gz
tail -n +2 ${train_txt} | cut -f7 | gzip > ${edir}/xnli.train.hyp.en.gz
tail -n +2 ${train_txt} | cut -f1 \
| sed -e 's/entailment/0/' -e 's/neutral/1/' -e 's/contradiction/2/' \
> ${edir}/xnli.train.cl.en
fi
}
##############################################################################################
if [ ! -d ${edir} ] ; then mkdir -p ${edir}; fi
ExtractXNLI
ExtractMNLI
# calculate embeddings
export PYTHONPATH="$PYTHONPATH:$LASER/tools-external/jieba"
python3 xnli.py --data_dir ${edir} --lang ${languages[@]} --bpe_codes ${bpe_codes} --encoder ${encoder} --verbose
#for fr in 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 ; do
for fr in 0.6 0.7 0.8 0.9 ; do
echo -e "\nTraining the classifier (see ${edir}/xnli.fract${fr}.log)"
python3 ${LASER}/source/nli.py -b ${edir} \
--train xnli.train.%s.enc.en --train-labels xnli.train.cl.en \
--dev xnli.dev.%s.enc.en --dev-labels xnli.dev.cl.en \
--test xnli.test.%s.enc --test-labels xnli.test.cl --lang ${languages[@]} \
--nhid ${nhid[@]} --dropout ${drop} --bsize ${bsize} \
--seed ${seed} --lr ${lr} --nepoch ${N} \
--cross-lingual \
--fraction $fr \
--save-outputs ${edir}/xnli.fract${fr}.outputs \
--gpu 1 > ${edir}/xnli.fract${fr}.log
done
|