File size: 3,181 Bytes
05d3571
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/bin/python3
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
#
# LASER  Language-Agnostic SEntence Representations
# is a toolkit to calculate multilingual sentence embeddings
# and to use them for document classification, bitext filtering
# and mining
#
# --------------------------------------------------------
#
# Tool to extract subset of mined bitexts in a tsv.gz file

import os
import sys
import gzip
import argparse

###############################################################################
#
# Main
#
###############################################################################

parser = argparse.ArgumentParser(description='Tool to extract bitext from the WikiMatrix')
parser.add_argument('--encoding', default='utf-8',
    help='character encoding for input/output')
parser.add_argument('--tsv', type=str, required=True,
    help='File with mined bitexts')
parser.add_argument('--bitext', type=str, required=True,
    help='Text file after sentence splitting')
parser.add_argument('--src-lang', type=str, required=True,
    help='Source language')
parser.add_argument('--trg-lang', type=str, required=True,
    help='Traget language')
parser.add_argument('--threshold', type=float, default=1.05,
    help='Threshold on margin score')
parser.add_argument('--nb-sents', type=int, default=999999999,
    help='Maximal number of sentences')
parser.add_argument('--nb-words-src', type=int, default=999999999,
    help='Maxmimal numer of total words in the source language')
parser.add_argument('--nb-words-trg', type=int, default=999999999,
    help='Maxmimal numer of total words in the target language')
args = parser.parse_args()

print('Tool to extract bitext from the WikiMatrix')

nl = 0
nw_src = 0   
nw_trg = 0   
print('Processing {}'.format(args.tsv))
with gzip.open(args.tsv, 'rt', encoding=args.encoding) as tsv:
    with open(args.bitext + '.' + args.src_lang, 'wt', encoding=args.encoding) as fsrc:
        with open(args.bitext + '.' + args.trg_lang, 'wt', encoding=args.encoding) as ftrg:
            while nl < args.nb_sents:
                line = tsv.readline()
                if not line:
                    break
                fields = line.split('\t')
                cur_src = len(fields[1].split())
                cur_trg = len(fields[2].split())
                if float(fields[0]) < args.threshold:
                    break
                if nw_src + cur_src > args.nb_words_src:
                    break
                if nw_trg + cur_trg > args.nb_words_trg:
                    break
                fsrc.write(fields[1].strip() + '\n')
                ftrg.write(fields[2].strip() + '\n')
                nw_src += cur_src
                nw_trg += cur_trg
                nl += 1
                if nl % 100000 == 0:
                    print('\r - {:d} lines read'.format(nl), end='')

print('\r - wrote {:d} lines'.format(nl))
print(' - with {:d} source and {:d} target words'.format(nw_src, nw_trg))
print(' - last threshold is {:.4f}'.format(float(fields[0])))