#!/bin/python3 # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. # # LASER Language-Agnostic SEntence Representations # is a toolkit to calculate multilingual sentence embeddings # and to use them for document classification, bitext filtering # and mining # # -------------------------------------------------------- # # Tool to extract subset of mined bitexts in a tsv.gz file import os import sys import gzip import argparse ############################################################################### # # Main # ############################################################################### parser = argparse.ArgumentParser(description='Tool to extract bitext from the WikiMatrix') parser.add_argument('--encoding', default='utf-8', help='character encoding for input/output') parser.add_argument('--tsv', type=str, required=True, help='File with mined bitexts') parser.add_argument('--bitext', type=str, required=True, help='Text file after sentence splitting') parser.add_argument('--src-lang', type=str, required=True, help='Source language') parser.add_argument('--trg-lang', type=str, required=True, help='Traget language') parser.add_argument('--threshold', type=float, default=1.05, help='Threshold on margin score') parser.add_argument('--nb-sents', type=int, default=999999999, help='Maximal number of sentences') parser.add_argument('--nb-words-src', type=int, default=999999999, help='Maxmimal numer of total words in the source language') parser.add_argument('--nb-words-trg', type=int, default=999999999, help='Maxmimal numer of total words in the target language') args = parser.parse_args() print('Tool to extract bitext from the WikiMatrix') nl = 0 nw_src = 0 nw_trg = 0 print('Processing {}'.format(args.tsv)) with gzip.open(args.tsv, 'rt', encoding=args.encoding) as tsv: with open(args.bitext + '.' + args.src_lang, 'wt', encoding=args.encoding) as fsrc: with open(args.bitext + '.' + args.trg_lang, 'wt', encoding=args.encoding) as ftrg: while nl < args.nb_sents: line = tsv.readline() if not line: break fields = line.split('\t') cur_src = len(fields[1].split()) cur_trg = len(fields[2].split()) if float(fields[0]) < args.threshold: break if nw_src + cur_src > args.nb_words_src: break if nw_trg + cur_trg > args.nb_words_trg: break fsrc.write(fields[1].strip() + '\n') ftrg.write(fields[2].strip() + '\n') nw_src += cur_src nw_trg += cur_trg nl += 1 if nl % 100000 == 0: print('\r - {:d} lines read'.format(nl), end='') print('\r - wrote {:d} lines'.format(nl)) print(' - with {:d} source and {:d} target words'.format(nw_src, nw_trg)) print(' - last threshold is {:.4f}'.format(float(fields[0])))