Spaces:
Sleeping
Sleeping
import argparse | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
# Required parameters | |
parser.add_argument( | |
"--translation_path", | |
default=None, | |
type=str, | |
required=True, | |
help="", | |
) | |
drop_languages = ["en", "zh-CN", "zh", "ja", "ko", "th", "my", "ml", "ta"] | |
translate_languages = None | |
args = parser.parse_args() | |
src2tgt = {} | |
print("Reading translation from {}".format(args.translation_path)) | |
with open(args.translation_path, encoding="utf-8") as f: | |
cnt = 0 | |
for line in f: | |
cnt += 1 | |
if cnt % 10000 == 0: | |
print("Reading lines {}".format(cnt)) | |
items = line.split("\t") | |
if items == 3: | |
src_sent, tgt_lang, tgt_sent = line.split("\t") | |
alignment = None | |
else: | |
src_sent, tgt_lang, tgt_sent, alignment_str = line.split("\t") | |
alignment = [] | |
for x in alignment_str.split(" "): | |
alignment.append((int(x.split("/")[0]), int(x.split("/")[1]))) | |
if tgt_lang in drop_languages: | |
continue | |
if translate_languages is not None and tgt_lang not in translate_languages: | |
continue | |
cnt_src = {} | |
cnt_tgt = {} | |
for x in alignment: | |
if x[0] not in cnt_src: | |
cnt_src[x[0]] = 0 | |
cnt_src[x[0]] += 1 | |
if x[1] not in cnt_tgt: | |
cnt_tgt[x[1]] = 0 | |
cnt_tgt[x[1]] += 1 | |
if not (cnt_src[x[0]] <= 1 or cnt_tgt[x[1]] <= 1): | |
print(cnt_src, cnt_tgt) | |
print(alignment) | |
print(src_sent, tgt_sent) | |
assert cnt_src[x[0]] <= 1 or cnt_tgt[x[1]] <= 1 | |