Spaces:
Sleeping
Sleeping
import argparse | |
import json | |
import random | |
from transformers import XLMRobertaTokenizer | |
from transformers import xglue_processors as processors | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
# Required parameters | |
parser.add_argument( | |
"--data_dir", | |
default=None, | |
type=str, | |
required=True, | |
help="The input data dir. Should contain the .tsv files (or other data files) for the task.", | |
) | |
parser.add_argument( | |
"--nbest_size", type=int, default=-1, help="nbest size in sampling subword sequence" | |
) | |
parser.add_argument( | |
"--alpha", type=float, default=0.2, help="alpha" | |
) | |
parser.add_argument( | |
"--train_language", default=None, type=str, help="Train language if is different of the evaluation language." | |
) | |
parser.add_argument( | |
"--language", | |
default=None, | |
type=str, | |
required=True, | |
help="Evaluation language. Also train language if `train_language` is set to None.", | |
) | |
parser.add_argument( | |
"--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." | |
) | |
parser.add_argument( | |
"--model_name_or_path", | |
default=None, | |
type=str, | |
required=True, | |
help="Path to pre-trained model", | |
) | |
parser.add_argument( | |
"--sample_rounds", | |
default=1, | |
type=int, | |
required=True, | |
help="Path to pre-trained model", | |
) | |
args = parser.parse_args() | |
tokenizer = XLMRobertaTokenizer.from_pretrained( | |
args.model_name_or_path, | |
do_lower_case=args.do_lower_case, | |
cache_dir=None, | |
) | |
task = "xnli" | |
processor = processors[task](language=args.train_language, train_language=args.train_language) | |
examples = processor.get_train_examples(args.data_dir) | |
train_word_cnt_origin = {} | |
for example in examples: | |
tokens_a = tokenizer.tokenize(example.text_a, add_special_tokens=True) | |
tokens_b = tokenizer.tokenize(example.text_b, add_special_tokens=True) | |
for token in tokens_a + tokens_b: | |
if token not in train_word_cnt_origin: | |
train_word_cnt_origin[token] = 0 | |
train_word_cnt_origin[token] += 1 | |
all_examples = [] | |
for i in range(args.sample_rounds): | |
all_examples += examples | |
examples = all_examples | |
sent_len = [] | |
example_len = [] | |
train_word_cnt = {} | |
for example in examples: | |
tokens_a = tokenizer.tokenize(example.text_a, add_special_tokens=True, nbest_size=args.nbest_size, | |
alpha=args.alpha) | |
tokens_b = tokenizer.tokenize(example.text_b, add_special_tokens=True, nbest_size=args.nbest_size, | |
alpha=args.alpha) | |
for token in tokens_a + tokens_b: | |
if token not in train_word_cnt: | |
train_word_cnt[token] = 0 | |
train_word_cnt[token] += 1 | |
sent_len += [len(tokens_a), len(tokens_b)] | |
example_len += [len(tokens_a) + len(tokens_b)] | |
print(sum(sent_len) / len(sent_len)) | |
print(sum(example_len) / len(example_len)) | |
total = 0 | |
n_oov = 0 | |
for token in train_word_cnt: | |
if token not in train_word_cnt_origin: | |
n_oov += train_word_cnt[token] | |
# n_oov += 1 | |
total += train_word_cnt[token] | |
# total += 1 | |
print("{} oov rate: {}".format("extra", n_oov / total)) | |
total = 0 | |
n_oov = 0 | |
for token in train_word_cnt_origin: | |
if token not in train_word_cnt: | |
n_oov += train_word_cnt_origin[token] | |
# n_oov += 1 | |
total += train_word_cnt_origin[token] | |
# total += 1 | |
print("{} oov rate: {}".format("origin", n_oov / total)) | |
# exit(0) | |
eval_datasets = [] | |
eval_langs = args.language.split(',') | |
for split in ["valid"]: | |
for lang in eval_langs: | |
eval_datasets.append((split, lang)) | |
for split, lang in eval_datasets: | |
processor = processors[task](language=lang, train_language=lang) | |
examples = processor.get_valid_examples(args.data_dir) | |
sent_len = [] | |
example_len = [] | |
valid_word_cnt = {} | |
for example in examples: | |
tokens_a = tokenizer.tokenize(example.text_a, add_special_tokens=True) | |
tokens_b = tokenizer.tokenize(example.text_b, add_special_tokens=True) | |
for token in tokens_a + tokens_b: | |
if token not in valid_word_cnt: | |
valid_word_cnt[token] = 0 | |
valid_word_cnt[token] += 1 | |
sent_len += [len(tokens_a), len(tokens_b)] | |
example_len += [len(tokens_a) + len(tokens_b)] | |
print(sum(sent_len) / len(sent_len)) | |
print(sum(example_len) / len(example_len)) | |
total = 0 | |
n_oov = 0 | |
for token in valid_word_cnt: | |
if token not in train_word_cnt: | |
n_oov += valid_word_cnt[token] | |
# n_oov += 1 | |
total += valid_word_cnt[token] | |
# total += 1 | |
print("{} oov rate: {}".format(lang, n_oov / total)) | |