|
import gzip |
|
import json |
|
import os |
|
import logging |
|
from argparse import ArgumentParser |
|
from itertools import accumulate |
|
|
|
import nltk |
|
import numpy as np |
|
from tools.framenet.nltk_framenet import framenet, framenet15 |
|
from tqdm import tqdm |
|
|
|
from tools.framenet.fn_util import framenet_split, Sentence |
|
|
|
logger = logging.getLogger('fn') |
|
|
|
|
|
def _load_raw(version): |
|
if version == '1.5': |
|
nltk.download('framenet_v15') |
|
return framenet15 |
|
else: |
|
nltk.download('framenet_v17') |
|
return framenet |
|
|
|
|
|
def one_frame(sentence, ann): |
|
frame_info = {'label': ann.frame.name} |
|
target_list = list() |
|
for start, end in ann.Target: |
|
start, end = sentence.span(start, end) |
|
target_list.extend(list(range(start, end+1))) |
|
assert len(target_list) > 0 |
|
frame_info['span'] = [sorted(target_list)[0], sorted(target_list)[-1]] |
|
frame_info['lu'] = ann.LU.name |
|
frame_info['children'] = fes = list() |
|
for start, end, fe_name in ann.FE[0]: |
|
start, end = sentence.span(start, end) |
|
fes.append({'span': [start, end], 'label': fe_name}) |
|
return frame_info |
|
|
|
|
|
def load_nltk_exemplars(version, exclude_ann_ids=None): |
|
exclude_ann_ids = exclude_ann_ids or list() |
|
fn = _load_raw(version) |
|
egs = list() |
|
bar = tqdm() |
|
skipped = 0 |
|
try: |
|
for eg in fn.annotations(full_text=False): |
|
if 'Target' not in eg.keys(): |
|
|
|
continue |
|
if eg.ID in exclude_ann_ids: |
|
skipped += 1 |
|
continue |
|
try: |
|
sentence = Sentence(eg.text) |
|
egs.append({ |
|
'tokens': list(map(str, sentence.tokens)), 'annotations': [one_frame(sentence, eg)], |
|
'meta': { |
|
'fully_annotated': False, |
|
'source': f'framenet_v{version}', |
|
'with_fe': True, |
|
'type': 'exemplar', |
|
'ann_ids': [eg.ID], |
|
} |
|
}) |
|
bar.update() |
|
except: |
|
pass |
|
except: |
|
pass |
|
bar.close() |
|
logger.info(f'Loaded {len(egs)} sentences for framenet v{version} from exemplars. (skipped {skipped} sentences)') |
|
return egs |
|
|
|
|
|
def load_nltk_fully_annotated(version): |
|
fn = _load_raw(version) |
|
|
|
splits = list(framenet_split.keys()) |
|
all_containers = {split: [] for split in splits} |
|
for doc in tqdm(fn.docs()): |
|
container = all_containers['train'] |
|
for sp in splits: |
|
if doc.filename in framenet_split[sp]: |
|
container = all_containers[sp] |
|
|
|
for sent in doc.sentence: |
|
sentence = Sentence(sent.text) |
|
all_frames = list() |
|
ann_ids = [] |
|
for ann in sent.annotationSet: |
|
if ann._type == 'posannotationset': |
|
continue |
|
assert ann._type == 'fulltext_annotationset' |
|
if 'Target' not in ann.keys(): |
|
logger.warning('Target not found.') |
|
continue |
|
if 'ID' in ann: |
|
ann_ids.append(ann['ID']) |
|
frame_info = one_frame(sentence, ann) |
|
all_frames.append(frame_info) |
|
eg_dict = { |
|
'tokens': list(map(str, sentence.tokens)), 'annotations': all_frames, |
|
'meta': { |
|
'source': f'framenet_v{version}', |
|
'fully_annotated': True, |
|
'with_fe': True, |
|
'type': 'full text', |
|
'sentence ID': sent.ID, |
|
'doc': doc.filename, |
|
'ann_ids': ann_ids |
|
} |
|
} |
|
container.append(eg_dict) |
|
|
|
for sp in splits: |
|
logger.info(f'Load {len(all_containers[sp])} for {sp}.') |
|
return all_containers |
|
|
|
|
|
def load_expanded_fn(path): |
|
raise NotImplementedError |
|
with gzip.open(path, 'rb') as compressed: |
|
lines = compressed.read().decode() |
|
instances = list() |
|
lines = lines.split('\n') |
|
for line in tqdm(lines): |
|
if len(line) != 0: |
|
instances.append(json.loads(line)) |
|
logger.info(f'{len(instances)} lines loaded.') |
|
|
|
dataset = list() |
|
for instance in tqdm(instances, desc='Processing expanded framenet...'): |
|
for output in instance['outputs']: |
|
ins_dict = dict() |
|
ins_dict['meta'] = { |
|
'source': 'expanded framenet', |
|
'type': 'paraphrase', |
|
'exemplar_id': instance['exemplar_id'], |
|
'annoset_id': instance['annoset_id'] |
|
} |
|
words = output['output_string'] |
|
text = ' '.join(words) |
|
length_offsets = [0] + list(accumulate(map(len, words))) |
|
start_idx, end_idx = output['output_trigger_offset'] |
|
start_idx = length_offsets[start_idx] + start_idx |
|
end_idx = length_offsets[end_idx] + end_idx - 2 |
|
sentence = Sentence(text) |
|
ins_dict['text'] = sentence.tokens |
|
ins_dict['pos'] = sentence.pos |
|
ins_dict['tag'] = sentence.tag |
|
ins_dict['frame'] = [{ |
|
'name': instance['frame_name'], |
|
'target': list(range(sentence.span(start_idx, end_idx)[0], sentence.span(start_idx, end_idx)[1]+1)), |
|
'lu': output['output_trigger'], |
|
'fe': [] |
|
}] |
|
ins_dict['score'] = { |
|
'pbr': np.exp(-output['pbr_score']), |
|
'aligner': output['aligner_score'], |
|
} |
|
ins_dict['with_fe'] = False |
|
ins_dict['fully_annotated'] = False |
|
dataset.append(ins_dict) |
|
logger.info(f'{len(dataset)} sentences loaded.') |
|
return dataset |
|
|
|
|
|
if __name__ == '__main__': |
|
logging.basicConfig(level='INFO') |
|
arg_parser = ArgumentParser() |
|
arg_parser.add_argument('output', type=str) |
|
arg_parser.add_argument('-v', type=str, default='1.7') |
|
cmd_args = arg_parser.parse_args() |
|
full = load_nltk_fully_annotated(cmd_args.v) |
|
full_ann_ids = list() |
|
for split in ['train', 'dev', 'test']: |
|
for sent in full[split]: |
|
full_ann_ids.extend(sent['meta']['ann_ids']) |
|
exe = load_nltk_exemplars(cmd_args.v, full_ann_ids) |
|
os.makedirs(cmd_args.output, exist_ok=True) |
|
with open(os.path.join(cmd_args.output, 'full.' + cmd_args.v.replace('.', '') + '.json'), 'w') as fp: |
|
json.dump(full, fp) |
|
with open(os.path.join(cmd_args.output, 'exe.' + cmd_args.v.replace('.', '') + '.json'), 'w') as fp: |
|
json.dump(exe, fp) |
|
|