File size: 3,685 Bytes
05922fb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import argparse
import os
from collections import defaultdict
from typing import Dict, Any
from concrete.util import CommunicationWriterTGZ
from nltk.corpus import framenet, framenet15
from tqdm import tqdm
from sftp.data_reader.concrete_srl import concrete_doc
from tools.framenet.fn_util import framenet_split, Sentence as TokSentence
def process_sentence(sent) -> Dict[str, Any]:
ret = {'sentence': sent.text, 'tokenization': list(), 'annotations': list()}
tok_sent = TokSentence(sent.text)
for token in tok_sent.tokens:
ret['tokenization'].append((token.idx, token.idx_end-1))
def process_one_ann_set(ann_set):
ret['annotations'].append(event := {'label': ann_set.frame.name, 'children': (arg_list := list())})
target_list = list()
for tar_start, tar_end in ann_set.Target:
target_list.extend(
list(range(tok_sent.span(tar_start, tar_end)[0], tok_sent.span(tar_start, tar_end)[1]+1))
)
target_list.sort()
event['span'] = (target_list[0], target_list[-1])
for fe_start, fe_end, fe_name in ann_set.FE[0]:
fe_start, fe_end = tok_sent.span(fe_start, fe_end)
arg_list.append({
'span': (fe_start, fe_end),
'label': fe_name
})
if 'annotationSet' in sent:
for ann_item in sent.annotationSet:
if 'Target' not in ann_item:
continue
process_one_ann_set(ann_item)
if 'Target' in sent:
process_one_ann_set(sent)
return ret
def process_doc(docs, dst_path: str):
writer = CommunicationWriterTGZ(dst_path)
for doc in tqdm(docs):
sentences = list()
for sent in doc.sentence:
sentences.append(process_sentence(sent))
comm = concrete_doc(sentences, doc.filename)
writer.write(comm, comm.id + '.concrete')
writer.close()
def process_exemplar(dst_path, fn):
bar = tqdm()
raw_annotations = list()
print('Loading exemplars...')
try:
for ann_sent in fn.annotations(full_text=False):
if 'Target' not in ann_sent:
continue
bar.update()
raw_annotations.append(ann_sent)
except RuntimeError:
pass
finally:
bar.close()
char_idx_offset = 0
sentences = list()
for sent in raw_annotations:
sentences.append(process_sentence(sent))
char_idx_offset += len(sent.text)+1
comm = concrete_doc(sentences, 'exemplar')
CommunicationWriterTGZ(dst_path).write(comm, 'exemplar.concrete')
def run():
parser = argparse.ArgumentParser()
parser.add_argument(
'dst', metavar='DESTINATION', type=str,
help='Destination folder path.'
)
parser.add_argument(
'-v', metavar='VERSION', default='1.7', type=str, choices=['1.5', '1.7'],
help='Version of FrameNet. Either 1.5 or 1.7.'
)
args = parser.parse_args()
fn = framenet if args.v == '1.7' else framenet15
os.makedirs(args.dst, exist_ok=True)
doc_group = defaultdict(list)
for doc in fn.docs():
if doc.filename in framenet_split['dev']:
doc_group['dev'].append(doc)
elif doc.filename in framenet_split['test']:
doc_group['test'].append(doc)
else:
doc_group['train'].append(doc)
for sp in framenet_split:
print(f'Loaded {len(doc_group[sp])} docs for {sp}.')
for sp in framenet_split:
process_doc(doc_group[sp], dst_path=os.path.join(args.dst, f'{sp}.tar.gz'))
process_exemplar(os.path.join(args.dst, 'exemplar.tar.gz'), fn)
if __name__ == '__main__':
run()
|