File size: 3,685 Bytes
05922fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import argparse
import os
from collections import defaultdict
from typing import Dict, Any

from concrete.util import CommunicationWriterTGZ
from nltk.corpus import framenet, framenet15
from tqdm import tqdm

from sftp.data_reader.concrete_srl import concrete_doc
from tools.framenet.fn_util import framenet_split, Sentence as TokSentence


def process_sentence(sent) -> Dict[str, Any]:
    ret = {'sentence': sent.text, 'tokenization': list(), 'annotations': list()}
    tok_sent = TokSentence(sent.text)
    for token in tok_sent.tokens:
        ret['tokenization'].append((token.idx, token.idx_end-1))

    def process_one_ann_set(ann_set):
        ret['annotations'].append(event := {'label': ann_set.frame.name, 'children': (arg_list := list())})
        target_list = list()
        for tar_start, tar_end in ann_set.Target:
            target_list.extend(
                list(range(tok_sent.span(tar_start, tar_end)[0], tok_sent.span(tar_start, tar_end)[1]+1))
            )
        target_list.sort()
        event['span'] = (target_list[0], target_list[-1])

        for fe_start, fe_end, fe_name in ann_set.FE[0]:
            fe_start, fe_end = tok_sent.span(fe_start, fe_end)
            arg_list.append({
                'span': (fe_start, fe_end),
                'label': fe_name
            })

    if 'annotationSet' in sent:
        for ann_item in sent.annotationSet:
            if 'Target' not in ann_item:
                continue
            process_one_ann_set(ann_item)
    if 'Target' in sent:
        process_one_ann_set(sent)

    return ret


def process_doc(docs, dst_path: str):
    writer = CommunicationWriterTGZ(dst_path)
    for doc in tqdm(docs):
        sentences = list()
        for sent in doc.sentence:
            sentences.append(process_sentence(sent))
        comm = concrete_doc(sentences, doc.filename)
        writer.write(comm, comm.id + '.concrete')
    writer.close()


def process_exemplar(dst_path, fn):
    bar = tqdm()
    raw_annotations = list()
    print('Loading exemplars...')
    try:
        for ann_sent in fn.annotations(full_text=False):
            if 'Target' not in ann_sent:
                continue
            bar.update()
            raw_annotations.append(ann_sent)
    except RuntimeError:
        pass
    finally:
        bar.close()

    char_idx_offset = 0
    sentences = list()
    for sent in raw_annotations:
        sentences.append(process_sentence(sent))
        char_idx_offset += len(sent.text)+1

    comm = concrete_doc(sentences, 'exemplar')
    CommunicationWriterTGZ(dst_path).write(comm, 'exemplar.concrete')


def run():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        'dst', metavar='DESTINATION', type=str,
        help='Destination folder path.'
    )
    parser.add_argument(
        '-v', metavar='VERSION', default='1.7', type=str, choices=['1.5', '1.7'],
        help='Version of FrameNet. Either 1.5 or 1.7.'
    )
    args = parser.parse_args()
    fn = framenet if args.v == '1.7' else framenet15
    os.makedirs(args.dst, exist_ok=True)

    doc_group = defaultdict(list)
    for doc in fn.docs():
        if doc.filename in framenet_split['dev']:
            doc_group['dev'].append(doc)
        elif doc.filename in framenet_split['test']:
            doc_group['test'].append(doc)
        else:
            doc_group['train'].append(doc)

    for sp in framenet_split:
        print(f'Loaded {len(doc_group[sp])} docs for {sp}.')

    for sp in framenet_split:
        process_doc(doc_group[sp], dst_path=os.path.join(args.dst, f'{sp}.tar.gz'))

    process_exemplar(os.path.join(args.dst, 'exemplar.tar.gz'), fn)


if __name__ == '__main__':
    run()