File size: 4,168 Bytes
05922fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import json
import os
import copy
from collections import defaultdict
from argparse import ArgumentParser
from tqdm import tqdm


def extract_sentences(raw_doc):
    sentence_tokens = list()  # [(start, end), list_tokens, event_list]
    for sent_boundary in raw_doc['_views']['_InitialView']['Sentence']:
        start, end = sent_boundary.get('begin', 0), sent_boundary.get('end')
        sentence_tokens.append([(start, end), list(), list()])
    begin2sentence, end2sentence = dict(), dict()
    for token in raw_doc['_views']['_InitialView']['Token']:
        start, end = token.get('begin', 0), token.get('end')
        added = False
        for sent_idx, (bound, tl, _) in enumerate(sentence_tokens):
            if start in range(*bound) and (end - 1) in range(*bound):
                assert not added
                begin2sentence[start] = (sent_idx, len(tl))
                end2sentence[end] = (sent_idx, len(tl))
                tl.append((start, end))
                added = True
        assert added
    return sentence_tokens, begin2sentence, end2sentence


def read_aida2kairos(mapping_path):
    mapping = dict()
    for line in open(mapping_path).readlines():
        kairos, aida_list = line.replace('\n', '').replace(',', '').split('\t')
        for aida in aida_list.split():
            if aida in 'x?':
                continue
            if aida in mapping:
                print('warning:', aida, 'already in the mapping, repeated.')
            mapping[aida] = kairos
    return mapping


def read_aida(corpus_path, mapping_path):
    print('reading aida data')
    n_negative, n_span_mismatch, n_diff = 0, 0, 0
    outputs = list()
    mapping = read_aida2kairos(mapping_path)
    for event_fn in tqdm(os.listdir(corpus_path)):
        event_name = event_fn.split('-')[0]
        if event_name not in mapping:
            print('warning:', event_name, 'not in the mapping.')
            continue
        event_name = mapping[event_name]

        for doc_name in os.listdir(os.path.join(corpus_path, event_fn)):
            if not doc_name.endswith('json'):
                continue
            raw_doc = json.load(open(os.path.join(corpus_path, event_fn, doc_name)))
            sentences, begin2sentence, end2sentence = extract_sentences(raw_doc)
            for fss_no, fss in raw_doc['_referenced_fss'].items():
                if fss_no == '1':
                    continue
                begin, end, is_negative = fss['begin'], fss['end'], fss['negative_example']
                if is_negative:
                    n_negative += 1
                    continue
                if begin not in begin2sentence or end not in end2sentence:
                    n_span_mismatch += 1
                    continue
                (b_idx_sent, b_idx_token), (e_idx_sent, e_idx_token) = begin2sentence[begin], end2sentence[end]
                if b_idx_sent != e_idx_sent:
                    n_diff += 1
                    continue
                sentences[b_idx_sent][2].append([b_idx_token, e_idx_token])

            text = raw_doc['_referenced_fss']['1']['sofaString']

            for _, tokens, events in sentences:
                tokens = [text[start:end] for start, end in tokens]
                for (start, end) in events:
                    outputs.append({
                        'tokens': copy.deepcopy(tokens),
                        'annotation': {
                            'start_idx': start,
                            'end_idx': end,
                            'label': event_name,
                        }
                    })

    print(f'Loaded {len(outputs)} annotations.')
    print(f'{n_negative} negative annotations are ignored.')
    print(f'{n_span_mismatch} mismatched annotations are ignored.')
    print(f'{n_diff} annotations across sentences are ignored.')

    return outputs


if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument('aida', type=str)
    parser.add_argument('aida2kairos', type=str)
    parser.add_argument('dst', type=str)
    args = parser.parse_args()

    aida = read_aida(args.aida, args.aida2kairos)

    json.dump(aida, open(args.dst, 'w'))