File size: 2,984 Bytes
05922fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from collections import defaultdict
from itertools import product
from typing import *

import nltk
from nltk.corpus import framenet, framenet15
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()


manual = {
    '\'s': 'be',
    '\'re': 'be',
    '\'ve': 'have',
    've': 'have',
    'men': 'man',
    'saw': 'see',
    'could': 'can',
    'neighbour': 'neighbor',
    'felt': 'feel',
    'fell': 'fall',
    'little': 'a little',
    'have': 'have to',
    'raping': 'rape',
    'flavor': 'flavour',
    'ca': 'can',
    'bit': 'a bit',
}


def load_framenet_corpus(version):
    if '1.5' in version:
        nltk.download('framenet_v15')
        return framenet15
    elif '1.7' in version:
        nltk.download('framenet_v17')
        return framenet
    else:
        raise NotImplementedError


def is_word(s: str):
    return all([c.isalpha() or c in ' -\'' for c in s])


def lu_to_frame(version: str):
    fn = load_framenet_corpus(version)
    fn._bad_statuses = []
    map_no_pos = defaultdict(set)
    lexicon_set = set()
    for frame in fn.frames():
        for lu in frame.lexUnit:
            assert lu.count('.') == 1
            lexicon, pos = lu.split('.')
            lexicon = lexicon.lower()
            lexicon = ' '.join(filter(lambda x: is_word(x), lexicon.split()))
            if lexicon == '':
                continue
            map_no_pos[lexicon].add(frame.name)
            lexicon_set.add(lexicon)
        fn._bad_statuses = []
    return map_no_pos


class FrameIdentifier:
    def __init__(self):
        lf_map = lu_to_frame('1.7')
        lf_map['there have'].add('Existence')
        lf_map['there must'].add('Existence')
        lf_map['be there'].add('Existence')
        self.lf_map = dict(lf_map)

    def __call__(self, tokens: List[str]):
        if len(tokens) == 1 and tokens[0].isnumeric():
            return ['Cardinal_numbers']
        if len(tokens) == 1 and tokens[0].endswith('th') and tokens[0][:-2].isnumeric():
            return ['Ordinal_numbers']
        tokens = [t.lower() for t in tokens]
        frames = list()

        if not all([is_word(t) for t in tokens]):
            return []

        for i, token in enumerate(tokens):
            t2s = [token]
            for _pos in 'asrnv':
                t2s.append(lemmatizer.lemmatize(token, _pos))
            for t_ in t2s:
                if t_ in manual:
                    t2s.append(manual[t_])
            t2s = list(set(t2s))
            tokens[i] = t2s

        for t2s in tokens:
            for t in t2s:
                key = t
                if key in self.lf_map:
                    for f in self.lf_map[key]:
                        frames.append(f)
        for t1, t2 in zip(tokens, tokens[1:]):
            for ts in product(t1, t2):
                t = ' '.join(ts)
                if t in self.lf_map:
                    for f in self.lf_map[t]:
                        frames.append(f)

        return list(set(frames))