File size: 2,984 Bytes
05922fb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
from collections import defaultdict
from itertools import product
from typing import *
import nltk
from nltk.corpus import framenet, framenet15
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
manual = {
'\'s': 'be',
'\'re': 'be',
'\'ve': 'have',
've': 'have',
'men': 'man',
'saw': 'see',
'could': 'can',
'neighbour': 'neighbor',
'felt': 'feel',
'fell': 'fall',
'little': 'a little',
'have': 'have to',
'raping': 'rape',
'flavor': 'flavour',
'ca': 'can',
'bit': 'a bit',
}
def load_framenet_corpus(version):
if '1.5' in version:
nltk.download('framenet_v15')
return framenet15
elif '1.7' in version:
nltk.download('framenet_v17')
return framenet
else:
raise NotImplementedError
def is_word(s: str):
return all([c.isalpha() or c in ' -\'' for c in s])
def lu_to_frame(version: str):
fn = load_framenet_corpus(version)
fn._bad_statuses = []
map_no_pos = defaultdict(set)
lexicon_set = set()
for frame in fn.frames():
for lu in frame.lexUnit:
assert lu.count('.') == 1
lexicon, pos = lu.split('.')
lexicon = lexicon.lower()
lexicon = ' '.join(filter(lambda x: is_word(x), lexicon.split()))
if lexicon == '':
continue
map_no_pos[lexicon].add(frame.name)
lexicon_set.add(lexicon)
fn._bad_statuses = []
return map_no_pos
class FrameIdentifier:
def __init__(self):
lf_map = lu_to_frame('1.7')
lf_map['there have'].add('Existence')
lf_map['there must'].add('Existence')
lf_map['be there'].add('Existence')
self.lf_map = dict(lf_map)
def __call__(self, tokens: List[str]):
if len(tokens) == 1 and tokens[0].isnumeric():
return ['Cardinal_numbers']
if len(tokens) == 1 and tokens[0].endswith('th') and tokens[0][:-2].isnumeric():
return ['Ordinal_numbers']
tokens = [t.lower() for t in tokens]
frames = list()
if not all([is_word(t) for t in tokens]):
return []
for i, token in enumerate(tokens):
t2s = [token]
for _pos in 'asrnv':
t2s.append(lemmatizer.lemmatize(token, _pos))
for t_ in t2s:
if t_ in manual:
t2s.append(manual[t_])
t2s = list(set(t2s))
tokens[i] = t2s
for t2s in tokens:
for t in t2s:
key = t
if key in self.lf_map:
for f in self.lf_map[key]:
frames.append(f)
for t1, t2 in zip(tokens, tokens[1:]):
for ts in product(t1, t2):
t = ' '.join(ts)
if t in self.lf_map:
for f in self.lf_map[t]:
frames.append(f)
return list(set(frames))
|