|
from allennlp.data.tokenizers.spacy_tokenizer import SpacyTokenizer |
|
|
|
framenet_split = { |
|
"train": [ |
|
"LUCorpus-v0.3__CNN_AARONBROWN_ENG_20051101_215800.partial-NEW.xml", |
|
"NTI__Iran_Chemical.xml", |
|
"NTI__Taiwan_Introduction.xml", |
|
"LUCorpus-v0.3__20000416_xin_eng-NEW.xml", |
|
"NTI__NorthKorea_ChemicalOverview.xml", |
|
"NTI__workAdvances.xml", |
|
"C-4__C-4Text.xml", |
|
"ANC__IntroOfDublin.xml", |
|
"LUCorpus-v0.3__20000420_xin_eng-NEW.xml", |
|
"NTI__BWTutorial_chapter1.xml", |
|
"ANC__110CYL068.xml", |
|
"LUCorpus-v0.3__artb_004_A1_E1_NEW.xml", |
|
"NTI__Iran_Missile.xml", |
|
"LUCorpus-v0.3__20000424_nyt-NEW.xml", |
|
"LUCorpus-v0.3__wsj_1640.mrg-NEW.xml", |
|
"ANC__110CYL070.xml", |
|
"NTI__Iran_Introduction.xml", |
|
"KBEval__lcch.xml", |
|
"ANC__HistoryOfLasVegas.xml", |
|
"LUCorpus-v0.3__wsj_2465.xml", |
|
"KBEval__LCC-M.xml", |
|
"LUCorpus-v0.3__artb_004_A1_E2_NEW.xml", |
|
"LUCorpus-v0.3__AFGP-2002-600002-Trans.xml", |
|
"LUCorpus-v0.3__602CZL285-1.xml", |
|
"PropBank__LomaPrieta.xml", |
|
"NTI__Iran_Biological.xml", |
|
"NTI__Kazakhstan.xml", |
|
"LUCorpus-v0.3__AFGP-2002-600045-Trans.xml", |
|
"NTI__Iran_Nuclear.xml", |
|
"ANC__EntrepreneurAsMadonna.xml", |
|
"SemAnno__Text1.xml", |
|
"ANC__HistoryOfJerusalem.xml", |
|
"NTI__ChinaOverview.xml", |
|
"PropBank__ElectionVictory.xml", |
|
"NTI__Russia_Introduction.xml", |
|
"NTI__SouthAfrica_Introduction.xml", |
|
"LUCorpus-v0.3__20000419_apw_eng-NEW.xml", |
|
"NTI__LibyaCountry1.xml", |
|
"ANC__IntroJamaica.xml", |
|
"QA__IranRelatedQuestions.xml", |
|
"ANC__HistoryOfGreece.xml", |
|
"NTI__NorthKorea_NuclearCapabilities.xml", |
|
"PropBank__BellRinging.xml", |
|
"PropBank__PolemicProgressiveEducation.xml", |
|
"NTI__WMDNews_042106.xml", |
|
"ANC__110CYL200.xml", |
|
"LUCorpus-v0.3__CNN_ENG_20030614_173123.4-NEW-1.xml" |
|
], |
|
|
|
"dev": [ |
|
"NTI__WMDNews_062606.xml", |
|
"LUCorpus-v0.3__ENRON-pearson-email-25jul02.xml", |
|
"KBEval__MIT.xml", |
|
"ANC__110CYL072.xml", |
|
"LUCorpus-v0.3__20000415_apw_eng-NEW.xml", |
|
"Miscellaneous__Hijack.xml", |
|
"PropBank__TicketSplitting.xml", |
|
"NTI__NorthKorea_NuclearOverview.xml" |
|
], |
|
|
|
"test": [ |
|
"NTI__NorthKorea_Introduction.xml", |
|
"LUCorpus-v0.3__enron-thread-159550.xml", |
|
"ANC__WhereToHongKong.xml", |
|
"KBEval__atm.xml", |
|
"ANC__112C-L013.xml", |
|
"LUCorpus-v0.3__IZ-060316-01-Trans-1.xml", |
|
"LUCorpus-v0.3__AFGP-2002-602187-Trans.xml", |
|
"ANC__StephanopoulosCrimes.xml", |
|
"ANC__110CYL069.xml", |
|
"ANC__110CYL067.xml", |
|
"ANC__IntroHongKong.xml", |
|
"LUCorpus-v0.3__20000410_nyt-NEW.xml", |
|
"KBEval__Brandeis.xml", |
|
"KBEval__Stanford.xml", |
|
"LUCorpus-v0.3__SNO-525.xml", |
|
"PropBank__AetnaLifeAndCasualty.xml", |
|
"Miscellaneous__Hound-Ch14.xml", |
|
"NTI__Syria_NuclearOverview.xml", |
|
"KBEval__cycorp.xml", |
|
"KBEval__utd-icsi.xml", |
|
"LUCorpus-v0.3__sw2025-ms98-a-trans.ascii-1-NEW.xml", |
|
"Miscellaneous__SadatAssassination.xml", |
|
"KBEval__parc.xml" |
|
] |
|
} |
|
|
|
_spacy_tokenizer = SpacyTokenizer(language='en_core_web_sm', pos_tags=True) |
|
|
|
|
|
class Sentence: |
|
def __init__(self, text): |
|
""" |
|
Re-tokenize sentence. Map character indices to token indices. |
|
We assume the char and token span indices are left inclusive and right inclusive. |
|
""" |
|
self.tokens = _spacy_tokenizer.tokenize(text) |
|
|
|
@property |
|
def pos(self): |
|
return [t.pos_ for t in self.tokens] |
|
|
|
@property |
|
def tag(self): |
|
return [t.tag_ for t in self.tokens] |
|
|
|
@property |
|
def starts(self): |
|
return [t.idx for t in self.tokens] |
|
|
|
@property |
|
def ends(self): |
|
return [t.idx_end for t in self.tokens] |
|
|
|
def char2token(self, char_idx): |
|
""" |
|
If char_idx falls into the a token, return the index of this token. |
|
Elif char_idx falls into the gap between 2 tokens, return the index of the previous token. |
|
Elif char_idx is lower than the first token, return 0. |
|
Elif return the index of the last token. |
|
""" |
|
if char_idx < self.starts[0]: |
|
return 0 |
|
if char_idx >= self.starts[-1]: |
|
return len(self.tokens)-1 |
|
for i_tok, start_idx in enumerate(self.starts): |
|
if start_idx == char_idx: |
|
return i_tok |
|
if start_idx > char_idx: |
|
return i_tok-1 |
|
|
|
def span(self, start, end): |
|
|
|
assert end > start |
|
start, end = self.char2token(start), self.char2token(end-1) |
|
assert end >= start |
|
return start, end |
|
|
|
def __repr__(self): |
|
return self.tokens.__repr__() |
|
|
|
|