|
import spacy |
|
from spacy import displacy |
|
from spacy.tokenizer import Tokenizer |
|
from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex |
|
from nltk import Tree |
|
from src.nlp.data.test_texts import TEXTS |
|
from src.nlp.playground.textclassification import ZeroShotClassifier, CustomMode |
|
from src.utils.helpers import normalize_data |
|
from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer |
|
|
|
|
|
|
|
|
|
nlp = spacy.blank("de") |
|
nlp.add_pipe('sentencizer') |
|
|
|
|
|
suffixes = list(nlp.Defaults.suffixes) + [r"\."] |
|
infixes = list(nlp.Defaults.infixes) + [r"(?<=\d)\.(?=\d)"] + [r"(?<=\d)\:(?=\d)"] |
|
|
|
|
|
|
|
suffix_re = compile_suffix_regex(suffixes) |
|
infix_re = compile_infix_regex(infixes) |
|
|
|
|
|
nlp.tokenizer = Tokenizer(nlp.vocab, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer) |
|
|
|
ruler = nlp.add_pipe("entity_ruler") |
|
|
|
patterns = [ |
|
{ |
|
"label": "DATE", |
|
"pattern": [ |
|
{"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dddd"} |
|
] |
|
}, |
|
{ |
|
"label": "TIME", |
|
"pattern": [ |
|
{"SHAPE": "dd"}, {"ORTH": ":"}, {"SHAPE": "dd"} |
|
] |
|
} |
|
] |
|
|
|
ruler.add_patterns(patterns) |
|
|
|
|
|
for text in TEXTS: |
|
text = normalize_data(text) |
|
analyzer = MarkdownAnalyzer(text) |
|
md_elements = analyzer.identify_all().get("block_elements") |
|
for md_element in md_elements: |
|
doc = nlp(md_element.text) |
|
|
|
|
|
|
|
|
|
if doc.ents: |
|
print(md_element.text) |
|
|
|
for ent in doc.ents: |
|
print(ent.text, ent.label_) |
|
|
|
|
|
|
|
|
|
|
|
|