File size: 2,108 Bytes
da88570 4ba7af4 da88570 4ba7af4 da88570 4ba7af4 da88570 4ba7af4 da88570 4ba7af4 da88570 4ba7af4 da88570 4ba7af4 da88570 4ba7af4 da88570 4ba7af4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import spacy
from spacy import displacy
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
from nltk import Tree
from src.nlp.data.test_texts import TEXTS
from src.nlp.playground.textclassification import ZeroShotClassifier, CustomMode
from src.utils.helpers import normalize_data
from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer
# Beispieltext mit Datum
nlp = spacy.blank("de")
nlp.add_pipe('sentencizer')
# 1️⃣ Punkt als Suffix & Infix definieren (damit er zwischen Zahlen trennt)
suffixes = list(nlp.Defaults.suffixes) + [r"\."] # Punkt als Suffix hinzufügen
infixes = list(nlp.Defaults.infixes) + [r"(?<=\d)\.(?=\d)"] + [r"(?<=\d)\:(?=\d)"] # Punkt zwischen Zahlen trennen
# Regex-Objekte kompilieren
suffix_re = compile_suffix_regex(suffixes)
infix_re = compile_infix_regex(infixes)
# Angepasste Tokenizer-Funktion setzen
nlp.tokenizer = Tokenizer(nlp.vocab, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer)
# 2️⃣ Entity Ruler für Datumsangaben hinzufügen
ruler = nlp.add_pipe("entity_ruler")
patterns = [
{
"label": "DATE",
"pattern": [
{"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dddd"}
]
},
{
"label": "TIME",
"pattern": [
{"SHAPE": "dd"}, {"ORTH": ":"}, {"SHAPE": "dd"}
]
}
]
ruler.add_patterns(patterns)
for text in TEXTS:
text = normalize_data(text)
analyzer = MarkdownAnalyzer(text)
md_elements = analyzer.identify_all().get("block_elements")
for md_element in md_elements:
doc = nlp(md_element.text)
# Prüfe Tokenisierung
# print("Tokens:", [token.text for token in doc])
if doc.ents:
print(md_element.text)
# Extrahiere erkannte Entitäten
for ent in doc.ents:
print(ent.text, ent.label_)
# doc = nlp("\n\n".join(TEXTS))
# displacy.serve(doc, style="ent") # Öffnet eine Webseite mit der Visualisierung
|