File size: 2,108 Bytes
da88570
4ba7af4
da88570
 
 
4ba7af4
 
 
 
da88570
 
 
4ba7af4
da88570
 
 
 
 
4ba7af4
 
da88570
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ba7af4
 
 
 
 
 
da88570
 
 
 
 
 
4ba7af4
 
 
 
 
 
da88570
4ba7af4
 
da88570
4ba7af4
 
 
 
 
da88570
 
 
4ba7af4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import spacy
from spacy import displacy
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
from nltk import Tree
from src.nlp.data.test_texts import TEXTS
from src.nlp.playground.textclassification import ZeroShotClassifier, CustomMode
from src.utils.helpers import normalize_data
from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer

# Beispieltext mit Datum


nlp = spacy.blank("de")
nlp.add_pipe('sentencizer')

# 1️⃣ Punkt als Suffix & Infix definieren (damit er zwischen Zahlen trennt)
suffixes = list(nlp.Defaults.suffixes) + [r"\."]  # Punkt als Suffix hinzufügen
infixes = list(nlp.Defaults.infixes) + [r"(?<=\d)\.(?=\d)"] + [r"(?<=\d)\:(?=\d)"]  # Punkt zwischen Zahlen trennen


# Regex-Objekte kompilieren
suffix_re = compile_suffix_regex(suffixes)
infix_re = compile_infix_regex(infixes)

# Angepasste Tokenizer-Funktion setzen
nlp.tokenizer = Tokenizer(nlp.vocab, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer)
# 2️⃣ Entity Ruler für Datumsangaben hinzufügen
ruler = nlp.add_pipe("entity_ruler")

patterns = [
    {
        "label": "DATE",
        "pattern": [
            {"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dddd"}
        ]
    },
    {
        "label": "TIME",
        "pattern": [
            {"SHAPE": "dd"}, {"ORTH": ":"}, {"SHAPE": "dd"}
        ]
    }
]

ruler.add_patterns(patterns)


for text in TEXTS:
    text = normalize_data(text)
    analyzer = MarkdownAnalyzer(text)
    md_elements = analyzer.identify_all().get("block_elements")
    for md_element in md_elements:
        doc = nlp(md_element.text)

        # Prüfe Tokenisierung
        # print("Tokens:", [token.text for token in doc])

        if doc.ents:
            print(md_element.text)
            # Extrahiere erkannte Entitäten
            for ent in doc.ents:
                print(ent.text, ent.label_)



# doc = nlp("\n\n".join(TEXTS))
# displacy.serve(doc, style="ent")  # Öffnet eine Webseite mit der Visualisierung