# import spacy | |
# from spacy import displacy | |
# from spacy.training import Example | |
# from spacy.tokens import Span | |
# | |
# | |
# nlp = spacy.blank("de") # Leeres Modell | |
# config = {"moves": None} | |
# parser = nlp.add_pipe("parser", config=config) | |
# | |
# # Definiere eigene Labels | |
# for label in ["ROOT", "nsubj", "obl", "det", "case", "punct"]: | |
# parser.add_label(label) | |
# | |
# # Erstelle Trainingsdaten | |
# TRAIN_DATA = [("Die Veranstaltung beginnt am 24.12.2025 um 16:00.", | |
# {"heads": [1, 2, 2, 2, 4, 5, 5, 2, 7], | |
# "deps": ["det", "nsubj", "ROOT", "case", "obl", "case", "obl", "punct"]})] | |
# | |
# # Trainingspipeline starten | |
# optimizer = nlp.begin_training() | |
# for epoch in range(50): # Mehr Epochen für bessere Ergebnisse | |
# for text, annotations in TRAIN_DATA: | |
# doc = nlp.make_doc(text) | |
# example = Example.from_dict(doc, annotations) | |
# nlp.update([example], drop=0.1, losses={}) | |
# | |
import spacy | |
from spacy.tokens import Doc | |
from spacy.language import Language | |
nlp = spacy.load("de_core_news_lg") | |
def custom_dependency_parser(doc): | |
for token in doc: | |
# Falls ein Token eine Zeitangabe ist, hänge es an das nächste Verb | |
if token.ent_type_ in ["TIME", "DATE"]: | |
for child in token.children: | |
if child.pos_ == "VERB": | |
token.head = child | |
token.dep_ = "time_modifier" | |
break | |
return doc | |
# Füge den Custom Parser nach dem Standard-Parser hinzu | |
nlp.add_pipe("custom_dependency_parser", after="parser") | |
doc = nlp("Die Veranstaltung beginnt am 24.12.2025 um 16:00, der Einlass startet ab 15:00. Beginn: 20:00, Ende: 21:00, Einlass: 19:00") | |
sub_sentences = [] | |
current_sentence = [] | |
for token in doc: | |
# Haupt-Subjekt identifizieren | |
if token.dep_ == "sb": | |
if current_sentence: # Falls bereits Tokens gesammelt wurden, speichere den bisherigen Satz | |
sub_sentences.append(" ".join([t for t in current_sentence])) | |
current_sentence = [token.text] # Starte einen neuen Satz | |
else: | |
current_sentence.append(token.text) | |
# Letzten Satz hinzufügen | |
if current_sentence: | |
sub_sentences.append(" ".join([t for t in current_sentence])) | |
# Ausgabe der getrennten Sätze | |
for i, sentence in enumerate(sub_sentences): | |
print(f"Satz {i+1}: {sentence}") | |
# Erlaubte Dependency-Typen | |
allowed_deps = {"sb", "oa", "da", "nk"} | |
allowed_pos = {"NOUN", "PROPN"} # Nomen & Eigennamen | |
# Gefilterte Tokens | |
filtered_tokens = [token for token in doc if token.dep_ in allowed_deps or token.pos_ in allowed_pos] | |
print(filtered_tokens) | |
# Ausgabe der Abhängigkeiten | |
relations = [] # Hier speichern wir die gefundenen Relationen | |
for token in doc: | |
print(f"{token.text} --({token.dep_})--> {token.head.text}") | |