Spaces:

adojode
/

event-data-extraction-playground

Running

File size: 2,847 Bytes

83f1514

# import spacy
# from spacy import displacy
# from spacy.training import Example
# from spacy.tokens import Span
#
#
# nlp = spacy.blank("de")  # Leeres Modell
# config = {"moves": None}
# parser = nlp.add_pipe("parser", config=config)
#
# # Definiere eigene Labels
# for label in ["ROOT", "nsubj", "obl", "det", "case", "punct"]:
#     parser.add_label(label)
#
# # Erstelle Trainingsdaten
# TRAIN_DATA = [("Die Veranstaltung beginnt am 24.12.2025 um 16:00.",
#                {"heads": [1, 2, 2, 2, 4, 5, 5, 2, 7],
#                 "deps": ["det", "nsubj", "ROOT", "case", "obl", "case", "obl", "punct"]})]
#
# # Trainingspipeline starten
# optimizer = nlp.begin_training()
# for epoch in range(50):  # Mehr Epochen für bessere Ergebnisse
#     for text, annotations in TRAIN_DATA:
#         doc = nlp.make_doc(text)
#         example = Example.from_dict(doc, annotations)
#         nlp.update([example], drop=0.1, losses={})
#






import spacy
from spacy.tokens import Doc
from spacy.language import Language

nlp = spacy.load("de_core_news_lg")

@Language.component("custom_dependency_parser")
def custom_dependency_parser(doc):
    for token in doc:
        # Falls ein Token eine Zeitangabe ist, hänge es an das nächste Verb
        if token.ent_type_ in ["TIME", "DATE"]:
            for child in token.children:
                if child.pos_ == "VERB":
                    token.head = child
                    token.dep_ = "time_modifier"
                    break
    return doc

# Füge den Custom Parser nach dem Standard-Parser hinzu
nlp.add_pipe("custom_dependency_parser", after="parser")

doc = nlp("Die Veranstaltung beginnt am 24.12.2025 um 16:00, der Einlass startet ab 15:00. Beginn: 20:00, Ende: 21:00, Einlass: 19:00")



sub_sentences = []
current_sentence = []

for token in doc:
    # Haupt-Subjekt identifizieren
    if token.dep_ == "sb":
        if current_sentence:  # Falls bereits Tokens gesammelt wurden, speichere den bisherigen Satz
            sub_sentences.append(" ".join([t for t in current_sentence]))
        current_sentence = [token.text]  # Starte einen neuen Satz
    else:
        current_sentence.append(token.text)

# Letzten Satz hinzufügen
if current_sentence:
    sub_sentences.append(" ".join([t for t in current_sentence]))

# Ausgabe der getrennten Sätze
for i, sentence in enumerate(sub_sentences):
    print(f"Satz {i+1}: {sentence}")

# Erlaubte Dependency-Typen
allowed_deps = {"sb", "oa", "da", "nk"}
allowed_pos = {"NOUN", "PROPN"}  # Nomen & Eigennamen

# Gefilterte Tokens
filtered_tokens = [token for token in doc if token.dep_ in allowed_deps or token.pos_ in allowed_pos]
print(filtered_tokens)

# Ausgabe der Abhängigkeiten
relations = []  # Hier speichern wir die gefundenen Relationen

for token in doc:
    print(f"{token.text} --({token.dep_})--> {token.head.text}")