from classy_classification import ClassyClassifier import pickle import spacy from spacy import displacy from spacy.tokenizer import Tokenizer from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex from nltk import Tree from src.nlp.data.test_texts import TEXTS from src.nlp.playground.textclassification import ZeroShotClassifier, CustomMode from src.utils.helpers import normalize_data from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer train_data = { "EVENT_TIME": [ "Die Veranstaltung beginnt am 24.12.2025 16:00", "Wann: 03.03.2020 16:00", "25.04.2025 - 23.05.2020 , 15:00 - 16:00", "22.06.2022 18:00", "23.12.2030 17:00 - 18:00", "12.05.2024 , 19:00 - 21:00", "So. 12.08.2024 12:15 - 13:15", "**24.12.2025 16:00 **", "15.01.2025 18:00", "18.01.2025 11:00 - 18:00", "**Der Nikolaus kommt am Freitag 17:00 !**", "Sa 12.00 + 16:00", "So 12:00", "Kindertheater Sa + So 15:00", "| Beginn | Freitag 16:00 - 20:00", "/ Samstag 11:00 - 20:00", "/ Sonntag 11:00 - 18:00", "19:00", "Beginn: 15:00" ], "ADMITTANCE_TIME": [ "Einlass ist 15:00", "Einlass ist 15:30", "Einlass: 15:00", "Abendkasse 14:30", "Einlass: 14:30", "Tageskasse 15:00 im Museum", "Einlass beginnt um 16:00", "Der Einlass erfolgt um 17:30", "Zutritt ab 18:00", "Gäste dürfen ab 14:45 eintreten", "Türöffnung: 19:00", "Einlass erfolgt ab 20:15", "Einlass für VIPs: 13:30", "Normaler Einlass: 16:45", "Die Türen öffnen sich um 12:00", "Zugang ab 17:00 möglich", "Einlasskontrolle beginnt um 18:30", "Einlass erst ab 19:15 gestattet" ] } nlp = spacy.blank("de") nlp.add_pipe('sentencizer') # 1️⃣ Punkt als Suffix & Infix definieren (damit er zwischen Zahlen trennt) suffixes = list(nlp.Defaults.suffixes) + [r"\."] # Punkt als Suffix hinzufügen infixes = list(nlp.Defaults.infixes) + [r"(?<=\d)\.(?=\d)"] + [r"(?<=\d)\:(?=\d)"] # Punkt zwischen Zahlen trennen # Regex-Objekte kompilieren suffix_re = compile_suffix_regex(suffixes) infix_re = compile_infix_regex(infixes) # Angepasste Tokenizer-Funktion setzen nlp.tokenizer = Tokenizer(nlp.vocab, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer) # 2️⃣ Entity Ruler für Datumsangaben hinzufügen ruler = nlp.add_pipe("entity_ruler") patterns = [ { "label": "DATE", "pattern": [ {"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dddd"} ] }, { "label": "TIME", "pattern": [ {"SHAPE": "dd"}, {"ORTH": ":"}, {"SHAPE": "dd"} ] } ] ruler.add_patterns(patterns) # Prepare Training Data: Use Placeholders for Times and Dates classifier_train_data_cleaned = {"EVENT_TIME": [], "ADMITTANCE_TIME": []} for text in train_data["EVENT_TIME"]: text = normalize_data(text) doc = nlp(text) for ent in doc.ents: if ent.label_ == "DATE": text = text.replace(ent.text, "[DATE]") if ent.label_ == "TIME": text = text.replace(ent.text, "[TIME]") classifier_train_data_cleaned["EVENT_TIME"].append(text) for text in train_data["ADMITTANCE_TIME"]: text = normalize_data(text) doc = nlp(text) for ent in doc.ents: if ent.label_ == "DATE": text = text.replace(ent.text, "[DATE]") if ent.label_ == "TIME": text = text.replace(ent.text, "[TIME]") classifier_train_data_cleaned["ADMITTANCE_TIME"].append(text) # remove duplicates classifier_train_data_cleaned["EVENT_TIME"] = list(set(classifier_train_data_cleaned["EVENT_TIME"])) classifier_train_data_cleaned["ADMITTANCE_TIME"] = list(set(classifier_train_data_cleaned["ADMITTANCE_TIME"])) classifier = ClassyClassifier(data=classifier_train_data_cleaned) classifier.set_embedding_model(model="stsb-xlm-r-multilingual") with open("../../playground/models/time_classifier.pkl", "wb") as f: pickle.dump(classifier, f) TEXTS = ["Einlass: ab 15:00", "Beginn: 18:00", "Konzert: 20:00 bis 21:00", "Start: 20:00, Ende: 21:00"] for text in TEXTS: print(text) text = normalize_data(text) analyzer = MarkdownAnalyzer(text) md_elements = analyzer.identify_all().get("block_elements") dates = {"dates": [], "times": []} for md_element in md_elements: doc = nlp(md_element.text) # Prüfe Tokenisierung # print("Tokens:", [token.text for token in doc]) if doc.ents: print("Found Entities: ", doc.ents) modified_text = md_element.text # Replace TIME and DATE entities with placeholders for ent in doc.ents: if ent.label_ == "DATE": modified_text = modified_text.replace(ent.text, "[DATE]") if ent.label_ == "TIME": modified_text = modified_text.replace(ent.text, "[TIME]") cats = classifier(modified_text) time_category = max(cats, key=cats.get) print(time_category) print("*"*100)