from classy_classification import ClassyClassifier import pickle import spacy from spacy import displacy from spacy.tokenizer import Tokenizer from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex from nltk import Tree from src.nlp.data.test_texts import TEXTS from src.nlp.playground.textclassification import ZeroShotClassifier, CustomMode from src.utils.helpers import normalize_data from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer date_classifier_train_data = { "EVENT_DATE": [ "Termin: [DATE], 19:00", "[DATE]", "Unser Meetup ist am [DATE] um 18:30 Uhr.", "Konzert: [DATE]", "Das Festival startet am [DATE]", "Die Show findet am [DATE] um 20:00 Uhr statt.", "[DATE] – Save the Date!", "Das Webinar beginnt am [DATE] um 16:00 Uhr.", "[DATE] – Große Premiere im Theater!", "Event am [DATE], komm vorbei!", "[DATE] – Silvesterparty!", "Fußballspiel: [DATE], 15:30 Uhr", "Live-Musik am [DATE]", "[DATE] – Infos folgen!", "[DATE] um 20:00 Uhr", "Termin: [DATE], 18:00 Uhr", "Wann? [DATE], 19:00 bis 20:00 Uhr" "Das Konzert findet am [DATE] statt.", "Save the Date: [DATE]!", "Nächste Veranstaltung: [DATE]", "[DATE] – große Feier!", "Konzert am [DATE], 20:00 Uhr", "[DATE]", "Festival: [DATE] – [DATE]", "[DATE] – nicht verpassen!", "Sportevent: [DATE]", "Networking-Event am [DATE], 17:00 Uhr", "Workshop: [DATE], 14:00 Uhr", "Firmenfeier: [DATE] ab 18:30 Uhr", "Seminar: [DATE], Beginn um 10:00 Uhr", "Schulung am [DATE] um 15:00 Uhr", "Jubiläumsfeier am [DATE], 19:30 Uhr", "[DATE] 23:00", "[DATE] 23:00", "[DATE] 23:00", "Datum: [DATE], Startzeit: 10:00, Endzeit: 12:00", "Samstag, [DATE], Einlass: 15:59, Beginn: 17:30, Preis: 65,73 EUR", "Samstag, [DATE], Einlass: 17:00, Beginn: 19:00, Preis: 74,99 EUR", "Samstag, [DATE], Einlass: 18:00, Beginn: 20:00", "Freitag, [DATE], Einlass: 17:00, Beginn: 19:00, Preis: 77,93 EUR", "Samstag, [DATE], Einlass: 16:30, Beginn: 18:30, Preis: ab 69,99 Euro", "Gestört aber GeiL – Das Festival • [DATE], 16:00 • Berlin", "Samstag, [DATE], Einlass: 17:00, Beginn: 19:00", "Kaufberatung: [DATE] um 19:00", "Bedienung: [DATE] um 19:00", "[DATE] Ganztägig", "ab dem [DATE]", "Mittwoch, [DATE], von 18:00-22:00", "Mittwoch, [DATE], von 18:00-22:00", "Augsburger Friedensgespräche am [DATE]", "Augsburger Friedensgespräche am [DATE]", "Augsburger Friedensgespräche am [DATE]", "[DATE] - [DATE]", "Am [DATE] endet der Weihnachtsmarkt bereits um 19:00.", "Winzerglühwein Do [DATE] 17:00 - 19:00", "Winzerglühwein Fr [DATE] 16:30 - 18:30", "Winzerglühwein Sa [DATE] 15:30 - 15:30", "Lessons and Carols Sa [DATE] 19:30 - 21:00", "[DATE] - [DATE]", "Vom [DATE] - [DATE]", "[DATE] - [DATE]", "Am [DATE] endet der Weihnachtsmarkt bereits um 19:00.", "Do [DATE] 17:00 - 19:00", "Fr [DATE] 16:30 - 18:30", "Sa [DATE] 15:30 - 15:30", "Sa [DATE] 19:30 - 21:00", "[DATE]", "[DATE] - [DATE]", "[DATE]", "[DATE]", "[DATE] und [DATE]", "[DATE]", "[DATE] - [DATE]", "Am [DATE] endet der Weihnachtsmarkt bereits um 19:00.", "Vom [DATE] - [DATE]", "[DATE] und [DATE]", "Am [DATE] ab 19:00", "Am [DATE] ab 19:00", "Am [DATE]", "[DATE]", "[DATE]", "[DATE]", "[DATE] 16:00 – [DATE] 17:00", "[DATE] 10:15 – [DATE] 12:30", "[DATE] 10:00 – [DATE] 18:00", "[DATE]", "[DATE] 11:00 – [DATE] 18:00", "[DATE] - [DATE]", "[DATE] | 19:30", "[DATE]", "[DATE] bis einschließlich [DATE]", "[DATE], [DATE], [DATE] und [DATE]", "[DATE] 18:00", "[DATE] 13:00-21:00", ], "OTHER": [ "Der Vorverkauf startet am [DATE].", "Anmeldefrist: [DATE]", "Tickets sind bis zum [DATE] erhältlich.", "Call for Papers läuft bis zum [DATE].", "Die Registrierung endet am [DATE].", "Bewerbungsschluss: [DATE].", "Frühbucherrabatt bis zum [DATE]!", "Einreichungsfrist: [DATE]", "Die Akkreditierung läuft bis zum [DATE]", "Reservierungen sind bis zum [DATE] möglich.", "Der Ticketverkauf startet am [DATE]", "Letzte Chance zur Anmeldung: [DATE]", "Call for Speakers läuft bis [DATE]", "Bitte reicht eure Abstracts bis [DATE] ein.", "Akkreditierung endet am [DATE]", "Bewerbungsschluss: [DATE]", "Die Early-Bird-Phase läuft bis zum [DATE]", "Anmeldefrist für Workshops: [DATE]", "Die Frist für Sponsorings endet am [DATE]", "Vergünstigte Tickets bis zum [DATE] verfügbar!" ] } time_classifier_train_data = { "EVENT_TIME": [ "**Wann?** 12.05.2024, 19:00-21:00", "So. 12.08.2024 12:15 - 13:15 CET", "13:00 - 14:00", "Termin: [DATE], 19:00", "[DATE]", "Unser Meetup ist am [DATE] um 18:30 Uhr.", "Die Show findet am [DATE] um 20:00 Uhr statt.", "Das Webinar beginnt am [DATE] um 16:00 Uhr.", "Fußballspiel: [DATE], 15:30 Uhr", "[DATE] um 20:00 Uhr", "Termin: [DATE], 18:00 Uhr", "Wann? [DATE], 19:00 bis 20:00 Uhr" "Konzert am [DATE], 20:00 Uhr", "Networking-Event am [DATE], 17:00 Uhr", "Workshop: [DATE], 14:00 Uhr", "Firmenfeier: [DATE] ab 18:30 Uhr", "Seminar: [DATE], Beginn um 10:00 Uhr", "Schulung am [DATE] um 15:00 Uhr", "Jubiläumsfeier am [DATE], 19:30 Uhr", "[DATE] 23:00", "[DATE] 23:00", "[DATE] 23:00", "Datum: [DATE], Startzeit: 10:00, Endzeit: 12:00", "Samstag, [DATE], Einlass: 15:59, Beginn: 17:30, Preis: 65,73 EUR", "Samstag, [DATE], Einlass: 17:00, Beginn: 19:00, Preis: 74,99 EUR", "Samstag, [DATE], Einlass: 18:00, Beginn: 20:00", "Freitag, [DATE], Einlass: 17:00, Beginn: 19:00, Preis: 77,93 EUR", "Samstag, [DATE], Einlass: 16:30, Beginn: 18:30, Preis: ab 69,99 Euro", "Gestört aber GeiL – Das Festival • [DATE], 16:00 • Berlin", "Samstag, [DATE], Einlass: 17:00, Beginn: 19:00", "Kaufberatung: [DATE] um 19:00", "Bedienung: [DATE] um 19:00", "[DATE] Ganztägig", "ab dem [DATE]", "Mittwoch, [DATE], von 18:00-22:00", "Mittwoch, [DATE], von 18:00-22:00", "Augsburger Friedensgespräche am [DATE]", "Augsburger Friedensgespräche am [DATE]", "Augsburger Friedensgespräche am [DATE]", "[DATE] - [DATE]", "Am [DATE] endet der Weihnachtsmarkt bereits um 19:00.", "Winzerglühwein Do [DATE] 17:00 - 19:00", "Winzerglühwein Fr [DATE] 16:30 - 18:30", "Winzerglühwein Sa [DATE] 15:30 - 15:30", "Lessons and Carols Sa [DATE] 19:30 - 21:00", "[DATE] - [DATE]", "Vom [DATE] - [DATE]", "[DATE] - [DATE]", "Am [DATE] endet der Weihnachtsmarkt bereits um 19:00.", "Do [DATE] 17:00 - 19:00", "Fr [DATE] 16:30 - 18:30", "Sa [DATE] 15:30 - 15:30", "Sa [DATE] 19:30 - 21:00", "[DATE]", "[DATE] - [DATE]", "[DATE]", "[DATE]", "[DATE] und [DATE]", "[DATE]", "[DATE] - [DATE]", "Am [DATE] endet der Weihnachtsmarkt bereits um 19:00.", "Vom [DATE] - [DATE]", "[DATE] und [DATE]", "Am [DATE] ab 19:00", "Am [DATE] ab 19:00", "Am [DATE]", "[DATE]", "[DATE]", "[DATE]", "[DATE] 16:00 – [DATE] 17:00", "[DATE] 10:15 – [DATE] 12:30", "[DATE] 10:00 – [DATE] 18:00", "[DATE]", "[DATE] 11:00 – [DATE] 18:00", "[DATE] - [DATE]", "[DATE] | 19:30", "[DATE]", "[DATE] bis einschließlich [DATE]", "[DATE], [DATE], [DATE] und [DATE]", "[DATE] 18:00", "[DATE] 13:00-21:00", ], "OTHER":[ "Einlass: 19:00", "Abendkasse ab 20:00 Uhr", "Tageskarten können ab 18:00 Uhr gekauft werden.", "Öffnungszeiten: Mo-Fr 09:00 - 17:00", "Kartenverkauf ab 17:30 Uhr", "Einlass beginnt um 18:45", "Reservierung erforderlich bis 12:00 Uhr", ] } nlp = spacy.blank("de") nlp.add_pipe('sentencizer') # 1️⃣ Punkt als Suffix & Infix definieren (damit er zwischen Zahlen trennt) suffixes = list(nlp.Defaults.suffixes) + [r"\."] # Punkt als Suffix hinzufügen infixes = list(nlp.Defaults.infixes) + [r"(?<=\d)\.(?=\d)"] + [r"(?<=\d)\:(?=\d)"] # Punkt zwischen Zahlen trennen # Regex-Objekte kompilieren suffix_re = compile_suffix_regex(suffixes) infix_re = compile_infix_regex(infixes) # Angepasste Tokenizer-Funktion setzen nlp.tokenizer = Tokenizer(nlp.vocab, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer) # 2️⃣ Entity Ruler für Datumsangaben hinzufügen ruler = nlp.add_pipe("entity_ruler") patterns = [ { "label": "DATE", "pattern": [ {"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dddd"} ] }, { "label": "TIME", "pattern": [ {"SHAPE": "dd"}, {"ORTH": ":"}, {"SHAPE": "dd"} ] } ] ruler.add_patterns(patterns) # Prepare Training Data: Use Placeholders for Times and Dates classifier_train_data_cleaned = {"EVENT_DATE": [], "OTHER":[]} for text in date_classifier_train_data["EVENT_DATE"]: text = normalize_data(text) doc = nlp(text) for ent in doc.ents: if ent.label_ == "DATE": text = text.replace(ent.text, "[DATE]") if ent.label_ == "TIME": text = text.replace(ent.text, "[TIME]") classifier_train_data_cleaned["EVENT_DATE"].append(text) for text in date_classifier_train_data["OTHER"]: text = normalize_data(text) doc = nlp(text) for ent in doc.ents: if ent.label_ == "DATE": text = text.replace(ent.text, "[DATE]") if ent.label_ == "TIME": text = text.replace(ent.text, "[TIME]") classifier_train_data_cleaned["OTHER"].append(text) # remove duplicates classifier_train_data_cleaned["EVENT_DATE"] = list(set(classifier_train_data_cleaned["EVENT_DATE"])) classifier_train_data_cleaned["OTHER"] = list(set(classifier_train_data_cleaned["OTHER"])) print(classifier_train_data_cleaned["EVENT_DATE"]) print(classifier_train_data_cleaned["OTHER"]) classifier = ClassyClassifier(data=classifier_train_data_cleaned) classifier.set_embedding_model(model="stsb-xlm-r-multilingual") with open("../../playground/models/date_classifier.pkl", "wb") as f: pickle.dump(classifier, f) zero_shot_classifier = ZeroShotClassifier() TEXTS = ["Tickets können ab dem 03.12.2020 erworben werden"] for text in TEXTS: text = normalize_data(text) analyzer = MarkdownAnalyzer(text) print("*"*100) print(text) print("\n\n\n") md_elements = analyzer.identify_all().get("block_elements") md_elements = [] dates = {"dates": [], "times": []} for md_element in md_elements: doc = nlp(md_element.text) # Prüfe Tokenisierung # print("Tokens:", [token.text for token in doc]) print(doc.ents) if doc.ents: print(md_element.text) modified_text = md_element.text # Replace TIME and DATE entities with placeholders for ent in doc.ents: if ent.label_ == "DATE": modified_text = modified_text.replace(ent.text, "[DATE]") if ent.label_ == "TIME": modified_text = modified_text.replace(ent.text, "[TIME]") date_entities = [ent.text for ent in doc.ents if ent.label_ == "DATE"] # Classify Date category if date_entities: print("DATES: ",date_entities ) cats = classifier(modified_text) date_category = max(cats, key=cats.get) print("Date Category: ", date_category) if date_category == "EVENT_DATE": dates["dates"].extend(date_entities) time_entities = [ent.text for ent in doc.ents if ent.label_ == "TIME"] if time_entities: # Classify Time category print("ZEITEN: ", time_entities) time_category = zero_shot_classifier.classify(modified_text, CustomMode( labels=["BEGINN", "EINLASS", "ABLAUF"], hypothesis_template="Der Text geht um {} einer Veranstaltung"))[0].label print("Time Category: ", time_category) if time_category == "BEGINN": dates["times"].extend(time_entities) print("\n") print(dates) print("*" * 100)