event-data-extraction-playground
/
src
/nlp
/experimental
/textclassification
/classy_classifier_date.py
from classy_classification import ClassyClassifier | |
import pickle | |
import spacy | |
from spacy import displacy | |
from spacy.tokenizer import Tokenizer | |
from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex | |
from nltk import Tree | |
from src.nlp.data.test_texts import TEXTS | |
from src.nlp.playground.textclassification import ZeroShotClassifier, CustomMode | |
from src.utils.helpers import normalize_data | |
from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer | |
date_classifier_train_data = { | |
"EVENT_DATE": [ | |
"Termin: [DATE], 19:00", | |
"[DATE]", | |
"Unser Meetup ist am [DATE] um 18:30 Uhr.", | |
"Konzert: [DATE]", | |
"Das Festival startet am [DATE]", | |
"Die Show findet am [DATE] um 20:00 Uhr statt.", | |
"[DATE] – Save the Date!", | |
"Das Webinar beginnt am [DATE] um 16:00 Uhr.", | |
"[DATE] – Große Premiere im Theater!", | |
"Event am [DATE], komm vorbei!", | |
"[DATE] – Silvesterparty!", | |
"Fußballspiel: [DATE], 15:30 Uhr", | |
"Live-Musik am [DATE]", | |
"[DATE] – Infos folgen!", | |
"[DATE] um 20:00 Uhr", | |
"Termin: [DATE], 18:00 Uhr", | |
"Wann? [DATE], 19:00 bis 20:00 Uhr" | |
"Das Konzert findet am [DATE] statt.", | |
"Save the Date: [DATE]!", | |
"Nächste Veranstaltung: [DATE]", | |
"[DATE] – große Feier!", | |
"Konzert am [DATE], 20:00 Uhr", | |
"[DATE]", | |
"Festival: [DATE] – [DATE]", | |
"[DATE] – nicht verpassen!", | |
"Sportevent: [DATE]", | |
"Networking-Event am [DATE], 17:00 Uhr", | |
"Workshop: [DATE], 14:00 Uhr", | |
"Firmenfeier: [DATE] ab 18:30 Uhr", | |
"Seminar: [DATE], Beginn um 10:00 Uhr", | |
"Schulung am [DATE] um 15:00 Uhr", | |
"Jubiläumsfeier am [DATE], 19:30 Uhr", | |
"[DATE] 23:00", | |
"[DATE] 23:00", | |
"[DATE] 23:00", | |
"Datum: [DATE], Startzeit: 10:00, Endzeit: 12:00", | |
"Samstag, [DATE], Einlass: 15:59, Beginn: 17:30, Preis: 65,73 EUR", | |
"Samstag, [DATE], Einlass: 17:00, Beginn: 19:00, Preis: 74,99 EUR", | |
"Samstag, [DATE], Einlass: 18:00, Beginn: 20:00", | |
"Freitag, [DATE], Einlass: 17:00, Beginn: 19:00, Preis: 77,93 EUR", | |
"Samstag, [DATE], Einlass: 16:30, Beginn: 18:30, Preis: ab 69,99 Euro", | |
"Gestört aber GeiL – Das Festival • [DATE], 16:00 • Berlin", | |
"Samstag, [DATE], Einlass: 17:00, Beginn: 19:00", | |
"Kaufberatung: [DATE] um 19:00", | |
"Bedienung: [DATE] um 19:00", | |
"[DATE] Ganztägig", | |
"ab dem [DATE]", | |
"Mittwoch, [DATE], von 18:00-22:00", | |
"Mittwoch, [DATE], von 18:00-22:00", | |
"Augsburger Friedensgespräche am [DATE]", | |
"Augsburger Friedensgespräche am [DATE]", | |
"Augsburger Friedensgespräche am [DATE]", | |
"[DATE] - [DATE]", | |
"Am [DATE] endet der Weihnachtsmarkt bereits um 19:00.", | |
"Winzerglühwein Do [DATE] 17:00 - 19:00", | |
"Winzerglühwein Fr [DATE] 16:30 - 18:30", | |
"Winzerglühwein Sa [DATE] 15:30 - 15:30", | |
"Lessons and Carols Sa [DATE] 19:30 - 21:00", | |
"[DATE] - [DATE]", | |
"Vom [DATE] - [DATE]", | |
"[DATE] - [DATE]", | |
"Am [DATE] endet der Weihnachtsmarkt bereits um 19:00.", | |
"Do [DATE] 17:00 - 19:00", | |
"Fr [DATE] 16:30 - 18:30", | |
"Sa [DATE] 15:30 - 15:30", | |
"Sa [DATE] 19:30 - 21:00", | |
"[DATE]", | |
"[DATE] - [DATE]", | |
"[DATE]", | |
"[DATE]", | |
"[DATE] und [DATE]", | |
"[DATE]", | |
"[DATE] - [DATE]", | |
"Am [DATE] endet der Weihnachtsmarkt bereits um 19:00.", | |
"Vom [DATE] - [DATE]", | |
"[DATE] und [DATE]", | |
"Am [DATE] ab 19:00", | |
"Am [DATE] ab 19:00", | |
"Am [DATE]", | |
"[DATE]", | |
"[DATE]", | |
"[DATE]", | |
"[DATE] 16:00 – [DATE] 17:00", | |
"[DATE] 10:15 – [DATE] 12:30", | |
"[DATE] 10:00 – [DATE] 18:00", | |
"[DATE]", | |
"[DATE] 11:00 – [DATE] 18:00", | |
"[DATE] - [DATE]", | |
"[DATE] | 19:30", | |
"[DATE]", | |
"[DATE] bis einschließlich [DATE]", | |
"[DATE], [DATE], [DATE] und [DATE]", | |
"[DATE] 18:00", | |
"[DATE] 13:00-21:00", | |
], | |
"OTHER": [ | |
"Der Vorverkauf startet am [DATE].", | |
"Anmeldefrist: [DATE]", | |
"Tickets sind bis zum [DATE] erhältlich.", | |
"Call for Papers läuft bis zum [DATE].", | |
"Die Registrierung endet am [DATE].", | |
"Bewerbungsschluss: [DATE].", | |
"Frühbucherrabatt bis zum [DATE]!", | |
"Einreichungsfrist: [DATE]", | |
"Die Akkreditierung läuft bis zum [DATE]", | |
"Reservierungen sind bis zum [DATE] möglich.", | |
"Der Ticketverkauf startet am [DATE]", | |
"Letzte Chance zur Anmeldung: [DATE]", | |
"Call for Speakers läuft bis [DATE]", | |
"Bitte reicht eure Abstracts bis [DATE] ein.", | |
"Akkreditierung endet am [DATE]", | |
"Bewerbungsschluss: [DATE]", | |
"Die Early-Bird-Phase läuft bis zum [DATE]", | |
"Anmeldefrist für Workshops: [DATE]", | |
"Die Frist für Sponsorings endet am [DATE]", | |
"Vergünstigte Tickets bis zum [DATE] verfügbar!" | |
] | |
} | |
time_classifier_train_data = { | |
"EVENT_TIME": [ | |
"**Wann?** 12.05.2024, 19:00-21:00", | |
"So. 12.08.2024 12:15 - 13:15 CET", | |
"13:00 - 14:00", | |
"Termin: [DATE], 19:00", | |
"[DATE]", | |
"Unser Meetup ist am [DATE] um 18:30 Uhr.", | |
"Die Show findet am [DATE] um 20:00 Uhr statt.", | |
"Das Webinar beginnt am [DATE] um 16:00 Uhr.", | |
"Fußballspiel: [DATE], 15:30 Uhr", | |
"[DATE] um 20:00 Uhr", | |
"Termin: [DATE], 18:00 Uhr", | |
"Wann? [DATE], 19:00 bis 20:00 Uhr" | |
"Konzert am [DATE], 20:00 Uhr", | |
"Networking-Event am [DATE], 17:00 Uhr", | |
"Workshop: [DATE], 14:00 Uhr", | |
"Firmenfeier: [DATE] ab 18:30 Uhr", | |
"Seminar: [DATE], Beginn um 10:00 Uhr", | |
"Schulung am [DATE] um 15:00 Uhr", | |
"Jubiläumsfeier am [DATE], 19:30 Uhr", | |
"[DATE] 23:00", | |
"[DATE] 23:00", | |
"[DATE] 23:00", | |
"Datum: [DATE], Startzeit: 10:00, Endzeit: 12:00", | |
"Samstag, [DATE], Einlass: 15:59, Beginn: 17:30, Preis: 65,73 EUR", | |
"Samstag, [DATE], Einlass: 17:00, Beginn: 19:00, Preis: 74,99 EUR", | |
"Samstag, [DATE], Einlass: 18:00, Beginn: 20:00", | |
"Freitag, [DATE], Einlass: 17:00, Beginn: 19:00, Preis: 77,93 EUR", | |
"Samstag, [DATE], Einlass: 16:30, Beginn: 18:30, Preis: ab 69,99 Euro", | |
"Gestört aber GeiL – Das Festival • [DATE], 16:00 • Berlin", | |
"Samstag, [DATE], Einlass: 17:00, Beginn: 19:00", | |
"Kaufberatung: [DATE] um 19:00", | |
"Bedienung: [DATE] um 19:00", | |
"[DATE] Ganztägig", | |
"ab dem [DATE]", | |
"Mittwoch, [DATE], von 18:00-22:00", | |
"Mittwoch, [DATE], von 18:00-22:00", | |
"Augsburger Friedensgespräche am [DATE]", | |
"Augsburger Friedensgespräche am [DATE]", | |
"Augsburger Friedensgespräche am [DATE]", | |
"[DATE] - [DATE]", | |
"Am [DATE] endet der Weihnachtsmarkt bereits um 19:00.", | |
"Winzerglühwein Do [DATE] 17:00 - 19:00", | |
"Winzerglühwein Fr [DATE] 16:30 - 18:30", | |
"Winzerglühwein Sa [DATE] 15:30 - 15:30", | |
"Lessons and Carols Sa [DATE] 19:30 - 21:00", | |
"[DATE] - [DATE]", | |
"Vom [DATE] - [DATE]", | |
"[DATE] - [DATE]", | |
"Am [DATE] endet der Weihnachtsmarkt bereits um 19:00.", | |
"Do [DATE] 17:00 - 19:00", | |
"Fr [DATE] 16:30 - 18:30", | |
"Sa [DATE] 15:30 - 15:30", | |
"Sa [DATE] 19:30 - 21:00", | |
"[DATE]", | |
"[DATE] - [DATE]", | |
"[DATE]", | |
"[DATE]", | |
"[DATE] und [DATE]", | |
"[DATE]", | |
"[DATE] - [DATE]", | |
"Am [DATE] endet der Weihnachtsmarkt bereits um 19:00.", | |
"Vom [DATE] - [DATE]", | |
"[DATE] und [DATE]", | |
"Am [DATE] ab 19:00", | |
"Am [DATE] ab 19:00", | |
"Am [DATE]", | |
"[DATE]", | |
"[DATE]", | |
"[DATE]", | |
"[DATE] 16:00 – [DATE] 17:00", | |
"[DATE] 10:15 – [DATE] 12:30", | |
"[DATE] 10:00 – [DATE] 18:00", | |
"[DATE]", | |
"[DATE] 11:00 – [DATE] 18:00", | |
"[DATE] - [DATE]", | |
"[DATE] | 19:30", | |
"[DATE]", | |
"[DATE] bis einschließlich [DATE]", | |
"[DATE], [DATE], [DATE] und [DATE]", | |
"[DATE] 18:00", | |
"[DATE] 13:00-21:00", | |
], | |
"OTHER":[ | |
"Einlass: 19:00", | |
"Abendkasse ab 20:00 Uhr", | |
"Tageskarten können ab 18:00 Uhr gekauft werden.", | |
"Öffnungszeiten: Mo-Fr 09:00 - 17:00", | |
"Kartenverkauf ab 17:30 Uhr", | |
"Einlass beginnt um 18:45", | |
"Reservierung erforderlich bis 12:00 Uhr", | |
] | |
} | |
nlp = spacy.blank("de") | |
nlp.add_pipe('sentencizer') | |
# 1️⃣ Punkt als Suffix & Infix definieren (damit er zwischen Zahlen trennt) | |
suffixes = list(nlp.Defaults.suffixes) + [r"\."] # Punkt als Suffix hinzufügen | |
infixes = list(nlp.Defaults.infixes) + [r"(?<=\d)\.(?=\d)"] + [r"(?<=\d)\:(?=\d)"] # Punkt zwischen Zahlen trennen | |
# Regex-Objekte kompilieren | |
suffix_re = compile_suffix_regex(suffixes) | |
infix_re = compile_infix_regex(infixes) | |
# Angepasste Tokenizer-Funktion setzen | |
nlp.tokenizer = Tokenizer(nlp.vocab, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer) | |
# 2️⃣ Entity Ruler für Datumsangaben hinzufügen | |
ruler = nlp.add_pipe("entity_ruler") | |
patterns = [ | |
{ | |
"label": "DATE", | |
"pattern": [ | |
{"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dddd"} | |
] | |
}, | |
{ | |
"label": "TIME", | |
"pattern": [ | |
{"SHAPE": "dd"}, {"ORTH": ":"}, {"SHAPE": "dd"} | |
] | |
} | |
] | |
ruler.add_patterns(patterns) | |
# Prepare Training Data: Use Placeholders for Times and Dates | |
classifier_train_data_cleaned = {"EVENT_DATE": [], "OTHER":[]} | |
for text in date_classifier_train_data["EVENT_DATE"]: | |
text = normalize_data(text) | |
doc = nlp(text) | |
for ent in doc.ents: | |
if ent.label_ == "DATE": | |
text = text.replace(ent.text, "[DATE]") | |
if ent.label_ == "TIME": | |
text = text.replace(ent.text, "[TIME]") | |
classifier_train_data_cleaned["EVENT_DATE"].append(text) | |
for text in date_classifier_train_data["OTHER"]: | |
text = normalize_data(text) | |
doc = nlp(text) | |
for ent in doc.ents: | |
if ent.label_ == "DATE": | |
text = text.replace(ent.text, "[DATE]") | |
if ent.label_ == "TIME": | |
text = text.replace(ent.text, "[TIME]") | |
classifier_train_data_cleaned["OTHER"].append(text) | |
# remove duplicates | |
classifier_train_data_cleaned["EVENT_DATE"] = list(set(classifier_train_data_cleaned["EVENT_DATE"])) | |
classifier_train_data_cleaned["OTHER"] = list(set(classifier_train_data_cleaned["OTHER"])) | |
print(classifier_train_data_cleaned["EVENT_DATE"]) | |
print(classifier_train_data_cleaned["OTHER"]) | |
classifier = ClassyClassifier(data=classifier_train_data_cleaned) | |
classifier.set_embedding_model(model="stsb-xlm-r-multilingual") | |
with open("../../playground/models/date_classifier.pkl", "wb") as f: | |
pickle.dump(classifier, f) | |
zero_shot_classifier = ZeroShotClassifier() | |
TEXTS = ["Tickets können ab dem 03.12.2020 erworben werden"] | |
for text in TEXTS: | |
text = normalize_data(text) | |
analyzer = MarkdownAnalyzer(text) | |
print("*"*100) | |
print(text) | |
print("\n\n\n") | |
md_elements = analyzer.identify_all().get("block_elements") | |
md_elements = [] | |
dates = {"dates": [], "times": []} | |
for md_element in md_elements: | |
doc = nlp(md_element.text) | |
# Prüfe Tokenisierung | |
# print("Tokens:", [token.text for token in doc]) | |
print(doc.ents) | |
if doc.ents: | |
print(md_element.text) | |
modified_text = md_element.text | |
# Replace TIME and DATE entities with placeholders | |
for ent in doc.ents: | |
if ent.label_ == "DATE": | |
modified_text = modified_text.replace(ent.text, "[DATE]") | |
if ent.label_ == "TIME": | |
modified_text = modified_text.replace(ent.text, "[TIME]") | |
date_entities = [ent.text for ent in doc.ents if ent.label_ == "DATE"] | |
# Classify Date category | |
if date_entities: | |
print("DATES: ",date_entities ) | |
cats = classifier(modified_text) | |
date_category = max(cats, key=cats.get) | |
print("Date Category: ", date_category) | |
if date_category == "EVENT_DATE": | |
dates["dates"].extend(date_entities) | |
time_entities = [ent.text for ent in doc.ents if ent.label_ == "TIME"] | |
if time_entities: | |
# Classify Time category | |
print("ZEITEN: ", time_entities) | |
time_category = zero_shot_classifier.classify(modified_text, CustomMode( | |
labels=["BEGINN", "EINLASS", "ABLAUF"], | |
hypothesis_template="Der Text geht um {} einer Veranstaltung"))[0].label | |
print("Time Category: ", time_category) | |
if time_category == "BEGINN": | |
dates["times"].extend(time_entities) | |
print("\n") | |
print(dates) | |
print("*" * 100) | |