manaviel85370
extract dates and times
83f1514
from classy_classification import ClassyClassifier
import pickle
import spacy
from spacy import displacy
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
from nltk import Tree
from src.nlp.data.test_texts import TEXTS
from src.nlp.playground.textclassification import ZeroShotClassifier, CustomMode
from src.utils.helpers import normalize_data
from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer
date_classifier_train_data = {
"EVENT_DATE": [
"Termin: [DATE], 19:00",
"[DATE]",
"Unser Meetup ist am [DATE] um 18:30 Uhr.",
"Konzert: [DATE]",
"Das Festival startet am [DATE]",
"Die Show findet am [DATE] um 20:00 Uhr statt.",
"[DATE] – Save the Date!",
"Das Webinar beginnt am [DATE] um 16:00 Uhr.",
"[DATE] – Große Premiere im Theater!",
"Event am [DATE], komm vorbei!",
"[DATE] – Silvesterparty!",
"Fußballspiel: [DATE], 15:30 Uhr",
"Live-Musik am [DATE]",
"[DATE] – Infos folgen!",
"[DATE] um 20:00 Uhr",
"Termin: [DATE], 18:00 Uhr",
"Wann? [DATE], 19:00 bis 20:00 Uhr"
"Das Konzert findet am [DATE] statt.",
"Save the Date: [DATE]!",
"Nächste Veranstaltung: [DATE]",
"[DATE] – große Feier!",
"Konzert am [DATE], 20:00 Uhr",
"[DATE]",
"Festival: [DATE] – [DATE]",
"[DATE] – nicht verpassen!",
"Sportevent: [DATE]",
"Networking-Event am [DATE], 17:00 Uhr",
"Workshop: [DATE], 14:00 Uhr",
"Firmenfeier: [DATE] ab 18:30 Uhr",
"Seminar: [DATE], Beginn um 10:00 Uhr",
"Schulung am [DATE] um 15:00 Uhr",
"Jubiläumsfeier am [DATE], 19:30 Uhr",
"[DATE] 23:00",
"[DATE] 23:00",
"[DATE] 23:00",
"Datum: [DATE], Startzeit: 10:00, Endzeit: 12:00",
"Samstag, [DATE], Einlass: 15:59, Beginn: 17:30, Preis: 65,73 EUR",
"Samstag, [DATE], Einlass: 17:00, Beginn: 19:00, Preis: 74,99 EUR",
"Samstag, [DATE], Einlass: 18:00, Beginn: 20:00",
"Freitag, [DATE], Einlass: 17:00, Beginn: 19:00, Preis: 77,93 EUR",
"Samstag, [DATE], Einlass: 16:30, Beginn: 18:30, Preis: ab 69,99 Euro",
"Gestört aber GeiL – Das Festival • [DATE], 16:00 • Berlin",
"Samstag, [DATE], Einlass: 17:00, Beginn: 19:00",
"Kaufberatung: [DATE] um 19:00",
"Bedienung: [DATE] um 19:00",
"[DATE] Ganztägig",
"ab dem [DATE]",
"Mittwoch, [DATE], von 18:00-22:00",
"Mittwoch, [DATE], von 18:00-22:00",
"Augsburger Friedensgespräche am [DATE]",
"Augsburger Friedensgespräche am [DATE]",
"Augsburger Friedensgespräche am [DATE]",
"[DATE] - [DATE]",
"Am [DATE] endet der Weihnachtsmarkt bereits um 19:00.",
"Winzerglühwein Do [DATE] 17:00 - 19:00",
"Winzerglühwein Fr [DATE] 16:30 - 18:30",
"Winzerglühwein Sa [DATE] 15:30 - 15:30",
"Lessons and Carols Sa [DATE] 19:30 - 21:00",
"[DATE] - [DATE]",
"Vom [DATE] - [DATE]",
"[DATE] - [DATE]",
"Am [DATE] endet der Weihnachtsmarkt bereits um 19:00.",
"Do [DATE] 17:00 - 19:00",
"Fr [DATE] 16:30 - 18:30",
"Sa [DATE] 15:30 - 15:30",
"Sa [DATE] 19:30 - 21:00",
"[DATE]",
"[DATE] - [DATE]",
"[DATE]",
"[DATE]",
"[DATE] und [DATE]",
"[DATE]",
"[DATE] - [DATE]",
"Am [DATE] endet der Weihnachtsmarkt bereits um 19:00.",
"Vom [DATE] - [DATE]",
"[DATE] und [DATE]",
"Am [DATE] ab 19:00",
"Am [DATE] ab 19:00",
"Am [DATE]",
"[DATE]",
"[DATE]",
"[DATE]",
"[DATE] 16:00 – [DATE] 17:00",
"[DATE] 10:15 – [DATE] 12:30",
"[DATE] 10:00 – [DATE] 18:00",
"[DATE]",
"[DATE] 11:00 – [DATE] 18:00",
"[DATE] - [DATE]",
"[DATE] | 19:30",
"[DATE]",
"[DATE] bis einschließlich [DATE]",
"[DATE], [DATE], [DATE] und [DATE]",
"[DATE] 18:00",
"[DATE] 13:00-21:00",
],
"OTHER": [
"Der Vorverkauf startet am [DATE].",
"Anmeldefrist: [DATE]",
"Tickets sind bis zum [DATE] erhältlich.",
"Call for Papers läuft bis zum [DATE].",
"Die Registrierung endet am [DATE].",
"Bewerbungsschluss: [DATE].",
"Frühbucherrabatt bis zum [DATE]!",
"Einreichungsfrist: [DATE]",
"Die Akkreditierung läuft bis zum [DATE]",
"Reservierungen sind bis zum [DATE] möglich.",
"Der Ticketverkauf startet am [DATE]",
"Letzte Chance zur Anmeldung: [DATE]",
"Call for Speakers läuft bis [DATE]",
"Bitte reicht eure Abstracts bis [DATE] ein.",
"Akkreditierung endet am [DATE]",
"Bewerbungsschluss: [DATE]",
"Die Early-Bird-Phase läuft bis zum [DATE]",
"Anmeldefrist für Workshops: [DATE]",
"Die Frist für Sponsorings endet am [DATE]",
"Vergünstigte Tickets bis zum [DATE] verfügbar!"
]
}
time_classifier_train_data = {
"EVENT_TIME": [
"**Wann?** 12.05.2024, 19:00-21:00",
"So. 12.08.2024 12:15 - 13:15 CET",
"13:00 - 14:00",
"Termin: [DATE], 19:00",
"[DATE]",
"Unser Meetup ist am [DATE] um 18:30 Uhr.",
"Die Show findet am [DATE] um 20:00 Uhr statt.",
"Das Webinar beginnt am [DATE] um 16:00 Uhr.",
"Fußballspiel: [DATE], 15:30 Uhr",
"[DATE] um 20:00 Uhr",
"Termin: [DATE], 18:00 Uhr",
"Wann? [DATE], 19:00 bis 20:00 Uhr"
"Konzert am [DATE], 20:00 Uhr",
"Networking-Event am [DATE], 17:00 Uhr",
"Workshop: [DATE], 14:00 Uhr",
"Firmenfeier: [DATE] ab 18:30 Uhr",
"Seminar: [DATE], Beginn um 10:00 Uhr",
"Schulung am [DATE] um 15:00 Uhr",
"Jubiläumsfeier am [DATE], 19:30 Uhr",
"[DATE] 23:00",
"[DATE] 23:00",
"[DATE] 23:00",
"Datum: [DATE], Startzeit: 10:00, Endzeit: 12:00",
"Samstag, [DATE], Einlass: 15:59, Beginn: 17:30, Preis: 65,73 EUR",
"Samstag, [DATE], Einlass: 17:00, Beginn: 19:00, Preis: 74,99 EUR",
"Samstag, [DATE], Einlass: 18:00, Beginn: 20:00",
"Freitag, [DATE], Einlass: 17:00, Beginn: 19:00, Preis: 77,93 EUR",
"Samstag, [DATE], Einlass: 16:30, Beginn: 18:30, Preis: ab 69,99 Euro",
"Gestört aber GeiL – Das Festival • [DATE], 16:00 • Berlin",
"Samstag, [DATE], Einlass: 17:00, Beginn: 19:00",
"Kaufberatung: [DATE] um 19:00",
"Bedienung: [DATE] um 19:00",
"[DATE] Ganztägig",
"ab dem [DATE]",
"Mittwoch, [DATE], von 18:00-22:00",
"Mittwoch, [DATE], von 18:00-22:00",
"Augsburger Friedensgespräche am [DATE]",
"Augsburger Friedensgespräche am [DATE]",
"Augsburger Friedensgespräche am [DATE]",
"[DATE] - [DATE]",
"Am [DATE] endet der Weihnachtsmarkt bereits um 19:00.",
"Winzerglühwein Do [DATE] 17:00 - 19:00",
"Winzerglühwein Fr [DATE] 16:30 - 18:30",
"Winzerglühwein Sa [DATE] 15:30 - 15:30",
"Lessons and Carols Sa [DATE] 19:30 - 21:00",
"[DATE] - [DATE]",
"Vom [DATE] - [DATE]",
"[DATE] - [DATE]",
"Am [DATE] endet der Weihnachtsmarkt bereits um 19:00.",
"Do [DATE] 17:00 - 19:00",
"Fr [DATE] 16:30 - 18:30",
"Sa [DATE] 15:30 - 15:30",
"Sa [DATE] 19:30 - 21:00",
"[DATE]",
"[DATE] - [DATE]",
"[DATE]",
"[DATE]",
"[DATE] und [DATE]",
"[DATE]",
"[DATE] - [DATE]",
"Am [DATE] endet der Weihnachtsmarkt bereits um 19:00.",
"Vom [DATE] - [DATE]",
"[DATE] und [DATE]",
"Am [DATE] ab 19:00",
"Am [DATE] ab 19:00",
"Am [DATE]",
"[DATE]",
"[DATE]",
"[DATE]",
"[DATE] 16:00 – [DATE] 17:00",
"[DATE] 10:15 – [DATE] 12:30",
"[DATE] 10:00 – [DATE] 18:00",
"[DATE]",
"[DATE] 11:00 – [DATE] 18:00",
"[DATE] - [DATE]",
"[DATE] | 19:30",
"[DATE]",
"[DATE] bis einschließlich [DATE]",
"[DATE], [DATE], [DATE] und [DATE]",
"[DATE] 18:00",
"[DATE] 13:00-21:00",
],
"OTHER":[
"Einlass: 19:00",
"Abendkasse ab 20:00 Uhr",
"Tageskarten können ab 18:00 Uhr gekauft werden.",
"Öffnungszeiten: Mo-Fr 09:00 - 17:00",
"Kartenverkauf ab 17:30 Uhr",
"Einlass beginnt um 18:45",
"Reservierung erforderlich bis 12:00 Uhr",
]
}
nlp = spacy.blank("de")
nlp.add_pipe('sentencizer')
# 1️⃣ Punkt als Suffix & Infix definieren (damit er zwischen Zahlen trennt)
suffixes = list(nlp.Defaults.suffixes) + [r"\."] # Punkt als Suffix hinzufügen
infixes = list(nlp.Defaults.infixes) + [r"(?<=\d)\.(?=\d)"] + [r"(?<=\d)\:(?=\d)"] # Punkt zwischen Zahlen trennen
# Regex-Objekte kompilieren
suffix_re = compile_suffix_regex(suffixes)
infix_re = compile_infix_regex(infixes)
# Angepasste Tokenizer-Funktion setzen
nlp.tokenizer = Tokenizer(nlp.vocab, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer)
# 2️⃣ Entity Ruler für Datumsangaben hinzufügen
ruler = nlp.add_pipe("entity_ruler")
patterns = [
{
"label": "DATE",
"pattern": [
{"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dddd"}
]
},
{
"label": "TIME",
"pattern": [
{"SHAPE": "dd"}, {"ORTH": ":"}, {"SHAPE": "dd"}
]
}
]
ruler.add_patterns(patterns)
# Prepare Training Data: Use Placeholders for Times and Dates
classifier_train_data_cleaned = {"EVENT_DATE": [], "OTHER":[]}
for text in date_classifier_train_data["EVENT_DATE"]:
text = normalize_data(text)
doc = nlp(text)
for ent in doc.ents:
if ent.label_ == "DATE":
text = text.replace(ent.text, "[DATE]")
if ent.label_ == "TIME":
text = text.replace(ent.text, "[TIME]")
classifier_train_data_cleaned["EVENT_DATE"].append(text)
for text in date_classifier_train_data["OTHER"]:
text = normalize_data(text)
doc = nlp(text)
for ent in doc.ents:
if ent.label_ == "DATE":
text = text.replace(ent.text, "[DATE]")
if ent.label_ == "TIME":
text = text.replace(ent.text, "[TIME]")
classifier_train_data_cleaned["OTHER"].append(text)
# remove duplicates
classifier_train_data_cleaned["EVENT_DATE"] = list(set(classifier_train_data_cleaned["EVENT_DATE"]))
classifier_train_data_cleaned["OTHER"] = list(set(classifier_train_data_cleaned["OTHER"]))
print(classifier_train_data_cleaned["EVENT_DATE"])
print(classifier_train_data_cleaned["OTHER"])
classifier = ClassyClassifier(data=classifier_train_data_cleaned)
classifier.set_embedding_model(model="stsb-xlm-r-multilingual")
with open("../../playground/models/date_classifier.pkl", "wb") as f:
pickle.dump(classifier, f)
zero_shot_classifier = ZeroShotClassifier()
TEXTS = ["Tickets können ab dem 03.12.2020 erworben werden"]
for text in TEXTS:
text = normalize_data(text)
analyzer = MarkdownAnalyzer(text)
print("*"*100)
print(text)
print("\n\n\n")
md_elements = analyzer.identify_all().get("block_elements")
md_elements = []
dates = {"dates": [], "times": []}
for md_element in md_elements:
doc = nlp(md_element.text)
# Prüfe Tokenisierung
# print("Tokens:", [token.text for token in doc])
print(doc.ents)
if doc.ents:
print(md_element.text)
modified_text = md_element.text
# Replace TIME and DATE entities with placeholders
for ent in doc.ents:
if ent.label_ == "DATE":
modified_text = modified_text.replace(ent.text, "[DATE]")
if ent.label_ == "TIME":
modified_text = modified_text.replace(ent.text, "[TIME]")
date_entities = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
# Classify Date category
if date_entities:
print("DATES: ",date_entities )
cats = classifier(modified_text)
date_category = max(cats, key=cats.get)
print("Date Category: ", date_category)
if date_category == "EVENT_DATE":
dates["dates"].extend(date_entities)
time_entities = [ent.text for ent in doc.ents if ent.label_ == "TIME"]
if time_entities:
# Classify Time category
print("ZEITEN: ", time_entities)
time_category = zero_shot_classifier.classify(modified_text, CustomMode(
labels=["BEGINN", "EINLASS", "ABLAUF"],
hypothesis_template="Der Text geht um {} einer Veranstaltung"))[0].label
print("Time Category: ", time_category)
if time_category == "BEGINN":
dates["times"].extend(time_entities)
print("\n")
print(dates)
print("*" * 100)