manaviel85370
extract dates and times
83f1514
from classy_classification import ClassyClassifier
import pickle
import spacy
from spacy import displacy
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
from nltk import Tree
from src.nlp.data.test_texts import TEXTS
from src.nlp.playground.textclassification import ZeroShotClassifier, CustomMode
from src.utils.helpers import normalize_data
from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer
train_data = {
"EVENT_TIME": [
"Die Veranstaltung beginnt am 24.12.2025 16:00",
"Wann: 03.03.2020 16:00",
"25.04.2025 - 23.05.2020 , 15:00 - 16:00",
"22.06.2022 18:00",
"23.12.2030 17:00 - 18:00",
"12.05.2024 , 19:00 - 21:00",
"So. 12.08.2024 12:15 - 13:15",
"**24.12.2025 16:00 **",
"15.01.2025 18:00",
"18.01.2025 11:00 - 18:00",
"**Der Nikolaus kommt am Freitag 17:00 !**",
"Sa 12.00 + 16:00",
"So 12:00",
"Kindertheater Sa + So 15:00",
"| Beginn | Freitag 16:00 - 20:00",
"/ Samstag 11:00 - 20:00",
"/ Sonntag 11:00 - 18:00",
"19:00",
"Beginn: 15:00"
],
"ADMITTANCE_TIME": [
"Einlass ist 15:00",
"Einlass ist 15:30",
"Einlass: 15:00",
"Abendkasse 14:30",
"Einlass: 14:30",
"Tageskasse 15:00 im Museum",
"Einlass beginnt um 16:00",
"Der Einlass erfolgt um 17:30",
"Zutritt ab 18:00",
"Gäste dürfen ab 14:45 eintreten",
"Türöffnung: 19:00",
"Einlass erfolgt ab 20:15",
"Einlass für VIPs: 13:30",
"Normaler Einlass: 16:45",
"Die Türen öffnen sich um 12:00",
"Zugang ab 17:00 möglich",
"Einlasskontrolle beginnt um 18:30",
"Einlass erst ab 19:15 gestattet"
]
}
nlp = spacy.blank("de")
nlp.add_pipe('sentencizer')
# 1️⃣ Punkt als Suffix & Infix definieren (damit er zwischen Zahlen trennt)
suffixes = list(nlp.Defaults.suffixes) + [r"\."] # Punkt als Suffix hinzufügen
infixes = list(nlp.Defaults.infixes) + [r"(?<=\d)\.(?=\d)"] + [r"(?<=\d)\:(?=\d)"] # Punkt zwischen Zahlen trennen
# Regex-Objekte kompilieren
suffix_re = compile_suffix_regex(suffixes)
infix_re = compile_infix_regex(infixes)
# Angepasste Tokenizer-Funktion setzen
nlp.tokenizer = Tokenizer(nlp.vocab, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer)
# 2️⃣ Entity Ruler für Datumsangaben hinzufügen
ruler = nlp.add_pipe("entity_ruler")
patterns = [
{
"label": "DATE",
"pattern": [
{"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dddd"}
]
},
{
"label": "TIME",
"pattern": [
{"SHAPE": "dd"}, {"ORTH": ":"}, {"SHAPE": "dd"}
]
}
]
ruler.add_patterns(patterns)
# Prepare Training Data: Use Placeholders for Times and Dates
classifier_train_data_cleaned = {"EVENT_TIME": [], "ADMITTANCE_TIME": []}
for text in train_data["EVENT_TIME"]:
text = normalize_data(text)
doc = nlp(text)
for ent in doc.ents:
if ent.label_ == "DATE":
text = text.replace(ent.text, "[DATE]")
if ent.label_ == "TIME":
text = text.replace(ent.text, "[TIME]")
classifier_train_data_cleaned["EVENT_TIME"].append(text)
for text in train_data["ADMITTANCE_TIME"]:
text = normalize_data(text)
doc = nlp(text)
for ent in doc.ents:
if ent.label_ == "DATE":
text = text.replace(ent.text, "[DATE]")
if ent.label_ == "TIME":
text = text.replace(ent.text, "[TIME]")
classifier_train_data_cleaned["ADMITTANCE_TIME"].append(text)
# remove duplicates
classifier_train_data_cleaned["EVENT_TIME"] = list(set(classifier_train_data_cleaned["EVENT_TIME"]))
classifier_train_data_cleaned["ADMITTANCE_TIME"] = list(set(classifier_train_data_cleaned["ADMITTANCE_TIME"]))
classifier = ClassyClassifier(data=classifier_train_data_cleaned)
classifier.set_embedding_model(model="stsb-xlm-r-multilingual")
with open("../../playground/models/time_classifier.pkl", "wb") as f:
pickle.dump(classifier, f)
TEXTS = ["Einlass: ab 15:00", "Beginn: 18:00", "Konzert: 20:00 bis 21:00", "Start: 20:00, Ende: 21:00"]
for text in TEXTS:
print(text)
text = normalize_data(text)
analyzer = MarkdownAnalyzer(text)
md_elements = analyzer.identify_all().get("block_elements")
dates = {"dates": [], "times": []}
for md_element in md_elements:
doc = nlp(md_element.text)
# Prüfe Tokenisierung
# print("Tokens:", [token.text for token in doc])
if doc.ents:
print("Found Entities: ", doc.ents)
modified_text = md_element.text
# Replace TIME and DATE entities with placeholders
for ent in doc.ents:
if ent.label_ == "DATE":
modified_text = modified_text.replace(ent.text, "[DATE]")
if ent.label_ == "TIME":
modified_text = modified_text.replace(ent.text, "[TIME]")
cats = classifier(modified_text)
time_category = max(cats, key=cats.get)
print(time_category)
print("*"*100)