event-data-extraction-playground
/
src
/nlp
/experimental
/textclassification
/classy_classifier_time.py
from classy_classification import ClassyClassifier | |
import pickle | |
import spacy | |
from spacy import displacy | |
from spacy.tokenizer import Tokenizer | |
from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex | |
from nltk import Tree | |
from src.nlp.data.test_texts import TEXTS | |
from src.nlp.playground.textclassification import ZeroShotClassifier, CustomMode | |
from src.utils.helpers import normalize_data | |
from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer | |
train_data = { | |
"EVENT_TIME": [ | |
"Die Veranstaltung beginnt am 24.12.2025 16:00", | |
"Wann: 03.03.2020 16:00", | |
"25.04.2025 - 23.05.2020 , 15:00 - 16:00", | |
"22.06.2022 18:00", | |
"23.12.2030 17:00 - 18:00", | |
"12.05.2024 , 19:00 - 21:00", | |
"So. 12.08.2024 12:15 - 13:15", | |
"**24.12.2025 16:00 **", | |
"15.01.2025 18:00", | |
"18.01.2025 11:00 - 18:00", | |
"**Der Nikolaus kommt am Freitag 17:00 !**", | |
"Sa 12.00 + 16:00", | |
"So 12:00", | |
"Kindertheater Sa + So 15:00", | |
"| Beginn | Freitag 16:00 - 20:00", | |
"/ Samstag 11:00 - 20:00", | |
"/ Sonntag 11:00 - 18:00", | |
"19:00", | |
"Beginn: 15:00" | |
], | |
"ADMITTANCE_TIME": [ | |
"Einlass ist 15:00", | |
"Einlass ist 15:30", | |
"Einlass: 15:00", | |
"Abendkasse 14:30", | |
"Einlass: 14:30", | |
"Tageskasse 15:00 im Museum", | |
"Einlass beginnt um 16:00", | |
"Der Einlass erfolgt um 17:30", | |
"Zutritt ab 18:00", | |
"Gäste dürfen ab 14:45 eintreten", | |
"Türöffnung: 19:00", | |
"Einlass erfolgt ab 20:15", | |
"Einlass für VIPs: 13:30", | |
"Normaler Einlass: 16:45", | |
"Die Türen öffnen sich um 12:00", | |
"Zugang ab 17:00 möglich", | |
"Einlasskontrolle beginnt um 18:30", | |
"Einlass erst ab 19:15 gestattet" | |
] | |
} | |
nlp = spacy.blank("de") | |
nlp.add_pipe('sentencizer') | |
# 1️⃣ Punkt als Suffix & Infix definieren (damit er zwischen Zahlen trennt) | |
suffixes = list(nlp.Defaults.suffixes) + [r"\."] # Punkt als Suffix hinzufügen | |
infixes = list(nlp.Defaults.infixes) + [r"(?<=\d)\.(?=\d)"] + [r"(?<=\d)\:(?=\d)"] # Punkt zwischen Zahlen trennen | |
# Regex-Objekte kompilieren | |
suffix_re = compile_suffix_regex(suffixes) | |
infix_re = compile_infix_regex(infixes) | |
# Angepasste Tokenizer-Funktion setzen | |
nlp.tokenizer = Tokenizer(nlp.vocab, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer) | |
# 2️⃣ Entity Ruler für Datumsangaben hinzufügen | |
ruler = nlp.add_pipe("entity_ruler") | |
patterns = [ | |
{ | |
"label": "DATE", | |
"pattern": [ | |
{"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dddd"} | |
] | |
}, | |
{ | |
"label": "TIME", | |
"pattern": [ | |
{"SHAPE": "dd"}, {"ORTH": ":"}, {"SHAPE": "dd"} | |
] | |
} | |
] | |
ruler.add_patterns(patterns) | |
# Prepare Training Data: Use Placeholders for Times and Dates | |
classifier_train_data_cleaned = {"EVENT_TIME": [], "ADMITTANCE_TIME": []} | |
for text in train_data["EVENT_TIME"]: | |
text = normalize_data(text) | |
doc = nlp(text) | |
for ent in doc.ents: | |
if ent.label_ == "DATE": | |
text = text.replace(ent.text, "[DATE]") | |
if ent.label_ == "TIME": | |
text = text.replace(ent.text, "[TIME]") | |
classifier_train_data_cleaned["EVENT_TIME"].append(text) | |
for text in train_data["ADMITTANCE_TIME"]: | |
text = normalize_data(text) | |
doc = nlp(text) | |
for ent in doc.ents: | |
if ent.label_ == "DATE": | |
text = text.replace(ent.text, "[DATE]") | |
if ent.label_ == "TIME": | |
text = text.replace(ent.text, "[TIME]") | |
classifier_train_data_cleaned["ADMITTANCE_TIME"].append(text) | |
# remove duplicates | |
classifier_train_data_cleaned["EVENT_TIME"] = list(set(classifier_train_data_cleaned["EVENT_TIME"])) | |
classifier_train_data_cleaned["ADMITTANCE_TIME"] = list(set(classifier_train_data_cleaned["ADMITTANCE_TIME"])) | |
classifier = ClassyClassifier(data=classifier_train_data_cleaned) | |
classifier.set_embedding_model(model="stsb-xlm-r-multilingual") | |
with open("../../playground/models/time_classifier.pkl", "wb") as f: | |
pickle.dump(classifier, f) | |
TEXTS = ["Einlass: ab 15:00", "Beginn: 18:00", "Konzert: 20:00 bis 21:00", "Start: 20:00, Ende: 21:00"] | |
for text in TEXTS: | |
print(text) | |
text = normalize_data(text) | |
analyzer = MarkdownAnalyzer(text) | |
md_elements = analyzer.identify_all().get("block_elements") | |
dates = {"dates": [], "times": []} | |
for md_element in md_elements: | |
doc = nlp(md_element.text) | |
# Prüfe Tokenisierung | |
# print("Tokens:", [token.text for token in doc]) | |
if doc.ents: | |
print("Found Entities: ", doc.ents) | |
modified_text = md_element.text | |
# Replace TIME and DATE entities with placeholders | |
for ent in doc.ents: | |
if ent.label_ == "DATE": | |
modified_text = modified_text.replace(ent.text, "[DATE]") | |
if ent.label_ == "TIME": | |
modified_text = modified_text.replace(ent.text, "[TIME]") | |
cats = classifier(modified_text) | |
time_category = max(cats, key=cats.get) | |
print(time_category) | |
print("*"*100) | |