Spaces:

adojode
/

event-data-extraction-playground

Running

event-data-extraction-playground / src /nlp /experimental /textclassification /classy_classifier_time.py

manaviel85370

extract dates and times

83f1514 about 2 months ago

5.15 kB

	from classy_classification import ClassyClassifier
	import pickle
	import spacy
	from spacy import displacy
	from spacy.tokenizer import Tokenizer
	from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
	from nltk import Tree
	from src.nlp.data.test_texts import TEXTS
	from src.nlp.playground.textclassification import ZeroShotClassifier, CustomMode
	from src.utils.helpers import normalize_data
	from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer

	train_data = {
	"EVENT_TIME": [
	"Die Veranstaltung beginnt am 24.12.2025 16:00",
	"Wann: 03.03.2020 16:00",
	"25.04.2025 - 23.05.2020 , 15:00 - 16:00",
	"22.06.2022 18:00",
	"23.12.2030 17:00 - 18:00",
	"12.05.2024 , 19:00 - 21:00",
	"So. 12.08.2024 12:15 - 13:15",
	"24.12.2025 16:00 ",
	"15.01.2025 18:00",
	"18.01.2025 11:00 - 18:00",
	"Der Nikolaus kommt am Freitag 17:00 !",
	"Sa 12.00 + 16:00",
	"So 12:00",
	"Kindertheater Sa + So 15:00",
	"\| Beginn \| Freitag 16:00 - 20:00",
	"/ Samstag 11:00 - 20:00",
	"/ Sonntag 11:00 - 18:00",
	"19:00",
	"Beginn: 15:00"
	],
	"ADMITTANCE_TIME": [
	"Einlass ist 15:00",
	"Einlass ist 15:30",
	"Einlass: 15:00",
	"Abendkasse 14:30",
	"Einlass: 14:30",
	"Tageskasse 15:00 im Museum",
	"Einlass beginnt um 16:00",
	"Der Einlass erfolgt um 17:30",
	"Zutritt ab 18:00",
	"Gäste dürfen ab 14:45 eintreten",
	"Türöffnung: 19:00",
	"Einlass erfolgt ab 20:15",
	"Einlass für VIPs: 13:30",
	"Normaler Einlass: 16:45",
	"Die Türen öffnen sich um 12:00",
	"Zugang ab 17:00 möglich",
	"Einlasskontrolle beginnt um 18:30",
	"Einlass erst ab 19:15 gestattet"
	]
	}


	nlp = spacy.blank("de")
	nlp.add_pipe('sentencizer')

	# 1️⃣ Punkt als Suffix & Infix definieren (damit er zwischen Zahlen trennt)
	suffixes = list(nlp.Defaults.suffixes) + [r"\."] # Punkt als Suffix hinzufügen
	infixes = list(nlp.Defaults.infixes) + [r"(?<=\d)\.(?=\d)"] + [r"(?<=\d)\:(?=\d)"] # Punkt zwischen Zahlen trennen


	# Regex-Objekte kompilieren
	suffix_re = compile_suffix_regex(suffixes)
	infix_re = compile_infix_regex(infixes)

	# Angepasste Tokenizer-Funktion setzen
	nlp.tokenizer = Tokenizer(nlp.vocab, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer)
	# 2️⃣ Entity Ruler für Datumsangaben hinzufügen
	ruler = nlp.add_pipe("entity_ruler")

	patterns = [
	{
	"label": "DATE",
	"pattern": [
	{"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dddd"}
	]
	},
	{
	"label": "TIME",
	"pattern": [
	{"SHAPE": "dd"}, {"ORTH": ":"}, {"SHAPE": "dd"}
	]
	}
	]

	ruler.add_patterns(patterns)


	# Prepare Training Data: Use Placeholders for Times and Dates
	classifier_train_data_cleaned = {"EVENT_TIME": [], "ADMITTANCE_TIME": []}
	for text in train_data["EVENT_TIME"]:
	text = normalize_data(text)
	doc = nlp(text)
	for ent in doc.ents:
	if ent.label_ == "DATE":
	text = text.replace(ent.text, "[DATE]")
	if ent.label_ == "TIME":
	text = text.replace(ent.text, "[TIME]")
	classifier_train_data_cleaned["EVENT_TIME"].append(text)
	for text in train_data["ADMITTANCE_TIME"]:
	text = normalize_data(text)
	doc = nlp(text)
	for ent in doc.ents:
	if ent.label_ == "DATE":
	text = text.replace(ent.text, "[DATE]")
	if ent.label_ == "TIME":
	text = text.replace(ent.text, "[TIME]")
	classifier_train_data_cleaned["ADMITTANCE_TIME"].append(text)

	# remove duplicates
	classifier_train_data_cleaned["EVENT_TIME"] = list(set(classifier_train_data_cleaned["EVENT_TIME"]))
	classifier_train_data_cleaned["ADMITTANCE_TIME"] = list(set(classifier_train_data_cleaned["ADMITTANCE_TIME"]))


	classifier = ClassyClassifier(data=classifier_train_data_cleaned)
	classifier.set_embedding_model(model="stsb-xlm-r-multilingual")
	with open("../../playground/models/time_classifier.pkl", "wb") as f:
	pickle.dump(classifier, f)

	TEXTS = ["Einlass: ab 15:00", "Beginn: 18:00", "Konzert: 20:00 bis 21:00", "Start: 20:00, Ende: 21:00"]

	for text in TEXTS:
	print(text)
	text = normalize_data(text)
	analyzer = MarkdownAnalyzer(text)

	md_elements = analyzer.identify_all().get("block_elements")
	dates = {"dates": [], "times": []}
	for md_element in md_elements:
	doc = nlp(md_element.text)
	# Prüfe Tokenisierung
	# print("Tokens:", [token.text for token in doc])
	if doc.ents:
	print("Found Entities: ", doc.ents)
	modified_text = md_element.text

	# Replace TIME and DATE entities with placeholders
	for ent in doc.ents:
	if ent.label_ == "DATE":
	modified_text = modified_text.replace(ent.text, "[DATE]")
	if ent.label_ == "TIME":
	modified_text = modified_text.replace(ent.text, "[TIME]")

	cats = classifier(modified_text)
	time_category = max(cats, key=cats.get)
	print(time_category)
	print(""100)