Spaces:

adojode
/

event-data-extraction-playground

Running

event-data-extraction-playground / src /nlp /preprocessing.py

manaviel85370

add pages and all

da88570 2 months ago

5.2 kB

	# import dateutil
	# from dateparser import DateDataParser
	# import re
	#
	# def normalize_data(input):
	# def normalize_dates(input_text):
	# # RegEx für deutsche Datumsformate
	# date_pattern = r"""
	# (?:
	# (?:\d{1,2}(?:\.\|\\\.)?\s*) # Tag (z. B. 15, 15. oder 15\.)
	# (?:Januar\|Februar\|März\|April\|Mai\|Juni\|Juli\|August\|September\|Oktober\|November\|Dezember\|
	# Jan\|Feb\|Mär\|Apr\|Mai\|Jun\|Jul\|Aug\|Sep\|Okt\|Nov\|Dez)\.? # Monatsname (mit optionalem Punkt)
	# \s* # optionaler Leerraum
	# (?:\d{2,4})? # Jahr (optional, z. B. 2025 oder 20)
	# )\|
	# (?: # Alternative für numerische Formate
	# \d{1,2}(?:\.\|\\\.\|/\|-)\d{1,2}(?:\.\|\\\.\|/\|-)(?:\d{4}\|\d{2}) # DD.MM.YYYY, DD-MM-YYYY, DD/MM/YY
	# )\|
	# (?: # ISO-Format (YYYY-MM-DD oder YY-MM-DD)
	# \d{2,4}(?:\.\|\\\.\|/\|-)\d{1,2}(?:\.\|\\\.\|/\|-)\d{1,2}
	# )
	# """
	#
	# # Kompiliere das Muster mit re.VERBOSE
	# compiled_pattern = re.compile(date_pattern, re.VERBOSE)
	#
	# # Finde alle Datumsangaben
	# matches = compiled_pattern.findall(input_text)
	#
	# # Instanziiere DateDataParser
	# ddp = DateDataParser()
	#
	# # Ersetze die gefundenen Datumsangaben mit dem normalisierten Format
	# for match in matches:
	# try:
	# # Parse das Datum
	# date_data = ddp.get_date_data(match)
	# if date_data.date_obj.year is None:
	# normalized_date = date_data.date_obj.strftime("%d.%m") # Ohne Jahr
	# else:
	# normalized_date = date_data.date_obj.strftime("%d.%m.%Y") # Mit Jahr
	# # Ersetze das Originaldatum durch das normalisierte Datum
	# input_text = input_text.replace(match, normalized_date)
	# except Exception as e:
	# print(f"Fehler bei der Verarbeitung von '{match}': {e}")
	#
	# # Regex für '20. und 21.03.2025' oder ähnliche Formate
	# date_pattern = r"(\d{1,2})\.?\sund\s(\d{1,2})\.(\d{1,2})\.(\d{4})"
	#
	# # Ersetze alle Vorkommen von '20. und 21.03.2025' durch '20.03.2025 und 21.03.2025'
	# input_text = re.sub(date_pattern, r"\1.\3.\4 und \2.\3.\4", input_text)
	#
	# # Regex für '20. und 21.03.2025' oder ähnliche Formate
	# date_pattern = r"(\d{1,2})\.\s(bis\|-)\s(\d{1,2})\.(\d{1,2})\.(\d{4})"
	#
	# # Ersetze alle Vorkommen von '20. und 21.03.2025' durch '20.03.2025 und 21.03.2025'
	# input_text = re.sub(date_pattern, r"\1.\4.\5 - \3.\4.\5", input_text)
	#
	# # Ersetze 20.02.2024 bis (zum) 22.02.204 mit 20.02.2024 - 22.02.204
	# date_pattern = r"(\d{2})\.(\d{2})\.(\d{4})\s(bis\|-)\s(zum)?\s*(\d{2})\.(\d{2})\.(\d{4})"
	# input_text = re.sub(date_pattern, r"\1.\2.\3 - \6.\7.\8", input_text)
	#
	# return input_text
	#
	# def normalize_times(input_text):
	# # Regex für Zeitangaben
	# time_patterns = r"""
	# \b
	# (\d{1,2}) # Stunde (z.B. 19 oder 5)
	# (?:
	# [:.] # Trennzeichen (Doppelpunkt oder Punkt)
	# (\d{2}) # Minuten (optional)
	# )?
	# (?:\s*(?:Uhr\|h))? # Optional "Uhr" oder "h", nur bei Minuten oder Kontext
	# (?: # Optionaler Zeitbereich (z.B. 19-20 oder 19 bis 20)
	# \s?(?:-\|bis)\s? # Trennzeichen "-" oder "bis"
	# (\d{1,2}) # Endzeit-Stunde
	# (?:
	# [:.] # Trennzeichen (Doppelpunkt oder Punkt)
	# (\d{2}) # Endzeit-Minuten (optional)
	# )?
	# # Optional "Uhr" oder "h" bei Endzeit
	# )?
	# \s*(?:Uhr\|h)
	# \b
	# """
	#
	# # Funktion, um Matches zu parsen und zu ersetzen
	# def format_time(match):
	# # Startzeit
	# start_hour = int(match.group(1))
	# start_minute = int(match.group(2) or 0)
	# formatted_start = f"{start_hour:02}:{start_minute:02}"
	#
	# # Endzeit
	# if match.group(3):
	# end_hour = int(match.group(3))
	# end_minute = int(match.group(4) or 0)
	# formatted_end = f"{end_hour:02}:{end_minute:02}"
	# return f"{formatted_start}-{formatted_end}" # Zeitbereich
	# return formatted_start # Nur Startzeit
	#
	# # Kompiliere Regex mit re.VERBOSE
	# compiled_pattern = re.compile(time_patterns, re.VERBOSE)
	#
	# # Ersetze alle gefundenen Zeitangaben im Text
	# replaced_text = compiled_pattern.sub(format_time, input_text)
	# return replaced_text
	#
	# def normalize_text(input_text):
	# return re.sub(r"(\d)([a-zA-Z])", r"\1 \2", input_text)
	#
	# normalized_data = normalize_times(input)
	# normalized_data = normalize_dates(normalized_data)
	# normalized_data = normalize_text(normalized_data)
	# return normalized_data
	#
	#
	#