Spaces:

adojode
/

event-data-extraction-playground

Running

File size: 5,201 Bytes

da88570

# import dateutil
# from dateparser import DateDataParser
# import re
#
# def normalize_data(input):
#     def normalize_dates(input_text):
#         # RegEx für deutsche Datumsformate
#         date_pattern = r"""
#             (?:
#               (?:\d{1,2}(?:\.|\\\.)?\s*)  # Tag (z. B. 15, 15. oder 15\.)
#               (?:Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember|
#                  Jan|Feb|Mär|Apr|Mai|Jun|Jul|Aug|Sep|Okt|Nov|Dez)\.?  # Monatsname (mit optionalem Punkt)
#               \s*  # optionaler Leerraum
#               (?:\d{2,4})?  # Jahr (optional, z. B. 2025 oder 20)
#             )|
#             (?:  # Alternative für numerische Formate
#               \d{1,2}(?:\.|\\\.|/|-)\d{1,2}(?:\.|\\\.|/|-)(?:\d{4}|\d{2})  # DD.MM.YYYY, DD-MM-YYYY, DD/MM/YY
#             )|
#             (?:  # ISO-Format (YYYY-MM-DD oder YY-MM-DD)
#               \d{2,4}(?:\.|\\\.|/|-)\d{1,2}(?:\.|\\\.|/|-)\d{1,2}
#             )
#         """
#
#         # Kompiliere das Muster mit re.VERBOSE
#         compiled_pattern = re.compile(date_pattern, re.VERBOSE)
#
#         # Finde alle Datumsangaben
#         matches = compiled_pattern.findall(input_text)
#
#         # Instanziiere DateDataParser
#         ddp = DateDataParser()
#
#         # Ersetze die gefundenen Datumsangaben mit dem normalisierten Format
#         for match in matches:
#             try:
#                 # Parse das Datum
#                 date_data = ddp.get_date_data(match)
#                 if date_data.date_obj.year is None:
#                     normalized_date = date_data.date_obj.strftime("%d.%m")  # Ohne Jahr
#                 else:
#                     normalized_date = date_data.date_obj.strftime("%d.%m.%Y")  # Mit Jahr
#                 # Ersetze das Originaldatum durch das normalisierte Datum
#                 input_text = input_text.replace(match, normalized_date)
#             except Exception as e:
#                 print(f"Fehler bei der Verarbeitung von '{match}': {e}")
#
#         # Regex für '20. und 21.03.2025' oder ähnliche Formate
#         date_pattern = r"(\d{1,2})\.?\s*und\s*(\d{1,2})\.(\d{1,2})\.(\d{4})"
#
#         # Ersetze alle Vorkommen von '20. und 21.03.2025' durch '20.03.2025 und 21.03.2025'
#         input_text = re.sub(date_pattern, r"\1.\3.\4 und \2.\3.\4", input_text)
#
#         # Regex für '20. und 21.03.2025' oder ähnliche Formate
#         date_pattern = r"(\d{1,2})\.\s*(bis|-)\s*(\d{1,2})\.(\d{1,2})\.(\d{4})"
#
#         # Ersetze alle Vorkommen von '20. und 21.03.2025' durch '20.03.2025 und 21.03.2025'
#         input_text = re.sub(date_pattern, r"\1.\4.\5 - \3.\4.\5", input_text)
#
#         # Ersetze 20.02.2024 bis (zum) 22.02.204 mit 20.02.2024 - 22.02.204
#         date_pattern = r"(\d{2})\.(\d{2})\.(\d{4})\s*(bis|-)\s*(zum)?\s*(\d{2})\.(\d{2})\.(\d{4})"
#         input_text = re.sub(date_pattern, r"\1.\2.\3 - \6.\7.\8", input_text)
#
#         return input_text
#
#     def normalize_times(input_text):
#         # Regex für Zeitangaben
#         time_patterns = r"""
#             \b
#             (\d{1,2})                # Stunde (z.B. 19 oder 5)
#             (?:
#                 [:.]                 # Trennzeichen (Doppelpunkt oder Punkt)
#                 (\d{2})             # Minuten (optional)
#             )?
#             (?:\s*(?:Uhr|h))?        # Optional "Uhr" oder "h", nur bei Minuten oder Kontext
#             (?:                      # Optionaler Zeitbereich (z.B. 19-20 oder 19 bis 20)
#                 \s?(?:-|bis)\s?      # Trennzeichen "-" oder "bis"
#                 (\d{1,2})            # Endzeit-Stunde
#                 (?:
#                     [:.]             # Trennzeichen (Doppelpunkt oder Punkt)
#                     (\d{2})         # Endzeit-Minuten (optional)
#                 )?
#                        # Optional "Uhr" oder "h" bei Endzeit
#             )?
#             \s*(?:Uhr|h)
#             \b
#         """
#
#         # Funktion, um Matches zu parsen und zu ersetzen
#         def format_time(match):
#             # Startzeit
#             start_hour = int(match.group(1))
#             start_minute = int(match.group(2) or 0)
#             formatted_start = f"{start_hour:02}:{start_minute:02}"
#
#             # Endzeit
#             if match.group(3):
#                 end_hour = int(match.group(3))
#                 end_minute = int(match.group(4) or 0)
#                 formatted_end = f"{end_hour:02}:{end_minute:02}"
#                 return f"{formatted_start}-{formatted_end}"  # Zeitbereich
#             return formatted_start  # Nur Startzeit
#
#         # Kompiliere Regex mit re.VERBOSE
#         compiled_pattern = re.compile(time_patterns, re.VERBOSE)
#
#         # Ersetze alle gefundenen Zeitangaben im Text
#         replaced_text = compiled_pattern.sub(format_time, input_text)
#         return replaced_text
#
#     def normalize_text(input_text):
#         return re.sub(r"(\d)([a-zA-Z])", r"\1 \2", input_text)
#
#     normalized_data = normalize_times(input)
#     normalized_data = normalize_dates(normalized_data)
#     normalized_data = normalize_text(normalized_data)
#     return normalized_data
#
#
#