Spaces:

adojode
/

event-data-extraction-playground

Running

File size: 7,038 Bytes

from bs4 import BeautifulSoup, Comment
import re
from dateparser import DateDataParser


def normalize_data(input):
    def normalize_dates(input_text):
        days = r"(?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag|Mo|Di|Mi|Do|Fr|Sa|So)"
        months = r"(?:Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember|Jan|Feb|Mär|Apr|Mai|Jun|Jul|Aug|Sep|Okt|Nov|Dez)"

        day_month_year_pattern = r"(?:(?:\d{1,2}\.?\s*)" + months + r"\.?\s*(?:\d{2,4})?)"
        dd_mm_yyyy_pattern = r"(?:\d{1,2}[./-]\d{1,2}[./-](?:\d{4}|\d{2}))"
        iso_pattern = r"(?:\d{2,4}[./-]\d{1,2}[./-]\d{1,2})"
        german_date_pattern = day_month_year_pattern + "|" + dd_mm_yyyy_pattern + "|" + iso_pattern

        compiled_pattern = re.compile(german_date_pattern, re.VERBOSE)

        matches = compiled_pattern.findall(input_text)

        ddp = DateDataParser(languages=["de"])

        for match in matches:
            try:
                date_data = ddp.get_date_data(match)
                if date_data.date_obj.year is None:
                    normalized_date = date_data.date_obj.strftime("%d.%m")  # Ohne Jahr
                else:
                    normalized_date = date_data.date_obj.strftime("%d.%m.%Y")  # Mit Jahr
                input_text = input_text.replace(match, normalized_date)
            except Exception as e:
                print(f"Fehler bei der Verarbeitung von '{match}': {e}")

        # Ersetze alle Vorkommen von '20.03. und 21.03.2025' durch '20.03.2025 und 21.03.2025'
        german_date_pattern = r"(?<!\.)(\d{2})\.(\d{2})\.(\s*\d{2}:\d{2})?\s*(und|\+|&|bis|bis zum|-|—|–)\s*(\d{1,2})\.(\d{1,2})\.(\d{4})(\s*\d{2}:\d{2})?"
        input_text = re.sub(german_date_pattern, r" \1.\2.\7 \4 \5.\6.\7 ", input_text)

        # Ersetze alle Vorkommen von '20. und 21.03.2025' durch '20.03.2025 und 21.03.2025'
        german_date_pattern = r"(?<!\d)(\d{2})\.(\s*\d{2}:\d{2})?\s*(und|\+|&|bis|bis zum|-|—|–)\s*(\d{1,2})\.(\d{1,2})\.(\d{4})(\s*\d{2}:\d{2})?"
        input_text = re.sub(german_date_pattern, r" \1.\5.\6 \2 \3 \4.\5.\6 \7 ", input_text)

        # Ersetze alle Vorkommen von '20.03.2025 bis/bis zum 21.03.2025' durch '20.03.2025 - 21.03.2025'
        german_date_pattern = r"(\d{1,2})\.(\d{1,2})\.(\d{4})(\s*\d{2}:\d{2})?\s*(bis|bis zum|-|—|–)\s*(\d{1,2})\.(\d{1,2})\.(\d{4})(\s*\d{2}:\d{2})?"
        input_text = re.sub(german_date_pattern, r" \1.\2.\3 \4 - \6.\7.\8 \9 ", input_text)

        # Ersetze alle Vorkommen von '20.03.2025 und/& 21.03.2025' durch '20.03.2025 + 21.03.2025'
        german_date_pattern = r"(\d{1,2})\.(\d{1,2})\.(\d{4})\.?(\s*\d{2}:\d{2})?\s*(und|\+|&)\s*(\d{1,2})\.(\d{1,2})\.(\d{4})(\s*\d{2}:\d{2})?"
        input_text = re.sub(german_date_pattern, r" \1.\2.\3 \4 + \6.\7.\8 \9 ", input_text)
        return input_text

    def normalize_times(input_text):
        time_patterns = r"""
            \b
            (\d{1,2})                # Stunde (z.B. 19 oder 5)
            (?:
                [:.]                 # Trennzeichen (Doppelpunkt oder Punkt)
                (\d{2})             # Minuten (optional)
            )?
            (?:\s*(?:Uhr|h))?        # Optional "Uhr" oder "h", nur bei Minuten oder Kontext
            (?:                      # Optionaler Zeitbereich (z.B. 19-20 oder 19 bis 20)
                \s?(?:-|bis)\s?      # Trennzeichen "-" oder "bis"
                (\d{1,2})            # Endzeit-Stunde
                (?:
                    [:.]             # Trennzeichen (Doppelpunkt oder Punkt)
                    (\d{2})         # Endzeit-Minuten (optional)
                )?
                       # Optional "Uhr" oder "h" bei Endzeit
            )?
            \s*(?:Uhr|h)
            \b
        """

        def format_time(match):
            start_hour = int(match.group(1))
            start_minute = int(match.group(2) or 0)
            formatted_start = f"{start_hour:02}:{start_minute:02}"

            if match.group(3):
                end_hour = int(match.group(3))
                end_minute = int(match.group(4) or 0)
                formatted_end = f"{end_hour:02}:{end_minute:02}"
                return f"{formatted_start}-{formatted_end}"
            return formatted_start

        compiled_pattern = re.compile(time_patterns, re.VERBOSE)

        replaced_text = compiled_pattern.sub(format_time, input_text)
        return replaced_text

    def normalize_text(input_text):
        return re.sub(r"(\d)([a-zA-Z])", r"\1 \2", input_text)

    result = re.sub(r"(\d+)\\\.", r"\1.", input)
    normalized_data = normalize_times(result)
    normalized_data = normalize_dates(normalized_data)
    normalized_data = normalize_text(normalized_data)
    return normalized_data


def clean_html(html: str):
    soup = BeautifulSoup(html, "lxml")
    body_content = soup.body

    if not body_content:
        print("Kein <body>-Tag im HTML gefunden!")
        return None
    else:
        for tag in body_content.find_all(["footer", "script", "nav", "menu", "img"]):
            tag.decompose()
        header = soup.find("header")
        if header:
            header.decompose()

        for comment in body_content.find_all(string=lambda text: isinstance(text, Comment)):
            comment.extract()

        cleaned_html = body_content.prettify()
        clean_html_lines = [line for line in cleaned_html.splitlines() if line.strip()]
        cleaned_html = "\n".join(clean_html_lines)
        return cleaned_html


def strip_html_to_text(html: str):
    soup = BeautifulSoup(html, "lxml")
    return soup.get_text(separator=' ', strip=True)


# texts = [
#     "Die 18. Koblenzer Literaturtage „ganzOhr“ finden vom 22.03. bis 05.04.2025 statt. Das Programm wird im Januar 2025 veröffentlicht, der Vorverkauf startet im Februar.",
#     "15. November 2024 & 13. Dezember 2024: Kunstausstellung 'Der erweiterte Raum'",
#     "Der siebte Workshop Retrodigitalisierung findet am 20. und 21.03.2025 bei ZB MED.",
#     "2. März bis 21. März 2025 \n"
#     "**Wann?** 05.12.2024, 19:00-21:00 **Wo?** Lesesaal im Marstallgebäude, TIB",
#     "22.04.25 15 Uhr bis 23.04.25 16 Uhr."
# ]
#
# expected_texts = [
#     "Die 18. Koblenzer Literaturtage „ganzOhr“ finden vom 22.03.2025 - 05.04.2025 statt. Das Programm wird im Januar 2025 veröffentlicht, der Vorverkauf startet im Februar.",
#     "15.11.2024 + 13.12.2024: Kunstausstellung 'Der erweiterte Raum'",
#     "Der siebte Workshop Retrodigitalisierung findet am 20.03.2025 + 21.03.2025 bei ZB MED.",
#     "02.03.2025 - 21.03.2025 \n",
#     "**Wann?** 05.12.2024, 19:00-21:00 **Wo?** Lesesaal im Marstallgebäude, TIB",
#     "22.04.2025 15:00 - 23.04.2025 16:00."
#     ]
#
# for i, text in enumerate(texts):
#     normalized = normalize_data(text)
#     normalized = re.sub("\s*", " ",normalized)
#     expected = re.sub("\s*", " ",expected_texts[i])
#     if normalized == expected:
#         print("Normalization successful!")
#     else:
#         print("Normalization failed!")