|
from bs4 import BeautifulSoup, Comment |
|
import re |
|
from dateparser import DateDataParser |
|
|
|
|
|
def normalize_data(input): |
|
def normalize_dates(input_text): |
|
days = r"(?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag|Mo|Di|Mi|Do|Fr|Sa|So)" |
|
months = r"(?:Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember|Jan|Feb|Mär|Apr|Mai|Jun|Jul|Aug|Sep|Okt|Nov|Dez)" |
|
|
|
day_month_year_pattern = r"(?:(?:\d{1,2}\.?\s*)" + months + r"\.?\s*(?:\d{2,4})?)" |
|
dd_mm_yyyy_pattern = r"(?:\d{1,2}[./-]\d{1,2}[./-](?:\d{4}|\d{2}))" |
|
iso_pattern = r"(?:\d{2,4}[./-]\d{1,2}[./-]\d{1,2})" |
|
german_date_pattern = day_month_year_pattern + "|" + dd_mm_yyyy_pattern + "|" + iso_pattern |
|
|
|
compiled_pattern = re.compile(german_date_pattern, re.VERBOSE) |
|
|
|
matches = compiled_pattern.findall(input_text) |
|
|
|
ddp = DateDataParser(languages=["de"]) |
|
|
|
for match in matches: |
|
try: |
|
date_data = ddp.get_date_data(match) |
|
if date_data.date_obj.year is None: |
|
normalized_date = date_data.date_obj.strftime("%d.%m") |
|
else: |
|
normalized_date = date_data.date_obj.strftime("%d.%m.%Y") |
|
input_text = input_text.replace(match, normalized_date) |
|
except Exception as e: |
|
print(f"Fehler bei der Verarbeitung von '{match}': {e}") |
|
|
|
|
|
german_date_pattern = r"(?<!\.)(\d{2})\.(\d{2})\.(\s*\d{2}:\d{2})?\s*(und|\+|&|bis|bis zum|-|—|–)\s*(\d{1,2})\.(\d{1,2})\.(\d{4})(\s*\d{2}:\d{2})?" |
|
input_text = re.sub(german_date_pattern, r" \1.\2.\7 \4 \5.\6.\7 ", input_text) |
|
|
|
|
|
german_date_pattern = r"(?<!\d)(\d{2})\.(\s*\d{2}:\d{2})?\s*(und|\+|&|bis|bis zum|-|—|–)\s*(\d{1,2})\.(\d{1,2})\.(\d{4})(\s*\d{2}:\d{2})?" |
|
input_text = re.sub(german_date_pattern, r" \1.\5.\6 \2 \3 \4.\5.\6 \7 ", input_text) |
|
|
|
|
|
german_date_pattern = r"(\d{1,2})\.(\d{1,2})\.(\d{4})(\s*\d{2}:\d{2})?\s*(bis|bis zum|-|—|–)\s*(\d{1,2})\.(\d{1,2})\.(\d{4})(\s*\d{2}:\d{2})?" |
|
input_text = re.sub(german_date_pattern, r" \1.\2.\3 \4 - \6.\7.\8 \9 ", input_text) |
|
|
|
|
|
german_date_pattern = r"(\d{1,2})\.(\d{1,2})\.(\d{4})\.?(\s*\d{2}:\d{2})?\s*(und|\+|&)\s*(\d{1,2})\.(\d{1,2})\.(\d{4})(\s*\d{2}:\d{2})?" |
|
input_text = re.sub(german_date_pattern, r" \1.\2.\3 \4 + \6.\7.\8 \9 ", input_text) |
|
return input_text |
|
|
|
def normalize_times(input_text): |
|
time_patterns = r""" |
|
\b |
|
(\d{1,2}) # Stunde (z.B. 19 oder 5) |
|
(?: |
|
[:.] # Trennzeichen (Doppelpunkt oder Punkt) |
|
(\d{2}) # Minuten (optional) |
|
)? |
|
(?:\s*(?:Uhr|h))? # Optional "Uhr" oder "h", nur bei Minuten oder Kontext |
|
(?: # Optionaler Zeitbereich (z.B. 19-20 oder 19 bis 20) |
|
\s?(?:-|bis)\s? # Trennzeichen "-" oder "bis" |
|
(\d{1,2}) # Endzeit-Stunde |
|
(?: |
|
[:.] # Trennzeichen (Doppelpunkt oder Punkt) |
|
(\d{2}) # Endzeit-Minuten (optional) |
|
)? |
|
# Optional "Uhr" oder "h" bei Endzeit |
|
)? |
|
\s*(?:Uhr|h) |
|
\b |
|
""" |
|
|
|
def format_time(match): |
|
start_hour = int(match.group(1)) |
|
start_minute = int(match.group(2) or 0) |
|
formatted_start = f"{start_hour:02}:{start_minute:02}" |
|
|
|
if match.group(3): |
|
end_hour = int(match.group(3)) |
|
end_minute = int(match.group(4) or 0) |
|
formatted_end = f"{end_hour:02}:{end_minute:02}" |
|
return f"{formatted_start}-{formatted_end}" |
|
return formatted_start |
|
|
|
compiled_pattern = re.compile(time_patterns, re.VERBOSE) |
|
|
|
replaced_text = compiled_pattern.sub(format_time, input_text) |
|
return replaced_text |
|
|
|
def normalize_text(input_text): |
|
return re.sub(r"(\d)([a-zA-Z])", r"\1 \2", input_text) |
|
|
|
result = re.sub(r"(\d+)\\\.", r"\1.", input) |
|
normalized_data = normalize_times(result) |
|
normalized_data = normalize_dates(normalized_data) |
|
normalized_data = normalize_text(normalized_data) |
|
return normalized_data |
|
|
|
|
|
def clean_html(html: str): |
|
soup = BeautifulSoup(html, "lxml") |
|
body_content = soup.body |
|
|
|
if not body_content: |
|
print("Kein <body>-Tag im HTML gefunden!") |
|
return None |
|
else: |
|
for tag in body_content.find_all(["footer", "script", "nav", "menu", "img"]): |
|
tag.decompose() |
|
header = soup.find("header") |
|
if header: |
|
header.decompose() |
|
|
|
for comment in body_content.find_all(string=lambda text: isinstance(text, Comment)): |
|
comment.extract() |
|
|
|
cleaned_html = body_content.prettify() |
|
clean_html_lines = [line for line in cleaned_html.splitlines() if line.strip()] |
|
cleaned_html = "\n".join(clean_html_lines) |
|
return cleaned_html |
|
|
|
|
|
def strip_html_to_text(html: str): |
|
soup = BeautifulSoup(html, "lxml") |
|
return soup.get_text(separator=' ', strip=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|