|
import re |
|
from pymongo import MongoClient |
|
import html2text |
|
from bs4 import BeautifulSoup, Comment |
|
|
|
from src.nlp.playground.textclassification import classify_title |
|
from src.nlp.playground.pipelines.title_extractor import extract_title |
|
|
|
|
|
def get_clean_html(page_content:str): |
|
|
|
soup = BeautifulSoup(page_content, "lxml") |
|
body_content = soup.body |
|
|
|
if not body_content: |
|
print("Kein <body>-Tag im HTML gefunden!") |
|
return None |
|
else: |
|
for tag in body_content.find_all(["footer", "script", "nav", "menu", "img"]): |
|
tag.decompose() |
|
header = soup.find("header") |
|
if header: |
|
header.decompose() |
|
|
|
|
|
for comment in body_content.find_all(string=lambda text: isinstance(text, Comment)): |
|
comment.extract() |
|
|
|
for headline in soup.find_all(re.compile(r"h[1-6]")): |
|
headline["text"] = headline.get_text(strip=True).replace("\n", " ") |
|
|
|
for tag in soup.find_all(True): |
|
del tag.attrs |
|
|
|
|
|
cleaned_html = body_content.prettify() |
|
clean_html_lines = [line for line in cleaned_html.splitlines() if line.strip()] |
|
cleaned_html = "\n".join(clean_html_lines) |
|
return cleaned_html |
|
|
|
def extract_title_from_html(html): |
|
"""Extrahiert alle Überschriften aus dem Markdown-Text und klassifiziert sie.""" |
|
|
|
|
|
soup = BeautifulSoup(html, "lxml") |
|
|
|
headers = soup.find_all(re.compile(r"h[1-6]")) |
|
title = None |
|
try: |
|
if headers: |
|
header_labels=[] |
|
for header in headers: |
|
header_text = header.get_text(strip=True) |
|
header_class = classify_title(header_text) |
|
header_level = int(header.name[1]) |
|
header_labels.append({"text": header_text, "label": header_class[0]["label"], "level": header_level}) |
|
|
|
|
|
if len(header_labels) == 1: |
|
title = header_labels[0]["text"] |
|
if header_labels[0]["label"] == "Titel": |
|
title = header_labels[0]["text"] |
|
else: |
|
|
|
lowest_level = header_labels[0] |
|
for h in header_labels: |
|
if h["level"] < lowest_level["level"]: |
|
lowest_level = h |
|
title = lowest_level["text"] |
|
|
|
|
|
|
|
if title == "": |
|
title = header_labels[0]["text"] |
|
except Exception as e: |
|
print(f"Fehler: {e}") |
|
return title |
|
|
|
|
|
uri = f"mongodb+srv://event_data_extraction_application:[email protected]/?retryWrites=true&w=majority&appName=Cluster0" |
|
client = MongoClient(uri) |
|
db = client.event_data |
|
|
|
unsorted_urls = db.unsorted_urls |
|
event_urls = db.event_urls |
|
|
|
count = 5 |
|
print(f"{event_urls.count_documents({"class": "EventDetail"})} EventDetail Dokumente") |
|
|
|
num = 1 |
|
for event in event_urls.find({"class": "EventDetail"},{"_id":1, "url":1, "data":1, "html":1}): |
|
print(f"Nr.{num} - {event["url"]}") |
|
html = event["html"] |
|
cleaned_html = get_clean_html(html) |
|
|
|
if not cleaned_html: |
|
break |
|
h = html2text.HTML2Text() |
|
h.ignore_links = True |
|
h.ignore_images = True |
|
md = h.handle(cleaned_html) |
|
|
|
cleaned_md = md |
|
title = extract_title(cleaned_md) |
|
print(f"PREDICTED TITLE: {title}") |
|
print(cleaned_md) |
|
print("*********************************************************************************************************") |
|
num+=1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|