import re from pymongo import MongoClient import html2text from bs4 import BeautifulSoup, Comment from src.nlp.playground.textclassification import classify_title from src.nlp.playground.pipelines.title_extractor import extract_title def get_clean_html(page_content:str): soup = BeautifulSoup(page_content, "lxml") body_content = soup.body if not body_content: print("Kein -Tag im HTML gefunden!") return None else: for tag in body_content.find_all(["footer", "script", "nav", "menu", "img"]): tag.decompose() header = soup.find("header") if header: header.decompose() # Entfernen von Kommentaren for comment in body_content.find_all(string=lambda text: isinstance(text, Comment)): comment.extract() for headline in soup.find_all(re.compile(r"h[1-6]")): headline["text"] = headline.get_text(strip=True).replace("\n", " ") for tag in soup.find_all(True): del tag.attrs # Bereinigtes HTML speichern oder verwenden cleaned_html = body_content.prettify() clean_html_lines = [line for line in cleaned_html.splitlines() if line.strip()] cleaned_html = "\n".join(clean_html_lines) return cleaned_html def extract_title_from_html(html): """Extrahiert alle Überschriften aus dem Markdown-Text und klassifiziert sie.""" # Konvertiere Markdown-Text in HTML # Erstelle BeautifulSoup-Objekt aus dem HTML soup = BeautifulSoup(html, "lxml") # Finde alle Header-Elemente (h1 bis h6) headers = soup.find_all(re.compile(r"h[1-6]")) title = None try: if headers: header_labels=[] for header in headers: header_text = header.get_text(strip=True) header_class = classify_title(header_text) header_level = int(header.name[1]) header_labels.append({"text": header_text, "label": header_class[0]["label"], "level": header_level}) # find title if len(header_labels) == 1: title = header_labels[0]["text"] if header_labels[0]["label"] == "Titel": title = header_labels[0]["text"] else: # if lowest_level = header_labels[0] for h in header_labels: if h["level"] < lowest_level["level"]: lowest_level = h title = lowest_level["text"] # if h["label"] == "Titel": # title = h["text"] # break if title == "": title = header_labels[0]["text"] except Exception as e: print(f"Fehler: {e}") return title uri = f"mongodb+srv://event_data_extraction_application:J1TRVDBbl4kSaxTD@cluster0.rtcz4.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0" client = MongoClient(uri) db = client.event_data unsorted_urls = db.unsorted_urls event_urls = db.event_urls count = 5 print(f"{event_urls.count_documents({"class": "EventDetail"})} EventDetail Dokumente") num = 1 for event in event_urls.find({"class": "EventDetail"},{"_id":1, "url":1, "data":1, "html":1}): print(f"Nr.{num} - {event["url"]}") html = event["html"] cleaned_html = get_clean_html(html) # print(cleaned_html) if not cleaned_html: break h = html2text.HTML2Text() h.ignore_links = True h.ignore_images = True md = h.handle(cleaned_html) # cleaned_md = remove_boilerplate(md) cleaned_md = md title = extract_title(cleaned_md) print(f"PREDICTED TITLE: {title}") print(cleaned_md) print("*********************************************************************************************************") num+=1 # count-=1 # if count == 0: # break # results = event_urls.distinct("base_url") # print(len(results)) # for url in results: # print(url) # for event in event_urls.find({"class": "EventDetail"},{"_id":1, "url":1, "data":1, "html":1}): # html = event["html"] # cleaned_html = get_clean_html(html) # print(cleaned_html)