manaviel85370
add pages and all
da88570
import re
from pymongo import MongoClient
import html2text
from bs4 import BeautifulSoup, Comment
from src.nlp.playground.textclassification import classify_title
from src.nlp.playground.pipelines.title_extractor import extract_title
def get_clean_html(page_content:str):
soup = BeautifulSoup(page_content, "lxml")
body_content = soup.body
if not body_content:
print("Kein <body>-Tag im HTML gefunden!")
return None
else:
for tag in body_content.find_all(["footer", "script", "nav", "menu", "img"]):
tag.decompose()
header = soup.find("header")
if header:
header.decompose()
# Entfernen von Kommentaren
for comment in body_content.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()
for headline in soup.find_all(re.compile(r"h[1-6]")):
headline["text"] = headline.get_text(strip=True).replace("\n", " ")
for tag in soup.find_all(True):
del tag.attrs
# Bereinigtes HTML speichern oder verwenden
cleaned_html = body_content.prettify()
clean_html_lines = [line for line in cleaned_html.splitlines() if line.strip()]
cleaned_html = "\n".join(clean_html_lines)
return cleaned_html
def extract_title_from_html(html):
"""Extrahiert alle Überschriften aus dem Markdown-Text und klassifiziert sie."""
# Konvertiere Markdown-Text in HTML
# Erstelle BeautifulSoup-Objekt aus dem HTML
soup = BeautifulSoup(html, "lxml")
# Finde alle Header-Elemente (h1 bis h6)
headers = soup.find_all(re.compile(r"h[1-6]"))
title = None
try:
if headers:
header_labels=[]
for header in headers:
header_text = header.get_text(strip=True)
header_class = classify_title(header_text)
header_level = int(header.name[1])
header_labels.append({"text": header_text, "label": header_class[0]["label"], "level": header_level})
# find title
if len(header_labels) == 1:
title = header_labels[0]["text"]
if header_labels[0]["label"] == "Titel":
title = header_labels[0]["text"]
else:
# if
lowest_level = header_labels[0]
for h in header_labels:
if h["level"] < lowest_level["level"]:
lowest_level = h
title = lowest_level["text"]
# if h["label"] == "Titel":
# title = h["text"]
# break
if title == "":
title = header_labels[0]["text"]
except Exception as e:
print(f"Fehler: {e}")
return title
uri = f"mongodb+srv://event_data_extraction_application:[email protected]/?retryWrites=true&w=majority&appName=Cluster0"
client = MongoClient(uri)
db = client.event_data
unsorted_urls = db.unsorted_urls
event_urls = db.event_urls
count = 5
print(f"{event_urls.count_documents({"class": "EventDetail"})} EventDetail Dokumente")
num = 1
for event in event_urls.find({"class": "EventDetail"},{"_id":1, "url":1, "data":1, "html":1}):
print(f"Nr.{num} - {event["url"]}")
html = event["html"]
cleaned_html = get_clean_html(html)
# print(cleaned_html)
if not cleaned_html:
break
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
md = h.handle(cleaned_html)
# cleaned_md = remove_boilerplate(md)
cleaned_md = md
title = extract_title(cleaned_md)
print(f"PREDICTED TITLE: {title}")
print(cleaned_md)
print("*********************************************************************************************************")
num+=1
# count-=1
# if count == 0:
# break
# results = event_urls.distinct("base_url")
# print(len(results))
# for url in results:
# print(url)
# for event in event_urls.find({"class": "EventDetail"},{"_id":1, "url":1, "data":1, "html":1}):
# html = event["html"]
# cleaned_html = get_clean_html(html)
# print(cleaned_html)