File size: 4,180 Bytes
da88570
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import re
from pymongo import MongoClient
import html2text
from bs4 import BeautifulSoup, Comment

from src.nlp.playground.textclassification import classify_title
from src.nlp.playground.pipelines.title_extractor import extract_title


def get_clean_html(page_content:str):

    soup = BeautifulSoup(page_content, "lxml")
    body_content = soup.body

    if not body_content:
        print("Kein <body>-Tag im HTML gefunden!")
        return None
    else:
        for tag in body_content.find_all(["footer", "script", "nav", "menu", "img"]):
            tag.decompose()
        header = soup.find("header")
        if header:
            header.decompose()

        # Entfernen von Kommentaren
        for comment in body_content.find_all(string=lambda text: isinstance(text, Comment)):
            comment.extract()

        for headline in soup.find_all(re.compile(r"h[1-6]")):
            headline["text"] = headline.get_text(strip=True).replace("\n", " ")

        for tag in soup.find_all(True):
            del tag.attrs

        # Bereinigtes HTML speichern oder verwenden
        cleaned_html = body_content.prettify()
        clean_html_lines = [line for line in cleaned_html.splitlines() if line.strip()]
        cleaned_html = "\n".join(clean_html_lines)
        return cleaned_html

def extract_title_from_html(html):
    """Extrahiert alle Überschriften aus dem Markdown-Text und klassifiziert sie."""
    # Konvertiere Markdown-Text in HTML
    # Erstelle BeautifulSoup-Objekt aus dem HTML
    soup = BeautifulSoup(html, "lxml")
    # Finde alle Header-Elemente (h1 bis h6)
    headers = soup.find_all(re.compile(r"h[1-6]"))
    title = None
    try:
        if headers:
            header_labels=[]
            for header in headers:
                header_text = header.get_text(strip=True)
                header_class = classify_title(header_text)
                header_level = int(header.name[1])
                header_labels.append({"text": header_text, "label": header_class[0]["label"], "level": header_level})

            # find title
            if len(header_labels) == 1:
                title = header_labels[0]["text"]
            if header_labels[0]["label"] == "Titel":
                title = header_labels[0]["text"]
            else:
                # if
                lowest_level = header_labels[0]
                for h in header_labels:
                    if h["level"] < lowest_level["level"]:
                        lowest_level = h
                title = lowest_level["text"]
                # if h["label"] == "Titel":
                #     title = h["text"]
                #     break
            if title == "":
                title = header_labels[0]["text"]
    except Exception as e:
        print(f"Fehler: {e}")
    return title


uri = f"mongodb+srv://event_data_extraction_application:[email protected]/?retryWrites=true&w=majority&appName=Cluster0"
client = MongoClient(uri)
db = client.event_data

unsorted_urls = db.unsorted_urls
event_urls = db.event_urls

count = 5
print(f"{event_urls.count_documents({"class": "EventDetail"})} EventDetail Dokumente")

num = 1
for event in event_urls.find({"class": "EventDetail"},{"_id":1, "url":1, "data":1, "html":1}):
    print(f"Nr.{num} - {event["url"]}")
    html = event["html"]
    cleaned_html = get_clean_html(html)
    # print(cleaned_html)
    if not cleaned_html:
        break
    h = html2text.HTML2Text()
    h.ignore_links = True
    h.ignore_images = True
    md = h.handle(cleaned_html)
    # cleaned_md = remove_boilerplate(md)
    cleaned_md = md
    title = extract_title(cleaned_md)
    print(f"PREDICTED TITLE: {title}")
    print(cleaned_md)
    print("*********************************************************************************************************")
    num+=1
    # count-=1
    # if count == 0:
    #     break



# results = event_urls.distinct("base_url")
# print(len(results))
# for url in results:
#     print(url)

# for event in event_urls.find({"class": "EventDetail"},{"_id":1, "url":1, "data":1, "html":1}):
#     html = event["html"]
#     cleaned_html = get_clean_html(html)
#     print(cleaned_html)