File size: 3,178 Bytes
da88570
 
837ccba
da88570
 
83f1514
da88570
 
83f1514
 
 
837ccba
83f1514
 
 
 
 
 
 
 
 
da88570
837ccba
da88570
 
 
 
837ccba
da88570
 
837ccba
 
da88570
 
837ccba
 
da88570
 
 
83f1514
 
837ccba
 
83f1514
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da88570
83f1514
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
837ccba
83f1514
 
73322e3
da88570
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import streamlit as st
import streamlit.components.v1 as components
from src.nlp.playground.pipelines.event_data_extractor import EventDataExtractor
from src.persistence.db import init_db
from src.utils.apis.gpt_api import remove_boilerplate
from src.utils.helpers import normalize_data, clean_html
from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer
from src.utils.markdown_processing.md_preprocessing import convert_html_to_md
import requests
from bs4 import BeautifulSoup


def scrape_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        cleaned_html = soup.prettify()
        return cleaned_html
    except requests.exceptions.RequestException as e:
        return f"Fehler beim Abrufen der Seite: {e}"


@st.cache_resource
def init_connection():
    return init_db()


@st.cache_resource
def init_data():
    return db.event_urls.find(filter={"class": "EventDetail", "final": True},
                              projection={"url": 1, "base_url_id": 1, "cleaned_html": 1, "data": 1})


if "event_data_extractor" not in st.session_state:
    st.session_state["event_data_extractor"] = EventDataExtractor()

db = init_connection()
data = init_data()

with st.form("url_form"):
    url = st.text_input(
        "Gebe eine URL einer Veranstaltungsseite ein (Oder leer lassen um Seite aus der Datenbank zu nehmen).")

    submit_button = st.form_submit_button("Starte Pipeline")

if submit_button:
    if url:
        html = scrape_url(url)
        html = clean_html(html)
    else:
        element = next(data, None)
        url = element["url"]
        html = element["cleaned_html"]

    st.subheader("Bereinigtes HTML")
    st.write(url)
    st.components.v1.html(html, height=500, scrolling=True)
    md = convert_html_to_md(html)
    with st.expander("Markdown"):
        with st.container(border=True, height=400):
            st.markdown(md)
    with st.expander("Markdown Code"):
        with st.container(height=400):
            st.code(md)

    cleaned_md = remove_boilerplate(md)
    st.info("Remove boilerplate with GPT API")
    with st.expander("Gekürztes Markdown"):
        with st.container(border=True, height=400):
            st.markdown(cleaned_md)

    normalized_md = normalize_data(cleaned_md)
    with st.expander("Normalisiertes Markdown"):
        with st.container(border=True, height=400):
            st.markdown(normalized_md)

    text = normalized_md
    analyzer = MarkdownAnalyzer(text)
    results = analyzer.identify_all()["block_elements"]
    table_data = [{"Class": r.__class__.__name__, "Markdown": r.markdown} for r in results]
    with st.expander("Markdown Elemente"):
        st.table(table_data)

    with st.expander("Markdown Segmente"):
        segments = analyzer.segmentation()
        for s in segments:
            with st.container(border=True):
                for e in s:
                    st.markdown(e.markdown)

    extracted_event = st.session_state.event_data_extractor.extract(cleaned_md)

    st.subheader("Extrahierte Daten")
    st.text(extracted_event)