Spaces:

adojode
/

event-data-extraction-playground

Running

File size: 3,178 Bytes

import streamlit as st
import streamlit.components.v1 as components
from src.nlp.playground.pipelines.event_data_extractor import EventDataExtractor
from src.persistence.db import init_db
from src.utils.apis.gpt_api import remove_boilerplate
from src.utils.helpers import normalize_data, clean_html
from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer
from src.utils.markdown_processing.md_preprocessing import convert_html_to_md
import requests
from bs4 import BeautifulSoup


def scrape_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        cleaned_html = soup.prettify()
        return cleaned_html
    except requests.exceptions.RequestException as e:
        return f"Fehler beim Abrufen der Seite: {e}"


@st.cache_resource
def init_connection():
    return init_db()


@st.cache_resource
def init_data():
    return db.event_urls.find(filter={"class": "EventDetail", "final": True},
                              projection={"url": 1, "base_url_id": 1, "cleaned_html": 1, "data": 1})


if "event_data_extractor" not in st.session_state:
    st.session_state["event_data_extractor"] = EventDataExtractor()

db = init_connection()
data = init_data()

with st.form("url_form"):
    url = st.text_input(
        "Gebe eine URL einer Veranstaltungsseite ein (Oder leer lassen um Seite aus der Datenbank zu nehmen).")

    submit_button = st.form_submit_button("Starte Pipeline")

if submit_button:
    if url:
        html = scrape_url(url)
        html = clean_html(html)
    else:
        element = next(data, None)
        url = element["url"]
        html = element["cleaned_html"]

    st.subheader("Bereinigtes HTML")
    st.write(url)
    st.components.v1.html(html, height=500, scrolling=True)
    md = convert_html_to_md(html)
    with st.expander("Markdown"):
        with st.container(border=True, height=400):
            st.markdown(md)
    with st.expander("Markdown Code"):
        with st.container(height=400):
            st.code(md)

    cleaned_md = remove_boilerplate(md)
    st.info("Remove boilerplate with GPT API")
    with st.expander("Gekürztes Markdown"):
        with st.container(border=True, height=400):
            st.markdown(cleaned_md)

    normalized_md = normalize_data(cleaned_md)
    with st.expander("Normalisiertes Markdown"):
        with st.container(border=True, height=400):
            st.markdown(normalized_md)

    text = normalized_md
    analyzer = MarkdownAnalyzer(text)
    results = analyzer.identify_all()["block_elements"]
    table_data = [{"Class": r.__class__.__name__, "Markdown": r.markdown} for r in results]
    with st.expander("Markdown Elemente"):
        st.table(table_data)

    with st.expander("Markdown Segmente"):
        segments = analyzer.segmentation()
        for s in segments:
            with st.container(border=True):
                for e in s:
                    st.markdown(e.markdown)

    extracted_event = st.session_state.event_data_extractor.extract(cleaned_md)

    st.subheader("Extrahierte Daten")
    st.text(extracted_event)