import streamlit as st import streamlit.components.v1 as components from src.nlp.playground.pipelines.event_data_extractor import EventDataExtractor from src.persistence.db import init_db from src.utils.apis.gpt_api import remove_boilerplate from src.utils.helpers import normalize_data, clean_html from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer from src.utils.markdown_processing.md_preprocessing import convert_html_to_md import requests from bs4 import BeautifulSoup def scrape_url(url): try: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') cleaned_html = soup.prettify() return cleaned_html except requests.exceptions.RequestException as e: return f"Fehler beim Abrufen der Seite: {e}" @st.cache_resource def init_connection(): return init_db() @st.cache_resource def init_data(): return db.event_urls.find(filter={"class": "EventDetail", "final": True}, projection={"url": 1, "base_url_id": 1, "cleaned_html": 1, "data": 1}) if "event_data_extractor" not in st.session_state: st.session_state["event_data_extractor"] = EventDataExtractor() db = init_connection() data = init_data() with st.form("url_form"): url = st.text_input( "Gebe eine URL einer Veranstaltungsseite ein (Oder leer lassen um Seite aus der Datenbank zu nehmen).") submit_button = st.form_submit_button("Starte Pipeline") if submit_button: if url: html = scrape_url(url) html = clean_html(html) else: element = next(data, None) url = element["url"] html = element["cleaned_html"] st.subheader("Bereinigtes HTML") st.write(url) st.components.v1.html(html, height=500, scrolling=True) md = convert_html_to_md(html) with st.expander("Markdown"): with st.container(border=True, height=400): st.markdown(md) with st.expander("Markdown Code"): with st.container(height=400): st.code(md) cleaned_md = remove_boilerplate(md) st.info("Remove boilerplate with GPT API") with st.expander("Gekürztes Markdown"): with st.container(border=True, height=400): st.markdown(cleaned_md) normalized_md = normalize_data(cleaned_md) with st.expander("Normalisiertes Markdown"): with st.container(border=True, height=400): st.markdown(normalized_md) text = normalized_md analyzer = MarkdownAnalyzer(text) results = analyzer.identify_all()["block_elements"] table_data = [{"Class": r.__class__.__name__, "Markdown": r.markdown} for r in results] with st.expander("Markdown Elemente"): st.table(table_data) with st.expander("Markdown Segmente"): segments = analyzer.segmentation() for s in segments: with st.container(border=True): for e in s: st.markdown(e.markdown) extracted_event = st.session_state.event_data_extractor.extract(cleaned_md) st.subheader("Extrahierte Daten") st.text(extracted_event)