|
import streamlit as st |
|
import streamlit.components.v1 as components |
|
from src.nlp.playground.pipelines.event_data_extractor import EventDataExtractor |
|
from src.persistence.db import init_db |
|
from src.utils.apis.gpt_api import remove_boilerplate |
|
from src.utils.helpers import normalize_data, clean_html |
|
from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer |
|
from src.utils.markdown_processing.md_preprocessing import convert_html_to_md |
|
import requests |
|
from bs4 import BeautifulSoup |
|
|
|
|
|
def scrape_url(url): |
|
try: |
|
response = requests.get(url) |
|
response.raise_for_status() |
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
cleaned_html = soup.prettify() |
|
return cleaned_html |
|
except requests.exceptions.RequestException as e: |
|
return f"Fehler beim Abrufen der Seite: {e}" |
|
|
|
|
|
@st.cache_resource |
|
def init_connection(): |
|
return init_db() |
|
|
|
|
|
@st.cache_resource |
|
def init_data(): |
|
return db.event_urls.find(filter={"class": "EventDetail", "final": True}, |
|
projection={"url": 1, "base_url_id": 1, "cleaned_html": 1, "data": 1}) |
|
|
|
|
|
if "event_data_extractor" not in st.session_state: |
|
st.session_state["event_data_extractor"] = EventDataExtractor() |
|
|
|
db = init_connection() |
|
data = init_data() |
|
|
|
with st.form("url_form"): |
|
url = st.text_input( |
|
"Gebe eine URL einer Veranstaltungsseite ein (Oder leer lassen um Seite aus der Datenbank zu nehmen).") |
|
|
|
submit_button = st.form_submit_button("Starte Pipeline") |
|
|
|
if submit_button: |
|
if url: |
|
html = scrape_url(url) |
|
html = clean_html(html) |
|
else: |
|
element = next(data, None) |
|
url = element["url"] |
|
html = element["cleaned_html"] |
|
|
|
st.subheader("Bereinigtes HTML") |
|
st.write(url) |
|
st.components.v1.html(html, height=500, scrolling=True) |
|
md = convert_html_to_md(html) |
|
with st.expander("Markdown"): |
|
with st.container(border=True, height=400): |
|
st.markdown(md) |
|
with st.expander("Markdown Code"): |
|
with st.container(height=400): |
|
st.code(md) |
|
|
|
cleaned_md = remove_boilerplate(md) |
|
st.info("Remove boilerplate with GPT API") |
|
with st.expander("Gekürztes Markdown"): |
|
with st.container(border=True, height=400): |
|
st.markdown(cleaned_md) |
|
|
|
normalized_md = normalize_data(cleaned_md) |
|
with st.expander("Normalisiertes Markdown"): |
|
with st.container(border=True, height=400): |
|
st.markdown(normalized_md) |
|
|
|
text = normalized_md |
|
analyzer = MarkdownAnalyzer(text) |
|
results = analyzer.identify_all()["block_elements"] |
|
table_data = [{"Class": r.__class__.__name__, "Markdown": r.markdown} for r in results] |
|
with st.expander("Markdown Elemente"): |
|
st.table(table_data) |
|
|
|
with st.expander("Markdown Segmente"): |
|
segments = analyzer.segmentation() |
|
for s in segments: |
|
with st.container(border=True): |
|
for e in s: |
|
st.markdown(e.markdown) |
|
|
|
extracted_event = st.session_state.event_data_extractor.extract(cleaned_md) |
|
|
|
st.subheader("Extrahierte Daten") |
|
st.text(extracted_event) |
|
|
|
|