manaviel85370
improve output display
73322e3
import streamlit as st
import streamlit.components.v1 as components
from src.nlp.playground.pipelines.event_data_extractor import EventDataExtractor
from src.persistence.db import init_db
from src.utils.apis.gpt_api import remove_boilerplate
from src.utils.helpers import normalize_data, clean_html
from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer
from src.utils.markdown_processing.md_preprocessing import convert_html_to_md
import requests
from bs4 import BeautifulSoup
def scrape_url(url):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
cleaned_html = soup.prettify()
return cleaned_html
except requests.exceptions.RequestException as e:
return f"Fehler beim Abrufen der Seite: {e}"
@st.cache_resource
def init_connection():
return init_db()
@st.cache_resource
def init_data():
return db.event_urls.find(filter={"class": "EventDetail", "final": True},
projection={"url": 1, "base_url_id": 1, "cleaned_html": 1, "data": 1})
if "event_data_extractor" not in st.session_state:
st.session_state["event_data_extractor"] = EventDataExtractor()
db = init_connection()
data = init_data()
with st.form("url_form"):
url = st.text_input(
"Gebe eine URL einer Veranstaltungsseite ein (Oder leer lassen um Seite aus der Datenbank zu nehmen).")
submit_button = st.form_submit_button("Starte Pipeline")
if submit_button:
if url:
html = scrape_url(url)
html = clean_html(html)
else:
element = next(data, None)
url = element["url"]
html = element["cleaned_html"]
st.subheader("Bereinigtes HTML")
st.write(url)
st.components.v1.html(html, height=500, scrolling=True)
md = convert_html_to_md(html)
with st.expander("Markdown"):
with st.container(border=True, height=400):
st.markdown(md)
with st.expander("Markdown Code"):
with st.container(height=400):
st.code(md)
cleaned_md = remove_boilerplate(md)
st.info("Remove boilerplate with GPT API")
with st.expander("Gekürztes Markdown"):
with st.container(border=True, height=400):
st.markdown(cleaned_md)
normalized_md = normalize_data(cleaned_md)
with st.expander("Normalisiertes Markdown"):
with st.container(border=True, height=400):
st.markdown(normalized_md)
text = normalized_md
analyzer = MarkdownAnalyzer(text)
results = analyzer.identify_all()["block_elements"]
table_data = [{"Class": r.__class__.__name__, "Markdown": r.markdown} for r in results]
with st.expander("Markdown Elemente"):
st.table(table_data)
with st.expander("Markdown Segmente"):
segments = analyzer.segmentation()
for s in segments:
with st.container(border=True):
for e in s:
st.markdown(e.markdown)
extracted_event = st.session_state.event_data_extractor.extract(cleaned_md)
st.subheader("Extrahierte Daten")
st.text(extracted_event)