File size: 3,178 Bytes
da88570 837ccba da88570 83f1514 da88570 83f1514 837ccba 83f1514 da88570 837ccba da88570 837ccba da88570 837ccba da88570 837ccba da88570 83f1514 837ccba 83f1514 da88570 83f1514 837ccba 83f1514 73322e3 da88570 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import streamlit as st
import streamlit.components.v1 as components
from src.nlp.playground.pipelines.event_data_extractor import EventDataExtractor
from src.persistence.db import init_db
from src.utils.apis.gpt_api import remove_boilerplate
from src.utils.helpers import normalize_data, clean_html
from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer
from src.utils.markdown_processing.md_preprocessing import convert_html_to_md
import requests
from bs4 import BeautifulSoup
def scrape_url(url):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
cleaned_html = soup.prettify()
return cleaned_html
except requests.exceptions.RequestException as e:
return f"Fehler beim Abrufen der Seite: {e}"
@st.cache_resource
def init_connection():
return init_db()
@st.cache_resource
def init_data():
return db.event_urls.find(filter={"class": "EventDetail", "final": True},
projection={"url": 1, "base_url_id": 1, "cleaned_html": 1, "data": 1})
if "event_data_extractor" not in st.session_state:
st.session_state["event_data_extractor"] = EventDataExtractor()
db = init_connection()
data = init_data()
with st.form("url_form"):
url = st.text_input(
"Gebe eine URL einer Veranstaltungsseite ein (Oder leer lassen um Seite aus der Datenbank zu nehmen).")
submit_button = st.form_submit_button("Starte Pipeline")
if submit_button:
if url:
html = scrape_url(url)
html = clean_html(html)
else:
element = next(data, None)
url = element["url"]
html = element["cleaned_html"]
st.subheader("Bereinigtes HTML")
st.write(url)
st.components.v1.html(html, height=500, scrolling=True)
md = convert_html_to_md(html)
with st.expander("Markdown"):
with st.container(border=True, height=400):
st.markdown(md)
with st.expander("Markdown Code"):
with st.container(height=400):
st.code(md)
cleaned_md = remove_boilerplate(md)
st.info("Remove boilerplate with GPT API")
with st.expander("Gekürztes Markdown"):
with st.container(border=True, height=400):
st.markdown(cleaned_md)
normalized_md = normalize_data(cleaned_md)
with st.expander("Normalisiertes Markdown"):
with st.container(border=True, height=400):
st.markdown(normalized_md)
text = normalized_md
analyzer = MarkdownAnalyzer(text)
results = analyzer.identify_all()["block_elements"]
table_data = [{"Class": r.__class__.__name__, "Markdown": r.markdown} for r in results]
with st.expander("Markdown Elemente"):
st.table(table_data)
with st.expander("Markdown Segmente"):
segments = analyzer.segmentation()
for s in segments:
with st.container(border=True):
for e in s:
st.markdown(e.markdown)
extracted_event = st.session_state.event_data_extractor.extract(cleaned_md)
st.subheader("Extrahierte Daten")
st.text(extracted_event)
|