Spaces:

adojode
/

event-data-extraction-playground

Running

event-data-extraction-playground / pages /6_Pipeline.py

manaviel85370

improve output display

73322e3 13 days ago

3.18 kB

	import streamlit as st
	import streamlit.components.v1 as components
	from src.nlp.playground.pipelines.event_data_extractor import EventDataExtractor
	from src.persistence.db import init_db
	from src.utils.apis.gpt_api import remove_boilerplate
	from src.utils.helpers import normalize_data, clean_html
	from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer
	from src.utils.markdown_processing.md_preprocessing import convert_html_to_md
	import requests
	from bs4 import BeautifulSoup


	def scrape_url(url):
	try:
	response = requests.get(url)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, 'html.parser')
	cleaned_html = soup.prettify()
	return cleaned_html
	except requests.exceptions.RequestException as e:
	return f"Fehler beim Abrufen der Seite: {e}"


	@st.cache_resource
	def init_connection():
	return init_db()


	@st.cache_resource
	def init_data():
	return db.event_urls.find(filter={"class": "EventDetail", "final": True},
	projection={"url": 1, "base_url_id": 1, "cleaned_html": 1, "data": 1})


	if "event_data_extractor" not in st.session_state:
	st.session_state["event_data_extractor"] = EventDataExtractor()

	db = init_connection()
	data = init_data()

	with st.form("url_form"):
	url = st.text_input(
	"Gebe eine URL einer Veranstaltungsseite ein (Oder leer lassen um Seite aus der Datenbank zu nehmen).")

	submit_button = st.form_submit_button("Starte Pipeline")

	if submit_button:
	if url:
	html = scrape_url(url)
	html = clean_html(html)
	else:
	element = next(data, None)
	url = element["url"]
	html = element["cleaned_html"]

	st.subheader("Bereinigtes HTML")
	st.write(url)
	st.components.v1.html(html, height=500, scrolling=True)
	md = convert_html_to_md(html)
	with st.expander("Markdown"):
	with st.container(border=True, height=400):
	st.markdown(md)
	with st.expander("Markdown Code"):
	with st.container(height=400):
	st.code(md)

	cleaned_md = remove_boilerplate(md)
	st.info("Remove boilerplate with GPT API")
	with st.expander("Gekürztes Markdown"):
	with st.container(border=True, height=400):
	st.markdown(cleaned_md)

	normalized_md = normalize_data(cleaned_md)
	with st.expander("Normalisiertes Markdown"):
	with st.container(border=True, height=400):
	st.markdown(normalized_md)

	text = normalized_md
	analyzer = MarkdownAnalyzer(text)
	results = analyzer.identify_all()["block_elements"]
	table_data = [{"Class": r.__class__.__name__, "Markdown": r.markdown} for r in results]
	with st.expander("Markdown Elemente"):
	st.table(table_data)

	with st.expander("Markdown Segmente"):
	segments = analyzer.segmentation()
	for s in segments:
	with st.container(border=True):
	for e in s:
	st.markdown(e.markdown)

	extracted_event = st.session_state.event_data_extractor.extract(cleaned_md)

	st.subheader("Extrahierte Daten")
	st.text(extracted_event)