import streamlit as st from src.nlp.experimental.textclassification.classify_title import train_data from src.nlp.playground.pipelines.event_data_extractor import EventDataExtractor from src.persistence.db import init_db from src.utils.Event import Event, Schedule from src.utils.apis.googlemaps_api import GoogleMapsAPI from src.utils.helpers import normalize_data filter_data = train_data["Veranstaltungstitel"] @st.cache_resource def init_connection(): return init_db() @st.cache_resource def init_event_data_extractor(): return EventDataExtractor() @st.cache_data def init_db_entries(): st.info("Fetching data") elements = list(db.event_urls.find({"final": True, "class": "EventDetail"}, {"_id": 1, "url": 1, "data": 1, "html": 1, "information": 1})) st.info("Fetched data") filtered_elements = [] for el in elements: if all(f not in el.get("data", "") for f in filter_data): filtered_elements.append(el) st.write(f"{len(filtered_elements)} Testdatensätze in der Datenbank") return filtered_elements[:10] @st.cache_resource def init_google_maps_api(): return GoogleMapsAPI() def event_similarity(actual, predicted): # Liste der Attribute, die verglichen werden attributes = [ (actual.title, predicted.title), (actual.schedule, predicted.schedule), (actual.prices, predicted.prices), (actual.address, predicted.address), (actual.organizers, predicted.organizers), ] # Anzahl der übereinstimmenden Attribute berechnen matches = sum(1 for a, p in attributes if a == p) total_attributes = len(attributes) # Prozentuale Übereinstimmung berechnen similarity_percentage = (matches / total_attributes) * 100 return similarity_percentage db = init_connection() google_maps_api = init_google_maps_api() # if "elements" not in st.session_state: # st.info("Fetching data") # elements = list(db.event_urls.find({"final":True, "class": "EventDetail"},{"_id":1, "url":1, "data":1, "html":1, "information":1})) # st.info("Fetched data") # filtered_elements = [] # for el in elements: # if all(f not in el.get("data", "") for f in filter_data): # filtered_elements.append(el) # st.session_state.elements = filtered_elements # st.write(f"{len(filtered_elements)} Testdatensätze in der Datenbank") # if "event_data_extractor" not in st.session_state: # st.info("Initialisiere Extractor Pipeline") # st.session_state.event_data_extractor = EventDataExtractor() event_data_extractor = init_event_data_extractor() elements = init_db_entries() start_tests = st.button("Starte Tests") if start_tests: for el in elements: actual_event = Event() actual_event.url = el.get("url") actual_event.title = el.get("information", {}).get("actual", {}).get("title", "") actual_event.organizers = el.get("information", {}).get("actual", {}).get("organizers", []) actual_event.categories = el.get("information", {}).get("actual", {}).get("categories", []) actual_event.locations = el.get("information", {}).get("actual", {}).get("locations", []) actual_event.prices = el.get("information", {}).get("actual", {}).get("prices", []) actual_event.address = el.get("information", {}).get("actual", {}).get("address", {}).get("formatted", None) dates = el.get("information", {}).get("actual", {}).get("dates", []) actual_event.schedule = [ Schedule(date.get("start_date", None), date.get("end_date", None), date.get("start_time", None), date.get("end_time", None), date.get("admittance_time", None)) for date in dates] with st.container(border=True): st.markdown(el["data"]) with st.container(border=True): preprocessed_md = normalize_data(el["data"]) st.markdown(preprocessed_md) st.write(actual_event) try: predicted_event = event_data_extractor.extract(el["data"]) st.write(predicted_event) st.info(f"Ähnlichkeit der Ergebnisse: {event_similarity(actual_event, predicted_event)}") except Exception as e: st.error(f"Fehler bei der Verarbeitung: {e}")