manaviel85370
commited on
Commit
·
4eb9d09
1
Parent(s):
f95159c
add input fields for actual information
Browse files- pages/4_Control.py +154 -14
- requirements.txt +1 -0
pages/4_Control.py
CHANGED
@@ -1,11 +1,45 @@
|
|
|
|
|
|
1 |
import pandas as pd
|
|
|
|
|
|
|
2 |
from src.utils.helpers import clean_html
|
3 |
from src.utils.markdown_processing.md_preprocessing import convert_html_to_md
|
4 |
from src.nlp.playground.pipelines.title_extractor import TitleExtractor
|
5 |
from src.utils.helpers import normalize_data
|
6 |
from src.persistence.db import *
|
7 |
from src.utils.apis.gpt_api import remove_boilerplate
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
|
11 |
@st.cache_resource
|
@@ -19,9 +53,11 @@ def remove_url():
|
|
19 |
def next():
|
20 |
db.event_urls.update_one({"_id": current_element["_id"]}, { "$set": { "information":event_information } })
|
21 |
st.session_state.index+=1
|
|
|
22 |
|
23 |
def prev():
|
24 |
st.session_state.index-=1
|
|
|
25 |
|
26 |
# Variables
|
27 |
db = init_connection()
|
@@ -49,6 +85,7 @@ if "predictions_on" not in st.session_state:
|
|
49 |
st.session_state.predictions_on = False
|
50 |
|
51 |
|
|
|
52 |
current_element = st.session_state.elements[st.session_state.index]
|
53 |
|
54 |
predictions_on = st.toggle("Predictions an (Zeigt Extrahierte Daten an, die Seite lädt dadurch langsamer).")
|
@@ -56,6 +93,8 @@ if predictions_on != st.session_state.predictions_on:
|
|
56 |
st.session_state.predictions_on = predictions_on
|
57 |
|
58 |
if current_element:
|
|
|
|
|
59 |
current_url = current_element['url']
|
60 |
|
61 |
try:
|
@@ -69,41 +108,142 @@ if current_element:
|
|
69 |
except Exception as e:
|
70 |
st.error(f"Es ist ein Fehler aufgetreten: {e} \nDer Datenbankeintrag wird gelöscht.")
|
71 |
db.event_urls.delete_one({"_id": current_element["_id"]})
|
72 |
-
|
73 |
-
normalized_text = normalize_data(
|
74 |
predicted_title = None
|
75 |
predicted_date = None
|
|
|
76 |
if st.session_state.predictions_on:
|
77 |
predicted_title = TitleExtractor().extract_title(normalized_text)
|
78 |
# predicted_date = extract_entities(normalized_text, ["date", "date_range"])
|
79 |
# predicted_date = [ {d["text"],d["label"]} for d in predicted_date ] if predicted_date else None
|
80 |
-
st.subheader("Normalisierte Daten:")
|
81 |
with st.container(border=True, height=400):
|
82 |
st.markdown(normalized_text)
|
83 |
with st.expander("Code ansehen"):
|
84 |
with st.container( height=400):
|
85 |
st.code(normalized_text)
|
86 |
-
actual_title = st.text_input("Tatsächlicher Titel eingeben:", key="title"+ str(current_element["_id"]),
|
87 |
-
value=current_element.get("information", {}).get("actual", {}).get("title", None))
|
88 |
-
actual_date = None
|
89 |
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
"Information": [
|
93 |
"Titel",
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
],
|
96 |
"Tatsächlicher Wert":
|
97 |
[
|
98 |
actual_title,
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
],
|
101 |
"Predicted Wert": [
|
102 |
predicted_title,
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
],
|
105 |
}
|
106 |
-
df = pd.DataFrame(
|
107 |
|
108 |
st.subheader("Vergleich der Titel:")
|
109 |
st.table(df)
|
@@ -115,7 +255,7 @@ if current_element:
|
|
115 |
|
116 |
|
117 |
with col1:
|
118 |
-
st.button("Zurück", on_click=prev)
|
119 |
with col3:
|
120 |
st.button("URL löschen", on_click=remove_url)
|
121 |
with col4:
|
|
|
1 |
+
import datetime
|
2 |
+
|
3 |
import pandas as pd
|
4 |
+
from lxml.html.defs import table_tags
|
5 |
+
from lxml.html.diff import end_tag
|
6 |
+
|
7 |
from src.utils.helpers import clean_html
|
8 |
from src.utils.markdown_processing.md_preprocessing import convert_html_to_md
|
9 |
from src.nlp.playground.pipelines.title_extractor import TitleExtractor
|
10 |
from src.utils.helpers import normalize_data
|
11 |
from src.persistence.db import *
|
12 |
from src.utils.apis.gpt_api import remove_boilerplate
|
13 |
+
|
14 |
+
# db even information schema:
|
15 |
+
# information:{
|
16 |
+
# actual: {
|
17 |
+
# title:
|
18 |
+
# dates: [
|
19 |
+
# {
|
20 |
+
# StartDate:
|
21 |
+
# EndDate:
|
22 |
+
# StartTime:
|
23 |
+
# EndTime:
|
24 |
+
# AdmittanceTime
|
25 |
+
# },...
|
26 |
+
# ],
|
27 |
+
# location
|
28 |
+
# adress:{
|
29 |
+
# street,
|
30 |
+
# housenumber,
|
31 |
+
# postalcode,
|
32 |
+
# city
|
33 |
+
# }
|
34 |
+
# organizers
|
35 |
+
# prices
|
36 |
+
# entryFree
|
37 |
+
# categories
|
38 |
+
# description
|
39 |
+
#
|
40 |
+
# }
|
41 |
+
# predicted: wie actual
|
42 |
+
# }
|
43 |
|
44 |
|
45 |
@st.cache_resource
|
|
|
53 |
def next():
|
54 |
db.event_urls.update_one({"_id": current_element["_id"]}, { "$set": { "information":event_information } })
|
55 |
st.session_state.index+=1
|
56 |
+
st.session_state.pop("time_ranges")
|
57 |
|
58 |
def prev():
|
59 |
st.session_state.index-=1
|
60 |
+
st.session_state.pop("time_ranges")
|
61 |
|
62 |
# Variables
|
63 |
db = init_connection()
|
|
|
85 |
st.session_state.predictions_on = False
|
86 |
|
87 |
|
88 |
+
|
89 |
current_element = st.session_state.elements[st.session_state.index]
|
90 |
|
91 |
predictions_on = st.toggle("Predictions an (Zeigt Extrahierte Daten an, die Seite lädt dadurch langsamer).")
|
|
|
93 |
st.session_state.predictions_on = predictions_on
|
94 |
|
95 |
if current_element:
|
96 |
+
if "time_ranges" not in st.session_state:
|
97 |
+
st.session_state.time_ranges = current_element.get("information", {}).get("actual", {}).get("dates", [])
|
98 |
current_url = current_element['url']
|
99 |
|
100 |
try:
|
|
|
108 |
except Exception as e:
|
109 |
st.error(f"Es ist ein Fehler aufgetreten: {e} \nDer Datenbankeintrag wird gelöscht.")
|
110 |
db.event_urls.delete_one({"_id": current_element["_id"]})
|
111 |
+
table_data = current_element["data"]
|
112 |
+
normalized_text = normalize_data(table_data)
|
113 |
predicted_title = None
|
114 |
predicted_date = None
|
115 |
+
predicted_organizers = None
|
116 |
if st.session_state.predictions_on:
|
117 |
predicted_title = TitleExtractor().extract_title(normalized_text)
|
118 |
# predicted_date = extract_entities(normalized_text, ["date", "date_range"])
|
119 |
# predicted_date = [ {d["text"],d["label"]} for d in predicted_date ] if predicted_date else None
|
|
|
120 |
with st.container(border=True, height=400):
|
121 |
st.markdown(normalized_text)
|
122 |
with st.expander("Code ansehen"):
|
123 |
with st.container( height=400):
|
124 |
st.code(normalized_text)
|
|
|
|
|
|
|
125 |
|
126 |
+
with st.expander("Veranstaltungsinformationen eingeben..."):
|
127 |
+
actual_title = st.text_input("Tatsächlicher Titel eingeben:", key="title"+ str(current_element["_id"]),
|
128 |
+
value=current_element.get("information", {}).get("actual", {}).get("title", None))
|
129 |
+
# Formular für neue Eingaben
|
130 |
+
|
131 |
+
with st.form("time_form"):
|
132 |
+
col1, col2 = st.columns(2) # Erstes Paar: Start- und Enddatum
|
133 |
+
with col1:
|
134 |
+
start_date = st.date_input("Startdatum", value=None, key="start_date" + str(st.session_state.index))
|
135 |
+
with col2:
|
136 |
+
end_date = st.date_input("Enddatum", value=None, key="end_date" + str(st.session_state.index))
|
137 |
+
|
138 |
+
col3, col4 = st.columns(2) # Zweites Paar: Start- und Endzeit
|
139 |
+
with col3:
|
140 |
+
start_time = st.time_input("Startzeit", value=None, key="start_time" + str(st.session_state.index))
|
141 |
+
with col4:
|
142 |
+
end_time = st.time_input("Endzeit", value=None, key="end_time" + str(st.session_state.index))
|
143 |
+
|
144 |
+
time_submitted = st.form_submit_button("Hinzufügen")
|
145 |
+
|
146 |
+
# Wenn das Formular abgesendet wird
|
147 |
+
if time_submitted:
|
148 |
+
new_entry = {
|
149 |
+
"start_date": datetime.datetime.combine(start_date,datetime.time(0)) if start_date else None,
|
150 |
+
"end_date": datetime.datetime.combine(end_date,datetime.time(0)) if end_date else None,
|
151 |
+
"start_time": datetime.datetime.combine(datetime.date.today(),start_time) if start_time else None,
|
152 |
+
"end_time": datetime.datetime.combine(datetime.date.today(),end_time) if end_time else None,
|
153 |
+
}
|
154 |
+
st.session_state.time_ranges.append(new_entry)
|
155 |
+
st.success("Zeitraum hinzugefügt!")
|
156 |
+
input_dates = st.session_state.time_ranges
|
157 |
+
actual_dates = "\n\n".join([
|
158 |
+
" ".join(filter(None, [ # Entfernt leere Strings automatisch
|
159 |
+
entry.get('start_date').strftime("%Y-%m-%d") if entry.get('start_date') else '',
|
160 |
+
f"- {entry.get('end_date').strftime('%Y-%m-%d')}" if entry.get('end_date') else '',
|
161 |
+
entry.get('start_time').strftime("%H:%M") if entry.get('start_time') else '',
|
162 |
+
f"- {entry.get('end_time').strftime('%H:%M')}" if entry.get('end_time') else ''
|
163 |
+
]))
|
164 |
+
for entry in input_dates
|
165 |
+
])
|
166 |
+
|
167 |
+
input_organizers = st.text_input("Tatsächlicher Veranstalter eingeben:", key="organizer" + str(current_element["_id"]),
|
168 |
+
value=",".join(current_element.get("information", {}).get("actual", {}).get("organizers", [])))
|
169 |
+
actual_organizers = input_organizers.split(",")
|
170 |
+
|
171 |
+
actual_location = st.text_input("Location Name", key="location" + str(current_element["_id"]),
|
172 |
+
value=current_element.get("information", {}).get("actual", {}).get("location", None))
|
173 |
+
with st.form("address_form"):
|
174 |
+
st.write("Adresse eingeben")
|
175 |
+
|
176 |
+
col1, col2 = st.columns([3, 1]) # Spalten für Straße & Hausnummer
|
177 |
+
street = col1.text_input("Straße")
|
178 |
+
house_number = col2.text_input("Hausnummer")
|
179 |
+
|
180 |
+
col3, col4 = st.columns([1, 3]) # Spalten für PLZ & Stadt
|
181 |
+
postal_code = col3.text_input("Postleitzahl")
|
182 |
+
city = col4.text_input("Stadt")
|
183 |
+
|
184 |
+
address_submitted = st.form_submit_button("Speichern")
|
185 |
+
address = current_element.get("information", {}).get("actual", {}).get("address", None)
|
186 |
+
if address_submitted:
|
187 |
+
address= {
|
188 |
+
"street": street,
|
189 |
+
"house_number": house_number,
|
190 |
+
"postal_code": postal_code,
|
191 |
+
"city": city,
|
192 |
+
}
|
193 |
+
|
194 |
+
actual_prices = st.text_input(
|
195 |
+
"Preise",
|
196 |
+
key="price" + str(current_element["_id"]),
|
197 |
+
value= ";".join(current_element.get("information", {}).get("actual", {}).get("prices", [])))
|
198 |
+
|
199 |
+
event_information = {
|
200 |
+
"actual":
|
201 |
+
{
|
202 |
+
"title":actual_title,
|
203 |
+
"dates":st.session_state.time_ranges,
|
204 |
+
"organizers":actual_organizers,
|
205 |
+
"location": actual_location,
|
206 |
+
"address":address,
|
207 |
+
"prices":actual_prices.split(";") if actual_prices else [],
|
208 |
+
}
|
209 |
+
}
|
210 |
+
table_data = {
|
211 |
"Information": [
|
212 |
"Titel",
|
213 |
+
"Daten",
|
214 |
+
"Veranstalter",
|
215 |
+
"Location",
|
216 |
+
"Straße",
|
217 |
+
"Hausnummer",
|
218 |
+
"Postleitzahl",
|
219 |
+
"Stadt",
|
220 |
+
"Preise"
|
221 |
],
|
222 |
"Tatsächlicher Wert":
|
223 |
[
|
224 |
actual_title,
|
225 |
+
actual_dates,
|
226 |
+
"\n\n".join(actual_organizers),
|
227 |
+
actual_location if actual_location else "",
|
228 |
+
address.get("street") if address else "",
|
229 |
+
address.get("house_number") if address else "",
|
230 |
+
address.get("postal_code") if address else "",
|
231 |
+
address.get("city") if address else "",
|
232 |
+
actual_prices.split(";") if actual_prices else "",
|
233 |
],
|
234 |
"Predicted Wert": [
|
235 |
predicted_title,
|
236 |
+
predicted_date,
|
237 |
+
predicted_organizers,
|
238 |
+
"",
|
239 |
+
"",
|
240 |
+
"",
|
241 |
+
"",
|
242 |
+
"",
|
243 |
+
""
|
244 |
],
|
245 |
}
|
246 |
+
df = pd.DataFrame(table_data)
|
247 |
|
248 |
st.subheader("Vergleich der Titel:")
|
249 |
st.table(df)
|
|
|
255 |
|
256 |
|
257 |
with col1:
|
258 |
+
st.button("Zurück", on_click=prev, disabled=st.session_state.index<1)
|
259 |
with col3:
|
260 |
st.button("URL löschen", on_click=remove_url)
|
261 |
with col4:
|
requirements.txt
CHANGED
@@ -15,6 +15,7 @@ pymongo
|
|
15 |
absl-py
|
16 |
dotenv
|
17 |
transformers
|
|
|
18 |
|
19 |
|
20 |
|
|
|
15 |
absl-py
|
16 |
dotenv
|
17 |
transformers
|
18 |
+
wtpsplit
|
19 |
|
20 |
|
21 |
|