manaviel85370 commited on
Commit
4eb9d09
·
1 Parent(s): f95159c

add input fields for actual information

Browse files
Files changed (2) hide show
  1. pages/4_Control.py +154 -14
  2. requirements.txt +1 -0
pages/4_Control.py CHANGED
@@ -1,11 +1,45 @@
 
 
1
  import pandas as pd
 
 
 
2
  from src.utils.helpers import clean_html
3
  from src.utils.markdown_processing.md_preprocessing import convert_html_to_md
4
  from src.nlp.playground.pipelines.title_extractor import TitleExtractor
5
  from src.utils.helpers import normalize_data
6
  from src.persistence.db import *
7
  from src.utils.apis.gpt_api import remove_boilerplate
8
- import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
 
11
  @st.cache_resource
@@ -19,9 +53,11 @@ def remove_url():
19
  def next():
20
  db.event_urls.update_one({"_id": current_element["_id"]}, { "$set": { "information":event_information } })
21
  st.session_state.index+=1
 
22
 
23
  def prev():
24
  st.session_state.index-=1
 
25
 
26
  # Variables
27
  db = init_connection()
@@ -49,6 +85,7 @@ if "predictions_on" not in st.session_state:
49
  st.session_state.predictions_on = False
50
 
51
 
 
52
  current_element = st.session_state.elements[st.session_state.index]
53
 
54
  predictions_on = st.toggle("Predictions an (Zeigt Extrahierte Daten an, die Seite lädt dadurch langsamer).")
@@ -56,6 +93,8 @@ if predictions_on != st.session_state.predictions_on:
56
  st.session_state.predictions_on = predictions_on
57
 
58
  if current_element:
 
 
59
  current_url = current_element['url']
60
 
61
  try:
@@ -69,41 +108,142 @@ if current_element:
69
  except Exception as e:
70
  st.error(f"Es ist ein Fehler aufgetreten: {e} \nDer Datenbankeintrag wird gelöscht.")
71
  db.event_urls.delete_one({"_id": current_element["_id"]})
72
- data = current_element["data"]
73
- normalized_text = normalize_data(data)
74
  predicted_title = None
75
  predicted_date = None
 
76
  if st.session_state.predictions_on:
77
  predicted_title = TitleExtractor().extract_title(normalized_text)
78
  # predicted_date = extract_entities(normalized_text, ["date", "date_range"])
79
  # predicted_date = [ {d["text"],d["label"]} for d in predicted_date ] if predicted_date else None
80
- st.subheader("Normalisierte Daten:")
81
  with st.container(border=True, height=400):
82
  st.markdown(normalized_text)
83
  with st.expander("Code ansehen"):
84
  with st.container( height=400):
85
  st.code(normalized_text)
86
- actual_title = st.text_input("Tatsächlicher Titel eingeben:", key="title"+ str(current_element["_id"]),
87
- value=current_element.get("information", {}).get("actual", {}).get("title", None))
88
- actual_date = None
89
 
90
- event_information = {"actual": {"title":actual_title}}
91
- data = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  "Information": [
93
  "Titel",
94
- # "Datum"
 
 
 
 
 
 
 
95
  ],
96
  "Tatsächlicher Wert":
97
  [
98
  actual_title,
99
- # actual_date
 
 
 
 
 
 
 
100
  ],
101
  "Predicted Wert": [
102
  predicted_title,
103
- # predicted_date
 
 
 
 
 
 
 
104
  ],
105
  }
106
- df = pd.DataFrame(data)
107
 
108
  st.subheader("Vergleich der Titel:")
109
  st.table(df)
@@ -115,7 +255,7 @@ if current_element:
115
 
116
 
117
  with col1:
118
- st.button("Zurück", on_click=prev)
119
  with col3:
120
  st.button("URL löschen", on_click=remove_url)
121
  with col4:
 
1
+ import datetime
2
+
3
  import pandas as pd
4
+ from lxml.html.defs import table_tags
5
+ from lxml.html.diff import end_tag
6
+
7
  from src.utils.helpers import clean_html
8
  from src.utils.markdown_processing.md_preprocessing import convert_html_to_md
9
  from src.nlp.playground.pipelines.title_extractor import TitleExtractor
10
  from src.utils.helpers import normalize_data
11
  from src.persistence.db import *
12
  from src.utils.apis.gpt_api import remove_boilerplate
13
+
14
+ # db even information schema:
15
+ # information:{
16
+ # actual: {
17
+ # title:
18
+ # dates: [
19
+ # {
20
+ # StartDate:
21
+ # EndDate:
22
+ # StartTime:
23
+ # EndTime:
24
+ # AdmittanceTime
25
+ # },...
26
+ # ],
27
+ # location
28
+ # adress:{
29
+ # street,
30
+ # housenumber,
31
+ # postalcode,
32
+ # city
33
+ # }
34
+ # organizers
35
+ # prices
36
+ # entryFree
37
+ # categories
38
+ # description
39
+ #
40
+ # }
41
+ # predicted: wie actual
42
+ # }
43
 
44
 
45
  @st.cache_resource
 
53
  def next():
54
  db.event_urls.update_one({"_id": current_element["_id"]}, { "$set": { "information":event_information } })
55
  st.session_state.index+=1
56
+ st.session_state.pop("time_ranges")
57
 
58
  def prev():
59
  st.session_state.index-=1
60
+ st.session_state.pop("time_ranges")
61
 
62
  # Variables
63
  db = init_connection()
 
85
  st.session_state.predictions_on = False
86
 
87
 
88
+
89
  current_element = st.session_state.elements[st.session_state.index]
90
 
91
  predictions_on = st.toggle("Predictions an (Zeigt Extrahierte Daten an, die Seite lädt dadurch langsamer).")
 
93
  st.session_state.predictions_on = predictions_on
94
 
95
  if current_element:
96
+ if "time_ranges" not in st.session_state:
97
+ st.session_state.time_ranges = current_element.get("information", {}).get("actual", {}).get("dates", [])
98
  current_url = current_element['url']
99
 
100
  try:
 
108
  except Exception as e:
109
  st.error(f"Es ist ein Fehler aufgetreten: {e} \nDer Datenbankeintrag wird gelöscht.")
110
  db.event_urls.delete_one({"_id": current_element["_id"]})
111
+ table_data = current_element["data"]
112
+ normalized_text = normalize_data(table_data)
113
  predicted_title = None
114
  predicted_date = None
115
+ predicted_organizers = None
116
  if st.session_state.predictions_on:
117
  predicted_title = TitleExtractor().extract_title(normalized_text)
118
  # predicted_date = extract_entities(normalized_text, ["date", "date_range"])
119
  # predicted_date = [ {d["text"],d["label"]} for d in predicted_date ] if predicted_date else None
 
120
  with st.container(border=True, height=400):
121
  st.markdown(normalized_text)
122
  with st.expander("Code ansehen"):
123
  with st.container( height=400):
124
  st.code(normalized_text)
 
 
 
125
 
126
+ with st.expander("Veranstaltungsinformationen eingeben..."):
127
+ actual_title = st.text_input("Tatsächlicher Titel eingeben:", key="title"+ str(current_element["_id"]),
128
+ value=current_element.get("information", {}).get("actual", {}).get("title", None))
129
+ # Formular für neue Eingaben
130
+
131
+ with st.form("time_form"):
132
+ col1, col2 = st.columns(2) # Erstes Paar: Start- und Enddatum
133
+ with col1:
134
+ start_date = st.date_input("Startdatum", value=None, key="start_date" + str(st.session_state.index))
135
+ with col2:
136
+ end_date = st.date_input("Enddatum", value=None, key="end_date" + str(st.session_state.index))
137
+
138
+ col3, col4 = st.columns(2) # Zweites Paar: Start- und Endzeit
139
+ with col3:
140
+ start_time = st.time_input("Startzeit", value=None, key="start_time" + str(st.session_state.index))
141
+ with col4:
142
+ end_time = st.time_input("Endzeit", value=None, key="end_time" + str(st.session_state.index))
143
+
144
+ time_submitted = st.form_submit_button("Hinzufügen")
145
+
146
+ # Wenn das Formular abgesendet wird
147
+ if time_submitted:
148
+ new_entry = {
149
+ "start_date": datetime.datetime.combine(start_date,datetime.time(0)) if start_date else None,
150
+ "end_date": datetime.datetime.combine(end_date,datetime.time(0)) if end_date else None,
151
+ "start_time": datetime.datetime.combine(datetime.date.today(),start_time) if start_time else None,
152
+ "end_time": datetime.datetime.combine(datetime.date.today(),end_time) if end_time else None,
153
+ }
154
+ st.session_state.time_ranges.append(new_entry)
155
+ st.success("Zeitraum hinzugefügt!")
156
+ input_dates = st.session_state.time_ranges
157
+ actual_dates = "\n\n".join([
158
+ " ".join(filter(None, [ # Entfernt leere Strings automatisch
159
+ entry.get('start_date').strftime("%Y-%m-%d") if entry.get('start_date') else '',
160
+ f"- {entry.get('end_date').strftime('%Y-%m-%d')}" if entry.get('end_date') else '',
161
+ entry.get('start_time').strftime("%H:%M") if entry.get('start_time') else '',
162
+ f"- {entry.get('end_time').strftime('%H:%M')}" if entry.get('end_time') else ''
163
+ ]))
164
+ for entry in input_dates
165
+ ])
166
+
167
+ input_organizers = st.text_input("Tatsächlicher Veranstalter eingeben:", key="organizer" + str(current_element["_id"]),
168
+ value=",".join(current_element.get("information", {}).get("actual", {}).get("organizers", [])))
169
+ actual_organizers = input_organizers.split(",")
170
+
171
+ actual_location = st.text_input("Location Name", key="location" + str(current_element["_id"]),
172
+ value=current_element.get("information", {}).get("actual", {}).get("location", None))
173
+ with st.form("address_form"):
174
+ st.write("Adresse eingeben")
175
+
176
+ col1, col2 = st.columns([3, 1]) # Spalten für Straße & Hausnummer
177
+ street = col1.text_input("Straße")
178
+ house_number = col2.text_input("Hausnummer")
179
+
180
+ col3, col4 = st.columns([1, 3]) # Spalten für PLZ & Stadt
181
+ postal_code = col3.text_input("Postleitzahl")
182
+ city = col4.text_input("Stadt")
183
+
184
+ address_submitted = st.form_submit_button("Speichern")
185
+ address = current_element.get("information", {}).get("actual", {}).get("address", None)
186
+ if address_submitted:
187
+ address= {
188
+ "street": street,
189
+ "house_number": house_number,
190
+ "postal_code": postal_code,
191
+ "city": city,
192
+ }
193
+
194
+ actual_prices = st.text_input(
195
+ "Preise",
196
+ key="price" + str(current_element["_id"]),
197
+ value= ";".join(current_element.get("information", {}).get("actual", {}).get("prices", [])))
198
+
199
+ event_information = {
200
+ "actual":
201
+ {
202
+ "title":actual_title,
203
+ "dates":st.session_state.time_ranges,
204
+ "organizers":actual_organizers,
205
+ "location": actual_location,
206
+ "address":address,
207
+ "prices":actual_prices.split(";") if actual_prices else [],
208
+ }
209
+ }
210
+ table_data = {
211
  "Information": [
212
  "Titel",
213
+ "Daten",
214
+ "Veranstalter",
215
+ "Location",
216
+ "Straße",
217
+ "Hausnummer",
218
+ "Postleitzahl",
219
+ "Stadt",
220
+ "Preise"
221
  ],
222
  "Tatsächlicher Wert":
223
  [
224
  actual_title,
225
+ actual_dates,
226
+ "\n\n".join(actual_organizers),
227
+ actual_location if actual_location else "",
228
+ address.get("street") if address else "",
229
+ address.get("house_number") if address else "",
230
+ address.get("postal_code") if address else "",
231
+ address.get("city") if address else "",
232
+ actual_prices.split(";") if actual_prices else "",
233
  ],
234
  "Predicted Wert": [
235
  predicted_title,
236
+ predicted_date,
237
+ predicted_organizers,
238
+ "",
239
+ "",
240
+ "",
241
+ "",
242
+ "",
243
+ ""
244
  ],
245
  }
246
+ df = pd.DataFrame(table_data)
247
 
248
  st.subheader("Vergleich der Titel:")
249
  st.table(df)
 
255
 
256
 
257
  with col1:
258
+ st.button("Zurück", on_click=prev, disabled=st.session_state.index<1)
259
  with col3:
260
  st.button("URL löschen", on_click=remove_url)
261
  with col4:
requirements.txt CHANGED
@@ -15,6 +15,7 @@ pymongo
15
  absl-py
16
  dotenv
17
  transformers
 
18
 
19
 
20
 
 
15
  absl-py
16
  dotenv
17
  transformers
18
+ wtpsplit
19
 
20
 
21