manaviel85370
commited on
Commit
·
cdd3ab6
1
Parent(s):
14a5766
update testing
Browse files
src/nlp/playground/pipelines/testing/event_data_extractor_testing.py
CHANGED
@@ -27,7 +27,7 @@ def init_db_entries():
|
|
27 |
if dates:
|
28 |
filtered_elements.append(el)
|
29 |
print(f"{len(filtered_elements)} Testdatensätze in der Datenbank")
|
30 |
-
return filtered_elements
|
31 |
|
32 |
def event_similarity(actual, predicted):
|
33 |
# Liste der Attribute, die verglichen werden
|
@@ -49,7 +49,6 @@ def event_similarity(actual, predicted):
|
|
49 |
"organizers": len([organizer for organizer in predicted.organizers if organizer in actual.organizers]) / len(actual.organizers) if actual.organizers and predicted.organizers else int(actual.organizers == predicted.organizers),
|
50 |
"location": len([location for location in predicted.locations if location in actual.locations]) / len(actual.locations) if actual.locations and predicted.locations else int(actual.locations == predicted.locations),
|
51 |
}
|
52 |
-
# match_results = {attr: int(act == pred) for attr, (act, pred) in attributes.items()}
|
53 |
|
54 |
# Berechnung der Gesamtähnlichkeit
|
55 |
similarity_percentage = (sum(match_results.values()) / len(attributes)) * 100
|
@@ -78,9 +77,6 @@ for el in elements:
|
|
78 |
gc.collect()
|
79 |
if not all(f not in el.get("markdown", "") for f in filter_data):
|
80 |
continue
|
81 |
-
# base_url = el.get("base_url",None)
|
82 |
-
# if base_url and base_url not in base_urls:
|
83 |
-
# base_urls.append(base_url)
|
84 |
print(f"************************ Processing {count + 1} **********************************")
|
85 |
actual_event = Event()
|
86 |
actual_event.url = el.get("url")
|
@@ -169,7 +165,7 @@ for el in elements:
|
|
169 |
|
170 |
|
171 |
# 📂 CSV-Datei einlesen
|
172 |
-
df = pd.read_csv("results.csv", delimiter=" ")
|
173 |
|
174 |
# 🏆 Summiere die Anzahl der Übereinstimmungen für jede Kategorie
|
175 |
field_sums = {
|
@@ -185,19 +181,18 @@ total_events = len(df) # Gesamtanzahl der Events
|
|
185 |
percentages = {key: (value / total_events) * 100 for key, value in field_sums.items()} # Berechne Prozentwerte
|
186 |
|
187 |
plt.figure(figsize=(10, 6))
|
188 |
-
bars = plt.bar(field_sums.keys(), field_sums.values(), color=["blue", "orange", "green", "red", "purple"])
|
189 |
|
190 |
-
|
|
|
|
|
191 |
for bar, (key, percent) in zip(bars, percentages.items()):
|
192 |
-
plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() - 0.
|
193 |
-
ha="center", va="top", fontsize=10, color="black")
|
194 |
|
195 |
-
# 🏷️ Achsenbeschriftungen & Titel
|
196 |
plt.xlabel("Event Attribute")
|
197 |
plt.ylabel("Anzahl der Übereinstimmungen")
|
198 |
plt.title("Summierte Übereinstimmungen pro Event-Attribut")
|
199 |
|
200 |
-
# 📌 Info-Box OBEN LINKS im Graphen platzieren
|
201 |
info_text = f"Getestete Daten: {total_events}\nDurchschnittliche Verarbeitungszeit: {float(df['extraction_time'].sum()) / total_events:.2f}s"
|
202 |
plt.annotate(info_text, xy=(0.02, 0.85), xycoords="axes fraction", fontsize=12, ha="left",
|
203 |
bbox=dict(facecolor="white", alpha=0.8))
|
@@ -205,5 +200,4 @@ plt.annotate(info_text, xy=(0.02, 0.85), xycoords="axes fraction", fontsize=12,
|
|
205 |
plt.ylim(0, total_events * 1.2) # Maximale Höhe etwas erhöhen für bessere Lesbarkeit
|
206 |
plt.grid(axis="y", linestyle="--", alpha=0.7)
|
207 |
|
208 |
-
# 📈 Zeige den Graphen
|
209 |
plt.show()
|
|
|
27 |
if dates:
|
28 |
filtered_elements.append(el)
|
29 |
print(f"{len(filtered_elements)} Testdatensätze in der Datenbank")
|
30 |
+
return filtered_elements
|
31 |
|
32 |
def event_similarity(actual, predicted):
|
33 |
# Liste der Attribute, die verglichen werden
|
|
|
49 |
"organizers": len([organizer for organizer in predicted.organizers if organizer in actual.organizers]) / len(actual.organizers) if actual.organizers and predicted.organizers else int(actual.organizers == predicted.organizers),
|
50 |
"location": len([location for location in predicted.locations if location in actual.locations]) / len(actual.locations) if actual.locations and predicted.locations else int(actual.locations == predicted.locations),
|
51 |
}
|
|
|
52 |
|
53 |
# Berechnung der Gesamtähnlichkeit
|
54 |
similarity_percentage = (sum(match_results.values()) / len(attributes)) * 100
|
|
|
77 |
gc.collect()
|
78 |
if not all(f not in el.get("markdown", "") for f in filter_data):
|
79 |
continue
|
|
|
|
|
|
|
80 |
print(f"************************ Processing {count + 1} **********************************")
|
81 |
actual_event = Event()
|
82 |
actual_event.url = el.get("url")
|
|
|
165 |
|
166 |
|
167 |
# 📂 CSV-Datei einlesen
|
168 |
+
df = pd.read_csv("results.csv", delimiter=" ")
|
169 |
|
170 |
# 🏆 Summiere die Anzahl der Übereinstimmungen für jede Kategorie
|
171 |
field_sums = {
|
|
|
181 |
percentages = {key: (value / total_events) * 100 for key, value in field_sums.items()} # Berechne Prozentwerte
|
182 |
|
183 |
plt.figure(figsize=(10, 6))
|
|
|
184 |
|
185 |
+
colors = ["cornflowerblue", "lightsalmon", "lightgreen", "lightcoral", "mediumpurple", "skyblue"]
|
186 |
+
bars = plt.bar(field_sums.keys(), field_sums.values(), color=colors)
|
187 |
+
|
188 |
for bar, (key, percent) in zip(bars, percentages.items()):
|
189 |
+
plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() - (total_events * 0.05),
|
190 |
+
f"{percent:.1f}%", ha="center", va="top", fontsize=10, color="black")
|
191 |
|
|
|
192 |
plt.xlabel("Event Attribute")
|
193 |
plt.ylabel("Anzahl der Übereinstimmungen")
|
194 |
plt.title("Summierte Übereinstimmungen pro Event-Attribut")
|
195 |
|
|
|
196 |
info_text = f"Getestete Daten: {total_events}\nDurchschnittliche Verarbeitungszeit: {float(df['extraction_time'].sum()) / total_events:.2f}s"
|
197 |
plt.annotate(info_text, xy=(0.02, 0.85), xycoords="axes fraction", fontsize=12, ha="left",
|
198 |
bbox=dict(facecolor="white", alpha=0.8))
|
|
|
200 |
plt.ylim(0, total_events * 1.2) # Maximale Höhe etwas erhöhen für bessere Lesbarkeit
|
201 |
plt.grid(axis="y", linestyle="--", alpha=0.7)
|
202 |
|
|
|
203 |
plt.show()
|