Spaces:

adojode
/

event-data-extraction-playground

Running

App Files Files Community

manaviel85370 commited on 15 days ago

Commit

cdd3ab6

1 Parent(s): 14a5766

update testing

Browse files

Files changed (1) hide show

src/nlp/playground/pipelines/testing/event_data_extractor_testing.py +7 -13

src/nlp/playground/pipelines/testing/event_data_extractor_testing.py CHANGED Viewed

@@ -27,7 +27,7 @@ def init_db_entries():
             if dates:
                 filtered_elements.append(el)
     print(f"{len(filtered_elements)} Testdatensätze in der Datenbank")
-    return filtered_elements[52:]
 def event_similarity(actual, predicted):
     # Liste der Attribute, die verglichen werden
@@ -49,7 +49,6 @@ def event_similarity(actual, predicted):
         "organizers": len([organizer for organizer in predicted.organizers if organizer in actual.organizers]) / len(actual.organizers) if actual.organizers and predicted.organizers else int(actual.organizers == predicted.organizers),
         "location": len([location for location in predicted.locations if location in actual.locations]) / len(actual.locations) if actual.locations and predicted.locations else int(actual.locations == predicted.locations),
     }
-    # match_results = {attr: int(act == pred) for attr, (act, pred) in attributes.items()}
     # Berechnung der Gesamtähnlichkeit
     similarity_percentage = (sum(match_results.values()) / len(attributes)) * 100
@@ -78,9 +77,6 @@ for el in elements:
     gc.collect()
     if not all(f not in el.get("markdown", "") for f in filter_data):
         continue
-    # base_url = el.get("base_url",None)
-    # if base_url and base_url not in base_urls:
-    #     base_urls.append(base_url)
     print(f"************************ Processing {count + 1}  **********************************")
     actual_event = Event()
     actual_event.url = el.get("url")
@@ -169,7 +165,7 @@ for el in elements:
 # 📂 CSV-Datei einlesen
-df = pd.read_csv("results.csv", delimiter=" ")  # Falls Probleme: delimiter anpassen
 # 🏆 Summiere die Anzahl der Übereinstimmungen für jede Kategorie
 field_sums = {
@@ -185,19 +181,18 @@ total_events = len(df)  # Gesamtanzahl der Events
 percentages = {key: (value / total_events) * 100 for key, value in field_sums.items()}  # Berechne Prozentwerte
 plt.figure(figsize=(10, 6))
-bars = plt.bar(field_sums.keys(), field_sums.values(), color=["blue", "orange", "green", "red", "purple"])
-# 📊 Prozentwerte UNTER den Balken hinzufügen
 for bar, (key, percent) in zip(bars, percentages.items()):
-    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() - 0.2, f"{percent:.1f}%",
-             ha="center", va="top", fontsize=10, color="black")
-# 🏷️ Achsenbeschriftungen & Titel
 plt.xlabel("Event Attribute")
 plt.ylabel("Anzahl der Übereinstimmungen")
 plt.title("Summierte Übereinstimmungen pro Event-Attribut")
-# 📌 Info-Box OBEN LINKS im Graphen platzieren
 info_text = f"Getestete Daten: {total_events}\nDurchschnittliche Verarbeitungszeit: {float(df['extraction_time'].sum()) / total_events:.2f}s"
 plt.annotate(info_text, xy=(0.02, 0.85), xycoords="axes fraction", fontsize=12, ha="left",
              bbox=dict(facecolor="white", alpha=0.8))
@@ -205,5 +200,4 @@ plt.annotate(info_text, xy=(0.02, 0.85), xycoords="axes fraction", fontsize=12,
 plt.ylim(0, total_events * 1.2)  # Maximale Höhe etwas erhöhen für bessere Lesbarkeit
 plt.grid(axis="y", linestyle="--", alpha=0.7)
-# 📈 Zeige den Graphen
 plt.show()

             if dates:
                 filtered_elements.append(el)
     print(f"{len(filtered_elements)} Testdatensätze in der Datenbank")
+    return filtered_elements
 def event_similarity(actual, predicted):
     # Liste der Attribute, die verglichen werden
         "organizers": len([organizer for organizer in predicted.organizers if organizer in actual.organizers]) / len(actual.organizers) if actual.organizers and predicted.organizers else int(actual.organizers == predicted.organizers),
         "location": len([location for location in predicted.locations if location in actual.locations]) / len(actual.locations) if actual.locations and predicted.locations else int(actual.locations == predicted.locations),
     }
     # Berechnung der Gesamtähnlichkeit
     similarity_percentage = (sum(match_results.values()) / len(attributes)) * 100
     gc.collect()
     if not all(f not in el.get("markdown", "") for f in filter_data):
         continue
     print(f"************************ Processing {count + 1}  **********************************")
     actual_event = Event()
     actual_event.url = el.get("url")
 # 📂 CSV-Datei einlesen
+df = pd.read_csv("results.csv", delimiter=" ")
 # 🏆 Summiere die Anzahl der Übereinstimmungen für jede Kategorie
 field_sums = {
 percentages = {key: (value / total_events) * 100 for key, value in field_sums.items()}  # Berechne Prozentwerte
 plt.figure(figsize=(10, 6))
+colors = ["cornflowerblue", "lightsalmon", "lightgreen", "lightcoral", "mediumpurple", "skyblue"]
+bars = plt.bar(field_sums.keys(), field_sums.values(), color=colors)
 for bar, (key, percent) in zip(bars, percentages.items()):
+    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() - (total_events * 0.05),
+             f"{percent:.1f}%", ha="center", va="top", fontsize=10, color="black")
 plt.xlabel("Event Attribute")
 plt.ylabel("Anzahl der Übereinstimmungen")
 plt.title("Summierte Übereinstimmungen pro Event-Attribut")
 info_text = f"Getestete Daten: {total_events}\nDurchschnittliche Verarbeitungszeit: {float(df['extraction_time'].sum()) / total_events:.2f}s"
 plt.annotate(info_text, xy=(0.02, 0.85), xycoords="axes fraction", fontsize=12, ha="left",
              bbox=dict(facecolor="white", alpha=0.8))
 plt.ylim(0, total_events * 1.2)  # Maximale Höhe etwas erhöhen für bessere Lesbarkeit
 plt.grid(axis="y", linestyle="--", alpha=0.7)
 plt.show()