manaviel85370 commited on
Commit
cdd3ab6
·
1 Parent(s): 14a5766

update testing

Browse files
src/nlp/playground/pipelines/testing/event_data_extractor_testing.py CHANGED
@@ -27,7 +27,7 @@ def init_db_entries():
27
  if dates:
28
  filtered_elements.append(el)
29
  print(f"{len(filtered_elements)} Testdatensätze in der Datenbank")
30
- return filtered_elements[52:]
31
 
32
  def event_similarity(actual, predicted):
33
  # Liste der Attribute, die verglichen werden
@@ -49,7 +49,6 @@ def event_similarity(actual, predicted):
49
  "organizers": len([organizer for organizer in predicted.organizers if organizer in actual.organizers]) / len(actual.organizers) if actual.organizers and predicted.organizers else int(actual.organizers == predicted.organizers),
50
  "location": len([location for location in predicted.locations if location in actual.locations]) / len(actual.locations) if actual.locations and predicted.locations else int(actual.locations == predicted.locations),
51
  }
52
- # match_results = {attr: int(act == pred) for attr, (act, pred) in attributes.items()}
53
 
54
  # Berechnung der Gesamtähnlichkeit
55
  similarity_percentage = (sum(match_results.values()) / len(attributes)) * 100
@@ -78,9 +77,6 @@ for el in elements:
78
  gc.collect()
79
  if not all(f not in el.get("markdown", "") for f in filter_data):
80
  continue
81
- # base_url = el.get("base_url",None)
82
- # if base_url and base_url not in base_urls:
83
- # base_urls.append(base_url)
84
  print(f"************************ Processing {count + 1} **********************************")
85
  actual_event = Event()
86
  actual_event.url = el.get("url")
@@ -169,7 +165,7 @@ for el in elements:
169
 
170
 
171
  # 📂 CSV-Datei einlesen
172
- df = pd.read_csv("results.csv", delimiter=" ") # Falls Probleme: delimiter anpassen
173
 
174
  # 🏆 Summiere die Anzahl der Übereinstimmungen für jede Kategorie
175
  field_sums = {
@@ -185,19 +181,18 @@ total_events = len(df) # Gesamtanzahl der Events
185
  percentages = {key: (value / total_events) * 100 for key, value in field_sums.items()} # Berechne Prozentwerte
186
 
187
  plt.figure(figsize=(10, 6))
188
- bars = plt.bar(field_sums.keys(), field_sums.values(), color=["blue", "orange", "green", "red", "purple"])
189
 
190
- # 📊 Prozentwerte UNTER den Balken hinzufügen
 
 
191
  for bar, (key, percent) in zip(bars, percentages.items()):
192
- plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() - 0.2, f"{percent:.1f}%",
193
- ha="center", va="top", fontsize=10, color="black")
194
 
195
- # 🏷️ Achsenbeschriftungen & Titel
196
  plt.xlabel("Event Attribute")
197
  plt.ylabel("Anzahl der Übereinstimmungen")
198
  plt.title("Summierte Übereinstimmungen pro Event-Attribut")
199
 
200
- # 📌 Info-Box OBEN LINKS im Graphen platzieren
201
  info_text = f"Getestete Daten: {total_events}\nDurchschnittliche Verarbeitungszeit: {float(df['extraction_time'].sum()) / total_events:.2f}s"
202
  plt.annotate(info_text, xy=(0.02, 0.85), xycoords="axes fraction", fontsize=12, ha="left",
203
  bbox=dict(facecolor="white", alpha=0.8))
@@ -205,5 +200,4 @@ plt.annotate(info_text, xy=(0.02, 0.85), xycoords="axes fraction", fontsize=12,
205
  plt.ylim(0, total_events * 1.2) # Maximale Höhe etwas erhöhen für bessere Lesbarkeit
206
  plt.grid(axis="y", linestyle="--", alpha=0.7)
207
 
208
- # 📈 Zeige den Graphen
209
  plt.show()
 
27
  if dates:
28
  filtered_elements.append(el)
29
  print(f"{len(filtered_elements)} Testdatensätze in der Datenbank")
30
+ return filtered_elements
31
 
32
  def event_similarity(actual, predicted):
33
  # Liste der Attribute, die verglichen werden
 
49
  "organizers": len([organizer for organizer in predicted.organizers if organizer in actual.organizers]) / len(actual.organizers) if actual.organizers and predicted.organizers else int(actual.organizers == predicted.organizers),
50
  "location": len([location for location in predicted.locations if location in actual.locations]) / len(actual.locations) if actual.locations and predicted.locations else int(actual.locations == predicted.locations),
51
  }
 
52
 
53
  # Berechnung der Gesamtähnlichkeit
54
  similarity_percentage = (sum(match_results.values()) / len(attributes)) * 100
 
77
  gc.collect()
78
  if not all(f not in el.get("markdown", "") for f in filter_data):
79
  continue
 
 
 
80
  print(f"************************ Processing {count + 1} **********************************")
81
  actual_event = Event()
82
  actual_event.url = el.get("url")
 
165
 
166
 
167
  # 📂 CSV-Datei einlesen
168
+ df = pd.read_csv("results.csv", delimiter=" ")
169
 
170
  # 🏆 Summiere die Anzahl der Übereinstimmungen für jede Kategorie
171
  field_sums = {
 
181
  percentages = {key: (value / total_events) * 100 for key, value in field_sums.items()} # Berechne Prozentwerte
182
 
183
  plt.figure(figsize=(10, 6))
 
184
 
185
+ colors = ["cornflowerblue", "lightsalmon", "lightgreen", "lightcoral", "mediumpurple", "skyblue"]
186
+ bars = plt.bar(field_sums.keys(), field_sums.values(), color=colors)
187
+
188
  for bar, (key, percent) in zip(bars, percentages.items()):
189
+ plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() - (total_events * 0.05),
190
+ f"{percent:.1f}%", ha="center", va="top", fontsize=10, color="black")
191
 
 
192
  plt.xlabel("Event Attribute")
193
  plt.ylabel("Anzahl der Übereinstimmungen")
194
  plt.title("Summierte Übereinstimmungen pro Event-Attribut")
195
 
 
196
  info_text = f"Getestete Daten: {total_events}\nDurchschnittliche Verarbeitungszeit: {float(df['extraction_time'].sum()) / total_events:.2f}s"
197
  plt.annotate(info_text, xy=(0.02, 0.85), xycoords="axes fraction", fontsize=12, ha="left",
198
  bbox=dict(facecolor="white", alpha=0.8))
 
200
  plt.ylim(0, total_events * 1.2) # Maximale Höhe etwas erhöhen für bessere Lesbarkeit
201
  plt.grid(axis="y", linestyle="--", alpha=0.7)
202
 
 
203
  plt.show()