import json import spacy def convert_to_spacy_annotations(data): spacy_annotations = [] for item in data: text = item["text"] entities = [] for entity in item["entities"]: start_idx = text.find(entity["data"]) if start_idx != -1: end_idx = start_idx + len(entity["data"]) entities.append((start_idx, end_idx, entity["label"])) spacy_annotations.append((text, {"entities": entities})) return spacy_annotations # Beispiel JSON-Daten data = [ { "text": "Im Rahmen der Jubiläumstour zu Ehren von Lutz Kayser (\"50 Jahre OTRAG – Oribital Transport und Raketen AG\") machen Film und Regisseur am 05.12.2024 im Technik-Salon Station in der TIB in Hannover.", "entities": [{ "data": "05.12.2024", "label": "EVENT_DATE" }] }, { "text": "Wann? 05.12.2024, 19:00-21:00", "entities": [{ "data": "05.12.2024", "label": "EVENT_DATE" }] }, { "text": "Der siebte Workshop Retrodigitalisierung findet am 20.03.2025 und 21.03.2025 bei ZB MED – Informationszentrum Lebenswissenschaften in Köln statt.", "entities": [ { "data": "20.03.2025", "label": "EVENT_DATE" }, { "data": "21.03.2025", "label": "EVENT_DATE" } ] }, { "text": "Wann? 20.03.2025 - 21.03.2025", "entities": [{ "data": "20.03.2025 - 21.03.2025", "label": "EVENT_DATE_RANGE" }] }, { "text": "Die 18. ACM International Conference on Web Search and Data Mining (WSDM 2025) wird vom 10.03.2025 - 14.03.2025 in Hannover stattfinden.", "entities": [{ "data": "10.03.2025 - 14.03.2025", "label": "EVENT_DATE_RANGE" }] }, { "text": "So. 08.12.2024 12:15 - 13:15 CET", "entities": [{ "data": "08.12.2024", "label": "EVENT_DATE" }] }, { "text": "24.12.2025 um 16:00", "entities": [{ "data": "24.12.2025", "label": "EVENT_DATE" }] }, { "text": "07.01.2025", "entities": [{ "data": "07.01.2025", "label": "EVENT_DATE" }] }, { "text": "Am 01.07.2025", "entities": [{ "data": "01.07.2025", "label": "EVENT_DATE" }] }, { "text": "Wann? 07.11.2024 - 09.03.2025", "entities": [{ "data": "07.11.2024 - 09.03.2025", "label": "EVENT_DATE_RANGE" }] }, { "text": "01.11.2024 - 09.03.2025", "entities": [{ "data": "01.11.2024 - 09.03.2025", "label": "EVENT_DATE_RANGE" }] }, { "text": "01.09.2025 - 26.12.2024", "entities": [{ "data": "01.09.2025 - 26.12.2024", "label": "EVENT_DATE_RANGE" }] }, { "text": "Premiere am 01.12.2024", "entities": [ { "data": "01.12.2024", "label": "EVENT_DATE" } ] }, { "text": "01.11.2025 ab 16:00", "entities": [ { "data": "01.11.2025", "label": "EVENT_DATE" } ] }, { "text": "15.01.2025 ab 18:00", "entities": [ { "data": "15.01.2025", "label": "EVENT_DATE" } ] }, { "text": "01.12.2025 ab 14:00-15:00", "entities": [ { "data": "01.12.2025", "label": "EVENT_DATE" } ] }, { "text": "18.01.2025 ab 11:00-18:00", "entities": [ { "data": "18.01.2025", "label": "EVENT_DATE" } ] }, { "text": "01.12.2024 – 08.12.2024", "entities": [ { "data": "01.12.2024 – 08.12.2024", "label": "EVENT_DATE_RANGE" } ] }, { "text": "Freitag 16:00-20:00 / Samstag 11:00-20:00 / Sonntag 11:00-18:00", "entities": [] }, { "text": "So, 15.12.2024 Beginn: 15:00 Einlass: 14:30", "entities": [ { "data": "15.12.2024", "label": "EVENT_DATE" } ] } ] # Umwandlung in spaCy-Format annotations = convert_to_spacy_annotations(data) # JSON speichern with open("../annotations.json", "w", encoding="utf-8") as f: json.dump(annotations, f, ensure_ascii=False, indent=4) # Ausgabe prüfen print(json.dumps(annotations, ensure_ascii=False, indent=4))