|
import json |
|
|
|
import spacy |
|
|
|
|
|
def convert_to_spacy_annotations(data): |
|
spacy_annotations = [] |
|
|
|
for item in data: |
|
text = item["text"] |
|
entities = [] |
|
|
|
for entity in item["entities"]: |
|
start_idx = text.find(entity["data"]) |
|
if start_idx != -1: |
|
end_idx = start_idx + len(entity["data"]) |
|
entities.append((start_idx, end_idx, entity["label"])) |
|
|
|
spacy_annotations.append((text, {"entities": entities})) |
|
|
|
return spacy_annotations |
|
|
|
|
|
|
|
data = [ |
|
{ |
|
"text": "Im Rahmen der Jubiläumstour zu Ehren von Lutz Kayser (\"50 Jahre OTRAG – Oribital Transport und Raketen AG\") machen Film und Regisseur am 05.12.2024 im Technik-Salon Station in der TIB in Hannover.", |
|
"entities": [{ "data": "05.12.2024", "label": "EVENT_DATE" }] |
|
}, |
|
{ |
|
"text": "Wann? 05.12.2024, 19:00-21:00", |
|
"entities": [{ "data": "05.12.2024", "label": "EVENT_DATE" }] |
|
}, |
|
{ |
|
"text": "Der siebte Workshop Retrodigitalisierung findet am 20.03.2025 und 21.03.2025 bei ZB MED – Informationszentrum Lebenswissenschaften in Köln statt.", |
|
"entities": [ |
|
{ "data": "20.03.2025", "label": "EVENT_DATE" }, |
|
{ "data": "21.03.2025", "label": "EVENT_DATE" } |
|
] |
|
}, |
|
{ |
|
"text": "Wann? 20.03.2025 - 21.03.2025", |
|
"entities": [{ "data": "20.03.2025 - 21.03.2025", "label": "EVENT_DATE_RANGE" }] |
|
}, |
|
{ |
|
"text": "Die 18. ACM International Conference on Web Search and Data Mining (WSDM 2025) wird vom 10.03.2025 - 14.03.2025 in Hannover stattfinden.", |
|
"entities": [{ "data": "10.03.2025 - 14.03.2025", "label": "EVENT_DATE_RANGE" }] |
|
}, |
|
{ |
|
"text": "So. 08.12.2024 12:15 - 13:15 CET", |
|
"entities": [{ "data": "08.12.2024", "label": "EVENT_DATE" }] |
|
}, |
|
{ |
|
"text": "24.12.2025 um 16:00", |
|
"entities": [{ "data": "24.12.2025", "label": "EVENT_DATE" }] |
|
}, |
|
{ |
|
"text": "07.01.2025", |
|
"entities": [{ "data": "07.01.2025", "label": "EVENT_DATE" }] |
|
}, |
|
{ |
|
"text": "Am 01.07.2025", |
|
"entities": [{ "data": "01.07.2025", "label": "EVENT_DATE" }] |
|
}, |
|
{ |
|
"text": "Wann? 07.11.2024 - 09.03.2025", |
|
"entities": [{ "data": "07.11.2024 - 09.03.2025", "label": "EVENT_DATE_RANGE" }] |
|
}, |
|
{ |
|
"text": "01.11.2024 - 09.03.2025", |
|
"entities": [{ "data": "01.11.2024 - 09.03.2025", "label": "EVENT_DATE_RANGE" }] |
|
}, |
|
{ |
|
"text": "01.09.2025 - 26.12.2024", |
|
"entities": [{ "data": "01.09.2025 - 26.12.2024", "label": "EVENT_DATE_RANGE" }] |
|
}, |
|
{ |
|
"text": "Premiere am 01.12.2024", |
|
"entities": [ |
|
{ |
|
"data": "01.12.2024", |
|
"label": "EVENT_DATE" |
|
} |
|
] |
|
}, |
|
{ |
|
"text": "01.11.2025 ab 16:00", |
|
"entities": [ |
|
{ |
|
"data": "01.11.2025", |
|
"label": "EVENT_DATE" |
|
} |
|
] |
|
}, |
|
{ |
|
"text": "15.01.2025 ab 18:00", |
|
"entities": [ |
|
{ |
|
"data": "15.01.2025", |
|
"label": "EVENT_DATE" |
|
} |
|
] |
|
}, |
|
{ |
|
"text": "01.12.2025 ab 14:00-15:00", |
|
"entities": [ |
|
{ |
|
"data": "01.12.2025", |
|
"label": "EVENT_DATE" |
|
} |
|
] |
|
}, |
|
{ |
|
"text": "18.01.2025 ab 11:00-18:00", |
|
"entities": [ |
|
{ |
|
"data": "18.01.2025", |
|
"label": "EVENT_DATE" |
|
} |
|
] |
|
}, |
|
{ |
|
"text": "01.12.2024 – 08.12.2024", |
|
"entities": [ |
|
{ |
|
"data": "01.12.2024 – 08.12.2024", |
|
"label": "EVENT_DATE_RANGE" |
|
} |
|
] |
|
}, |
|
{ |
|
"text": "Freitag 16:00-20:00 / Samstag 11:00-20:00 / Sonntag 11:00-18:00", |
|
"entities": [] |
|
}, |
|
{ |
|
"text": "So, 15.12.2024 Beginn: 15:00 Einlass: 14:30", |
|
"entities": [ |
|
{ |
|
"data": "15.12.2024", |
|
"label": "EVENT_DATE" |
|
} |
|
] |
|
} |
|
] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
annotations = convert_to_spacy_annotations(data) |
|
|
|
with open("../annotations.json", "w", encoding="utf-8") as f: |
|
json.dump(annotations, f, ensure_ascii=False, indent=4) |
|
|
|
|
|
print(json.dumps(annotations, ensure_ascii=False, indent=4)) |
|
|