manaviel85370
add pages and all
da88570
import json
import spacy
def convert_to_spacy_annotations(data):
spacy_annotations = []
for item in data:
text = item["text"]
entities = []
for entity in item["entities"]:
start_idx = text.find(entity["data"])
if start_idx != -1:
end_idx = start_idx + len(entity["data"])
entities.append((start_idx, end_idx, entity["label"]))
spacy_annotations.append((text, {"entities": entities}))
return spacy_annotations
# Beispiel JSON-Daten
data = [
{
"text": "Im Rahmen der Jubiläumstour zu Ehren von Lutz Kayser (\"50 Jahre OTRAG – Oribital Transport und Raketen AG\") machen Film und Regisseur am 05.12.2024 im Technik-Salon Station in der TIB in Hannover.",
"entities": [{ "data": "05.12.2024", "label": "EVENT_DATE" }]
},
{
"text": "Wann? 05.12.2024, 19:00-21:00",
"entities": [{ "data": "05.12.2024", "label": "EVENT_DATE" }]
},
{
"text": "Der siebte Workshop Retrodigitalisierung findet am 20.03.2025 und 21.03.2025 bei ZB MED – Informationszentrum Lebenswissenschaften in Köln statt.",
"entities": [
{ "data": "20.03.2025", "label": "EVENT_DATE" },
{ "data": "21.03.2025", "label": "EVENT_DATE" }
]
},
{
"text": "Wann? 20.03.2025 - 21.03.2025",
"entities": [{ "data": "20.03.2025 - 21.03.2025", "label": "EVENT_DATE_RANGE" }]
},
{
"text": "Die 18. ACM International Conference on Web Search and Data Mining (WSDM 2025) wird vom 10.03.2025 - 14.03.2025 in Hannover stattfinden.",
"entities": [{ "data": "10.03.2025 - 14.03.2025", "label": "EVENT_DATE_RANGE" }]
},
{
"text": "So. 08.12.2024 12:15 - 13:15 CET",
"entities": [{ "data": "08.12.2024", "label": "EVENT_DATE" }]
},
{
"text": "24.12.2025 um 16:00",
"entities": [{ "data": "24.12.2025", "label": "EVENT_DATE" }]
},
{
"text": "07.01.2025",
"entities": [{ "data": "07.01.2025", "label": "EVENT_DATE" }]
},
{
"text": "Am 01.07.2025",
"entities": [{ "data": "01.07.2025", "label": "EVENT_DATE" }]
},
{
"text": "Wann? 07.11.2024 - 09.03.2025",
"entities": [{ "data": "07.11.2024 - 09.03.2025", "label": "EVENT_DATE_RANGE" }]
},
{
"text": "01.11.2024 - 09.03.2025",
"entities": [{ "data": "01.11.2024 - 09.03.2025", "label": "EVENT_DATE_RANGE" }]
},
{
"text": "01.09.2025 - 26.12.2024",
"entities": [{ "data": "01.09.2025 - 26.12.2024", "label": "EVENT_DATE_RANGE" }]
},
{
"text": "Premiere am 01.12.2024",
"entities": [
{
"data": "01.12.2024",
"label": "EVENT_DATE"
}
]
},
{
"text": "01.11.2025 ab 16:00",
"entities": [
{
"data": "01.11.2025",
"label": "EVENT_DATE"
}
]
},
{
"text": "15.01.2025 ab 18:00",
"entities": [
{
"data": "15.01.2025",
"label": "EVENT_DATE"
}
]
},
{
"text": "01.12.2025 ab 14:00-15:00",
"entities": [
{
"data": "01.12.2025",
"label": "EVENT_DATE"
}
]
},
{
"text": "18.01.2025 ab 11:00-18:00",
"entities": [
{
"data": "18.01.2025",
"label": "EVENT_DATE"
}
]
},
{
"text": "01.12.2024 – 08.12.2024",
"entities": [
{
"data": "01.12.2024 – 08.12.2024",
"label": "EVENT_DATE_RANGE"
}
]
},
{
"text": "Freitag 16:00-20:00 / Samstag 11:00-20:00 / Sonntag 11:00-18:00",
"entities": []
},
{
"text": "So, 15.12.2024 Beginn: 15:00 Einlass: 14:30",
"entities": [
{
"data": "15.12.2024",
"label": "EVENT_DATE"
}
]
}
]
# Umwandlung in spaCy-Format
annotations = convert_to_spacy_annotations(data)
# JSON speichern
with open("../annotations.json", "w", encoding="utf-8") as f:
json.dump(annotations, f, ensure_ascii=False, indent=4)
# Ausgabe prüfen
print(json.dumps(annotations, ensure_ascii=False, indent=4))