File size: 3,980 Bytes
da88570 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import json
import spacy
def convert_to_spacy_annotations(data):
spacy_annotations = []
for item in data:
text = item["text"]
entities = []
for entity in item["entities"]:
start_idx = text.find(entity["data"])
if start_idx != -1:
end_idx = start_idx + len(entity["data"])
entities.append((start_idx, end_idx, entity["label"]))
spacy_annotations.append((text, {"entities": entities}))
return spacy_annotations
# Beispiel JSON-Daten
data = [
{
"text": "Im Rahmen der Jubiläumstour zu Ehren von Lutz Kayser (\"50 Jahre OTRAG – Oribital Transport und Raketen AG\") machen Film und Regisseur am 05.12.2024 im Technik-Salon Station in der TIB in Hannover.",
"entities": [{ "data": "05.12.2024", "label": "EVENT_DATE" }]
},
{
"text": "Wann? 05.12.2024, 19:00-21:00",
"entities": [{ "data": "05.12.2024", "label": "EVENT_DATE" }]
},
{
"text": "Der siebte Workshop Retrodigitalisierung findet am 20.03.2025 und 21.03.2025 bei ZB MED – Informationszentrum Lebenswissenschaften in Köln statt.",
"entities": [
{ "data": "20.03.2025", "label": "EVENT_DATE" },
{ "data": "21.03.2025", "label": "EVENT_DATE" }
]
},
{
"text": "Wann? 20.03.2025 - 21.03.2025",
"entities": [{ "data": "20.03.2025 - 21.03.2025", "label": "EVENT_DATE_RANGE" }]
},
{
"text": "Die 18. ACM International Conference on Web Search and Data Mining (WSDM 2025) wird vom 10.03.2025 - 14.03.2025 in Hannover stattfinden.",
"entities": [{ "data": "10.03.2025 - 14.03.2025", "label": "EVENT_DATE_RANGE" }]
},
{
"text": "So. 08.12.2024 12:15 - 13:15 CET",
"entities": [{ "data": "08.12.2024", "label": "EVENT_DATE" }]
},
{
"text": "24.12.2025 um 16:00",
"entities": [{ "data": "24.12.2025", "label": "EVENT_DATE" }]
},
{
"text": "07.01.2025",
"entities": [{ "data": "07.01.2025", "label": "EVENT_DATE" }]
},
{
"text": "Am 01.07.2025",
"entities": [{ "data": "01.07.2025", "label": "EVENT_DATE" }]
},
{
"text": "Wann? 07.11.2024 - 09.03.2025",
"entities": [{ "data": "07.11.2024 - 09.03.2025", "label": "EVENT_DATE_RANGE" }]
},
{
"text": "01.11.2024 - 09.03.2025",
"entities": [{ "data": "01.11.2024 - 09.03.2025", "label": "EVENT_DATE_RANGE" }]
},
{
"text": "01.09.2025 - 26.12.2024",
"entities": [{ "data": "01.09.2025 - 26.12.2024", "label": "EVENT_DATE_RANGE" }]
},
{
"text": "Premiere am 01.12.2024",
"entities": [
{
"data": "01.12.2024",
"label": "EVENT_DATE"
}
]
},
{
"text": "01.11.2025 ab 16:00",
"entities": [
{
"data": "01.11.2025",
"label": "EVENT_DATE"
}
]
},
{
"text": "15.01.2025 ab 18:00",
"entities": [
{
"data": "15.01.2025",
"label": "EVENT_DATE"
}
]
},
{
"text": "01.12.2025 ab 14:00-15:00",
"entities": [
{
"data": "01.12.2025",
"label": "EVENT_DATE"
}
]
},
{
"text": "18.01.2025 ab 11:00-18:00",
"entities": [
{
"data": "18.01.2025",
"label": "EVENT_DATE"
}
]
},
{
"text": "01.12.2024 – 08.12.2024",
"entities": [
{
"data": "01.12.2024 – 08.12.2024",
"label": "EVENT_DATE_RANGE"
}
]
},
{
"text": "Freitag 16:00-20:00 / Samstag 11:00-20:00 / Sonntag 11:00-18:00",
"entities": []
},
{
"text": "So, 15.12.2024 Beginn: 15:00 Einlass: 14:30",
"entities": [
{
"data": "15.12.2024",
"label": "EVENT_DATE"
}
]
}
]
# Umwandlung in spaCy-Format
annotations = convert_to_spacy_annotations(data)
# JSON speichern
with open("../annotations.json", "w", encoding="utf-8") as f:
json.dump(annotations, f, ensure_ascii=False, indent=4)
# Ausgabe prüfen
print(json.dumps(annotations, ensure_ascii=False, indent=4))
|