manaviel85370
commited on
Commit
·
eaebaa4
1
Parent(s):
82f20dd
try date extraction
Browse files
src/nlp/experimental/textclassification/classy_classifier_date.py
ADDED
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from classy_classification import ClassyClassifier
|
2 |
+
import pickle
|
3 |
+
import spacy
|
4 |
+
from spacy import displacy
|
5 |
+
from spacy.tokenizer import Tokenizer
|
6 |
+
from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
|
7 |
+
from nltk import Tree
|
8 |
+
from src.nlp.data.test_texts import TEXTS
|
9 |
+
from src.nlp.playground.textclassification import ZeroShotClassifier, CustomMode
|
10 |
+
from src.utils.helpers import normalize_data
|
11 |
+
from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer
|
12 |
+
|
13 |
+
classifier_train_data = {
|
14 |
+
"EVENT_DATE": [
|
15 |
+
"Termin: [DATE], 19:00",
|
16 |
+
"[DATE]",
|
17 |
+
"Unser Meetup ist am [DATE] um 18:30 Uhr.",
|
18 |
+
"Konzert: [DATE]",
|
19 |
+
"Das Festival startet am [DATE]",
|
20 |
+
"Die Show findet am [DATE] um 20:00 Uhr statt.",
|
21 |
+
"[DATE] – Save the Date!",
|
22 |
+
"Das Webinar beginnt am [DATE] um 16:00 Uhr.",
|
23 |
+
"[DATE] – Große Premiere im Theater!",
|
24 |
+
"Event am [DATE], komm vorbei!",
|
25 |
+
"[DATE] – Silvesterparty!",
|
26 |
+
"Fußballspiel: [DATE], 15:30 Uhr",
|
27 |
+
"Live-Musik am [DATE]",
|
28 |
+
"[DATE] – Infos folgen!",
|
29 |
+
"[DATE] um 20:00 Uhr",
|
30 |
+
"Termin: [DATE], 18:00 Uhr",
|
31 |
+
"Wann? [DATE], 19:00 bis 20:00 Uhr"
|
32 |
+
"Das Konzert findet am [DATE] statt.",
|
33 |
+
"Save the Date: [DATE]!",
|
34 |
+
"Nächste Veranstaltung: [DATE]",
|
35 |
+
"[DATE] – große Feier!",
|
36 |
+
"Konzert am [DATE], 20:00 Uhr",
|
37 |
+
"[DATE]",
|
38 |
+
"Festival: [DATE] – [DATE]",
|
39 |
+
"[DATE] – nicht verpassen!",
|
40 |
+
"Sportevent: [DATE]",
|
41 |
+
"Networking-Event am [DATE], 17:00 Uhr",
|
42 |
+
"Workshop: [DATE], 14:00 Uhr",
|
43 |
+
"Firmenfeier: [DATE] ab 18:30 Uhr",
|
44 |
+
"Seminar: [DATE], Beginn um 10:00 Uhr",
|
45 |
+
"Schulung am [DATE] um 15:00 Uhr",
|
46 |
+
"Jubiläumsfeier am [DATE], 19:30 Uhr",
|
47 |
+
"[DATE] 23:00",
|
48 |
+
"[DATE] 23:00",
|
49 |
+
"[DATE] 23:00",
|
50 |
+
"Datum: [DATE], Startzeit: 10:00, Endzeit: 12:00",
|
51 |
+
"Samstag, [DATE], Einlass: 15:59, Beginn: 17:30, Preis: 65,73 EUR",
|
52 |
+
"Samstag, [DATE], Einlass: 17:00, Beginn: 19:00, Preis: 74,99 EUR",
|
53 |
+
"Samstag, [DATE], Einlass: 18:00, Beginn: 20:00",
|
54 |
+
"Freitag, [DATE], Einlass: 17:00, Beginn: 19:00, Preis: 77,93 EUR",
|
55 |
+
"Samstag, [DATE], Einlass: 16:30, Beginn: 18:30, Preis: ab 69,99 Euro",
|
56 |
+
"Gestört aber GeiL – Das Festival • [DATE], 16:00 • Berlin",
|
57 |
+
"Samstag, [DATE], Einlass: 17:00, Beginn: 19:00",
|
58 |
+
"Kaufberatung: [DATE] um 19:00",
|
59 |
+
"Bedienung: [DATE] um 19:00",
|
60 |
+
"[DATE] Ganztägig",
|
61 |
+
"ab dem [DATE]",
|
62 |
+
"Mittwoch, [DATE], von 18:00-22:00",
|
63 |
+
"Mittwoch, [DATE], von 18:00-22:00",
|
64 |
+
"Augsburger Friedensgespräche am [DATE]",
|
65 |
+
"Augsburger Friedensgespräche am [DATE]",
|
66 |
+
"Augsburger Friedensgespräche am [DATE]",
|
67 |
+
"[DATE] - [DATE]",
|
68 |
+
"Am [DATE] endet der Weihnachtsmarkt bereits um 19:00.",
|
69 |
+
"Winzerglühwein Do [DATE] 17:00 - 19:00",
|
70 |
+
"Winzerglühwein Fr [DATE] 16:30 - 18:30",
|
71 |
+
"Winzerglühwein Sa [DATE] 15:30 - 15:30",
|
72 |
+
"Lessons and Carols Sa [DATE] 19:30 - 21:00",
|
73 |
+
"[DATE] - [DATE]",
|
74 |
+
"Vom [DATE] - [DATE]",
|
75 |
+
"[DATE] - [DATE]",
|
76 |
+
"Am [DATE] endet der Weihnachtsmarkt bereits um 19:00.",
|
77 |
+
"Do [DATE] 17:00 - 19:00",
|
78 |
+
"Fr [DATE] 16:30 - 18:30",
|
79 |
+
"Sa [DATE] 15:30 - 15:30",
|
80 |
+
"Sa [DATE] 19:30 - 21:00",
|
81 |
+
"[DATE]",
|
82 |
+
"[DATE] - [DATE]",
|
83 |
+
"[DATE]",
|
84 |
+
"[DATE]",
|
85 |
+
"[DATE] und [DATE]",
|
86 |
+
"[DATE]",
|
87 |
+
"[DATE] - [DATE]",
|
88 |
+
"Am [DATE] endet der Weihnachtsmarkt bereits um 19:00.",
|
89 |
+
"Vom [DATE] - [DATE]",
|
90 |
+
"[DATE] und [DATE]",
|
91 |
+
"Am [DATE] ab 19:00",
|
92 |
+
"Am [DATE] ab 19:00",
|
93 |
+
"Am [DATE]",
|
94 |
+
"[DATE]",
|
95 |
+
"[DATE]",
|
96 |
+
"[DATE]",
|
97 |
+
"[DATE] 16:00 – [DATE] 17:00",
|
98 |
+
"[DATE] 10:15 – [DATE] 12:30",
|
99 |
+
"[DATE] 10:00 – [DATE] 18:00",
|
100 |
+
"[DATE]",
|
101 |
+
"[DATE] 11:00 – [DATE] 18:00",
|
102 |
+
"[DATE] - [DATE]",
|
103 |
+
"[DATE] | 19:30",
|
104 |
+
"[DATE]",
|
105 |
+
"[DATE] bis einschließlich [DATE]",
|
106 |
+
"[DATE], [DATE], [DATE] und [DATE]",
|
107 |
+
"[DATE] 18:00",
|
108 |
+
"[DATE] 13:00-21:00",
|
109 |
+
],
|
110 |
+
"OTHER": [
|
111 |
+
"Der Vorverkauf startet am [DATE].",
|
112 |
+
"Anmeldefrist: [DATE]",
|
113 |
+
"Tickets sind bis zum [DATE] erhältlich.",
|
114 |
+
"Call for Papers läuft bis zum [DATE].",
|
115 |
+
"Die Registrierung endet am [DATE].",
|
116 |
+
"Bewerbungsschluss: [DATE].",
|
117 |
+
"Frühbucherrabatt bis zum [DATE]!",
|
118 |
+
"Einreichungsfrist: [DATE]",
|
119 |
+
"Die Akkreditierung läuft bis zum [DATE]",
|
120 |
+
"Reservierungen sind bis zum [DATE] möglich.",
|
121 |
+
"Der Ticketverkauf startet am [DATE]",
|
122 |
+
"Letzte Chance zur Anmeldung: [DATE]",
|
123 |
+
"Call for Speakers läuft bis [DATE]",
|
124 |
+
"Bitte reicht eure Abstracts bis [DATE] ein.",
|
125 |
+
"Akkreditierung endet am [DATE]",
|
126 |
+
"Bewerbungsschluss: [DATE]",
|
127 |
+
"Die Early-Bird-Phase läuft bis zum [DATE]",
|
128 |
+
"Anmeldefrist für Workshops: [DATE]",
|
129 |
+
"Die Frist für Sponsorings endet am [DATE]",
|
130 |
+
"Vergünstigte Tickets bis zum [DATE] verfügbar!"
|
131 |
+
]
|
132 |
+
}
|
133 |
+
|
134 |
+
|
135 |
+
|
136 |
+
nlp = spacy.blank("de")
|
137 |
+
nlp.add_pipe('sentencizer')
|
138 |
+
|
139 |
+
# 1️⃣ Punkt als Suffix & Infix definieren (damit er zwischen Zahlen trennt)
|
140 |
+
suffixes = list(nlp.Defaults.suffixes) + [r"\."] # Punkt als Suffix hinzufügen
|
141 |
+
infixes = list(nlp.Defaults.infixes) + [r"(?<=\d)\.(?=\d)"] + [r"(?<=\d)\:(?=\d)"] # Punkt zwischen Zahlen trennen
|
142 |
+
|
143 |
+
|
144 |
+
# Regex-Objekte kompilieren
|
145 |
+
suffix_re = compile_suffix_regex(suffixes)
|
146 |
+
infix_re = compile_infix_regex(infixes)
|
147 |
+
|
148 |
+
# Angepasste Tokenizer-Funktion setzen
|
149 |
+
nlp.tokenizer = Tokenizer(nlp.vocab, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer)
|
150 |
+
# 2️⃣ Entity Ruler für Datumsangaben hinzufügen
|
151 |
+
ruler = nlp.add_pipe("entity_ruler")
|
152 |
+
|
153 |
+
patterns = [
|
154 |
+
{
|
155 |
+
"label": "DATE",
|
156 |
+
"pattern": [
|
157 |
+
{"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dddd"}
|
158 |
+
]
|
159 |
+
},
|
160 |
+
{
|
161 |
+
"label": "TIME",
|
162 |
+
"pattern": [
|
163 |
+
{"SHAPE": "dd"}, {"ORTH": ":"}, {"SHAPE": "dd"}
|
164 |
+
]
|
165 |
+
}
|
166 |
+
]
|
167 |
+
|
168 |
+
ruler.add_patterns(patterns)
|
169 |
+
|
170 |
+
|
171 |
+
# Prepare Training Data: Use Placeholders for Times and Dates
|
172 |
+
classifier_train_data_cleaned = {"EVENT_DATE": [], "OTHER":[]}
|
173 |
+
for text in classifier_train_data["EVENT_DATE"]:
|
174 |
+
text = normalize_data(text)
|
175 |
+
doc = nlp(text)
|
176 |
+
for ent in doc.ents:
|
177 |
+
if ent.label_ == "DATE":
|
178 |
+
text = text.replace(ent.text, "[DATE]")
|
179 |
+
if ent.label_ == "TIME":
|
180 |
+
text = text.replace(ent.text, "[TIME]")
|
181 |
+
classifier_train_data_cleaned["EVENT_DATE"].append(text)
|
182 |
+
for text in classifier_train_data["OTHER"]:
|
183 |
+
text = normalize_data(text)
|
184 |
+
doc = nlp(text)
|
185 |
+
for ent in doc.ents:
|
186 |
+
if ent.label_ == "DATE":
|
187 |
+
text = text.replace(ent.text, "[DATE]")
|
188 |
+
if ent.label_ == "TIME":
|
189 |
+
text = text.replace(ent.text, "[TIME]")
|
190 |
+
classifier_train_data_cleaned["OTHER"].append(text)
|
191 |
+
|
192 |
+
# remove duplicates
|
193 |
+
classifier_train_data_cleaned["EVENT_DATE"] = list(set(classifier_train_data_cleaned["EVENT_DATE"]))
|
194 |
+
classifier_train_data_cleaned["OTHER"] = list(set(classifier_train_data_cleaned["OTHER"]))
|
195 |
+
print(classifier_train_data_cleaned["EVENT_DATE"])
|
196 |
+
|
197 |
+
classifier = ClassyClassifier(data=classifier_train_data_cleaned)
|
198 |
+
classifier.set_embedding_model(model="stsb-xlm-r-multilingual")
|
199 |
+
|
200 |
+
|
201 |
+
for text in TEXTS:
|
202 |
+
text = normalize_data(text)
|
203 |
+
analyzer = MarkdownAnalyzer(text)
|
204 |
+
md_elements = analyzer.identify_all().get("block_elements")
|
205 |
+
for md_element in md_elements:
|
206 |
+
doc = nlp(md_element.text)
|
207 |
+
|
208 |
+
# Prüfe Tokenisierung
|
209 |
+
# print("Tokens:", [token.text for token in doc])
|
210 |
+
|
211 |
+
if doc.ents:
|
212 |
+
print("*" * 100)
|
213 |
+
|
214 |
+
# Extrahiere erkannte Entitäten
|
215 |
+
modified_text = md_element.text
|
216 |
+
for ent in doc.ents:
|
217 |
+
print(ent.text, ent.label_)
|
218 |
+
if ent.label_ == "DATE":
|
219 |
+
modified_text = modified_text.replace(ent.text, "[DATE]")
|
220 |
+
if ent.label_ == "TIME":
|
221 |
+
modified_text = modified_text.replace(ent.text, "[TIME]")
|
222 |
+
cats = classifier(modified_text)
|
223 |
+
print(modified_text)
|
224 |
+
print(f"{max(cats, key=cats.get)}")
|
225 |
+
print("*" * 100)
|
226 |
+
|
227 |
+
|
228 |
+
# for text in test_data["EVENT_DATE"]:
|
229 |
+
# print(text)
|
230 |
+
# print("*"*100)
|
231 |
+
# # print(nlp(text)._.cats)
|
232 |
+
# cats = classifier(text)
|
233 |
+
# print(f"{max(cats,key=cats.get)}")
|
234 |
+
# print("*"*100)
|
235 |
+
#
|
236 |
+
# print("\n\n\n\n\n")
|
237 |
+
# for text in test_data["OTHER"]:
|
238 |
+
# print(text)
|
239 |
+
# print("*"*100)
|
240 |
+
# # print(nlp(text)._.cats)
|
241 |
+
# cats = classifier(text)
|
242 |
+
# print(f"{max(cats,key=cats.get)}")
|
243 |
+
# print("*"*100)
|