manaviel85370 commited on
Commit
eaebaa4
·
1 Parent(s): 82f20dd

try date extraction

Browse files
src/nlp/experimental/textclassification/classy_classifier_date.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from classy_classification import ClassyClassifier
2
+ import pickle
3
+ import spacy
4
+ from spacy import displacy
5
+ from spacy.tokenizer import Tokenizer
6
+ from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
7
+ from nltk import Tree
8
+ from src.nlp.data.test_texts import TEXTS
9
+ from src.nlp.playground.textclassification import ZeroShotClassifier, CustomMode
10
+ from src.utils.helpers import normalize_data
11
+ from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer
12
+
13
+ classifier_train_data = {
14
+ "EVENT_DATE": [
15
+ "Termin: [DATE], 19:00",
16
+ "[DATE]",
17
+ "Unser Meetup ist am [DATE] um 18:30 Uhr.",
18
+ "Konzert: [DATE]",
19
+ "Das Festival startet am [DATE]",
20
+ "Die Show findet am [DATE] um 20:00 Uhr statt.",
21
+ "[DATE] – Save the Date!",
22
+ "Das Webinar beginnt am [DATE] um 16:00 Uhr.",
23
+ "[DATE] – Große Premiere im Theater!",
24
+ "Event am [DATE], komm vorbei!",
25
+ "[DATE] – Silvesterparty!",
26
+ "Fußballspiel: [DATE], 15:30 Uhr",
27
+ "Live-Musik am [DATE]",
28
+ "[DATE] – Infos folgen!",
29
+ "[DATE] um 20:00 Uhr",
30
+ "Termin: [DATE], 18:00 Uhr",
31
+ "Wann? [DATE], 19:00 bis 20:00 Uhr"
32
+ "Das Konzert findet am [DATE] statt.",
33
+ "Save the Date: [DATE]!",
34
+ "Nächste Veranstaltung: [DATE]",
35
+ "[DATE] – große Feier!",
36
+ "Konzert am [DATE], 20:00 Uhr",
37
+ "[DATE]",
38
+ "Festival: [DATE] – [DATE]",
39
+ "[DATE] – nicht verpassen!",
40
+ "Sportevent: [DATE]",
41
+ "Networking-Event am [DATE], 17:00 Uhr",
42
+ "Workshop: [DATE], 14:00 Uhr",
43
+ "Firmenfeier: [DATE] ab 18:30 Uhr",
44
+ "Seminar: [DATE], Beginn um 10:00 Uhr",
45
+ "Schulung am [DATE] um 15:00 Uhr",
46
+ "Jubiläumsfeier am [DATE], 19:30 Uhr",
47
+ "[DATE] 23:00",
48
+ "[DATE] 23:00",
49
+ "[DATE] 23:00",
50
+ "Datum: [DATE], Startzeit: 10:00, Endzeit: 12:00",
51
+ "Samstag, [DATE], Einlass: 15:59, Beginn: 17:30, Preis: 65,73 EUR",
52
+ "Samstag, [DATE], Einlass: 17:00, Beginn: 19:00, Preis: 74,99 EUR",
53
+ "Samstag, [DATE], Einlass: 18:00, Beginn: 20:00",
54
+ "Freitag, [DATE], Einlass: 17:00, Beginn: 19:00, Preis: 77,93 EUR",
55
+ "Samstag, [DATE], Einlass: 16:30, Beginn: 18:30, Preis: ab 69,99 Euro",
56
+ "Gestört aber GeiL – Das Festival • [DATE], 16:00 • Berlin",
57
+ "Samstag, [DATE], Einlass: 17:00, Beginn: 19:00",
58
+ "Kaufberatung: [DATE] um 19:00",
59
+ "Bedienung: [DATE] um 19:00",
60
+ "[DATE] Ganztägig",
61
+ "ab dem [DATE]",
62
+ "Mittwoch, [DATE], von 18:00-22:00",
63
+ "Mittwoch, [DATE], von 18:00-22:00",
64
+ "Augsburger Friedensgespräche am [DATE]",
65
+ "Augsburger Friedensgespräche am [DATE]",
66
+ "Augsburger Friedensgespräche am [DATE]",
67
+ "[DATE] - [DATE]",
68
+ "Am [DATE] endet der Weihnachtsmarkt bereits um 19:00.",
69
+ "Winzerglühwein Do [DATE] 17:00 - 19:00",
70
+ "Winzerglühwein Fr [DATE] 16:30 - 18:30",
71
+ "Winzerglühwein Sa [DATE] 15:30 - 15:30",
72
+ "Lessons and Carols Sa [DATE] 19:30 - 21:00",
73
+ "[DATE] - [DATE]",
74
+ "Vom [DATE] - [DATE]",
75
+ "[DATE] - [DATE]",
76
+ "Am [DATE] endet der Weihnachtsmarkt bereits um 19:00.",
77
+ "Do [DATE] 17:00 - 19:00",
78
+ "Fr [DATE] 16:30 - 18:30",
79
+ "Sa [DATE] 15:30 - 15:30",
80
+ "Sa [DATE] 19:30 - 21:00",
81
+ "[DATE]",
82
+ "[DATE] - [DATE]",
83
+ "[DATE]",
84
+ "[DATE]",
85
+ "[DATE] und [DATE]",
86
+ "[DATE]",
87
+ "[DATE] - [DATE]",
88
+ "Am [DATE] endet der Weihnachtsmarkt bereits um 19:00.",
89
+ "Vom [DATE] - [DATE]",
90
+ "[DATE] und [DATE]",
91
+ "Am [DATE] ab 19:00",
92
+ "Am [DATE] ab 19:00",
93
+ "Am [DATE]",
94
+ "[DATE]",
95
+ "[DATE]",
96
+ "[DATE]",
97
+ "[DATE] 16:00 – [DATE] 17:00",
98
+ "[DATE] 10:15 – [DATE] 12:30",
99
+ "[DATE] 10:00 – [DATE] 18:00",
100
+ "[DATE]",
101
+ "[DATE] 11:00 – [DATE] 18:00",
102
+ "[DATE] - [DATE]",
103
+ "[DATE] | 19:30",
104
+ "[DATE]",
105
+ "[DATE] bis einschließlich [DATE]",
106
+ "[DATE], [DATE], [DATE] und [DATE]",
107
+ "[DATE] 18:00",
108
+ "[DATE] 13:00-21:00",
109
+ ],
110
+ "OTHER": [
111
+ "Der Vorverkauf startet am [DATE].",
112
+ "Anmeldefrist: [DATE]",
113
+ "Tickets sind bis zum [DATE] erhältlich.",
114
+ "Call for Papers läuft bis zum [DATE].",
115
+ "Die Registrierung endet am [DATE].",
116
+ "Bewerbungsschluss: [DATE].",
117
+ "Frühbucherrabatt bis zum [DATE]!",
118
+ "Einreichungsfrist: [DATE]",
119
+ "Die Akkreditierung läuft bis zum [DATE]",
120
+ "Reservierungen sind bis zum [DATE] möglich.",
121
+ "Der Ticketverkauf startet am [DATE]",
122
+ "Letzte Chance zur Anmeldung: [DATE]",
123
+ "Call for Speakers läuft bis [DATE]",
124
+ "Bitte reicht eure Abstracts bis [DATE] ein.",
125
+ "Akkreditierung endet am [DATE]",
126
+ "Bewerbungsschluss: [DATE]",
127
+ "Die Early-Bird-Phase läuft bis zum [DATE]",
128
+ "Anmeldefrist für Workshops: [DATE]",
129
+ "Die Frist für Sponsorings endet am [DATE]",
130
+ "Vergünstigte Tickets bis zum [DATE] verfügbar!"
131
+ ]
132
+ }
133
+
134
+
135
+
136
+ nlp = spacy.blank("de")
137
+ nlp.add_pipe('sentencizer')
138
+
139
+ # 1️⃣ Punkt als Suffix & Infix definieren (damit er zwischen Zahlen trennt)
140
+ suffixes = list(nlp.Defaults.suffixes) + [r"\."] # Punkt als Suffix hinzufügen
141
+ infixes = list(nlp.Defaults.infixes) + [r"(?<=\d)\.(?=\d)"] + [r"(?<=\d)\:(?=\d)"] # Punkt zwischen Zahlen trennen
142
+
143
+
144
+ # Regex-Objekte kompilieren
145
+ suffix_re = compile_suffix_regex(suffixes)
146
+ infix_re = compile_infix_regex(infixes)
147
+
148
+ # Angepasste Tokenizer-Funktion setzen
149
+ nlp.tokenizer = Tokenizer(nlp.vocab, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer)
150
+ # 2️⃣ Entity Ruler für Datumsangaben hinzufügen
151
+ ruler = nlp.add_pipe("entity_ruler")
152
+
153
+ patterns = [
154
+ {
155
+ "label": "DATE",
156
+ "pattern": [
157
+ {"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dddd"}
158
+ ]
159
+ },
160
+ {
161
+ "label": "TIME",
162
+ "pattern": [
163
+ {"SHAPE": "dd"}, {"ORTH": ":"}, {"SHAPE": "dd"}
164
+ ]
165
+ }
166
+ ]
167
+
168
+ ruler.add_patterns(patterns)
169
+
170
+
171
+ # Prepare Training Data: Use Placeholders for Times and Dates
172
+ classifier_train_data_cleaned = {"EVENT_DATE": [], "OTHER":[]}
173
+ for text in classifier_train_data["EVENT_DATE"]:
174
+ text = normalize_data(text)
175
+ doc = nlp(text)
176
+ for ent in doc.ents:
177
+ if ent.label_ == "DATE":
178
+ text = text.replace(ent.text, "[DATE]")
179
+ if ent.label_ == "TIME":
180
+ text = text.replace(ent.text, "[TIME]")
181
+ classifier_train_data_cleaned["EVENT_DATE"].append(text)
182
+ for text in classifier_train_data["OTHER"]:
183
+ text = normalize_data(text)
184
+ doc = nlp(text)
185
+ for ent in doc.ents:
186
+ if ent.label_ == "DATE":
187
+ text = text.replace(ent.text, "[DATE]")
188
+ if ent.label_ == "TIME":
189
+ text = text.replace(ent.text, "[TIME]")
190
+ classifier_train_data_cleaned["OTHER"].append(text)
191
+
192
+ # remove duplicates
193
+ classifier_train_data_cleaned["EVENT_DATE"] = list(set(classifier_train_data_cleaned["EVENT_DATE"]))
194
+ classifier_train_data_cleaned["OTHER"] = list(set(classifier_train_data_cleaned["OTHER"]))
195
+ print(classifier_train_data_cleaned["EVENT_DATE"])
196
+
197
+ classifier = ClassyClassifier(data=classifier_train_data_cleaned)
198
+ classifier.set_embedding_model(model="stsb-xlm-r-multilingual")
199
+
200
+
201
+ for text in TEXTS:
202
+ text = normalize_data(text)
203
+ analyzer = MarkdownAnalyzer(text)
204
+ md_elements = analyzer.identify_all().get("block_elements")
205
+ for md_element in md_elements:
206
+ doc = nlp(md_element.text)
207
+
208
+ # Prüfe Tokenisierung
209
+ # print("Tokens:", [token.text for token in doc])
210
+
211
+ if doc.ents:
212
+ print("*" * 100)
213
+
214
+ # Extrahiere erkannte Entitäten
215
+ modified_text = md_element.text
216
+ for ent in doc.ents:
217
+ print(ent.text, ent.label_)
218
+ if ent.label_ == "DATE":
219
+ modified_text = modified_text.replace(ent.text, "[DATE]")
220
+ if ent.label_ == "TIME":
221
+ modified_text = modified_text.replace(ent.text, "[TIME]")
222
+ cats = classifier(modified_text)
223
+ print(modified_text)
224
+ print(f"{max(cats, key=cats.get)}")
225
+ print("*" * 100)
226
+
227
+
228
+ # for text in test_data["EVENT_DATE"]:
229
+ # print(text)
230
+ # print("*"*100)
231
+ # # print(nlp(text)._.cats)
232
+ # cats = classifier(text)
233
+ # print(f"{max(cats,key=cats.get)}")
234
+ # print("*"*100)
235
+ #
236
+ # print("\n\n\n\n\n")
237
+ # for text in test_data["OTHER"]:
238
+ # print(text)
239
+ # print("*"*100)
240
+ # # print(nlp(text)._.cats)
241
+ # cats = classifier(text)
242
+ # print(f"{max(cats,key=cats.get)}")
243
+ # print("*"*100)