File size: 5,672 Bytes
6fc4565
 
67e898c
6fc4565
67e898c
58c260c
6fc4565
67e898c
 
 
 
 
 
 
 
 
 
58c260c
 
6fc4565
 
67e898c
 
 
fc86982
 
 
 
14a5766
 
58c260c
fc86982
58c260c
67e898c
 
fc86982
67e898c
6fc4565
67e898c
 
 
 
 
6fc4565
67e898c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6fc4565
67e898c
6fc4565
 
 
67e898c
 
 
 
6fc4565
67e898c
 
 
 
14a5766
 
67e898c
14a5766
 
 
 
 
67e898c
6fc4565
67e898c
14a5766
67e898c
 
 
 
 
6fc4565
67e898c
6fc4565
 
67e898c
6fc4565
67e898c
 
 
6fc4565
 
 
14a5766
58c260c
14a5766
6fc4565
 
 
14a5766
6fc4565
 
 
 
 
 
 
58c260c
6fc4565
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import re

from src.nlp.playground.ner import GlinerHandler
from src.nlp.playground.pipelines.address_extractor import AddressExtractor
from src.nlp.playground.pipelines.date_extractor_v2 import ScheduleExtractor
from src.nlp.playground.pipelines.date_extractor_v3 import ScheduleExtractorV3
from src.nlp.playground.pipelines.description_extractor import DescriptionExtractor
from src.nlp.playground.pipelines.title_extractor import TitleExtractor
from src.nlp.playground.textclassification import ZeroShotClassifier, CustomMode
from src.utils.Event import Event


class EventDataExtractor:
    def __init__(self):
        self.title_extractor = TitleExtractor()
        self.zero_shot_classifier = ZeroShotClassifier()
        self.gliner_handler = GlinerHandler()
        # self.schedule_extractor = ScheduleExtractor()
        self.schedule_extractor = ScheduleExtractorV3()
        self.address_extractor = AddressExtractor()
        self.description_extractor = DescriptionExtractor()

    def extract(self, data):
        print("Starting extraction process...")
        event = Event()
        event.title = self.extract_title(data)
        event.categories = self.extract_categories(data)
        event.address = self.extract_address(data)
        event.locations = self.extract_locations(data, event.address)
        event.organizers = self.extract_organizers(data)
        event.schedule = self.extract_schedule(data)
        event.description = self.extract_description(data, event.title)
        event.prices = self.extract_prices(data)

        print("Extraction process completed.")
        return event

    def extract_title(self, md):
        print("Extracting title...")
        title = self.title_extractor.extract_title(md)
        print(f"Extracted title: {title}")
        return title

    def extract_categories(self, text):
        print("Extracting categories...")
        categories = []

        family_category = [cat.label for cat in self.zero_shot_classifier.classify(text,
                                                                                   CustomMode(["Kinder_und_Familie",
                                                                                               "Adults_only"],
                                                                                              "Die Veranstaltung ist für {}"))
                           if
                           cat.score >= 0.8]
        topic_category = [self.zero_shot_classifier.classify(text, CustomMode(
            ["Kunst", "Kultur", "Musik", "Sport", "Bildung", "Tanz", "Wissenschaft", "Unterhaltung", "Gesundheit",
             "Wellness", "Business", "Politik", "Religion"],
            "In der Veranstaltung geht es um {}"))[0].label]
        type_category = [self.zero_shot_classifier.classify(text, CustomMode(
            ["Oper", "Theater", "Konzert", "Musical", "Gottesdienst", "Ausstellung", "Museum", "Planetarium", "Führung",
             "Dokumentation", "Film", "Kino", "Vortrag", "Show", "Wettkampf", "Markt", "Feier", "Party", "Infoveranstaltung"],
            "Die Art der Veranstaltung ist {}"))[0].label]
        # time_category = [self.zero_shot_classifier.classify(text, CustomMode(
        #     ["Mehrere Tage", "Einen Tag"],
        #     "Die Veranstaltung findet über {} statt."))[0].label]

        categories.extend(family_category)
        categories.extend(topic_category)
        categories.extend(type_category)
        # categories.extend(time_category)

        print(f"Extracted categories: {categories}")
        return categories

    def extract_locations(self, data, address):
        address = address if address else ""
        print("Extracting locations...")
        entities = self.gliner_handler.extract_entities(data, ["Lokalität", "Adresse"])
        print(entities)
        if entities:
            return list(set([entity["text"] for entity in entities if entity["label"] == "Lokalität" and entity["text"] != "" and entity["text"] not in address]))
        return []

    def extract_organizers(self, data):
        print("Extracting organizers...")
        entities = self.gliner_handler.extract_entities(data, ["EVENT_ORGANIZER"])
        organizers = list(set([item["text"] for item in entities if item["label"] == "EVENT_ORGANIZER"]))

        print(f"Extracted organizers: {organizers}")
        return organizers

    def extract_address(self, data):
        print("Extracting address...")
        return self.address_extractor.extract_address(data)


    def extract_schedule(self, data):
        print("Extracting schedule...")
        date_times = self.schedule_extractor.extract(data)
        return date_times

    def extract_prices(self, data):
        print("Extracting prices...")
        entities = self.gliner_handler.extract_entities(data, ["Eintrittspreis"])
        print(entities)
        filtered_entities = [e["text"] for e in entities if e["text"] and re.search(r'\d\s*(€|EUR|eur|Eur|Euro|euro|euros|EURO)', e["text"]) and e["score"]>=0.4]

        prices = [re.findall(r'\d+(?:[.,]\d+)?', price) for price in filtered_entities]

        prices = [p.replace(",", ".") + " €" for sublist in prices for p in sublist]

        entrance_free_category = self.zero_shot_classifier.classify(data, CustomMode(
            ["Eintritt frei", "Ticket", "Preis"],
            "Der Eintritt zur Veranstaltung ist mit {}."
        ))[0].label

        if entrance_free_category == "Eintritt frei" and not prices:
            return ["kostenlos"]

        return prices

    def extract_description(self, data, title):
        return self.description_extractor.extract_description(data, title)