import re from src.nlp.playground.ner import GlinerHandler from src.nlp.playground.pipelines.address_extractor import AddressExtractor from src.nlp.playground.pipelines.date_extractor_v2 import ScheduleExtractor from src.nlp.playground.pipelines.date_extractor_v3 import ScheduleExtractorV3 from src.nlp.playground.pipelines.description_extractor import DescriptionExtractor from src.nlp.playground.pipelines.title_extractor import TitleExtractor from src.nlp.playground.textclassification import ZeroShotClassifier, CustomMode from src.utils.Event import Event class EventDataExtractor: def __init__(self): self.title_extractor = TitleExtractor() self.zero_shot_classifier = ZeroShotClassifier() self.gliner_handler = GlinerHandler() # self.schedule_extractor = ScheduleExtractor() self.schedule_extractor = ScheduleExtractorV3() self.address_extractor = AddressExtractor() self.description_extractor = DescriptionExtractor() def extract(self, data): print("Starting extraction process...") event = Event() event.title = self.extract_title(data) event.categories = self.extract_categories(data) event.address = self.extract_address(data) event.locations = self.extract_locations(data, event.address) event.organizers = self.extract_organizers(data) event.schedule = self.extract_schedule(data) event.description = self.extract_description(data, event.title) event.prices = self.extract_prices(data) print("Extraction process completed.") return event def extract_title(self, md): print("Extracting title...") title = self.title_extractor.extract_title(md) print(f"Extracted title: {title}") return title def extract_categories(self, text): print("Extracting categories...") categories = [] family_category = [cat.label for cat in self.zero_shot_classifier.classify(text, CustomMode(["Kinder_und_Familie", "Adults_only"], "Die Veranstaltung ist für {}")) if cat.score >= 0.8] topic_category = [self.zero_shot_classifier.classify(text, CustomMode( ["Kunst", "Kultur", "Musik", "Sport", "Bildung", "Tanz", "Wissenschaft", "Unterhaltung", "Gesundheit", "Wellness", "Business", "Politik", "Religion"], "In der Veranstaltung geht es um {}"))[0].label] type_category = [self.zero_shot_classifier.classify(text, CustomMode( ["Oper", "Theater", "Konzert", "Musical", "Gottesdienst", "Ausstellung", "Museum", "Planetarium", "Führung", "Dokumentation", "Film", "Kino", "Vortrag", "Show", "Wettkampf", "Markt", "Feier", "Party", "Infoveranstaltung"], "Die Art der Veranstaltung ist {}"))[0].label] # time_category = [self.zero_shot_classifier.classify(text, CustomMode( # ["Mehrere Tage", "Einen Tag"], # "Die Veranstaltung findet über {} statt."))[0].label] categories.extend(family_category) categories.extend(topic_category) categories.extend(type_category) # categories.extend(time_category) print(f"Extracted categories: {categories}") return categories def extract_locations(self, data, address): address = address if address else "" print("Extracting locations...") entities = self.gliner_handler.extract_entities(data, ["Lokalität", "Adresse"]) print(entities) if entities: return list(set([entity["text"] for entity in entities if entity["label"] == "Lokalität" and entity["text"] != "" and entity["text"] not in address])) return [] def extract_organizers(self, data): print("Extracting organizers...") entities = self.gliner_handler.extract_entities(data, ["EVENT_ORGANIZER"]) organizers = list(set([item["text"] for item in entities if item["label"] == "EVENT_ORGANIZER"])) print(f"Extracted organizers: {organizers}") return organizers def extract_address(self, data): print("Extracting address...") return self.address_extractor.extract_address(data) def extract_schedule(self, data): print("Extracting schedule...") date_times = self.schedule_extractor.extract(data) return date_times def extract_prices(self, data): print("Extracting prices...") entities = self.gliner_handler.extract_entities(data, ["Eintrittspreis"]) print(entities) filtered_entities = [e["text"] for e in entities if e["text"] and re.search(r'\d\s*(€|EUR|eur|Eur|Euro|euro|euros|EURO)', e["text"]) and e["score"]>=0.4] prices = [re.findall(r'\d+(?:[.,]\d+)?', price) for price in filtered_entities] prices = [p.replace(",", ".") + " €" for sublist in prices for p in sublist] entrance_free_category = self.zero_shot_classifier.classify(data, CustomMode( ["Eintritt frei", "Ticket", "Preis"], "Der Eintritt zur Veranstaltung ist mit {}." ))[0].label if entrance_free_category == "Eintritt frei" and not prices: return ["kostenlos"] return prices def extract_description(self, data, title): return self.description_extractor.extract_description(data, title)