|
import re |
|
|
|
from src.nlp.playground.ner import GlinerHandler |
|
from src.nlp.playground.pipelines.address_extractor import AddressExtractor |
|
from src.nlp.playground.pipelines.date_extractor_v2 import ScheduleExtractor |
|
from src.nlp.playground.pipelines.date_extractor_v3 import ScheduleExtractorV3 |
|
from src.nlp.playground.pipelines.description_extractor import DescriptionExtractor |
|
from src.nlp.playground.pipelines.title_extractor import TitleExtractor |
|
from src.nlp.playground.textclassification import ZeroShotClassifier, CustomMode |
|
from src.utils.Event import Event |
|
|
|
|
|
class EventDataExtractor: |
|
def __init__(self): |
|
self.title_extractor = TitleExtractor() |
|
self.zero_shot_classifier = ZeroShotClassifier() |
|
self.gliner_handler = GlinerHandler() |
|
|
|
self.schedule_extractor = ScheduleExtractorV3() |
|
self.address_extractor = AddressExtractor() |
|
self.description_extractor = DescriptionExtractor() |
|
|
|
def extract(self, data): |
|
print("Starting extraction process...") |
|
event = Event() |
|
event.title = self.extract_title(data) |
|
event.categories = self.extract_categories(data) |
|
event.address = self.extract_address(data) |
|
event.locations = self.extract_locations(data, event.address) |
|
event.organizers = self.extract_organizers(data) |
|
event.schedule = self.extract_schedule(data) |
|
event.description = self.extract_description(data, event.title) |
|
event.prices = self.extract_prices(data) |
|
|
|
print("Extraction process completed.") |
|
return event |
|
|
|
def extract_title(self, md): |
|
print("Extracting title...") |
|
title = self.title_extractor.extract_title(md) |
|
print(f"Extracted title: {title}") |
|
return title |
|
|
|
def extract_categories(self, text): |
|
print("Extracting categories...") |
|
categories = [] |
|
|
|
family_category = [cat.label for cat in self.zero_shot_classifier.classify(text, |
|
CustomMode(["Kinder_und_Familie", |
|
"Adults_only"], |
|
"Die Veranstaltung ist für {}")) |
|
if |
|
cat.score >= 0.8] |
|
topic_category = [self.zero_shot_classifier.classify(text, CustomMode( |
|
["Kunst", "Kultur", "Musik", "Sport", "Bildung", "Tanz", "Wissenschaft", "Unterhaltung", "Gesundheit", |
|
"Wellness", "Business", "Politik", "Religion"], |
|
"In der Veranstaltung geht es um {}"))[0].label] |
|
type_category = [self.zero_shot_classifier.classify(text, CustomMode( |
|
["Oper", "Theater", "Konzert", "Musical", "Gottesdienst", "Ausstellung", "Museum", "Planetarium", "Führung", |
|
"Dokumentation", "Film", "Kino", "Vortrag", "Show", "Wettkampf", "Markt", "Feier", "Party", "Infoveranstaltung"], |
|
"Die Art der Veranstaltung ist {}"))[0].label] |
|
|
|
|
|
|
|
|
|
categories.extend(family_category) |
|
categories.extend(topic_category) |
|
categories.extend(type_category) |
|
|
|
|
|
print(f"Extracted categories: {categories}") |
|
return categories |
|
|
|
def extract_locations(self, data, address): |
|
address = address if address else "" |
|
print("Extracting locations...") |
|
entities = self.gliner_handler.extract_entities(data, ["Lokalität", "Adresse"]) |
|
print(entities) |
|
if entities: |
|
return list(set([entity["text"] for entity in entities if entity["label"] == "Lokalität" and entity["text"] != "" and entity["text"] not in address])) |
|
return [] |
|
|
|
def extract_organizers(self, data): |
|
print("Extracting organizers...") |
|
entities = self.gliner_handler.extract_entities(data, ["EVENT_ORGANIZER"]) |
|
organizers = list(set([item["text"] for item in entities if item["label"] == "EVENT_ORGANIZER"])) |
|
|
|
print(f"Extracted organizers: {organizers}") |
|
return organizers |
|
|
|
def extract_address(self, data): |
|
print("Extracting address...") |
|
return self.address_extractor.extract_address(data) |
|
|
|
|
|
def extract_schedule(self, data): |
|
print("Extracting schedule...") |
|
date_times = self.schedule_extractor.extract(data) |
|
return date_times |
|
|
|
def extract_prices(self, data): |
|
print("Extracting prices...") |
|
entities = self.gliner_handler.extract_entities(data, ["Eintrittspreis"]) |
|
print(entities) |
|
filtered_entities = [e["text"] for e in entities if e["text"] and re.search(r'\d\s*(€|EUR|eur|Eur|Euro|euro|euros|EURO)', e["text"]) and e["score"]>=0.4] |
|
|
|
prices = [re.findall(r'\d+(?:[.,]\d+)?', price) for price in filtered_entities] |
|
|
|
prices = [p.replace(",", ".") + " €" for sublist in prices for p in sublist] |
|
|
|
entrance_free_category = self.zero_shot_classifier.classify(data, CustomMode( |
|
["Eintritt frei", "Ticket", "Preis"], |
|
"Der Eintritt zur Veranstaltung ist mit {}." |
|
))[0].label |
|
|
|
if entrance_free_category == "Eintritt frei" and not prices: |
|
return ["kostenlos"] |
|
|
|
return prices |
|
|
|
def extract_description(self, data, title): |
|
return self.description_extractor.extract_description(data, title) |
|
|