elmanavi's picture
refactor testing
14a5766
import re
from src.nlp.playground.ner import GlinerHandler
from src.nlp.playground.pipelines.address_extractor import AddressExtractor
from src.nlp.playground.pipelines.date_extractor_v2 import ScheduleExtractor
from src.nlp.playground.pipelines.date_extractor_v3 import ScheduleExtractorV3
from src.nlp.playground.pipelines.description_extractor import DescriptionExtractor
from src.nlp.playground.pipelines.title_extractor import TitleExtractor
from src.nlp.playground.textclassification import ZeroShotClassifier, CustomMode
from src.utils.Event import Event
class EventDataExtractor:
def __init__(self):
self.title_extractor = TitleExtractor()
self.zero_shot_classifier = ZeroShotClassifier()
self.gliner_handler = GlinerHandler()
# self.schedule_extractor = ScheduleExtractor()
self.schedule_extractor = ScheduleExtractorV3()
self.address_extractor = AddressExtractor()
self.description_extractor = DescriptionExtractor()
def extract(self, data):
print("Starting extraction process...")
event = Event()
event.title = self.extract_title(data)
event.categories = self.extract_categories(data)
event.address = self.extract_address(data)
event.locations = self.extract_locations(data, event.address)
event.organizers = self.extract_organizers(data)
event.schedule = self.extract_schedule(data)
event.description = self.extract_description(data, event.title)
event.prices = self.extract_prices(data)
print("Extraction process completed.")
return event
def extract_title(self, md):
print("Extracting title...")
title = self.title_extractor.extract_title(md)
print(f"Extracted title: {title}")
return title
def extract_categories(self, text):
print("Extracting categories...")
categories = []
family_category = [cat.label for cat in self.zero_shot_classifier.classify(text,
CustomMode(["Kinder_und_Familie",
"Adults_only"],
"Die Veranstaltung ist für {}"))
if
cat.score >= 0.8]
topic_category = [self.zero_shot_classifier.classify(text, CustomMode(
["Kunst", "Kultur", "Musik", "Sport", "Bildung", "Tanz", "Wissenschaft", "Unterhaltung", "Gesundheit",
"Wellness", "Business", "Politik", "Religion"],
"In der Veranstaltung geht es um {}"))[0].label]
type_category = [self.zero_shot_classifier.classify(text, CustomMode(
["Oper", "Theater", "Konzert", "Musical", "Gottesdienst", "Ausstellung", "Museum", "Planetarium", "Führung",
"Dokumentation", "Film", "Kino", "Vortrag", "Show", "Wettkampf", "Markt", "Feier", "Party", "Infoveranstaltung"],
"Die Art der Veranstaltung ist {}"))[0].label]
# time_category = [self.zero_shot_classifier.classify(text, CustomMode(
# ["Mehrere Tage", "Einen Tag"],
# "Die Veranstaltung findet über {} statt."))[0].label]
categories.extend(family_category)
categories.extend(topic_category)
categories.extend(type_category)
# categories.extend(time_category)
print(f"Extracted categories: {categories}")
return categories
def extract_locations(self, data, address):
address = address if address else ""
print("Extracting locations...")
entities = self.gliner_handler.extract_entities(data, ["Lokalität", "Adresse"])
print(entities)
if entities:
return list(set([entity["text"] for entity in entities if entity["label"] == "Lokalität" and entity["text"] != "" and entity["text"] not in address]))
return []
def extract_organizers(self, data):
print("Extracting organizers...")
entities = self.gliner_handler.extract_entities(data, ["EVENT_ORGANIZER"])
organizers = list(set([item["text"] for item in entities if item["label"] == "EVENT_ORGANIZER"]))
print(f"Extracted organizers: {organizers}")
return organizers
def extract_address(self, data):
print("Extracting address...")
return self.address_extractor.extract_address(data)
def extract_schedule(self, data):
print("Extracting schedule...")
date_times = self.schedule_extractor.extract(data)
return date_times
def extract_prices(self, data):
print("Extracting prices...")
entities = self.gliner_handler.extract_entities(data, ["Eintrittspreis"])
print(entities)
filtered_entities = [e["text"] for e in entities if e["text"] and re.search(r'\d\s*(€|EUR|eur|Eur|Euro|euro|euros|EURO)', e["text"]) and e["score"]>=0.4]
prices = [re.findall(r'\d+(?:[.,]\d+)?', price) for price in filtered_entities]
prices = [p.replace(",", ".") + " €" for sublist in prices for p in sublist]
entrance_free_category = self.zero_shot_classifier.classify(data, CustomMode(
["Eintritt frei", "Ticket", "Preis"],
"Der Eintritt zur Veranstaltung ist mit {}."
))[0].label
if entrance_free_category == "Eintritt frei" and not prices:
return ["kostenlos"]
return prices
def extract_description(self, data, title):
return self.description_extractor.extract_description(data, title)