File size: 5,672 Bytes
6fc4565 67e898c 6fc4565 67e898c 58c260c 6fc4565 67e898c 58c260c 6fc4565 67e898c fc86982 14a5766 58c260c fc86982 58c260c 67e898c fc86982 67e898c 6fc4565 67e898c 6fc4565 67e898c 6fc4565 67e898c 6fc4565 67e898c 6fc4565 67e898c 14a5766 67e898c 14a5766 67e898c 6fc4565 67e898c 14a5766 67e898c 6fc4565 67e898c 6fc4565 67e898c 6fc4565 67e898c 6fc4565 14a5766 58c260c 14a5766 6fc4565 14a5766 6fc4565 58c260c 6fc4565 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import re
from src.nlp.playground.ner import GlinerHandler
from src.nlp.playground.pipelines.address_extractor import AddressExtractor
from src.nlp.playground.pipelines.date_extractor_v2 import ScheduleExtractor
from src.nlp.playground.pipelines.date_extractor_v3 import ScheduleExtractorV3
from src.nlp.playground.pipelines.description_extractor import DescriptionExtractor
from src.nlp.playground.pipelines.title_extractor import TitleExtractor
from src.nlp.playground.textclassification import ZeroShotClassifier, CustomMode
from src.utils.Event import Event
class EventDataExtractor:
def __init__(self):
self.title_extractor = TitleExtractor()
self.zero_shot_classifier = ZeroShotClassifier()
self.gliner_handler = GlinerHandler()
# self.schedule_extractor = ScheduleExtractor()
self.schedule_extractor = ScheduleExtractorV3()
self.address_extractor = AddressExtractor()
self.description_extractor = DescriptionExtractor()
def extract(self, data):
print("Starting extraction process...")
event = Event()
event.title = self.extract_title(data)
event.categories = self.extract_categories(data)
event.address = self.extract_address(data)
event.locations = self.extract_locations(data, event.address)
event.organizers = self.extract_organizers(data)
event.schedule = self.extract_schedule(data)
event.description = self.extract_description(data, event.title)
event.prices = self.extract_prices(data)
print("Extraction process completed.")
return event
def extract_title(self, md):
print("Extracting title...")
title = self.title_extractor.extract_title(md)
print(f"Extracted title: {title}")
return title
def extract_categories(self, text):
print("Extracting categories...")
categories = []
family_category = [cat.label for cat in self.zero_shot_classifier.classify(text,
CustomMode(["Kinder_und_Familie",
"Adults_only"],
"Die Veranstaltung ist für {}"))
if
cat.score >= 0.8]
topic_category = [self.zero_shot_classifier.classify(text, CustomMode(
["Kunst", "Kultur", "Musik", "Sport", "Bildung", "Tanz", "Wissenschaft", "Unterhaltung", "Gesundheit",
"Wellness", "Business", "Politik", "Religion"],
"In der Veranstaltung geht es um {}"))[0].label]
type_category = [self.zero_shot_classifier.classify(text, CustomMode(
["Oper", "Theater", "Konzert", "Musical", "Gottesdienst", "Ausstellung", "Museum", "Planetarium", "Führung",
"Dokumentation", "Film", "Kino", "Vortrag", "Show", "Wettkampf", "Markt", "Feier", "Party", "Infoveranstaltung"],
"Die Art der Veranstaltung ist {}"))[0].label]
# time_category = [self.zero_shot_classifier.classify(text, CustomMode(
# ["Mehrere Tage", "Einen Tag"],
# "Die Veranstaltung findet über {} statt."))[0].label]
categories.extend(family_category)
categories.extend(topic_category)
categories.extend(type_category)
# categories.extend(time_category)
print(f"Extracted categories: {categories}")
return categories
def extract_locations(self, data, address):
address = address if address else ""
print("Extracting locations...")
entities = self.gliner_handler.extract_entities(data, ["Lokalität", "Adresse"])
print(entities)
if entities:
return list(set([entity["text"] for entity in entities if entity["label"] == "Lokalität" and entity["text"] != "" and entity["text"] not in address]))
return []
def extract_organizers(self, data):
print("Extracting organizers...")
entities = self.gliner_handler.extract_entities(data, ["EVENT_ORGANIZER"])
organizers = list(set([item["text"] for item in entities if item["label"] == "EVENT_ORGANIZER"]))
print(f"Extracted organizers: {organizers}")
return organizers
def extract_address(self, data):
print("Extracting address...")
return self.address_extractor.extract_address(data)
def extract_schedule(self, data):
print("Extracting schedule...")
date_times = self.schedule_extractor.extract(data)
return date_times
def extract_prices(self, data):
print("Extracting prices...")
entities = self.gliner_handler.extract_entities(data, ["Eintrittspreis"])
print(entities)
filtered_entities = [e["text"] for e in entities if e["text"] and re.search(r'\d\s*(€|EUR|eur|Eur|Euro|euro|euros|EURO)', e["text"]) and e["score"]>=0.4]
prices = [re.findall(r'\d+(?:[.,]\d+)?', price) for price in filtered_entities]
prices = [p.replace(",", ".") + " €" for sublist in prices for p in sublist]
entrance_free_category = self.zero_shot_classifier.classify(data, CustomMode(
["Eintritt frei", "Ticket", "Preis"],
"Der Eintritt zur Veranstaltung ist mit {}."
))[0].label
if entrance_free_category == "Eintritt frei" and not prices:
return ["kostenlos"]
return prices
def extract_description(self, data, title):
return self.description_extractor.extract_description(data, title)
|