|
from wtpsplit import SaT |
|
from transformers import pipeline |
|
|
|
import json |
|
|
|
from src.nlp.playground.textclassification import classify_paragraph |
|
from src.persistence.db import * |
|
uri = f"mongodb+srv://event_data_extraction_application:[email protected]/?retryWrites=true&w=majority&appName=Cluster0" |
|
client = MongoClient(uri) |
|
db = client.event_data |
|
|
|
unsorted_urls = db.unsorted_urls |
|
event_urls = db.event_urls |
|
|
|
|
|
label_description = "Veranstaltungsbeschreibung" |
|
sat = SaT("sat-12l-sm") |
|
text_class = [ |
|
label_description, |
|
"Titel", |
|
"Sonstiges", |
|
] |
|
classifier = pipeline("zero-shot-classification", |
|
model="facebook/bart-large-mnli") |
|
hypothesis_template = "Der Text ist {}." |
|
|
|
|
|
|
|
count = 10 |
|
for event in event_urls.find(): |
|
if "data" in event: |
|
text = event["data"] |
|
if text !="": |
|
print("Original:") |
|
print(text) |
|
segments = sat.split(text, do_paragraph_segmentation=True,paragraph_threshold=0.1) |
|
|
|
|
|
|
|
title = "", |
|
description = "" |
|
for sequence in segments: |
|
sequence = " ".join(sequence) |
|
|
|
if sequence!="": |
|
|
|
|
|
print("SEGMENTATION:") |
|
print(sequence) |
|
|
|
predictions = classify_paragraph(sequence) |
|
print("Labels:") |
|
print(predictions) |
|
print("\n") |
|
print("Beschreibung:") |
|
print(description) |
|
count-=1 |
|
if count==0: |
|
break |