from wtpsplit import SaT from transformers import pipeline import json from src.nlp.playground.textclassification import classify_paragraph from src.persistence.db import * uri = f"mongodb+srv://event_data_extraction_application:J1TRVDBbl4kSaxTD@cluster0.rtcz4.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0" client = MongoClient(uri) db = client.event_data unsorted_urls = db.unsorted_urls event_urls = db.event_urls label_description = "Veranstaltungsbeschreibung" sat = SaT("sat-12l-sm") text_class = [ label_description, "Titel", "Sonstiges", ] classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") hypothesis_template = "Der Text ist {}." count = 10 for event in event_urls.find(): if "data" in event: text = event["data"] if text !="": print("Original:") print(text) segments = sat.split(text, do_paragraph_segmentation=True,paragraph_threshold=0.1) # print(segments) title = "", description = "" for sequence in segments: sequence = " ".join(sequence) # print(sequence) if sequence!="": ## Since monolingual model,its sensitive to hypothesis template. This can be experimented print("SEGMENTATION:") print(sequence) # predictions = classifier(sequence, text_class, hypothesis_template=hypothesis_template) predictions = classify_paragraph(sequence) print("Labels:") print(predictions) print("\n") print("Beschreibung:") print(description) count-=1 if count==0: break