manaviel85370
add pages and all
da88570
from wtpsplit import SaT
from transformers import pipeline
import json
from src.nlp.playground.textclassification import classify_paragraph
from src.persistence.db import *
uri = f"mongodb+srv://event_data_extraction_application:[email protected]/?retryWrites=true&w=majority&appName=Cluster0"
client = MongoClient(uri)
db = client.event_data
unsorted_urls = db.unsorted_urls
event_urls = db.event_urls
label_description = "Veranstaltungsbeschreibung"
sat = SaT("sat-12l-sm")
text_class = [
label_description,
"Titel",
"Sonstiges",
]
classifier = pipeline("zero-shot-classification",
model="facebook/bart-large-mnli")
hypothesis_template = "Der Text ist {}."
count = 10
for event in event_urls.find():
if "data" in event:
text = event["data"]
if text !="":
print("Original:")
print(text)
segments = sat.split(text, do_paragraph_segmentation=True,paragraph_threshold=0.1)
# print(segments)
title = "",
description = ""
for sequence in segments:
sequence = " ".join(sequence)
# print(sequence)
if sequence!="":
## Since monolingual model,its sensitive to hypothesis template. This can be experimented
print("SEGMENTATION:")
print(sequence)
# predictions = classifier(sequence, text_class, hypothesis_template=hypothesis_template)
predictions = classify_paragraph(sequence)
print("Labels:")
print(predictions)
print("\n")
print("Beschreibung:")
print(description)
count-=1
if count==0:
break