Spaces:

adojode
/

event-data-extraction-playground

Running

manaviel85370

add pages and all

da88570 2 months ago

1.83 kB

	from wtpsplit import SaT
	from transformers import pipeline

	import json

	from src.nlp.playground.textclassification import classify_paragraph
	from src.persistence.db import *
	uri = f"mongodb+srv://event_data_extraction_application:[email protected]/?retryWrites=true&w=majority&appName=Cluster0"
	client = MongoClient(uri)
	db = client.event_data

	unsorted_urls = db.unsorted_urls
	event_urls = db.event_urls


	label_description = "Veranstaltungsbeschreibung"
	sat = SaT("sat-12l-sm")
	text_class = [
	label_description,
	"Titel",
	"Sonstiges",
	]
	classifier = pipeline("zero-shot-classification",
	model="facebook/bart-large-mnli")
	hypothesis_template = "Der Text ist {}."



	count = 10
	for event in event_urls.find():
	if "data" in event:
	text = event["data"]
	if text !="":
	print("Original:")
	print(text)
	segments = sat.split(text, do_paragraph_segmentation=True,paragraph_threshold=0.1)
	# print(segments)


	title = "",
	description = ""
	for sequence in segments:
	sequence = " ".join(sequence)
	# print(sequence)
	if sequence!="":
	## Since monolingual model,its sensitive to hypothesis template. This can be experimented

	print("SEGMENTATION:")
	print(sequence)
	# predictions = classifier(sequence, text_class, hypothesis_template=hypothesis_template)
	predictions = classify_paragraph(sequence)
	print("Labels:")
	print(predictions)
	print("\n")
	print("Beschreibung:")
	print(description)
	count-=1
	if count==0:
	break