Spaces:

adojode
/

event-data-extraction-playground

Running

manaviel85370

add pages and all

da88570 2 months ago

1.44 kB

	import json
	import re
	import string

	import markdown
	from bs4 import BeautifulSoup

	from src.persistence.db import *
	from src.utils.helpers import normalize_data
	from src.utils.markdown_processing.md_preprocessing import convert_html_to_md
	uri = f"mongodb+srv://event_data_extraction_application:[email protected]/?retryWrites=true&w=majority&appName=Cluster0"
	client = MongoClient(uri)
	db = client.event_data

	unsorted_urls = db.unsorted_urls
	event_urls = db.event_urls

	texts = []

	for event in event_urls.find():
	if "data" in event.keys() and event["data"]:
	text = event["data"]
	html = markdown.markdown(event["data"])
	soup = BeautifulSoup(html, "lxml")
	for tag in soup.find_all("a"):
	p_tag = soup.new_tag("p")
	p_tag.string = tag.get_text()
	tag.replace_with(p_tag)
	text = convert_html_to_md(soup.prettify())
	text = ''.join(i for i in text if i.isprintable() or i == "\n")
	text = re.sub(r'(!)?\[(.?)\]\([^)]\)', r'\2', text)
	text = re.sub(r'(!)?\[(.?)\]\([^)]\)', r'\2', text)
	text = normalize_data(text)
	texts.append(text)

	print(len(texts))
	# Speichere die Texte in einer JSON-Datei
	output = {"texts": texts}

	with open("texts.json", "w", encoding="utf-8") as f:
	json.dump(output, f, ensure_ascii=False, indent=4)

	print("Die Texte wurden erfolgreich in texts.json gespeichert.")