import json import re import string import markdown from bs4 import BeautifulSoup from src.persistence.db import * from src.utils.helpers import normalize_data from src.utils.markdown_processing.md_preprocessing import convert_html_to_md uri = f"mongodb+srv://event_data_extraction_application:J1TRVDBbl4kSaxTD@cluster0.rtcz4.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0" client = MongoClient(uri) db = client.event_data unsorted_urls = db.unsorted_urls event_urls = db.event_urls texts = [] for event in event_urls.find(): if "data" in event.keys() and event["data"]: text = event["data"] html = markdown.markdown(event["data"]) soup = BeautifulSoup(html, "lxml") for tag in soup.find_all("a"): p_tag = soup.new_tag("p") p_tag.string = tag.get_text() tag.replace_with(p_tag) text = convert_html_to_md(soup.prettify()) text = ''.join(i for i in text if i.isprintable() or i == "\n") text = re.sub(r'(!)?\[(.*?)\]\([^)]*\)', r'\2', text) text = re.sub(r'(!)?\[(.*?)\]\([^)]*\)', r'\2', text) text = normalize_data(text) texts.append(text) print(len(texts)) # Speichere die Texte in einer JSON-Datei output = {"texts": texts} with open("texts.json", "w", encoding="utf-8") as f: json.dump(output, f, ensure_ascii=False, indent=4) print("Die Texte wurden erfolgreich in texts.json gespeichert.")