manaviel85370
add pages and all
da88570
import json
import re
import string
import markdown
from bs4 import BeautifulSoup
from src.persistence.db import *
from src.utils.helpers import normalize_data
from src.utils.markdown_processing.md_preprocessing import convert_html_to_md
uri = f"mongodb+srv://event_data_extraction_application:[email protected]/?retryWrites=true&w=majority&appName=Cluster0"
client = MongoClient(uri)
db = client.event_data
unsorted_urls = db.unsorted_urls
event_urls = db.event_urls
texts = []
for event in event_urls.find():
if "data" in event.keys() and event["data"]:
text = event["data"]
html = markdown.markdown(event["data"])
soup = BeautifulSoup(html, "lxml")
for tag in soup.find_all("a"):
p_tag = soup.new_tag("p")
p_tag.string = tag.get_text()
tag.replace_with(p_tag)
text = convert_html_to_md(soup.prettify())
text = ''.join(i for i in text if i.isprintable() or i == "\n")
text = re.sub(r'(!)?\[(.*?)\]\([^)]*\)', r'\2', text)
text = re.sub(r'(!)?\[(.*?)\]\([^)]*\)', r'\2', text)
text = normalize_data(text)
texts.append(text)
print(len(texts))
# Speichere die Texte in einer JSON-Datei
output = {"texts": texts}
with open("texts.json", "w", encoding="utf-8") as f:
json.dump(output, f, ensure_ascii=False, indent=4)
print("Die Texte wurden erfolgreich in texts.json gespeichert.")