|
import json |
|
import re |
|
import string |
|
|
|
import markdown |
|
from bs4 import BeautifulSoup |
|
|
|
from src.persistence.db import * |
|
from src.utils.helpers import normalize_data |
|
from src.utils.markdown_processing.md_preprocessing import convert_html_to_md |
|
uri = f"mongodb+srv://event_data_extraction_application:[email protected]/?retryWrites=true&w=majority&appName=Cluster0" |
|
client = MongoClient(uri) |
|
db = client.event_data |
|
|
|
unsorted_urls = db.unsorted_urls |
|
event_urls = db.event_urls |
|
|
|
texts = [] |
|
|
|
for event in event_urls.find(): |
|
if "data" in event.keys() and event["data"]: |
|
text = event["data"] |
|
html = markdown.markdown(event["data"]) |
|
soup = BeautifulSoup(html, "lxml") |
|
for tag in soup.find_all("a"): |
|
p_tag = soup.new_tag("p") |
|
p_tag.string = tag.get_text() |
|
tag.replace_with(p_tag) |
|
text = convert_html_to_md(soup.prettify()) |
|
text = ''.join(i for i in text if i.isprintable() or i == "\n") |
|
text = re.sub(r'(!)?\[(.*?)\]\([^)]*\)', r'\2', text) |
|
text = re.sub(r'(!)?\[(.*?)\]\([^)]*\)', r'\2', text) |
|
text = normalize_data(text) |
|
texts.append(text) |
|
|
|
print(len(texts)) |
|
|
|
output = {"texts": texts} |
|
|
|
with open("texts.json", "w", encoding="utf-8") as f: |
|
json.dump(output, f, ensure_ascii=False, indent=4) |
|
|
|
print("Die Texte wurden erfolgreich in texts.json gespeichert.") |
|
|
|
|