File size: 1,441 Bytes
da88570
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import json
import re
import string

import markdown
from bs4 import BeautifulSoup

from src.persistence.db import *
from src.utils.helpers import normalize_data
from src.utils.markdown_processing.md_preprocessing import convert_html_to_md
uri = f"mongodb+srv://event_data_extraction_application:[email protected]/?retryWrites=true&w=majority&appName=Cluster0"
client = MongoClient(uri)
db = client.event_data

unsorted_urls = db.unsorted_urls
event_urls = db.event_urls

texts = []

for event in event_urls.find():
    if "data" in event.keys() and event["data"]:
        text = event["data"]
        html = markdown.markdown(event["data"])
        soup = BeautifulSoup(html, "lxml")
        for tag in soup.find_all("a"):
            p_tag = soup.new_tag("p")
            p_tag.string = tag.get_text()
            tag.replace_with(p_tag)
        text = convert_html_to_md(soup.prettify())
        text = ''.join(i for i in text if i.isprintable() or i == "\n")
        text =  re.sub(r'(!)?\[(.*?)\]\([^)]*\)', r'\2', text)
        text =  re.sub(r'(!)?\[(.*?)\]\([^)]*\)', r'\2', text)
        text = normalize_data(text)
        texts.append(text)

print(len(texts))
# Speichere die Texte in einer JSON-Datei
output = {"texts": texts}

with open("texts.json", "w", encoding="utf-8") as f:
    json.dump(output, f, ensure_ascii=False, indent=4)

print("Die Texte wurden erfolgreich in texts.json gespeichert.")