File size: 2,160 Bytes
da88570
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import re

from pymongo import MongoClient

from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer
from src.utils.markdown_processing.CustomMarkdownAnalyzer.event_texts import TEXTS

def get_dates(text):
    pattern = r"\b(\d{2}\.\d{2}\.\d{4})\b"
    return re.findall(pattern, text)

def get_date_ranges(text):
    pattern = r"\b(\d{2}\.\d{2}\.\d{4})\b\s*(?:\-|bis|bis\s*zum)\s*\b(\d{2}\.\d{2}\.\d{4})\b"
    return re.findall(pattern, text)



# for text in TEXTS:
#     analyzer = MarkdownAnalyzer(text)
#     result = analyzer.identify_all()
#     result = sorted(result, key=lambda el: el.line)
#     for element in result:
#         if get_date_ranges(element.text):
#             print(f"Found Date Range in {element.text}\n")
#         elif get_dates(element.text):
#             print(f"Found Date in {element.text}\n")

for text in TEXTS:
    analyzer = MarkdownAnalyzer(text)
    segments = analyzer.segmentation()
    text_dates = []
    for segment in segments:
        segment_dates = []
        for element in segment:
            date_ranges = get_date_ranges(element.text)
            if date_ranges:
                segment_dates.extend(date_ranges)
                # print(f"Found Date Range in {element.text}\n")
            else:
                dates = get_dates(element.text)
                if dates:
                    segment_dates.extend(dates)
                    # print(f"Found Date in {element.text}\n")
        text_dates.append(segment_dates)
    print(text)
    print(text_dates)
    print("*"*100)

# uri = f"mongodb+srv://event_data_extraction_application:[email protected]/?retryWrites=true&w=majority&appName=Cluster0"
# client = MongoClient(uri)
# db = client.event_data
#
# event_urls = db.event_urls
#
# for event in event_urls.find({"data": {"$ne": None}}, {"data": 1, "_id": 0}):
#     analyzer = MarkdownAnalyzer(event["data"])
#     result = analyzer.identify_all()
#     result = sorted(result, key=lambda el: el.line)
#     for element in result:
#         if has_date(element.text):
#
#             print(f"Found Date in {element.text}\n")