import re | |
from pymongo import MongoClient | |
from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer | |
from src.utils.markdown_processing.CustomMarkdownAnalyzer.event_texts import TEXTS | |
def get_dates(text): | |
pattern = r"\b(\d{2}\.\d{2}\.\d{4})\b" | |
return re.findall(pattern, text) | |
def get_date_ranges(text): | |
pattern = r"\b(\d{2}\.\d{2}\.\d{4})\b\s*(?:\-|bis|bis\s*zum)\s*\b(\d{2}\.\d{2}\.\d{4})\b" | |
return re.findall(pattern, text) | |
# for text in TEXTS: | |
# analyzer = MarkdownAnalyzer(text) | |
# result = analyzer.identify_all() | |
# result = sorted(result, key=lambda el: el.line) | |
# for element in result: | |
# if get_date_ranges(element.text): | |
# print(f"Found Date Range in {element.text}\n") | |
# elif get_dates(element.text): | |
# print(f"Found Date in {element.text}\n") | |
for text in TEXTS: | |
analyzer = MarkdownAnalyzer(text) | |
segments = analyzer.segmentation() | |
text_dates = [] | |
for segment in segments: | |
segment_dates = [] | |
for element in segment: | |
date_ranges = get_date_ranges(element.text) | |
if date_ranges: | |
segment_dates.extend(date_ranges) | |
# print(f"Found Date Range in {element.text}\n") | |
else: | |
dates = get_dates(element.text) | |
if dates: | |
segment_dates.extend(dates) | |
# print(f"Found Date in {element.text}\n") | |
text_dates.append(segment_dates) | |
print(text) | |
print(text_dates) | |
print("*"*100) | |
# uri = f"mongodb+srv://event_data_extraction_application:[email protected]/?retryWrites=true&w=majority&appName=Cluster0" | |
# client = MongoClient(uri) | |
# db = client.event_data | |
# | |
# event_urls = db.event_urls | |
# | |
# for event in event_urls.find({"data": {"$ne": None}}, {"data": 1, "_id": 0}): | |
# analyzer = MarkdownAnalyzer(event["data"]) | |
# result = analyzer.identify_all() | |
# result = sorted(result, key=lambda el: el.line) | |
# for element in result: | |
# if has_date(element.text): | |
# | |
# print(f"Found Date in {element.text}\n") | |