import re from pymongo import MongoClient from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer from src.utils.markdown_processing.CustomMarkdownAnalyzer.event_texts import TEXTS def get_dates(text): pattern = r"\b(\d{2}\.\d{2}\.\d{4})\b" return re.findall(pattern, text) def get_date_ranges(text): pattern = r"\b(\d{2}\.\d{2}\.\d{4})\b\s*(?:\-|bis|bis\s*zum)\s*\b(\d{2}\.\d{2}\.\d{4})\b" return re.findall(pattern, text) # for text in TEXTS: # analyzer = MarkdownAnalyzer(text) # result = analyzer.identify_all() # result = sorted(result, key=lambda el: el.line) # for element in result: # if get_date_ranges(element.text): # print(f"Found Date Range in {element.text}\n") # elif get_dates(element.text): # print(f"Found Date in {element.text}\n") for text in TEXTS: analyzer = MarkdownAnalyzer(text) segments = analyzer.segmentation() text_dates = [] for segment in segments: segment_dates = [] for element in segment: date_ranges = get_date_ranges(element.text) if date_ranges: segment_dates.extend(date_ranges) # print(f"Found Date Range in {element.text}\n") else: dates = get_dates(element.text) if dates: segment_dates.extend(dates) # print(f"Found Date in {element.text}\n") text_dates.append(segment_dates) print(text) print(text_dates) print("*"*100) # uri = f"mongodb+srv://event_data_extraction_application:J1TRVDBbl4kSaxTD@cluster0.rtcz4.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0" # client = MongoClient(uri) # db = client.event_data # # event_urls = db.event_urls # # for event in event_urls.find({"data": {"$ne": None}}, {"data": 1, "_id": 0}): # analyzer = MarkdownAnalyzer(event["data"]) # result = analyzer.identify_all() # result = sorted(result, key=lambda el: el.line) # for element in result: # if has_date(element.text): # # print(f"Found Date in {element.text}\n")