manaviel85370
add pages and all
da88570
import re
from pymongo import MongoClient
from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer
from src.utils.markdown_processing.CustomMarkdownAnalyzer.event_texts import TEXTS
def get_dates(text):
pattern = r"\b(\d{2}\.\d{2}\.\d{4})\b"
return re.findall(pattern, text)
def get_date_ranges(text):
pattern = r"\b(\d{2}\.\d{2}\.\d{4})\b\s*(?:\-|bis|bis\s*zum)\s*\b(\d{2}\.\d{2}\.\d{4})\b"
return re.findall(pattern, text)
# for text in TEXTS:
# analyzer = MarkdownAnalyzer(text)
# result = analyzer.identify_all()
# result = sorted(result, key=lambda el: el.line)
# for element in result:
# if get_date_ranges(element.text):
# print(f"Found Date Range in {element.text}\n")
# elif get_dates(element.text):
# print(f"Found Date in {element.text}\n")
for text in TEXTS:
analyzer = MarkdownAnalyzer(text)
segments = analyzer.segmentation()
text_dates = []
for segment in segments:
segment_dates = []
for element in segment:
date_ranges = get_date_ranges(element.text)
if date_ranges:
segment_dates.extend(date_ranges)
# print(f"Found Date Range in {element.text}\n")
else:
dates = get_dates(element.text)
if dates:
segment_dates.extend(dates)
# print(f"Found Date in {element.text}\n")
text_dates.append(segment_dates)
print(text)
print(text_dates)
print("*"*100)
# uri = f"mongodb+srv://event_data_extraction_application:[email protected]/?retryWrites=true&w=majority&appName=Cluster0"
# client = MongoClient(uri)
# db = client.event_data
#
# event_urls = db.event_urls
#
# for event in event_urls.find({"data": {"$ne": None}}, {"data": 1, "_id": 0}):
# analyzer = MarkdownAnalyzer(event["data"])
# result = analyzer.identify_all()
# result = sorted(result, key=lambda el: el.line)
# for element in result:
# if has_date(element.text):
#
# print(f"Found Date in {element.text}\n")