event-data-extraction-playground
/
src
/utils
/markdown_processing
/CustomMarkdownAnalyzer
/MarkdownAnalyzer.py
from collections import defaultdict | |
from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownElements import Header, CodeBlock, OrderedList, \ | |
UnorderedList, Table, HTMLBlock, Paragraph, Blockquote, Emphasis | |
from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownParser import MarkdownParser, InlineParser | |
class MarkdownAnalyzer: | |
def __init__(self, text, encoding='utf-8'): | |
self.text = text | |
parser = MarkdownParser(self.text) | |
self.tokens = parser.parse() | |
self.references = parser.references | |
self.footnotes = parser.footnotes | |
self.inline_parser = InlineParser(references=self.references, footnotes=self.footnotes) | |
self._parse_inline_tokens() | |
def from_file(self, file_path, encoding='utf-8'): | |
with open(file_path, 'r', encoding=encoding) as f: | |
self.text = f.read() | |
return MarkdownAnalyzer(self.text, encoding=encoding) | |
def identify_all(self): | |
return { | |
"block_elements":sorted([ | |
*self.identify_headers().get("Header", []), | |
*self.identify_paragraphs().get("Paragraph", []), | |
*self.identify_blockquotes().get("Blockquote", []), | |
*self.identify_code_blocks().get("Code block", []), | |
*self.identify_lists().get("Unordered list", []), | |
*self.identify_lists().get("Ordered list", []), | |
*self.identify_tables().get("Table", []), | |
*self.identify_html_blocks(), | |
],key=lambda el: el.line), | |
"inline_elements": sorted([*self.identify_emphasis()],key=lambda el: el.line) | |
} | |
def segmentation(self): | |
md_elements = self.identify_all() | |
segments = [] | |
if md_elements: | |
emphasises = [el for el in md_elements["inline_elements"] if isinstance(el, Emphasis)] | |
paragraphs = [el for el in md_elements["block_elements"] if isinstance(el, Paragraph)] | |
for emphasis in emphasises: | |
for paragraph in paragraphs: | |
if paragraph.text == emphasis.text: | |
md_elements["block_elements"].append(emphasis) | |
md_elements["inline_elements"].remove(emphasis) | |
md_elements["block_elements"].remove(paragraph) | |
block_elements = md_elements["block_elements"] | |
current_segment = [block_elements[0]] | |
block_elements.pop(0) | |
for element in block_elements: | |
if element.__class__.__name__ == "Header" or element.__class__.__name__ == "Emphasis": | |
segments.append(current_segment) | |
current_segment = [element] | |
else: | |
current_segment.append(element) | |
if current_segment: | |
segments.append(current_segment) | |
return segments | |
def identify_headers(self): | |
result = defaultdict(list) | |
for token in self.tokens: | |
if token.type == 'header': | |
result["Header"].append(Header(line=token.line, text=token.content, level=token.level)) | |
return dict(result) | |
def identify_paragraphs(self): | |
result = defaultdict(list) | |
for token in self.tokens: | |
if token.type == 'paragraph': | |
result["Paragraph"].append(Paragraph(line=token.line, text=token.content)) | |
return dict(result) | |
def identify_blockquotes(self): | |
result = defaultdict(list) | |
for token in self.tokens: | |
if token.type == 'blockquote': | |
result["Blockquote"].append(Blockquote(line=token.line, text=token.content)) | |
return dict(result) | |
def identify_code_blocks(self): | |
result = defaultdict(list) | |
for token in self.tokens: | |
if token.type == 'code': | |
result["Code block"].append( | |
CodeBlock(line=token.line, content=token.content, language=token.meta.get("language"))) | |
return dict(result) | |
def identify_lists(self): | |
result = defaultdict(list) | |
for token in self.tokens: | |
if token.type == 'ordered_list': | |
result["Ordered list"].append(OrderedList(line=token.line, items=token.meta["items"])) | |
elif token.type == 'unordered_list': | |
result["Unordered list"].append(UnorderedList(line=token.line, items=token.meta["items"])) | |
return dict(result) | |
def identify_tables(self): | |
result = defaultdict(list) | |
for token in self.tokens: | |
if token.type == 'table': | |
result["Table"].append(Table(line=token.line, header=token.meta["header"], rows=token.meta["rows"])) | |
return dict(result) | |
def identify_html_blocks(self): | |
result = [] | |
for token in self.tokens: | |
if token.type == 'html_block': | |
result.append(HTMLBlock(line=token.line, text=token.content)) | |
return result | |
def _parse_inline_tokens(self): | |
inline_types = ('paragraph', 'header', 'blockquote') | |
for token in self.tokens: | |
if token.type in inline_types and token.content: | |
inline_data = self.inline_parser.parse_inline(token.content) | |
token.meta.update(inline_data) | |
def identify_links(self): | |
result = defaultdict(list) | |
for token in self.tokens: | |
if "text_links" in token.meta: | |
for l in token.meta["text_links"]: | |
result["Text link"].append({"line": token.line, "text": l["text"], "url": l["url"]}) | |
if "image_links" in token.meta: | |
for img in token.meta["image_links"]: | |
result["Image link"].append({"line": token.line, "alt_text": img["alt_text"], "url": img["url"]}) | |
return dict(result) | |
def identify_footnotes(self): | |
result = [] | |
seen = set() | |
for token in self.tokens: | |
if "footnotes_used" in token.meta: | |
for fn in token.meta["footnotes_used"]: | |
key = (fn["id"], fn["content"]) | |
if key not in seen: | |
seen.add(key) | |
result.append({"line": token.line, "id": fn["id"], "content": fn["content"]}) | |
return result | |
def identify_inline_code(self): | |
codes = [] | |
for token in self.tokens: | |
if "inline_code" in token.meta: | |
for c in token.meta["inline_code"]: | |
codes.append({"line": token.line, "code": c}) | |
return codes | |
def identify_emphasis(self): | |
ems = [] | |
for token in self.tokens: | |
if "emphasis" in token.meta: | |
for e in token.meta["emphasis"]: | |
ems.append(Emphasis(line=token.line, text=e)) | |
return ems | |
def identify_task_items(self): | |
tasks = [] | |
for token in self.tokens: | |
if token.type in ('ordered_list', 'unordered_list'): | |
for it in token.meta["items"]: | |
if it.get("task_item"): | |
tasks.append({ | |
"line": token.line, | |
"text": it["text"], | |
"checked": it["checked"] | |
}) | |
return tasks | |
def identify_html_inline(self): | |
# Récupère les tags HTML inline dans tous les tokens inline | |
result = [] | |
inline_types = ('paragraph', 'header', 'blockquote') | |
for token in self.tokens: | |
if token.type in inline_types and "html_inline" in token.meta: | |
for h in token.meta["html_inline"]: | |
result.append({"line": token.line, "html": h}) | |
return result | |
def count_words(self): | |
words = self.text.split() | |
return len(words) | |
def count_characters(self): | |
characters = [char for char in self.text if not char.isspace()] | |
return len(characters) | |
def analyse(self): | |
headers = self.identify_headers().get("Header", []) | |
paragraphs = self.identify_paragraphs().get("Paragraph", []) | |
blockquotes = self.identify_blockquotes().get("Blockquote", []) | |
code_blocks = self.identify_code_blocks().get("Code block", []) | |
lists = self.identify_lists() | |
ordered_lists = lists.get("Ordered list", []) | |
unordered_lists = lists.get("Unordered list", []) | |
tables = self.identify_tables().get("Table", []) | |
html_blocks = self.identify_html_blocks() | |
html_inline = self.identify_html_inline() | |
analysis = { | |
'headers': len(headers), | |
'paragraphs': len(paragraphs), | |
'blockquotes': len(blockquotes), | |
'code_blocks': len(code_blocks), | |
'ordered_lists': sum(len(l) for l in ordered_lists), | |
'unordered_lists': sum(len(l) for l in unordered_lists), | |
'tables': len(tables), | |
'html_blocks': len(html_blocks), | |
'html_inline_count': len(html_inline), | |
'words': self.count_words(), | |
'characters': self.count_characters() | |
} | |
return analysis | |