manaviel85370
add pages and all
da88570
from collections import defaultdict
from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownElements import Header, CodeBlock, OrderedList, \
UnorderedList, Table, HTMLBlock, Paragraph, Blockquote, Emphasis
from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownParser import MarkdownParser, InlineParser
class MarkdownAnalyzer:
def __init__(self, text, encoding='utf-8'):
self.text = text
parser = MarkdownParser(self.text)
self.tokens = parser.parse()
self.references = parser.references
self.footnotes = parser.footnotes
self.inline_parser = InlineParser(references=self.references, footnotes=self.footnotes)
self._parse_inline_tokens()
@classmethod
def from_file(self, file_path, encoding='utf-8'):
with open(file_path, 'r', encoding=encoding) as f:
self.text = f.read()
return MarkdownAnalyzer(self.text, encoding=encoding)
def identify_all(self):
return {
"block_elements":sorted([
*self.identify_headers().get("Header", []),
*self.identify_paragraphs().get("Paragraph", []),
*self.identify_blockquotes().get("Blockquote", []),
*self.identify_code_blocks().get("Code block", []),
*self.identify_lists().get("Unordered list", []),
*self.identify_lists().get("Ordered list", []),
*self.identify_tables().get("Table", []),
*self.identify_html_blocks(),
],key=lambda el: el.line),
"inline_elements": sorted([*self.identify_emphasis()],key=lambda el: el.line)
}
def segmentation(self):
md_elements = self.identify_all()
segments = []
if md_elements:
emphasises = [el for el in md_elements["inline_elements"] if isinstance(el, Emphasis)]
paragraphs = [el for el in md_elements["block_elements"] if isinstance(el, Paragraph)]
for emphasis in emphasises:
for paragraph in paragraphs:
if paragraph.text == emphasis.text:
md_elements["block_elements"].append(emphasis)
md_elements["inline_elements"].remove(emphasis)
md_elements["block_elements"].remove(paragraph)
block_elements = md_elements["block_elements"]
current_segment = [block_elements[0]]
block_elements.pop(0)
for element in block_elements:
if element.__class__.__name__ == "Header" or element.__class__.__name__ == "Emphasis":
segments.append(current_segment)
current_segment = [element]
else:
current_segment.append(element)
if current_segment:
segments.append(current_segment)
return segments
def identify_headers(self):
result = defaultdict(list)
for token in self.tokens:
if token.type == 'header':
result["Header"].append(Header(line=token.line, text=token.content, level=token.level))
return dict(result)
def identify_paragraphs(self):
result = defaultdict(list)
for token in self.tokens:
if token.type == 'paragraph':
result["Paragraph"].append(Paragraph(line=token.line, text=token.content))
return dict(result)
def identify_blockquotes(self):
result = defaultdict(list)
for token in self.tokens:
if token.type == 'blockquote':
result["Blockquote"].append(Blockquote(line=token.line, text=token.content))
return dict(result)
def identify_code_blocks(self):
result = defaultdict(list)
for token in self.tokens:
if token.type == 'code':
result["Code block"].append(
CodeBlock(line=token.line, content=token.content, language=token.meta.get("language")))
return dict(result)
def identify_lists(self):
result = defaultdict(list)
for token in self.tokens:
if token.type == 'ordered_list':
result["Ordered list"].append(OrderedList(line=token.line, items=token.meta["items"]))
elif token.type == 'unordered_list':
result["Unordered list"].append(UnorderedList(line=token.line, items=token.meta["items"]))
return dict(result)
def identify_tables(self):
result = defaultdict(list)
for token in self.tokens:
if token.type == 'table':
result["Table"].append(Table(line=token.line, header=token.meta["header"], rows=token.meta["rows"]))
return dict(result)
def identify_html_blocks(self):
result = []
for token in self.tokens:
if token.type == 'html_block':
result.append(HTMLBlock(line=token.line, text=token.content))
return result
def _parse_inline_tokens(self):
inline_types = ('paragraph', 'header', 'blockquote')
for token in self.tokens:
if token.type in inline_types and token.content:
inline_data = self.inline_parser.parse_inline(token.content)
token.meta.update(inline_data)
def identify_links(self):
result = defaultdict(list)
for token in self.tokens:
if "text_links" in token.meta:
for l in token.meta["text_links"]:
result["Text link"].append({"line": token.line, "text": l["text"], "url": l["url"]})
if "image_links" in token.meta:
for img in token.meta["image_links"]:
result["Image link"].append({"line": token.line, "alt_text": img["alt_text"], "url": img["url"]})
return dict(result)
def identify_footnotes(self):
result = []
seen = set()
for token in self.tokens:
if "footnotes_used" in token.meta:
for fn in token.meta["footnotes_used"]:
key = (fn["id"], fn["content"])
if key not in seen:
seen.add(key)
result.append({"line": token.line, "id": fn["id"], "content": fn["content"]})
return result
def identify_inline_code(self):
codes = []
for token in self.tokens:
if "inline_code" in token.meta:
for c in token.meta["inline_code"]:
codes.append({"line": token.line, "code": c})
return codes
def identify_emphasis(self):
ems = []
for token in self.tokens:
if "emphasis" in token.meta:
for e in token.meta["emphasis"]:
ems.append(Emphasis(line=token.line, text=e))
return ems
def identify_task_items(self):
tasks = []
for token in self.tokens:
if token.type in ('ordered_list', 'unordered_list'):
for it in token.meta["items"]:
if it.get("task_item"):
tasks.append({
"line": token.line,
"text": it["text"],
"checked": it["checked"]
})
return tasks
def identify_html_inline(self):
# Récupère les tags HTML inline dans tous les tokens inline
result = []
inline_types = ('paragraph', 'header', 'blockquote')
for token in self.tokens:
if token.type in inline_types and "html_inline" in token.meta:
for h in token.meta["html_inline"]:
result.append({"line": token.line, "html": h})
return result
def count_words(self):
words = self.text.split()
return len(words)
def count_characters(self):
characters = [char for char in self.text if not char.isspace()]
return len(characters)
def analyse(self):
headers = self.identify_headers().get("Header", [])
paragraphs = self.identify_paragraphs().get("Paragraph", [])
blockquotes = self.identify_blockquotes().get("Blockquote", [])
code_blocks = self.identify_code_blocks().get("Code block", [])
lists = self.identify_lists()
ordered_lists = lists.get("Ordered list", [])
unordered_lists = lists.get("Unordered list", [])
tables = self.identify_tables().get("Table", [])
html_blocks = self.identify_html_blocks()
html_inline = self.identify_html_inline()
analysis = {
'headers': len(headers),
'paragraphs': len(paragraphs),
'blockquotes': len(blockquotes),
'code_blocks': len(code_blocks),
'ordered_lists': sum(len(l) for l in ordered_lists),
'unordered_lists': sum(len(l) for l in unordered_lists),
'tables': len(tables),
'html_blocks': len(html_blocks),
'html_inline_count': len(html_inline),
'words': self.count_words(),
'characters': self.count_characters()
}
return analysis