Spaces:

adojode
/

event-data-extraction-playground

Running

event-data-extraction-playground / src /utils /markdown_processing /CustomMarkdownAnalyzer /MarkdownAnalyzer.py

manaviel85370

add pages and all

da88570 2 months ago

9.19 kB

	from collections import defaultdict

	from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownElements import Header, CodeBlock, OrderedList, \
	UnorderedList, Table, HTMLBlock, Paragraph, Blockquote, Emphasis
	from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownParser import MarkdownParser, InlineParser


	class MarkdownAnalyzer:
	def __init__(self, text, encoding='utf-8'):
	self.text = text
	parser = MarkdownParser(self.text)
	self.tokens = parser.parse()
	self.references = parser.references
	self.footnotes = parser.footnotes
	self.inline_parser = InlineParser(references=self.references, footnotes=self.footnotes)

	self._parse_inline_tokens()

	@classmethod
	def from_file(self, file_path, encoding='utf-8'):
	with open(file_path, 'r', encoding=encoding) as f:
	self.text = f.read()
	return MarkdownAnalyzer(self.text, encoding=encoding)


	def identify_all(self):
	return {
	"block_elements":sorted([
	*self.identify_headers().get("Header", []),
	*self.identify_paragraphs().get("Paragraph", []),
	*self.identify_blockquotes().get("Blockquote", []),
	*self.identify_code_blocks().get("Code block", []),
	*self.identify_lists().get("Unordered list", []),
	*self.identify_lists().get("Ordered list", []),
	*self.identify_tables().get("Table", []),
	*self.identify_html_blocks(),
	],key=lambda el: el.line),
	"inline_elements": sorted([*self.identify_emphasis()],key=lambda el: el.line)
	}


	def segmentation(self):
	md_elements = self.identify_all()
	segments = []
	if md_elements:
	emphasises = [el for el in md_elements["inline_elements"] if isinstance(el, Emphasis)]
	paragraphs = [el for el in md_elements["block_elements"] if isinstance(el, Paragraph)]
	for emphasis in emphasises:
	for paragraph in paragraphs:
	if paragraph.text == emphasis.text:
	md_elements["block_elements"].append(emphasis)
	md_elements["inline_elements"].remove(emphasis)
	md_elements["block_elements"].remove(paragraph)
	block_elements = md_elements["block_elements"]
	current_segment = [block_elements[0]]
	block_elements.pop(0)
	for element in block_elements:
	if element.__class__.__name__ == "Header" or element.__class__.__name__ == "Emphasis":
	segments.append(current_segment)
	current_segment = [element]
	else:
	current_segment.append(element)
	if current_segment:
	segments.append(current_segment)
	return segments

	def identify_headers(self):
	result = defaultdict(list)
	for token in self.tokens:
	if token.type == 'header':
	result["Header"].append(Header(line=token.line, text=token.content, level=token.level))
	return dict(result)

	def identify_paragraphs(self):
	result = defaultdict(list)
	for token in self.tokens:
	if token.type == 'paragraph':
	result["Paragraph"].append(Paragraph(line=token.line, text=token.content))
	return dict(result)

	def identify_blockquotes(self):
	result = defaultdict(list)
	for token in self.tokens:
	if token.type == 'blockquote':
	result["Blockquote"].append(Blockquote(line=token.line, text=token.content))
	return dict(result)

	def identify_code_blocks(self):
	result = defaultdict(list)
	for token in self.tokens:
	if token.type == 'code':
	result["Code block"].append(
	CodeBlock(line=token.line, content=token.content, language=token.meta.get("language")))
	return dict(result)

	def identify_lists(self):
	result = defaultdict(list)
	for token in self.tokens:
	if token.type == 'ordered_list':
	result["Ordered list"].append(OrderedList(line=token.line, items=token.meta["items"]))
	elif token.type == 'unordered_list':
	result["Unordered list"].append(UnorderedList(line=token.line, items=token.meta["items"]))
	return dict(result)

	def identify_tables(self):
	result = defaultdict(list)
	for token in self.tokens:
	if token.type == 'table':
	result["Table"].append(Table(line=token.line, header=token.meta["header"], rows=token.meta["rows"]))
	return dict(result)

	def identify_html_blocks(self):
	result = []
	for token in self.tokens:
	if token.type == 'html_block':
	result.append(HTMLBlock(line=token.line, text=token.content))
	return result


	def _parse_inline_tokens(self):
	inline_types = ('paragraph', 'header', 'blockquote')
	for token in self.tokens:
	if token.type in inline_types and token.content:
	inline_data = self.inline_parser.parse_inline(token.content)
	token.meta.update(inline_data)



	def identify_links(self):
	result = defaultdict(list)
	for token in self.tokens:
	if "text_links" in token.meta:
	for l in token.meta["text_links"]:
	result["Text link"].append({"line": token.line, "text": l["text"], "url": l["url"]})
	if "image_links" in token.meta:
	for img in token.meta["image_links"]:
	result["Image link"].append({"line": token.line, "alt_text": img["alt_text"], "url": img["url"]})
	return dict(result)

	def identify_footnotes(self):
	result = []
	seen = set()
	for token in self.tokens:
	if "footnotes_used" in token.meta:
	for fn in token.meta["footnotes_used"]:
	key = (fn["id"], fn["content"])
	if key not in seen:
	seen.add(key)
	result.append({"line": token.line, "id": fn["id"], "content": fn["content"]})
	return result

	def identify_inline_code(self):
	codes = []
	for token in self.tokens:
	if "inline_code" in token.meta:
	for c in token.meta["inline_code"]:
	codes.append({"line": token.line, "code": c})
	return codes

	def identify_emphasis(self):
	ems = []
	for token in self.tokens:
	if "emphasis" in token.meta:
	for e in token.meta["emphasis"]:
	ems.append(Emphasis(line=token.line, text=e))
	return ems

	def identify_task_items(self):
	tasks = []
	for token in self.tokens:
	if token.type in ('ordered_list', 'unordered_list'):
	for it in token.meta["items"]:
	if it.get("task_item"):
	tasks.append({
	"line": token.line,
	"text": it["text"],
	"checked": it["checked"]
	})
	return tasks


	def identify_html_inline(self):
	# Récupère les tags HTML inline dans tous les tokens inline
	result = []
	inline_types = ('paragraph', 'header', 'blockquote')
	for token in self.tokens:
	if token.type in inline_types and "html_inline" in token.meta:
	for h in token.meta["html_inline"]:
	result.append({"line": token.line, "html": h})
	return result

	def count_words(self):
	words = self.text.split()
	return len(words)

	def count_characters(self):
	characters = [char for char in self.text if not char.isspace()]
	return len(characters)

	def analyse(self):
	headers = self.identify_headers().get("Header", [])
	paragraphs = self.identify_paragraphs().get("Paragraph", [])
	blockquotes = self.identify_blockquotes().get("Blockquote", [])
	code_blocks = self.identify_code_blocks().get("Code block", [])
	lists = self.identify_lists()
	ordered_lists = lists.get("Ordered list", [])
	unordered_lists = lists.get("Unordered list", [])
	tables = self.identify_tables().get("Table", [])
	html_blocks = self.identify_html_blocks()
	html_inline = self.identify_html_inline()

	analysis = {
	'headers': len(headers),
	'paragraphs': len(paragraphs),
	'blockquotes': len(blockquotes),
	'code_blocks': len(code_blocks),
	'ordered_lists': sum(len(l) for l in ordered_lists),
	'unordered_lists': sum(len(l) for l in unordered_lists),
	'tables': len(tables),
	'html_blocks': len(html_blocks),
	'html_inline_count': len(html_inline),
	'words': self.count_words(),
	'characters': self.count_characters()
	}
	return analysis