event-data-extraction-playground
/
src
/utils
/markdown_processing
/CustomMarkdownAnalyzer
/MarkdownElements.py
from abc import abstractmethod | |
class MarkdownElement: | |
def __init__(self, line, text): | |
self.line = line | |
self.text = text | |
def markdown(self): | |
pass | |
def __repr__(self): | |
return f"{self.__class__.__name__}(line={self.line}, text={self.text}, markdown={self.markdown})" | |
def __str__(self): | |
return f"[{self.__class__.__name__}] Line {self.line}: {self.text}" | |
class Header(MarkdownElement): | |
annotations = ["#","##","###","####","#####","######"] | |
def __init__(self, line, level, text): | |
super().__init__(line, text) | |
self.level = level | |
def markdown(self): | |
return self.annotations[self.level-1] + " " + self.text | |
def __repr__(self): | |
return f"Header(line={self.line}, level={self.level}, text={self.text}, markdown={self.markdown})" | |
def __str__(self): | |
return f"[Header] Line {self.line}, Level {self.level}: {self.text}" | |
class Paragraph(MarkdownElement): | |
def __init__(self, line, text): | |
super().__init__(line, text) | |
def markdown(self): | |
return self.text | |
class Emphasis(MarkdownElement): | |
def __init__(self, line, text): | |
super().__init__(line, text) | |
def markdown(self): | |
return f"**{self.text}**" | |
class Blockquote(MarkdownElement): | |
def __init__(self, line, text): | |
super().__init__(line, text) | |
def markdown(self): | |
lines = self.text.split("\n") | |
result = "> " + lines[0] | |
lines.pop(0) | |
for line in lines: | |
result = "".join([result,"\n> ", line]) | |
return result | |
def __repr__(self): | |
return f"[{self.__class__.__name__}](line={self.line}, content={self.text}, markdown={self.markdown})" | |
def __str__(self): | |
return f"[{self.__class__.__name__}] Line {self.line}: {self.text}" | |
class CodeBlock(MarkdownElement): | |
def __init__(self, line, content, language): | |
super().__init__(line, content) | |
self.language = language | |
def markdown(self): | |
return "```\n" + self.text + "\n```" | |
def __repr__(self): | |
return f"CodeBlock(line={self.line}, language={self.language}, content={self.text}, markdown={self.markdown})" | |
def __str__(self): | |
return f"[CodeBlock] Line {self.line}, Language {self.language}: {self.text}" | |
class UnorderedList(MarkdownElement): | |
def __init__(self, line, items): | |
super().__init__(line, "") | |
self.items = items | |
def markdown(self): | |
if self.items: | |
result ="- " + self.items[0]["text"] | |
self.items.pop(0) | |
for item in self.items: | |
indentation = " " * (item["level"]-1) * 2 | |
result = "".join([result, "\n", indentation ,"- ", item["text"]]) | |
return result | |
else: | |
return "" | |
def __repr__(self): | |
return f"UnorderedList(line={self.line}, items={self.items}, markdown={self.markdown})" | |
def __str__(self): | |
return f"[UnorderedList] Line {self.line}: Items {self.items}" | |
class OrderedList(MarkdownElement): | |
def __init__(self, line, items): | |
super().__init__(line, "") | |
self.items = items | |
def markdown(self): | |
md = "" | |
level_stack = [] | |
for idx,item in enumerate(self.items): | |
level = int(item["level"])-1 | |
text = item["text"] | |
while len(level_stack) < level + 1: | |
level_stack.append(1) | |
while len(level_stack) > level + 1: | |
level_stack.pop() | |
num = level_stack[-1] | |
level_stack[-1] += 1 | |
if idx==0: | |
md+="{}{}. {}".format(" " * level, num, text) | |
else: | |
md += "\n{}{}. {}".format(" " * level, num, text) | |
return md | |
def __repr__(self): | |
return f"OrderedList(line={self.line}, items={self.items}, markdown={self.markdown})" | |
def __str__(self): | |
return f"[OrderedList] Line {self.line}: Items {self.items}" | |
class Table(MarkdownElement): | |
def __init__(self, line, header, rows): | |
super().__init__(line, "") | |
self.header = header | |
self.rows = rows | |
def markdown(self): | |
header = "|".join(["",*self.header,""]) | |
line = "|" + ("----|" * len(self.header)) | |
rows = [] | |
for row in self.rows: | |
rows.append("|".join(["", *row, ""])) | |
return "\n".join([header,line, *rows]) # Platzhalter für markdown | |
def __repr__(self): | |
return f"Table(line={self.line}, header={self.header}, rows={self.rows}, markdown={self.markdown})" | |
def __str__(self): | |
return f"[Table] Line {self.line}: Header {self.header}, Rows {len(self.rows)}" | |
class HTMLBlock(MarkdownElement): | |
def __init__(self, line, text): | |
super().__init__(line, text) | |
def markdown(self): | |
return "\n".join(["```html", self.text, "```"]) | |