|
import html2text |
|
import re |
|
from markdownify import markdownify |
|
|
|
|
|
from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer |
|
|
|
def convert_html_to_md(html:str): |
|
md = markdownify(html, strip="a") |
|
return md |
|
|
|
def convert_html_to_md_html2text(html:str): |
|
h = html2text.HTML2Text() |
|
h.ignore_links = True |
|
h.ignore_images = True |
|
md = h.handle(html) |
|
return md |
|
|
|
def split_markdown_in_segments(md:str): |
|
analyzer = MarkdownAnalyzer(md) |
|
headers = analyzer.identify_headers() |
|
emphasis = analyzer.identify_emphasis() |
|
|
|
if headers: |
|
headers = headers["Header"] |
|
headers.extend(emphasis) |
|
else: |
|
headers = emphasis if emphasis else None |
|
|
|
header_paragraphs = [] |
|
if headers: |
|
lines = md.split("\n") |
|
lines = [line for line in lines if not re.fullmatch(r'=+', line)] |
|
|
|
if headers[0]["text"] in lines[0]: |
|
h = headers[0]["text"] |
|
else: |
|
h = "" |
|
lines.remove(lines[0]) |
|
p = [] |
|
for l in lines: |
|
if any(header["text"] in l for header in headers): |
|
header_paragraphs.append({ |
|
"header": h, |
|
"paragraphs": p, |
|
}) |
|
h = l |
|
p = [] |
|
else: |
|
if l.strip() == "": |
|
header_paragraphs.append({ |
|
"header": h, |
|
"paragraphs": p, |
|
}) |
|
h = "" |
|
p = [] |
|
else: |
|
p.append(l) |
|
header_paragraphs.append({ |
|
"header": h, |
|
"paragraphs": p, |
|
}) |
|
return header_paragraphs |
|
|
|
def split_markdown_in_chunks(md:str): |
|
analyzer = MarkdownAnalyzer(md) |
|
headers = analyzer.identify_headers() |
|
emphasis = analyzer.identify_emphasis() |
|
|
|
if headers: |
|
headers = headers["Header"] |
|
headers.extend(emphasis) |
|
else: |
|
headers = emphasis if emphasis else None |
|
|
|
header_paragraphs = [] |
|
if headers: |
|
lines = md.split("\n") |
|
lines = [line for line in lines if not re.fullmatch(r'=+', line)] |
|
|
|
if headers[0]["text"] in lines[0]: |
|
h = headers[0]["text"] |
|
else: |
|
h = "" |
|
lines.remove(lines[0]) |
|
p = [] |
|
for l in lines: |
|
if any(header["text"] in l for header in headers): |
|
header_paragraphs.append({ |
|
"header": h, |
|
"paragraphs": p, |
|
}) |
|
h = l |
|
p = [] |
|
else: |
|
p.append(l) |
|
header_paragraphs.append({ |
|
"header": h, |
|
"paragraphs": p, |
|
}) |
|
return header_paragraphs |
|
|
|
def match_and_extract_md(md:str, text:str): |
|
segments = split_markdown_in_segments(md) |
|
for segment in segments: |
|
print(segment) |
|
|
|
|