manaviel85370
add pages and all
da88570
import html2text
import re
from markdownify import markdownify
from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer
def convert_html_to_md(html:str):
md = markdownify(html, strip="a")
return md
def convert_html_to_md_html2text(html:str):
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
md = h.handle(html)
return md
def split_markdown_in_segments(md:str):
analyzer = MarkdownAnalyzer(md)
headers = analyzer.identify_headers()
emphasis = analyzer.identify_emphasis()
if headers:
headers = headers["Header"]
headers.extend(emphasis)
else:
headers = emphasis if emphasis else None
header_paragraphs = []
if headers:
lines = md.split("\n")
lines = [line for line in lines if not re.fullmatch(r'=+', line)]
if headers[0]["text"] in lines[0]:
h = headers[0]["text"]
else:
h = ""
lines.remove(lines[0])
p = []
for l in lines:
if any(header["text"] in l for header in headers):
header_paragraphs.append({
"header": h,
"paragraphs": p,
})
h = l
p = []
else:
if l.strip() == "":
header_paragraphs.append({
"header": h,
"paragraphs": p,
})
h = ""
p = []
else:
p.append(l)
header_paragraphs.append({
"header": h,
"paragraphs": p,
})
return header_paragraphs
def split_markdown_in_chunks(md:str):
analyzer = MarkdownAnalyzer(md)
headers = analyzer.identify_headers()
emphasis = analyzer.identify_emphasis()
if headers:
headers = headers["Header"]
headers.extend(emphasis)
else:
headers = emphasis if emphasis else None
header_paragraphs = []
if headers:
lines = md.split("\n")
lines = [line for line in lines if not re.fullmatch(r'=+', line)]
if headers[0]["text"] in lines[0]:
h = headers[0]["text"]
else:
h = ""
lines.remove(lines[0])
p = []
for l in lines:
if any(header["text"] in l for header in headers):
header_paragraphs.append({
"header": h,
"paragraphs": p,
})
h = l
p = []
else:
p.append(l)
header_paragraphs.append({
"header": h,
"paragraphs": p,
})
return header_paragraphs
def match_and_extract_md(md:str, text:str):
segments = split_markdown_in_segments(md)
for segment in segments:
print(segment)