import html2text import re from markdownify import markdownify from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer def convert_html_to_md(html:str): md = markdownify(html, strip="a") return md def convert_html_to_md_html2text(html:str): h = html2text.HTML2Text() h.ignore_links = True h.ignore_images = True md = h.handle(html) return md def split_markdown_in_segments(md:str): analyzer = MarkdownAnalyzer(md) headers = analyzer.identify_headers() emphasis = analyzer.identify_emphasis() if headers: headers = headers["Header"] headers.extend(emphasis) else: headers = emphasis if emphasis else None header_paragraphs = [] if headers: lines = md.split("\n") lines = [line for line in lines if not re.fullmatch(r'=+', line)] if headers[0]["text"] in lines[0]: h = headers[0]["text"] else: h = "" lines.remove(lines[0]) p = [] for l in lines: if any(header["text"] in l for header in headers): header_paragraphs.append({ "header": h, "paragraphs": p, }) h = l p = [] else: if l.strip() == "": header_paragraphs.append({ "header": h, "paragraphs": p, }) h = "" p = [] else: p.append(l) header_paragraphs.append({ "header": h, "paragraphs": p, }) return header_paragraphs def split_markdown_in_chunks(md:str): analyzer = MarkdownAnalyzer(md) headers = analyzer.identify_headers() emphasis = analyzer.identify_emphasis() if headers: headers = headers["Header"] headers.extend(emphasis) else: headers = emphasis if emphasis else None header_paragraphs = [] if headers: lines = md.split("\n") lines = [line for line in lines if not re.fullmatch(r'=+', line)] if headers[0]["text"] in lines[0]: h = headers[0]["text"] else: h = "" lines.remove(lines[0]) p = [] for l in lines: if any(header["text"] in l for header in headers): header_paragraphs.append({ "header": h, "paragraphs": p, }) h = l p = [] else: p.append(l) header_paragraphs.append({ "header": h, "paragraphs": p, }) return header_paragraphs def match_and_extract_md(md:str, text:str): segments = split_markdown_in_segments(md) for segment in segments: print(segment)