File size: 2,931 Bytes
da88570 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import html2text
import re
from markdownify import markdownify
from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer
def convert_html_to_md(html:str):
md = markdownify(html, strip="a")
return md
def convert_html_to_md_html2text(html:str):
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
md = h.handle(html)
return md
def split_markdown_in_segments(md:str):
analyzer = MarkdownAnalyzer(md)
headers = analyzer.identify_headers()
emphasis = analyzer.identify_emphasis()
if headers:
headers = headers["Header"]
headers.extend(emphasis)
else:
headers = emphasis if emphasis else None
header_paragraphs = []
if headers:
lines = md.split("\n")
lines = [line for line in lines if not re.fullmatch(r'=+', line)]
if headers[0]["text"] in lines[0]:
h = headers[0]["text"]
else:
h = ""
lines.remove(lines[0])
p = []
for l in lines:
if any(header["text"] in l for header in headers):
header_paragraphs.append({
"header": h,
"paragraphs": p,
})
h = l
p = []
else:
if l.strip() == "":
header_paragraphs.append({
"header": h,
"paragraphs": p,
})
h = ""
p = []
else:
p.append(l)
header_paragraphs.append({
"header": h,
"paragraphs": p,
})
return header_paragraphs
def split_markdown_in_chunks(md:str):
analyzer = MarkdownAnalyzer(md)
headers = analyzer.identify_headers()
emphasis = analyzer.identify_emphasis()
if headers:
headers = headers["Header"]
headers.extend(emphasis)
else:
headers = emphasis if emphasis else None
header_paragraphs = []
if headers:
lines = md.split("\n")
lines = [line for line in lines if not re.fullmatch(r'=+', line)]
if headers[0]["text"] in lines[0]:
h = headers[0]["text"]
else:
h = ""
lines.remove(lines[0])
p = []
for l in lines:
if any(header["text"] in l for header in headers):
header_paragraphs.append({
"header": h,
"paragraphs": p,
})
h = l
p = []
else:
p.append(l)
header_paragraphs.append({
"header": h,
"paragraphs": p,
})
return header_paragraphs
def match_and_extract_md(md:str, text:str):
segments = split_markdown_in_segments(md)
for segment in segments:
print(segment)
|