Spaces:
Sleeping
Sleeping
from typing import Any | |
from bs4 import BeautifulSoup | |
from langchain_core.documents import Document | |
from markdown import markdown | |
from pathlib import Path | |
from langchain.text_splitter import MarkdownTextSplitter, MarkdownHeaderTextSplitter, TextSplitter | |
from src.utils import batched | |
def read_markdown_file(path: str | Path) -> [str, str]: | |
path = Path(path) | |
with open(path, 'r', encoding="utf8") as f_r: | |
text = f_r.read() | |
# text = markdown(text) | |
# text = ''.join(BeautifulSoup(text).findAll(text=True)) | |
return text, str(path) | |
def split_markdown(md: str | list[str], | |
metadata=dict[str, Any] | list[dict[str, Any]], | |
chunk_size=512, | |
overlap=64, | |
splitter: TextSplitter = None) -> list[Document]: | |
if isinstance(md, str): | |
md = [md] | |
if isinstance(metadata, list): | |
raise ValueError("metadata should be a single dict") | |
metadata = [metadata] | |
if splitter is None: | |
headers_to_split_on = [ | |
("#", "Header 1"), | |
("##", "Header 2"), | |
("###", "Header 3"), | |
] | |
md = [MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False).split_text(i) for i in md] | |
metadata = [{**metadata[i], **text.metadata} for i, text_split in enumerate(md) for text in text_split] | |
md = [j.page_content for i in md for j in i] | |
splitter = MarkdownTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap) | |
docs = splitter.create_documents(md, metadata) | |
return docs | |
def process_markdown_files(paths: list[str | Path], batch_size=1, chunk_size=512, overlap=64): | |
for files in batched(paths, batch_size): | |
mds_w_paths = [read_markdown_file(i) for i in files] | |
metadata = [{"path": md_path} for _, md_path in mds_w_paths] | |
md = [md for md, _ in mds_w_paths] | |
docs = split_markdown(md, metadata, chunk_size=chunk_size, overlap=overlap) | |
yield [i.page_content for i in docs], [i.metadata for i in docs] | |