from typing import Any from bs4 import BeautifulSoup from langchain_core.documents import Document from markdown import markdown from pathlib import Path from langchain.text_splitter import MarkdownTextSplitter, MarkdownHeaderTextSplitter, TextSplitter from src.utils import batched def read_markdown_file(path: str | Path) -> [str, str]: path = Path(path) with open(path, 'r', encoding="utf8") as f_r: text = f_r.read() # text = markdown(text) # text = ''.join(BeautifulSoup(text).findAll(text=True)) return text, str(path) def split_markdown(md: str | list[str], metadata=dict[str, Any] | list[dict[str, Any]], chunk_size=512, overlap=64, splitter: TextSplitter = None) -> list[Document]: if isinstance(md, str): md = [md] if isinstance(metadata, list): raise ValueError("metadata should be a single dict") metadata = [metadata] if splitter is None: headers_to_split_on = [ ("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3"), ] md = [MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False).split_text(i) for i in md] metadata = [{**metadata[i], **text.metadata} for i, text_split in enumerate(md) for text in text_split] md = [j.page_content for i in md for j in i] splitter = MarkdownTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap) docs = splitter.create_documents(md, metadata) return docs def process_markdown_files(paths: list[str | Path], batch_size=1, chunk_size=512, overlap=64): for files in batched(paths, batch_size): mds_w_paths = [read_markdown_file(i) for i in files] metadata = [{"path": md_path} for _, md_path in mds_w_paths] md = [md for md, _ in mds_w_paths] docs = split_markdown(md, metadata, chunk_size=chunk_size, overlap=overlap) yield [i.page_content for i in docs], [i.metadata for i in docs]