Spaces:
Paused
Paused
File size: 716 Bytes
8669df3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 |
from chainlit import AskFileMessage
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
def split_file(file: AskFileMessage):
import tempfile
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
Loader = PyMuPDFLoader
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tempfile:
with open(tempfile.name, "wb") as f:
f.write(file.content)
loader = Loader(tempfile.name)
documents = loader.load()
docs = text_splitter.split_documents(documents)
for i, doc in enumerate(docs):
doc.metadata["source"] = f"source_{id}"
return docs |