File size: 716 Bytes
8669df3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
from chainlit import AskFileMessage
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader

def split_file(file: AskFileMessage):
     import tempfile
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
     Loader = PyMuPDFLoader
     with tempfile.NamedTemporaryFile(mode="w", delete=False) as tempfile:
        with open(tempfile.name, "wb") as f:
            f.write(file.content)
     loader = Loader(tempfile.name)
     documents = loader.load()
     docs = text_splitter.split_documents(documents)
     for i, doc in enumerate(docs):
        doc.metadata["source"] = f"source_{id}"
     return docs