Spaces:
Paused
Paused
from chainlit import AskFileMessage | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
from langchain_community.document_loaders import PyMuPDFLoader | |
def split_file(file: AskFileMessage): | |
import tempfile | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100) | |
Loader = PyMuPDFLoader | |
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tempfile: | |
with open(tempfile.name, "wb") as f: | |
f.write(file.content) | |
loader = Loader(tempfile.name) | |
documents = loader.load() | |
docs = text_splitter.split_documents(documents) | |
for i, doc in enumerate(docs): | |
doc.metadata["source"] = f"source_{id}" | |
return docs |