ProductionRAG / handle_files.py
Mdean77's picture
App refactored
8669df3
raw
history blame contribute delete
716 Bytes
from chainlit import AskFileMessage
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
def split_file(file: AskFileMessage):
import tempfile
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
Loader = PyMuPDFLoader
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tempfile:
with open(tempfile.name, "wb") as f:
f.write(file.content)
loader = Loader(tempfile.name)
documents = loader.load()
docs = text_splitter.split_documents(documents)
for i, doc in enumerate(docs):
doc.metadata["source"] = f"source_{id}"
return docs