from langchain_community.document_loaders.parsers.pdf import PyPDFParser from langchain_community.document_loaders.generic import GenericLoader from langchain_core.document_loaders.blob_loaders import BlobLoader from io import BytesIO from starlette.datastructures import UploadFile from typing import List, Iterable, ByteString # Ensure this is the correct path for your custom loader from custon_generic_loader import CustomGenericLoader from langchain_core.documents import Document from langchain_community.document_loaders.blob_loaders.schema import Blob from parser.msword_parser import MsWordParser from parser.pptx_parser import PptxParser from parser.xlsx_parser import XlsxParser from parser.txt_parser import TxtParser from parser.audio_parser import AudioParser from parser.video_parser import VideoParser class InMemoryBlobLoader(BlobLoader): def __init__(self, upload_file: UploadFile): self.upload_file = upload_file async def yield_blobs(self) -> Iterable[ByteString]: data = await self.upload_file.read() yield Blob.from_data(data, mime_type=self.upload_file.content_type, metadata={ 'name': self.upload_file.filename, 'size': self.upload_file.size, 'source': self.upload_file.filename }) async def load_document(upload_file: UploadFile) -> List[Document]: blob_loader = InMemoryBlobLoader(upload_file) if upload_file.content_type == 'application/pdf': blob_parser = PyPDFParser() print(f'Loading PDF: {upload_file.filename}') elif upload_file.content_type in [ 'application/msword', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/vnd.openxmlformats-officedocument.themeManager+xml' ]: blob_parser = MsWordParser() print(f'Loading Word Document: {upload_file.filename}') elif upload_file.content_type in [ 'application/vnd.ms-powerpoint', 'application/vnd.openxmlformats-officedocument.presentationml.presentation' ]: blob_parser = PptxParser() print(f'Loading PowerPoint: {upload_file.filename}') elif upload_file.content_type == 'text/plain': blob_parser = TxtParser() print(f'Loading Text File: {upload_file.filename}') elif upload_file.content_type.startswith('audio/'): blob_parser = AudioParser() print(f'Loading Audio File: {upload_file.filename}') elif upload_file.content_type.startswith('video/'): blob_parser = VideoParser() print(f'Loading Video File: {upload_file.filename}') # Suggested code may be subject to a license. Learn more: ~LicenseLog:3330720155. elif upload_file.content_type in [ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'application/vnd.ms-excel' ]: blob_parser = XlsxParser() print(f'Loading Excel File: {upload_file.filename}') else: raise ValueError(f"Unsupported file type: {upload_file.content_type}") loader = CustomGenericLoader(blob_loader, blob_parser) documents = [] # async for document in loader.lazy_load(): # documents.append(document) document = await loader.load_all() documents.append(document) if not documents: raise ValueError( f"No documents were loaded for file: {upload_file.filename}") return documents async def load_all_documents(upload_files: List[UploadFile]) -> List[List[Document]]: all_documents = [] for upload_file in upload_files: try: documents = await load_document(upload_file) all_documents.extend(documents) except ValueError as e: print(f"Error loading {upload_file.filename}: {e}") if not all_documents: raise ValueError("No documents were loaded from the provided files.") return all_documents # Example usage: # Note: You would typically run this inside an async function or an async event loop. # Example: # upload_files = [UploadFile1, UploadFile2, ...] # documents = await load_all_documents(upload_files)