Spaces:
Sleeping
Sleeping
from langchain_community.document_loaders.parsers.pdf import PyPDFParser | |
from langchain_community.document_loaders.generic import GenericLoader | |
from langchain_core.document_loaders.blob_loaders import BlobLoader | |
from io import BytesIO | |
from starlette.datastructures import UploadFile | |
from typing import List, Iterable, ByteString | |
# Ensure this is the correct path for your custom loader | |
from custon_generic_loader import CustomGenericLoader | |
from langchain_core.documents import Document | |
from langchain_community.document_loaders.blob_loaders.schema import Blob | |
from parser.msword_parser import MsWordParser | |
from parser.pptx_parser import PptxParser | |
from parser.xlsx_parser import XlsxParser | |
from parser.txt_parser import TxtParser | |
from parser.audio_parser import AudioParser | |
from parser.video_parser import VideoParser | |
class InMemoryBlobLoader(BlobLoader): | |
def __init__(self, upload_file: UploadFile): | |
self.upload_file = upload_file | |
async def yield_blobs(self) -> Iterable[ByteString]: | |
data = await self.upload_file.read() | |
yield Blob.from_data(data, mime_type=self.upload_file.content_type, metadata={ | |
'name': self.upload_file.filename, | |
'size': self.upload_file.size, | |
'source': self.upload_file.filename | |
}) | |
async def load_document(upload_file: UploadFile) -> List[Document]: | |
blob_loader = InMemoryBlobLoader(upload_file) | |
if upload_file.content_type == 'application/pdf': | |
blob_parser = PyPDFParser() | |
print(f'Loading PDF: {upload_file.filename}') | |
elif upload_file.content_type in [ | |
'application/msword', | |
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', | |
'application/vnd.openxmlformats-officedocument.themeManager+xml' | |
]: | |
blob_parser = MsWordParser() | |
print(f'Loading Word Document: {upload_file.filename}') | |
elif upload_file.content_type in [ | |
'application/vnd.ms-powerpoint', | |
'application/vnd.openxmlformats-officedocument.presentationml.presentation' | |
]: | |
blob_parser = PptxParser() | |
print(f'Loading PowerPoint: {upload_file.filename}') | |
elif upload_file.content_type == 'text/plain': | |
blob_parser = TxtParser() | |
print(f'Loading Text File: {upload_file.filename}') | |
elif upload_file.content_type.startswith('audio/'): | |
blob_parser = AudioParser() | |
print(f'Loading Audio File: {upload_file.filename}') | |
elif upload_file.content_type.startswith('video/'): | |
blob_parser = VideoParser() | |
print(f'Loading Video File: {upload_file.filename}') | |
# Suggested code may be subject to a license. Learn more: ~LicenseLog:3330720155. | |
elif upload_file.content_type in [ | |
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', | |
'application/vnd.ms-excel' | |
]: | |
blob_parser = XlsxParser() | |
print(f'Loading Excel File: {upload_file.filename}') | |
else: | |
raise ValueError(f"Unsupported file type: {upload_file.content_type}") | |
loader = CustomGenericLoader(blob_loader, blob_parser) | |
documents = [] | |
# async for document in loader.lazy_load(): | |
# documents.append(document) | |
document = await loader.load_all() | |
documents.append(document) | |
if not documents: | |
raise ValueError( | |
f"No documents were loaded for file: {upload_file.filename}") | |
return documents | |
async def load_all_documents(upload_files: List[UploadFile]) -> List[List[Document]]: | |
all_documents = [] | |
for upload_file in upload_files: | |
try: | |
documents = await load_document(upload_file) | |
all_documents.extend(documents) | |
except ValueError as e: | |
print(f"Error loading {upload_file.filename}: {e}") | |
if not all_documents: | |
raise ValueError("No documents were loaded from the provided files.") | |
return all_documents | |
# Example usage: | |
# Note: You would typically run this inside an async function or an async event loop. | |
# Example: | |
# upload_files = [UploadFile1, UploadFile2, ...] | |
# documents = await load_all_documents(upload_files) | |