from typing import Iterator from langchain_core.documents import Document from langchain_community.document_loaders.base import BaseBlobParser from langchain_community.document_loaders.blob_loaders import Blob class XlsxParser(BaseBlobParser): """Parse Microsoft Excel spreadsheets from a blob.""" def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Parse a Microsoft Excel document into the Document iterator. Args: blob: The blob to parse. Returns: An iterator of Documents. """ try: from openpyxl import load_workbook except ImportError as e: raise ImportError( "Could not import openpyxl, please install with `pip install openpyxl`." ) from e supported_mime_types = [ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" # .xlsx ] # Debugging: Print MIME type print(f"Blob MIME type: {blob.mimetype}") if blob.mimetype not in supported_mime_types: raise ValueError( f"This blob type is not supported for this parser. Supported types are: {supported_mime_types}" ) with blob.as_bytes_io() as xlsx_file: workbook = load_workbook(xlsx_file, data_only=True) for sheet in workbook.sheetnames: worksheet = workbook[sheet] text = "" for row in worksheet.iter_rows(values_only=True): row_data = "\t".join([str(cell) if cell is not None else "" for cell in row]) text += row_data + "\n" metadata = {"source": blob.source, "sheet": sheet} yield Document(page_content=text, metadata=metadata)