Spaces:
Sleeping
Sleeping
from typing import Iterator | |
from langchain_core.documents import Document | |
from langchain_community.document_loaders.base import BaseBlobParser | |
from langchain_community.document_loaders.blob_loaders import Blob | |
class PptxParser(BaseBlobParser): | |
"""Parse Microsoft PowerPoint presentations from a blob.""" | |
# type: ignore[valid-type] | |
def lazy_parse(self, blob: Blob) -> Iterator[Document]: | |
"""Parse a Microsoft PowerPoint document into the Document iterator. | |
Args: | |
blob: The blob to parse. | |
Returns: An iterator of Documents. | |
""" | |
try: | |
from pptx import Presentation | |
except ImportError as e: | |
raise ImportError( | |
"Could not import python-pptx, please install with `pip install python-pptx`." | |
) from e | |
supported_mime_types = [ | |
"application/vnd.ms-powerpoint", # .ppt | |
"application/vnd.openxmlformats-officedocument.presentationml.presentation" # .pptx | |
] | |
# Debugging: Print MIME type | |
print(f"Blob MIME type: {blob.mimetype}") | |
# type: ignore[attr-defined] | |
if blob.mimetype not in supported_mime_types: | |
raise ValueError( | |
f"This blob type is not supported for this parser. Supported types are: {supported_mime_types}" | |
) | |
with blob.as_bytes_io() as pptx_file: # type: ignore[attr-defined] | |
presentation = Presentation(pptx_file) | |
text = "" | |
for slide in presentation.slides: | |
for shape in slide.shapes: | |
if hasattr(shape, "text"): | |
text += shape.text + "\n" | |
metadata = {"source": blob.source} # type: ignore[attr-defined] | |
yield Document(page_content=text, metadata=metadata) | |