Spaces:

jameszokah
/

marigen_api

Sleeping

App Files Files Community

jameszokah commited on Jul 27, 2024

Commit

17ec764

verified ·

1 Parent(s): a7571eb

Update parser/msword_parser.py

Browse files

Files changed (1) hide show

parser/msword_parser.py +45 -44

parser/msword_parser.py CHANGED Viewed

@@ -1,44 +1,45 @@
-from typing import Iterator
-from langchain_core.documents import Document
-from langchain_community.document_loaders.base import BaseBlobParser
-from langchain_community.document_loaders.blob_loaders import Blob
-class MsWordParser(BaseBlobParser):
-    """Parse Microsoft Word documents from a blob."""
-    # type: ignore[valid-type]
-    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
-        """Parse a Microsoft Word document into the Document iterator.
-        Args:
-            blob: The blob to parse.
-        Returns: An iterator of Documents.
-        """
-        try:
-            from docx import Document as DocxDocument
-        except ImportError as e:
-            raise ImportError(
-                "Could not import python-docx, please install with `pip install python-docx`."
-            ) from e
-        supported_mime_types = [
-            "application/msword",
-            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-        ]
-        # Debugging: Print MIME type
-        print(f"Blob MIME type: {blob.mimetype}")
-        # type: ignore[attr-defined]
-        if blob.mimetype not in supported_mime_types:
-            raise ValueError(
-                f"This blob type is not supported for this parser. Supported types are: {supported_mime_types}"
-            )
-        with blob.as_bytes_io() as word_document:  # type: ignore[attr-defined]
-            doc = DocxDocument(word_document)
-            text = "\n\n".join([para.text for para in doc.paragraphs])
-            metadata = {"source": blob.source}  # type: ignore[attr-defined]
-            yield Document(page_content=text, metadata=metadata)

+from typing import Iterator
+from langchain_core.documents import Document
+from langchain_community.document_loaders.base import BaseBlobParser
+from langchain_community.document_loaders.blob_loaders import Blob
+class MsWordParser(BaseBlobParser):
+    """Parse Microsoft Word documents from a blob."""
+    # type: ignore[valid-type]
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        """Parse a Microsoft Word document into the Document iterator.
+        Args:
+            blob: The blob to parse.
+        Returns: An iterator of Documents.
+        """
+        try:
+            from docx import Document as DocxDocument
+        except ImportError as e:
+            raise ImportError(
+                "Could not import python-docx, please install with `pip install python-docx`."
+            ) from e
+        supported_mime_types = [
+            "application/msword",
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            "application/vnd.openxmlformats-officedocument.themeManager+xml",
+        ]
+        # Debugging: Print MIME type
+        print(f"Blob MIME type: {blob.mimetype}")
+        # type: ignore[attr-defined]
+        if blob.mimetype not in supported_mime_types:
+            raise ValueError(
+                f"This blob type is not supported for this parser. Supported types are: {supported_mime_types}"
+            )
+        with blob.as_bytes_io() as word_document:  # type: ignore[attr-defined]
+            doc = DocxDocument(word_document)
+            text = "\n\n".join([para.text for para in doc.paragraphs])
+            metadata = {"source": blob.source}  # type: ignore[attr-defined]
+            yield Document(page_content=text, metadata=metadata)