jameszokah commited on
Commit
17ec764
·
verified ·
1 Parent(s): a7571eb

Update parser/msword_parser.py

Browse files
Files changed (1) hide show
  1. parser/msword_parser.py +45 -44
parser/msword_parser.py CHANGED
@@ -1,44 +1,45 @@
1
- from typing import Iterator
2
- from langchain_core.documents import Document
3
- from langchain_community.document_loaders.base import BaseBlobParser
4
- from langchain_community.document_loaders.blob_loaders import Blob
5
-
6
-
7
- class MsWordParser(BaseBlobParser):
8
- """Parse Microsoft Word documents from a blob."""
9
-
10
- # type: ignore[valid-type]
11
- def lazy_parse(self, blob: Blob) -> Iterator[Document]:
12
- """Parse a Microsoft Word document into the Document iterator.
13
-
14
- Args:
15
- blob: The blob to parse.
16
-
17
- Returns: An iterator of Documents.
18
- """
19
- try:
20
- from docx import Document as DocxDocument
21
- except ImportError as e:
22
- raise ImportError(
23
- "Could not import python-docx, please install with `pip install python-docx`."
24
- ) from e
25
-
26
- supported_mime_types = [
27
- "application/msword",
28
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
29
- ]
30
-
31
- # Debugging: Print MIME type
32
- print(f"Blob MIME type: {blob.mimetype}")
33
-
34
- # type: ignore[attr-defined]
35
- if blob.mimetype not in supported_mime_types:
36
- raise ValueError(
37
- f"This blob type is not supported for this parser. Supported types are: {supported_mime_types}"
38
- )
39
-
40
- with blob.as_bytes_io() as word_document: # type: ignore[attr-defined]
41
- doc = DocxDocument(word_document)
42
- text = "\n\n".join([para.text for para in doc.paragraphs])
43
- metadata = {"source": blob.source} # type: ignore[attr-defined]
44
- yield Document(page_content=text, metadata=metadata)
 
 
1
+ from typing import Iterator
2
+ from langchain_core.documents import Document
3
+ from langchain_community.document_loaders.base import BaseBlobParser
4
+ from langchain_community.document_loaders.blob_loaders import Blob
5
+
6
+
7
+ class MsWordParser(BaseBlobParser):
8
+ """Parse Microsoft Word documents from a blob."""
9
+
10
+ # type: ignore[valid-type]
11
+ def lazy_parse(self, blob: Blob) -> Iterator[Document]:
12
+ """Parse a Microsoft Word document into the Document iterator.
13
+
14
+ Args:
15
+ blob: The blob to parse.
16
+
17
+ Returns: An iterator of Documents.
18
+ """
19
+ try:
20
+ from docx import Document as DocxDocument
21
+ except ImportError as e:
22
+ raise ImportError(
23
+ "Could not import python-docx, please install with `pip install python-docx`."
24
+ ) from e
25
+
26
+ supported_mime_types = [
27
+ "application/msword",
28
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
29
+ "application/vnd.openxmlformats-officedocument.themeManager+xml",
30
+ ]
31
+
32
+ # Debugging: Print MIME type
33
+ print(f"Blob MIME type: {blob.mimetype}")
34
+
35
+ # type: ignore[attr-defined]
36
+ if blob.mimetype not in supported_mime_types:
37
+ raise ValueError(
38
+ f"This blob type is not supported for this parser. Supported types are: {supported_mime_types}"
39
+ )
40
+
41
+ with blob.as_bytes_io() as word_document: # type: ignore[attr-defined]
42
+ doc = DocxDocument(word_document)
43
+ text = "\n\n".join([para.text for para in doc.paragraphs])
44
+ metadata = {"source": blob.source} # type: ignore[attr-defined]
45
+ yield Document(page_content=text, metadata=metadata)