Spaces:
Runtime error
Runtime error
File size: 3,031 Bytes
ed4d993 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
from typing import TYPE_CHECKING, Dict, List, Union
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
if TYPE_CHECKING:
from chm import chm
class UnstructuredCHMLoader(UnstructuredFileLoader):
"""Load `CHM` files using `Unstructured`.
CHM means Microsoft Compiled HTML Help.
Examples
--------
from langchain_community.document_loaders import UnstructuredCHMLoader
loader = UnstructuredCHMLoader("example.chm")
docs = loader.load()
References
----------
https://github.com/dottedmag/pychm
http://www.jedrea.com/chmlib/
"""
def _get_elements(self) -> List:
from unstructured.partition.html import partition_html
with CHMParser(self.file_path) as f: # type: ignore[arg-type]
return [
partition_html(text=item["content"], **self.unstructured_kwargs)
for item in f.load_all()
]
class CHMParser(object):
"""Microsoft Compiled HTML Help (CHM) Parser."""
path: str
file: "chm.CHMFile"
def __init__(self, path: str):
from chm import chm
self.path = path
self.file = chm.CHMFile()
self.file.LoadCHM(path)
def __enter__(self): # type: ignore[no-untyped-def]
return self
def __exit__(self, exc_type, exc_value, traceback): # type: ignore[no-untyped-def]
if self.file:
self.file.CloseCHM()
@property
def encoding(self) -> str:
return self.file.GetEncoding().decode("utf-8")
def index(self) -> List[Dict[str, str]]:
from urllib.parse import urlparse
from bs4 import BeautifulSoup
res = []
index = self.file.GetTopicsTree().decode(self.encoding)
soup = BeautifulSoup(index)
# <OBJECT ..>
for obj in soup.find_all("object"):
# <param name="Name" value="<...>">
# <param name="Local" value="<...>">
name = ""
local = ""
for param in obj.find_all("param"):
if param["name"] == "Name":
name = param["value"]
if param["name"] == "Local":
local = param["value"]
if not name or not local:
continue
local = urlparse(local).path
if not local.startswith("/"):
local = "/" + local
res.append({"name": name, "local": local})
return res
def load(self, path: Union[str, bytes]) -> str:
if isinstance(path, str):
path = path.encode("utf-8")
obj = self.file.ResolveObject(path)[1]
return self.file.RetrieveObject(obj)[1].decode(self.encoding)
def load_all(self) -> List[Dict[str, str]]:
res = []
index = self.index()
for item in index:
content = self.load(item["local"])
res.append(
{"name": item["name"], "local": item["local"], "content": content}
)
return res
|