Spaces:
Runtime error
Runtime error
from typing import TYPE_CHECKING, Dict, List, Union | |
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader | |
if TYPE_CHECKING: | |
from chm import chm | |
class UnstructuredCHMLoader(UnstructuredFileLoader): | |
"""Load `CHM` files using `Unstructured`. | |
CHM means Microsoft Compiled HTML Help. | |
Examples | |
-------- | |
from langchain_community.document_loaders import UnstructuredCHMLoader | |
loader = UnstructuredCHMLoader("example.chm") | |
docs = loader.load() | |
References | |
---------- | |
https://github.com/dottedmag/pychm | |
http://www.jedrea.com/chmlib/ | |
""" | |
def _get_elements(self) -> List: | |
from unstructured.partition.html import partition_html | |
with CHMParser(self.file_path) as f: # type: ignore[arg-type] | |
return [ | |
partition_html(text=item["content"], **self.unstructured_kwargs) | |
for item in f.load_all() | |
] | |
class CHMParser(object): | |
"""Microsoft Compiled HTML Help (CHM) Parser.""" | |
path: str | |
file: "chm.CHMFile" | |
def __init__(self, path: str): | |
from chm import chm | |
self.path = path | |
self.file = chm.CHMFile() | |
self.file.LoadCHM(path) | |
def __enter__(self): # type: ignore[no-untyped-def] | |
return self | |
def __exit__(self, exc_type, exc_value, traceback): # type: ignore[no-untyped-def] | |
if self.file: | |
self.file.CloseCHM() | |
def encoding(self) -> str: | |
return self.file.GetEncoding().decode("utf-8") | |
def index(self) -> List[Dict[str, str]]: | |
from urllib.parse import urlparse | |
from bs4 import BeautifulSoup | |
res = [] | |
index = self.file.GetTopicsTree().decode(self.encoding) | |
soup = BeautifulSoup(index) | |
# <OBJECT ..> | |
for obj in soup.find_all("object"): | |
# <param name="Name" value="<...>"> | |
# <param name="Local" value="<...>"> | |
name = "" | |
local = "" | |
for param in obj.find_all("param"): | |
if param["name"] == "Name": | |
name = param["value"] | |
if param["name"] == "Local": | |
local = param["value"] | |
if not name or not local: | |
continue | |
local = urlparse(local).path | |
if not local.startswith("/"): | |
local = "/" + local | |
res.append({"name": name, "local": local}) | |
return res | |
def load(self, path: Union[str, bytes]) -> str: | |
if isinstance(path, str): | |
path = path.encode("utf-8") | |
obj = self.file.ResolveObject(path)[1] | |
return self.file.RetrieveObject(obj)[1].decode(self.encoding) | |
def load_all(self) -> List[Dict[str, str]]: | |
res = [] | |
index = self.index() | |
for item in index: | |
content = self.load(item["local"]) | |
res.append( | |
{"name": item["name"], "local": item["local"], "content": content} | |
) | |
return res | |