Spaces:
Runtime error
Runtime error
File size: 6,271 Bytes
ed4d993 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
# Prerequisites:
# 1. Create a Dropbox app.
# 2. Give the app these scope permissions: `files.metadata.read`
# and `files.content.read`.
# 3. Generate access token: https://www.dropbox.com/developers/apps/create.
# 4. `pip install dropbox` (requires `pip install unstructured[pdf]` for PDF filetype).
import os
import tempfile
from pathlib import Path
from typing import Any, Dict, List, Optional
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import BaseModel, root_validator
from langchain_community.document_loaders.base import BaseLoader
class DropboxLoader(BaseLoader, BaseModel):
"""Load files from `Dropbox`.
In addition to common files such as text and PDF files, it also supports
*Dropbox Paper* files.
"""
dropbox_access_token: str
"""Dropbox access token."""
dropbox_folder_path: Optional[str] = None
"""The folder path to load from."""
dropbox_file_paths: Optional[List[str]] = None
"""The file paths to load from."""
recursive: bool = False
"""Flag to indicate whether to load files recursively from subfolders."""
@root_validator
def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
"""Validate that either folder_path or file_paths is set, but not both."""
if (
values.get("dropbox_folder_path") is not None
and values.get("dropbox_file_paths") is not None
):
raise ValueError("Cannot specify both folder_path and file_paths")
if values.get("dropbox_folder_path") is None and not values.get(
"dropbox_file_paths"
):
raise ValueError("Must specify either folder_path or file_paths")
return values
def _create_dropbox_client(self) -> Any:
"""Create a Dropbox client."""
try:
from dropbox import Dropbox, exceptions
except ImportError:
raise ImportError("You must run " "`pip install dropbox")
try:
dbx = Dropbox(self.dropbox_access_token)
dbx.users_get_current_account()
except exceptions.AuthError as ex:
raise ValueError(
"Invalid Dropbox access token. Please verify your token and try again."
) from ex
return dbx
def _load_documents_from_folder(self, folder_path: str) -> List[Document]:
"""Load documents from a Dropbox folder."""
dbx = self._create_dropbox_client()
try:
from dropbox import exceptions
from dropbox.files import FileMetadata
except ImportError:
raise ImportError("You must run " "`pip install dropbox")
try:
results = dbx.files_list_folder(folder_path, recursive=self.recursive)
except exceptions.ApiError as ex:
raise ValueError(
f"Could not list files in the folder: {folder_path}. "
"Please verify the folder path and try again."
) from ex
files = [entry for entry in results.entries if isinstance(entry, FileMetadata)]
documents = [
doc
for doc in (self._load_file_from_path(file.path_display) for file in files)
if doc is not None
]
return documents
def _load_file_from_path(self, file_path: str) -> Optional[Document]:
"""Load a file from a Dropbox path."""
dbx = self._create_dropbox_client()
try:
from dropbox import exceptions
except ImportError:
raise ImportError("You must run " "`pip install dropbox")
try:
file_metadata = dbx.files_get_metadata(file_path)
if file_metadata.is_downloadable:
_, response = dbx.files_download(file_path)
# Some types such as Paper, need to be exported.
elif file_metadata.export_info:
_, response = dbx.files_export(file_path, "markdown")
except exceptions.ApiError as ex:
raise ValueError(
f"Could not load file: {file_path}. Please verify the file path"
"and try again."
) from ex
try:
text = response.content.decode("utf-8")
except UnicodeDecodeError:
file_extension = os.path.splitext(file_path)[1].lower()
if file_extension == ".pdf":
print(f"File {file_path} type detected as .pdf") # noqa: T201
from langchain_community.document_loaders import UnstructuredPDFLoader
# Download it to a temporary file.
temp_dir = tempfile.TemporaryDirectory()
temp_pdf = Path(temp_dir.name) / "tmp.pdf"
with open(temp_pdf, mode="wb") as f:
f.write(response.content)
try:
loader = UnstructuredPDFLoader(str(temp_pdf))
docs = loader.load()
if docs:
return docs[0]
except Exception as pdf_ex:
print(f"Error while trying to parse PDF {file_path}: {pdf_ex}") # noqa: T201
return None
else:
print( # noqa: T201
f"File {file_path} could not be decoded as pdf or text. Skipping."
)
return None
metadata = {
"source": f"dropbox://{file_path}",
"title": os.path.basename(file_path),
}
return Document(page_content=text, metadata=metadata)
def _load_documents_from_paths(self) -> List[Document]:
"""Load documents from a list of Dropbox file paths."""
if not self.dropbox_file_paths:
raise ValueError("file_paths must be set")
return [
doc
for doc in (
self._load_file_from_path(file_path)
for file_path in self.dropbox_file_paths
)
if doc is not None
]
def load(self) -> List[Document]:
"""Load documents."""
if self.dropbox_folder_path is not None:
return self._load_documents_from_folder(self.dropbox_folder_path)
else:
return self._load_documents_from_paths()
|