|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json |
|
import re |
|
from abc import ABC, abstractmethod |
|
from copy import deepcopy |
|
from hashlib import md5 |
|
from io import BytesIO |
|
from typing import Any, Dict, List, Optional |
|
|
|
from camel.utils import dependencies_required |
|
|
|
|
|
class File(ABC): |
|
r"""Represents an uploaded file comprised of Documents. |
|
|
|
Args: |
|
name (str): The name of the file. |
|
file_id (str): The unique identifier of the file. |
|
metadata (Dict[str, Any], optional): Additional metadata |
|
associated with the file. Defaults to None. |
|
docs (List[Dict[str, Any]], optional): A list of documents |
|
contained within the file. Defaults to None. |
|
raw_bytes (bytes, optional): The raw bytes content of the file. |
|
Defaults to b"". |
|
""" |
|
|
|
def __init__( |
|
self, |
|
name: str, |
|
file_id: str, |
|
metadata: Optional[Dict[str, Any]] = None, |
|
docs: Optional[List[Dict[str, Any]]] = None, |
|
raw_bytes: bytes = b"", |
|
): |
|
self.name = name |
|
self.file_id = file_id |
|
self.metadata = metadata or {} |
|
self.docs = docs or [] |
|
self.raw_bytes = raw_bytes |
|
|
|
@classmethod |
|
@abstractmethod |
|
def from_bytes(cls, file: BytesIO, filename: str) -> "File": |
|
r"""Creates a File object from a BytesIO object. |
|
|
|
Args: |
|
file (BytesIO): A BytesIO object representing the contents of the |
|
file. |
|
filename (str): The name of the file. |
|
|
|
Returns: |
|
File: A File object. |
|
""" |
|
pass |
|
|
|
@classmethod |
|
def from_raw_bytes(cls, raw_bytes: bytes, filename: str) -> "File": |
|
r"""Creates a File object from raw bytes. |
|
|
|
Args: |
|
raw_bytes (bytes): The raw bytes content of the file. |
|
filename (str): The name of the file. |
|
|
|
Returns: |
|
File: A File object. |
|
""" |
|
file = BytesIO(raw_bytes) |
|
return cls.from_bytes(file, filename) |
|
|
|
@staticmethod |
|
def create_file(file: BytesIO, filename: str) -> "File": |
|
r"""Reads an uploaded file and returns a File object. |
|
|
|
Args: |
|
file (BytesIO): A BytesIO object representing the contents of the |
|
file. |
|
filename (str): The name of the file. |
|
|
|
Returns: |
|
File: A File object. |
|
""" |
|
ext_to_cls = { |
|
"docx": DocxFile, |
|
"pdf": PdfFile, |
|
"txt": TxtFile, |
|
"json": JsonFile, |
|
"html": HtmlFile, |
|
} |
|
|
|
ext = filename.split(".")[-1].lower() |
|
if ext not in ext_to_cls: |
|
raise NotImplementedError(f"File type {ext} not supported") |
|
|
|
out_file = ext_to_cls[ext].from_bytes(file, filename) |
|
return out_file |
|
|
|
@staticmethod |
|
def create_file_from_raw_bytes(raw_bytes: bytes, filename: str) -> "File": |
|
r"""Reads raw bytes and returns a File object. |
|
|
|
Args: |
|
raw_bytes (bytes): The raw bytes content of the file. |
|
filename (str): The name of the file. |
|
|
|
Returns: |
|
File: A File object. |
|
""" |
|
file = BytesIO(raw_bytes) |
|
return File.create_file(file, filename) |
|
|
|
def __repr__(self) -> str: |
|
return ( |
|
f"File(name={self.name}, id={self.file_id}, " |
|
f"metadata={self.metadata}, docs={self.docs})" |
|
) |
|
|
|
def __str__(self) -> str: |
|
return ( |
|
f"File(name={self.name}, id={self.file_id}, metadata=" |
|
f"{self.metadata})" |
|
) |
|
|
|
def copy(self) -> "File": |
|
r"""Create a deep copy of this File""" |
|
|
|
return self.__class__( |
|
name=self.name, |
|
file_id=self.file_id, |
|
metadata=deepcopy(self.metadata), |
|
docs=deepcopy(self.docs), |
|
raw_bytes=self.raw_bytes, |
|
) |
|
|
|
|
|
def strip_consecutive_newlines(text: str) -> str: |
|
r"""Strips consecutive newlines from a string. |
|
|
|
Args: |
|
text (str): The string to strip. |
|
|
|
Returns: |
|
str: The string with consecutive newlines stripped. |
|
""" |
|
return re.sub(r"\s*\n\s*", "\n", text) |
|
|
|
|
|
class DocxFile(File): |
|
@classmethod |
|
@dependencies_required('docx2txt') |
|
def from_bytes(cls, file: BytesIO, filename: str) -> "DocxFile": |
|
r"""Creates a DocxFile object from a BytesIO object. |
|
|
|
Args: |
|
file (BytesIO): A BytesIO object representing the contents of the |
|
docx file. |
|
filename (str): The name of the file. |
|
|
|
Returns: |
|
DocxFile: A DocxFile object. |
|
""" |
|
import docx2txt |
|
|
|
text = docx2txt.process(file) |
|
text = strip_consecutive_newlines(text) |
|
|
|
doc = {"page_content": text.strip()} |
|
|
|
file_id = md5(file.getvalue()).hexdigest() |
|
|
|
file.seek(0) |
|
return cls( |
|
name=filename, |
|
file_id=file_id, |
|
docs=[doc], |
|
raw_bytes=file.getvalue(), |
|
) |
|
|
|
|
|
class PdfFile(File): |
|
@classmethod |
|
def from_bytes(cls, file: BytesIO, filename: str) -> "PdfFile": |
|
r"""Creates a PdfFile object from a BytesIO object. |
|
|
|
Args: |
|
file (BytesIO): A BytesIO object representing the contents of the |
|
pdf file. |
|
filename (str): The name of the file. |
|
|
|
Returns: |
|
PdfFile: A PdfFile object. |
|
""" |
|
|
|
try: |
|
import fitz |
|
except ImportError: |
|
raise ImportError( |
|
"Please install `PyMuPDF` first. " |
|
"You can install it by running " |
|
"`pip install PyMuPDF`." |
|
) |
|
pdf = fitz.open(stream=file.read(), filetype="pdf") |
|
docs = [] |
|
for i, page in enumerate(pdf): |
|
text = page.get_text(sort=True) |
|
text = strip_consecutive_newlines(text) |
|
|
|
doc = {"page_content": text.strip(), "page": i + 1} |
|
docs.append(doc) |
|
|
|
file_id = md5(file.getvalue()).hexdigest() |
|
|
|
file.seek(0) |
|
return cls( |
|
name=filename, |
|
file_id=file_id, |
|
docs=docs, |
|
raw_bytes=file.getvalue(), |
|
) |
|
|
|
|
|
class TxtFile(File): |
|
@classmethod |
|
def from_bytes(cls, file: BytesIO, filename: str) -> "TxtFile": |
|
r"""Creates a TxtFile object from a BytesIO object. |
|
|
|
Args: |
|
file (BytesIO): A BytesIO object representing the contents of the |
|
txt file. |
|
filename (str): The name of the file. |
|
|
|
Returns: |
|
TxtFile: A TxtFile object. |
|
""" |
|
|
|
text = file.read().decode("utf-8") |
|
text = strip_consecutive_newlines(text) |
|
|
|
doc = {"page_content": text.strip()} |
|
|
|
file_id = md5(file.getvalue()).hexdigest() |
|
|
|
file.seek(0) |
|
return cls( |
|
name=filename, |
|
file_id=file_id, |
|
docs=[doc], |
|
raw_bytes=file.getvalue(), |
|
) |
|
|
|
|
|
class JsonFile(File): |
|
@classmethod |
|
def from_bytes(cls, file: BytesIO, filename: str) -> "JsonFile": |
|
r"""Creates a JsonFile object from a BytesIO object. |
|
|
|
Args: |
|
file (BytesIO): A BytesIO object representing the contents of the |
|
json file. |
|
filename (str): The name of the file. |
|
|
|
Returns: |
|
JsonFile: A JsonFile object. |
|
""" |
|
|
|
data = json.load(file) |
|
|
|
doc = {"page_content": json.dumps(data)} |
|
|
|
file_id = md5(file.getvalue()).hexdigest() |
|
|
|
file.seek(0) |
|
return cls( |
|
name=filename, |
|
file_id=file_id, |
|
docs=[doc], |
|
raw_bytes=file.getvalue(), |
|
) |
|
|
|
|
|
class HtmlFile(File): |
|
@classmethod |
|
def from_bytes(cls, file: BytesIO, filename: str) -> "HtmlFile": |
|
r"""Creates a HtmlFile object from a BytesIO object. |
|
|
|
Args: |
|
file (BytesIO): A BytesIO object representing the contents of the |
|
html file. |
|
filename (str): The name of the file. |
|
|
|
Returns: |
|
HtmlFile: A HtmlFile object. |
|
""" |
|
|
|
try: |
|
from bs4 import BeautifulSoup |
|
except ImportError: |
|
raise ImportError( |
|
"Please install `beautifulsoup4` first. " |
|
"You can install it by running " |
|
"`pip install beautifulsoup4`." |
|
) |
|
soup = BeautifulSoup(file, "html.parser") |
|
text = soup.get_text() |
|
text = strip_consecutive_newlines(text) |
|
|
|
doc = {"page_content": text.strip()} |
|
|
|
file_id = md5(file.getvalue()).hexdigest() |
|
|
|
file.seek(0) |
|
return cls( |
|
name=filename, |
|
file_id=file_id, |
|
docs=[doc], |
|
raw_bytes=file.getvalue(), |
|
) |
|
|