|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import uuid |
|
import warnings |
|
from typing import ( |
|
IO, |
|
TYPE_CHECKING, |
|
Any, |
|
Dict, |
|
List, |
|
Literal, |
|
Optional, |
|
Tuple, |
|
Union, |
|
) |
|
if TYPE_CHECKING: |
|
from unstructured.documents.elements import Element |
|
import pdb |
|
|
|
class UnstructuredIO: |
|
r"""A class to handle various functionalities provided by the |
|
Unstructured library, including version checking, parsing, cleaning, |
|
extracting, staging, chunking data, and integrating with cloud |
|
services like S3 and Azure for data connection. |
|
|
|
References: |
|
https://docs.unstructured.io/ |
|
""" |
|
|
|
@staticmethod |
|
def create_element_from_text( |
|
text: str, |
|
element_id: Optional[str] = None, |
|
embeddings: Optional[List[float]] = None, |
|
filename: Optional[str] = None, |
|
file_directory: Optional[str] = None, |
|
last_modified: Optional[str] = None, |
|
filetype: Optional[str] = None, |
|
parent_id: Optional[str] = None, |
|
) -> "Element": |
|
r"""Creates a Text element from a given text input, with optional |
|
metadata and embeddings. |
|
|
|
Args: |
|
text (str): The text content for the element. |
|
element_id (Optional[str], optional): Unique identifier for the |
|
element. (default: :obj:`None`) |
|
embeddings (List[float], optional): A list of float |
|
numbers representing the text embeddings. |
|
(default: :obj:`None`) |
|
filename (Optional[str], optional): The name of the file the |
|
element is associated with. (default: :obj:`None`) |
|
file_directory (Optional[str], optional): The directory path where |
|
the file is located. (default: :obj:`None`) |
|
last_modified (Optional[str], optional): The last modified date of |
|
the file. (default: :obj:`None`) |
|
filetype (Optional[str], optional): The type of the file. |
|
(default: :obj:`None`) |
|
parent_id (Optional[str], optional): The identifier of the parent |
|
element. (default: :obj:`None`) |
|
|
|
Returns: |
|
Element: An instance of Text with the provided content and |
|
metadata. |
|
""" |
|
from unstructured.documents.elements import ElementMetadata, Text |
|
|
|
metadata = ElementMetadata( |
|
filename=filename, |
|
file_directory=file_directory, |
|
last_modified=last_modified, |
|
filetype=filetype, |
|
parent_id=parent_id, |
|
) |
|
|
|
return Text( |
|
text=text, |
|
element_id=element_id or str(uuid.uuid4()), |
|
metadata=metadata, |
|
embeddings=embeddings, |
|
) |
|
|
|
@staticmethod |
|
def parse_file_or_url( |
|
input_path: str, |
|
**kwargs: Any, |
|
) -> Union[List["Element"], None]: |
|
r"""Loads a file or a URL and parses its contents into elements. |
|
|
|
Args: |
|
input_path (str): Path to the file or URL to be parsed. |
|
**kwargs: Extra kwargs passed to the partition function. |
|
|
|
Returns: |
|
Union[List[Element],None]: List of elements after parsing the file |
|
or URL if success. |
|
|
|
Raises: |
|
FileNotFoundError: If the file does not exist at the path |
|
specified. |
|
|
|
Notes: |
|
Supported file types: |
|
"csv", "doc", "docx", "epub", "image", "md", "msg", "odt", |
|
"org", "pdf", "ppt", "pptx", "rtf", "rst", "tsv", "xlsx". |
|
|
|
References: |
|
https://unstructured-io.github.io/unstructured/ |
|
""" |
|
import os |
|
from urllib.parse import urlparse |
|
|
|
from unstructured.partition.auto import partition |
|
|
|
parsed_url = urlparse(input_path) |
|
|
|
is_url = all([parsed_url.scheme, parsed_url.netloc]) |
|
|
|
|
|
if is_url: |
|
try: |
|
elements = partition(url=input_path, **kwargs) |
|
return elements |
|
except Exception: |
|
warnings.warn(f"Failed to parse the URL: {input_path}") |
|
return None |
|
|
|
|
|
else: |
|
|
|
if not os.path.exists(input_path): |
|
raise FileNotFoundError( |
|
f"The file {input_path} was not found." |
|
) |
|
|
|
|
|
try: |
|
with open(input_path, "rb") as f: |
|
elements = partition(file=f, **kwargs) |
|
return elements |
|
except Exception: |
|
warnings.warn(f"Failed to partition the file: {input_path}") |
|
return None |
|
|
|
@staticmethod |
|
def parse_bytes( |
|
file: IO[bytes], **kwargs: Any |
|
) -> Union[List["Element"], None]: |
|
r"""Parses a bytes stream and converts its contents into elements. |
|
|
|
Args: |
|
file (IO[bytes]): The file in bytes format to be parsed. |
|
**kwargs: Extra kwargs passed to the partition function. |
|
|
|
Returns: |
|
Union[List[Element], None]: List of elements after parsing the file |
|
if successful, otherwise `None`. |
|
|
|
Notes: |
|
Supported file types: |
|
"csv", "doc", "docx", "epub", "image", "md", "msg", "odt", |
|
"org", "pdf", "ppt", "pptx", "rtf", "rst", "tsv", "xlsx". |
|
|
|
References: |
|
https://docs.unstructured.io/open-source/core-functionality/partitioning |
|
""" |
|
|
|
from unstructured.partition.auto import partition |
|
|
|
try: |
|
|
|
elements = partition(file=file, **kwargs) |
|
return elements |
|
except Exception as e: |
|
warnings.warn(f"Failed to partition the file stream: {e}") |
|
return None |
|
|
|
@staticmethod |
|
def clean_text_data( |
|
text: str, |
|
clean_options: Optional[List[Tuple[str, Dict[str, Any]]]] = None, |
|
) -> str: |
|
r"""Cleans text data using a variety of cleaning functions provided by |
|
the `unstructured` library. |
|
|
|
This function applies multiple text cleaning utilities by calling the |
|
`unstructured` library's cleaning bricks for operations like |
|
replacing Unicode quotes, removing extra whitespace, dashes, non-ascii |
|
characters, and more. |
|
|
|
If no cleaning options are provided, a default set of cleaning |
|
operations is applied. These defaults including operations |
|
"replace_unicode_quotes", "clean_non_ascii_chars", |
|
"group_broken_paragraphs", and "clean_extra_whitespace". |
|
|
|
Args: |
|
text (str): The text to be cleaned. |
|
clean_options (dict): A dictionary specifying which cleaning |
|
options to apply. The keys should match the names of the |
|
cleaning functions, and the values should be dictionaries |
|
containing the parameters for each function. Supported types: |
|
'clean_extra_whitespace', 'clean_bullets', |
|
'clean_ordered_bullets', 'clean_postfix', 'clean_prefix', |
|
'clean_dashes', 'clean_trailing_punctuation', |
|
'clean_non_ascii_chars', 'group_broken_paragraphs', |
|
'remove_punctuation', 'replace_unicode_quotes', |
|
'bytes_string_to_string', 'translate_text'. |
|
|
|
Returns: |
|
str: The cleaned text. |
|
|
|
Raises: |
|
AttributeError: If a cleaning option does not correspond to a |
|
valid cleaning function in `unstructured`. |
|
|
|
Notes: |
|
The 'options' dictionary keys must correspond to valid cleaning |
|
brick names from the `unstructured` library. |
|
Each brick's parameters must be provided in a nested dictionary |
|
as the value for the key. |
|
|
|
References: |
|
https://unstructured-io.github.io/unstructured/ |
|
""" |
|
|
|
from unstructured.cleaners.core import ( |
|
bytes_string_to_string, |
|
clean_bullets, |
|
clean_dashes, |
|
clean_extra_whitespace, |
|
clean_non_ascii_chars, |
|
clean_ordered_bullets, |
|
clean_postfix, |
|
clean_prefix, |
|
clean_trailing_punctuation, |
|
group_broken_paragraphs, |
|
remove_punctuation, |
|
replace_unicode_quotes, |
|
) |
|
from unstructured.cleaners.translate import translate_text |
|
|
|
cleaning_functions: Any = { |
|
"clean_extra_whitespace": clean_extra_whitespace, |
|
"clean_bullets": clean_bullets, |
|
"clean_ordered_bullets": clean_ordered_bullets, |
|
"clean_postfix": clean_postfix, |
|
"clean_prefix": clean_prefix, |
|
"clean_dashes": clean_dashes, |
|
"clean_trailing_punctuation": clean_trailing_punctuation, |
|
"clean_non_ascii_chars": clean_non_ascii_chars, |
|
"group_broken_paragraphs": group_broken_paragraphs, |
|
"remove_punctuation": remove_punctuation, |
|
"replace_unicode_quotes": replace_unicode_quotes, |
|
"bytes_string_to_string": bytes_string_to_string, |
|
"translate_text": translate_text, |
|
} |
|
|
|
|
|
if clean_options is None: |
|
clean_options = [ |
|
("replace_unicode_quotes", {}), |
|
("clean_non_ascii_chars", {}), |
|
("group_broken_paragraphs", {}), |
|
("clean_extra_whitespace", {}), |
|
] |
|
|
|
cleaned_text = text |
|
for func_name, params in clean_options: |
|
if func_name in cleaning_functions: |
|
cleaned_text = cleaning_functions[func_name]( |
|
cleaned_text, **params |
|
) |
|
else: |
|
raise ValueError( |
|
f"'{func_name}' is not a valid function in " |
|
"`Unstructured IO`." |
|
) |
|
|
|
return cleaned_text |
|
|
|
@staticmethod |
|
def extract_data_from_text( |
|
text: str, |
|
extract_type: Literal[ |
|
'extract_datetimetz', |
|
'extract_email_address', |
|
'extract_ip_address', |
|
'extract_ip_address_name', |
|
'extract_mapi_id', |
|
'extract_ordered_bullets', |
|
'extract_text_after', |
|
'extract_text_before', |
|
'extract_us_phone_number', |
|
], |
|
**kwargs, |
|
) -> Any: |
|
r"""Extracts various types of data from text using functions from |
|
unstructured.cleaners.extract. |
|
|
|
Args: |
|
text (str): Text to extract data from. |
|
extract_type (Literal['extract_datetimetz', |
|
'extract_email_address', 'extract_ip_address', |
|
'extract_ip_address_name', 'extract_mapi_id', |
|
'extract_ordered_bullets', 'extract_text_after', |
|
'extract_text_before', 'extract_us_phone_number']): Type of |
|
data to extract. |
|
**kwargs: Additional keyword arguments for specific |
|
extraction functions. |
|
|
|
Returns: |
|
Any: The extracted data, type depends on extract_type. |
|
|
|
References: |
|
https://unstructured-io.github.io/unstructured/ |
|
""" |
|
|
|
from unstructured.cleaners.extract import ( |
|
extract_datetimetz, |
|
extract_email_address, |
|
extract_ip_address, |
|
extract_ip_address_name, |
|
extract_mapi_id, |
|
extract_ordered_bullets, |
|
extract_text_after, |
|
extract_text_before, |
|
extract_us_phone_number, |
|
) |
|
|
|
extraction_functions: Any = { |
|
"extract_datetimetz": extract_datetimetz, |
|
"extract_email_address": extract_email_address, |
|
"extract_ip_address": extract_ip_address, |
|
"extract_ip_address_name": extract_ip_address_name, |
|
"extract_mapi_id": extract_mapi_id, |
|
"extract_ordered_bullets": extract_ordered_bullets, |
|
"extract_text_after": extract_text_after, |
|
"extract_text_before": extract_text_before, |
|
"extract_us_phone_number": extract_us_phone_number, |
|
} |
|
|
|
if extract_type not in extraction_functions: |
|
raise ValueError(f"Unsupported extract_type: {extract_type}") |
|
|
|
return extraction_functions[extract_type](text, **kwargs) |
|
|
|
@staticmethod |
|
def stage_elements( |
|
elements: List[Any], |
|
stage_type: Literal[ |
|
'convert_to_csv', |
|
'convert_to_dataframe', |
|
'convert_to_dict', |
|
'dict_to_elements', |
|
'stage_csv_for_prodigy', |
|
'stage_for_prodigy', |
|
'stage_for_baseplate', |
|
'stage_for_datasaur', |
|
'stage_for_label_box', |
|
'stage_for_label_studio', |
|
'stage_for_weaviate', |
|
], |
|
**kwargs, |
|
) -> Union[str, List[Dict], Any]: |
|
r"""Stages elements for various platforms based on the |
|
specified staging type. |
|
|
|
This function applies multiple staging utilities to format data |
|
for different NLP annotation and machine learning tools. It uses |
|
the 'unstructured.staging' module's functions for operations like |
|
converting to CSV, DataFrame, dictionary, or formatting for |
|
specific platforms like Prodigy, etc. |
|
|
|
Args: |
|
elements (List[Any]): List of Element objects to be staged. |
|
stage_type (Literal['convert_to_csv', 'convert_to_dataframe', |
|
'convert_to_dict', 'dict_to_elements', |
|
'stage_csv_for_prodigy', 'stage_for_prodigy', |
|
'stage_for_baseplate', 'stage_for_datasaur', |
|
'stage_for_label_box', 'stage_for_label_studio', |
|
'stage_for_weaviate']): Type of staging to perform. |
|
**kwargs: Additional keyword arguments specific to |
|
the staging type. |
|
|
|
Returns: |
|
Union[str, List[Dict], Any]: Staged data in the |
|
format appropriate for the specified staging type. |
|
|
|
Raises: |
|
ValueError: If the staging type is not supported or a required |
|
argument is missing. |
|
References: |
|
https://unstructured-io.github.io/unstructured/ |
|
""" |
|
|
|
from unstructured.staging import ( |
|
base, |
|
baseplate, |
|
datasaur, |
|
label_box, |
|
label_studio, |
|
prodigy, |
|
weaviate, |
|
) |
|
|
|
staging_functions: Any = { |
|
"convert_to_csv": base.convert_to_csv, |
|
"convert_to_dataframe": base.convert_to_dataframe, |
|
"convert_to_dict": base.convert_to_dict, |
|
"dict_to_elements": base.dict_to_elements, |
|
"stage_csv_for_prodigy": lambda els, |
|
**kw: prodigy.stage_csv_for_prodigy(els, kw.get('metadata', [])), |
|
"stage_for_prodigy": lambda els, **kw: prodigy.stage_for_prodigy( |
|
els, kw.get('metadata', []) |
|
), |
|
"stage_for_baseplate": baseplate.stage_for_baseplate, |
|
"stage_for_datasaur": lambda els, |
|
**kw: datasaur.stage_for_datasaur(els, kw.get('entities', [])), |
|
"stage_for_label_box": lambda els, |
|
**kw: label_box.stage_for_label_box(els, **kw), |
|
"stage_for_label_studio": lambda els, |
|
**kw: label_studio.stage_for_label_studio(els, **kw), |
|
"stage_for_weaviate": weaviate.stage_for_weaviate, |
|
} |
|
|
|
if stage_type not in staging_functions: |
|
raise ValueError(f"Unsupported stage type: {stage_type}") |
|
|
|
return staging_functions[stage_type](elements, **kwargs) |
|
|
|
@staticmethod |
|
def chunk_elements( |
|
elements: List["Element"], chunk_type: str, **kwargs |
|
) -> List["Element"]: |
|
r"""Chunks elements by titles. |
|
|
|
Args: |
|
elements (List[Element]): List of Element objects to be chunked. |
|
chunk_type (str): Type chunk going to apply. Supported types: |
|
'chunk_by_title'. |
|
**kwargs: Additional keyword arguments for chunking. |
|
|
|
Returns: |
|
List[Dict]: List of chunked sections. |
|
|
|
References: |
|
https://unstructured-io.github.io/unstructured/ |
|
""" |
|
|
|
from unstructured.chunking.title import chunk_by_title |
|
|
|
chunking_functions = { |
|
"chunk_by_title": chunk_by_title, |
|
} |
|
|
|
if chunk_type not in chunking_functions: |
|
raise ValueError(f"Unsupported chunk type: {chunk_type}") |
|
|
|
|
|
return chunking_functions[chunk_type](elements, **kwargs) |
|
|