Spaces:
Running
Running
# this mo | |
import re | |
import os | |
import json | |
import docling | |
from langchain_core.documents import Document | |
from typing import List, Dict, Any, Optional, Tuple | |
import logging | |
logging.basicConfig(level=logging.INFO) | |
#============================ | |
# data loader from json and md files | |
#============================ | |
def load_json_file(file_path: str)-> dict: | |
""" | |
Load a JSON file and return its content as a dictionary. | |
Args: | |
file_path (str): Path to the JSON file. | |
Returns: | |
dict: Dictionary containing the JSON data. | |
""" | |
with open(file_path, 'r') as file: | |
data = json.load(file) | |
return data | |
def load_md_file(file_path: str) -> str: | |
""" | |
Load a Markdown file and return its content as a string. | |
The function reads the file in UTF-8 encoding. | |
Args: | |
file_path (str): Path to the Markdown file. | |
Returns: | |
str: Content of the Markdown file as a string. | |
""" | |
with open(file_path, 'r', encoding='utf-8') as file: | |
content = file.read() | |
return content | |
def data_preprocess(folder_path: str) -> dict: | |
""" | |
Load data from a folder containing JSON files and a Markdown file. | |
The function reads the following files: | |
- tables.json | |
- images.json | |
- text.json | |
- chunks.json | |
- {base_folder_name}-with-images.md | |
Args: | |
folder_path (str): Path to the folder containing the JSON and Markdown files. | |
Returns: | |
dict: A dictionary containing the loaded data from the JSON files and the | |
Markdown file. | |
""" | |
tables_path = os.path.join(folder_path, "tables.json") | |
images_path = os.path.join(folder_path, "images.json") | |
text_path = os.path.join(folder_path, "text.json") | |
chunks_path = os.path.join(folder_path, "chunks.json") | |
# Extract base folder name for md and images folder | |
base_folder_name = os.path.basename(folder_path) | |
images_folder_path = os.path.join(folder_path, f"{base_folder_name}-with-images_artifacts") | |
md_file_path = os.path.join(folder_path, f"{base_folder_name}-with-images.md") | |
# Load JSON contents | |
tables = load_json_file(tables_path) | |
images = load_json_file(images_path) | |
text = load_json_file(text_path) | |
chunks = load_json_file(chunks_path) | |
# Load Markdown content | |
markdown = load_md_file(md_file_path) | |
return { | |
"tables": tables, | |
"images": images, | |
"text": text, | |
"chunks": chunks, | |
"images_folder": images_folder_path, | |
"markdown": markdown | |
} | |
def load_json_data_documents(converted_document: dict, data_type: str)-> Document: | |
""" | |
Load JSON data documents from the converted document. | |
This function takes a converted document and a data type (e.g., "tables", "images", "text", "chunks") | |
and returns a list of Document objects. | |
Args: | |
converted_document (dict): The converted document containing data. | |
data_type (str): The type of data to load (e.g., "tables", "images", "text", "chunks"). | |
Returns: | |
Document: A list of Document objects containing the loaded data. | |
""" | |
documents = [] | |
for chunk in converted_document[data_type]: | |
content = chunk["content"] | |
metadata = chunk["metadata"] | |
# Create Document object | |
document = Document( | |
page_content=content, | |
metadata=metadata | |
) | |
documents.append(document) | |
return documents | |
#============================ | |
# dataloader for all the data | |
# from the folder | |
# containing json and md files | |
# and images | |
#============================ | |
def dataloader(folder_path: str)-> Tuple[list, list, list, list]: | |
""" | |
Load data from a folder containing JSON files and a Markdown file. | |
The function reads the following files: | |
Args: | |
folder_path (str): Folder path containing all folders with JSON files and | |
Markdown files. | |
Returns: | |
Tuple[list, list, list, list]: list of chunks, list of pictures, list of tables, | |
and list of text of overall data. | |
""" | |
chunks_list = [] | |
pictures_list = [] | |
tables_list = [] | |
text_list = [] | |
logging.info(f"Loading data from folder: {folder_path}") | |
for file_name in os.listdir(folder_path): | |
logging.info(f"Processing file: {file_name}") | |
file_path = os.path.join(folder_path, file_name) | |
# load the data | |
dict_data = data_preprocess(file_path) | |
chunks_data = load_json_data_documents(dict_data, "chunks") | |
pictures_data = load_json_data_documents(dict_data, "images") | |
tables_data = load_json_data_documents(dict_data, "tables") | |
text_data = load_json_data_documents(dict_data, "text") | |
# adding the data to the list | |
chunks_list.extend(chunks_data) | |
pictures_list.extend(pictures_data) | |
tables_list.extend(tables_data) | |
text_list.extend(text_data) | |
logging.info(f"Loaded {len(chunks_data)} chunks, {len(pictures_data)} pictures, " | |
f"{len(tables_data)} tables, and {len(text_data)} text documents from {file_name}") | |
return chunks_list, pictures_list, tables_list, text_list | |
if __name__ == "__main__": | |
# Example usage | |
folder_path = "dataset/converted_json_docs" | |
chunks, pictures, tables, text = dataloader(folder_path) | |