kap2403
"added files"
5e433de
# this mo
import re
import os
import json
import docling
from langchain_core.documents import Document
from typing import List, Dict, Any, Optional, Tuple
import logging
logging.basicConfig(level=logging.INFO)
#============================
# data loader from json and md files
#============================
def load_json_file(file_path: str)-> dict:
"""
Load a JSON file and return its content as a dictionary.
Args:
file_path (str): Path to the JSON file.
Returns:
dict: Dictionary containing the JSON data.
"""
with open(file_path, 'r') as file:
data = json.load(file)
return data
def load_md_file(file_path: str) -> str:
"""
Load a Markdown file and return its content as a string.
The function reads the file in UTF-8 encoding.
Args:
file_path (str): Path to the Markdown file.
Returns:
str: Content of the Markdown file as a string.
"""
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
return content
def data_preprocess(folder_path: str) -> dict:
"""
Load data from a folder containing JSON files and a Markdown file.
The function reads the following files:
- tables.json
- images.json
- text.json
- chunks.json
- {base_folder_name}-with-images.md
Args:
folder_path (str): Path to the folder containing the JSON and Markdown files.
Returns:
dict: A dictionary containing the loaded data from the JSON files and the
Markdown file.
"""
tables_path = os.path.join(folder_path, "tables.json")
images_path = os.path.join(folder_path, "images.json")
text_path = os.path.join(folder_path, "text.json")
chunks_path = os.path.join(folder_path, "chunks.json")
# Extract base folder name for md and images folder
base_folder_name = os.path.basename(folder_path)
images_folder_path = os.path.join(folder_path, f"{base_folder_name}-with-images_artifacts")
md_file_path = os.path.join(folder_path, f"{base_folder_name}-with-images.md")
# Load JSON contents
tables = load_json_file(tables_path)
images = load_json_file(images_path)
text = load_json_file(text_path)
chunks = load_json_file(chunks_path)
# Load Markdown content
markdown = load_md_file(md_file_path)
return {
"tables": tables,
"images": images,
"text": text,
"chunks": chunks,
"images_folder": images_folder_path,
"markdown": markdown
}
def load_json_data_documents(converted_document: dict, data_type: str)-> Document:
"""
Load JSON data documents from the converted document.
This function takes a converted document and a data type (e.g., "tables", "images", "text", "chunks")
and returns a list of Document objects.
Args:
converted_document (dict): The converted document containing data.
data_type (str): The type of data to load (e.g., "tables", "images", "text", "chunks").
Returns:
Document: A list of Document objects containing the loaded data.
"""
documents = []
for chunk in converted_document[data_type]:
content = chunk["content"]
metadata = chunk["metadata"]
# Create Document object
document = Document(
page_content=content,
metadata=metadata
)
documents.append(document)
return documents
#============================
# dataloader for all the data
# from the folder
# containing json and md files
# and images
#============================
def dataloader(folder_path: str)-> Tuple[list, list, list, list]:
"""
Load data from a folder containing JSON files and a Markdown file.
The function reads the following files:
Args:
folder_path (str): Folder path containing all folders with JSON files and
Markdown files.
Returns:
Tuple[list, list, list, list]: list of chunks, list of pictures, list of tables,
and list of text of overall data.
"""
chunks_list = []
pictures_list = []
tables_list = []
text_list = []
logging.info(f"Loading data from folder: {folder_path}")
for file_name in os.listdir(folder_path):
logging.info(f"Processing file: {file_name}")
file_path = os.path.join(folder_path, file_name)
# load the data
dict_data = data_preprocess(file_path)
chunks_data = load_json_data_documents(dict_data, "chunks")
pictures_data = load_json_data_documents(dict_data, "images")
tables_data = load_json_data_documents(dict_data, "tables")
text_data = load_json_data_documents(dict_data, "text")
# adding the data to the list
chunks_list.extend(chunks_data)
pictures_list.extend(pictures_data)
tables_list.extend(tables_data)
text_list.extend(text_data)
logging.info(f"Loaded {len(chunks_data)} chunks, {len(pictures_data)} pictures, "
f"{len(tables_data)} tables, and {len(text_data)} text documents from {file_name}")
return chunks_list, pictures_list, tables_list, text_list
if __name__ == "__main__":
# Example usage
folder_path = "dataset/converted_json_docs"
chunks, pictures, tables, text = dataloader(folder_path)