File size: 5,339 Bytes
5e433de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# this mo

import re
import os
import json
import docling
from langchain_core.documents import Document
from typing import List, Dict, Any, Optional, Tuple
import logging

logging.basicConfig(level=logging.INFO)

#============================
# data loader from json and md files
#============================

def load_json_file(file_path: str)-> dict:
    """
    Load a JSON file and return its content as a dictionary.

    Args:   
        file_path (str): Path to the JSON file.

    Returns:
        dict: Dictionary containing the JSON data.
    """
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

def load_md_file(file_path: str) -> str:
    """
    Load a Markdown file and return its content as a string.
    The function reads the file in UTF-8 encoding.

    Args:
        file_path (str): Path to the Markdown file.

    Returns:
        str: Content of the Markdown file as a string.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content


def data_preprocess(folder_path: str) -> dict:
    """
    Load data from a folder containing JSON files and a Markdown file.
    The function reads the following files:
    - tables.json
    - images.json
    - text.json
    - chunks.json
    - {base_folder_name}-with-images.md

    Args:
        folder_path (str): Path to the folder containing the JSON and Markdown files.

    Returns:
        dict: A dictionary containing the loaded data from the JSON files and the 
        Markdown file.
    """
    tables_path = os.path.join(folder_path, "tables.json")
    images_path = os.path.join(folder_path, "images.json")
    text_path = os.path.join(folder_path, "text.json")
    chunks_path = os.path.join(folder_path, "chunks.json")

    # Extract base folder name for md and images folder
    base_folder_name = os.path.basename(folder_path)
    images_folder_path = os.path.join(folder_path, f"{base_folder_name}-with-images_artifacts")
    md_file_path = os.path.join(folder_path, f"{base_folder_name}-with-images.md")

    # Load JSON contents
    tables = load_json_file(tables_path)
    images = load_json_file(images_path)
    text = load_json_file(text_path)
    chunks = load_json_file(chunks_path)

    # Load Markdown content
    markdown = load_md_file(md_file_path)

    return {
        "tables": tables,
        "images": images,
        "text": text,
        "chunks": chunks,
        "images_folder": images_folder_path,
        "markdown": markdown
    }


def load_json_data_documents(converted_document: dict, data_type: str)-> Document:
    """
    Load JSON data documents from the converted document.
    This function takes a converted document and a data type (e.g., "tables", "images", "text", "chunks")
    and returns a list of Document objects.

    Args:
        converted_document (dict): The converted document containing data.
        data_type (str): The type of data to load (e.g., "tables", "images", "text", "chunks").
    Returns:
        Document: A list of Document objects containing the loaded data.
    """
    documents = []
    for chunk in converted_document[data_type]:
        content = chunk["content"]
        metadata = chunk["metadata"]
        # Create Document object
        document = Document(
            page_content=content,
            metadata=metadata
        )
        documents.append(document)

    return documents



#============================
#  dataloader for all the data
#  from the folder
#  containing json and md files
#  and images
#============================


def dataloader(folder_path: str)-> Tuple[list, list, list, list]:
    """
    Load data from a folder containing JSON files and a Markdown file.
    The function reads the following files:

    Args:
        folder_path (str): Folder path containing all folders with JSON files and 
        Markdown files.
    Returns:
        Tuple[list, list, list, list]: list of chunks, list of pictures, list of tables, 
        and list of text of overall data.
    """

    chunks_list = []
    pictures_list = []
    tables_list = []
    text_list = []

    logging.info(f"Loading data from folder: {folder_path}")
    for file_name in os.listdir(folder_path):
        logging.info(f"Processing file: {file_name}")
        file_path = os.path.join(folder_path, file_name)
        
        # load the data
        dict_data = data_preprocess(file_path)
        chunks_data = load_json_data_documents(dict_data, "chunks")
        pictures_data = load_json_data_documents(dict_data, "images")
        tables_data = load_json_data_documents(dict_data, "tables")
        text_data = load_json_data_documents(dict_data, "text")

        # adding the data to the list
        chunks_list.extend(chunks_data)
        pictures_list.extend(pictures_data)
        tables_list.extend(tables_data)
        text_list.extend(text_data)
        logging.info(f"Loaded {len(chunks_data)} chunks, {len(pictures_data)} pictures, "
                     f"{len(tables_data)} tables, and {len(text_data)} text documents from {file_name}")
    
    return chunks_list, pictures_list, tables_list, text_list



if __name__ == "__main__":
    # Example usage
    folder_path = "dataset/converted_json_docs"
    chunks, pictures, tables, text = dataloader(folder_path)