Spaces:
Running
Running
File size: 5,339 Bytes
5e433de |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
# this mo
import re
import os
import json
import docling
from langchain_core.documents import Document
from typing import List, Dict, Any, Optional, Tuple
import logging
logging.basicConfig(level=logging.INFO)
#============================
# data loader from json and md files
#============================
def load_json_file(file_path: str)-> dict:
"""
Load a JSON file and return its content as a dictionary.
Args:
file_path (str): Path to the JSON file.
Returns:
dict: Dictionary containing the JSON data.
"""
with open(file_path, 'r') as file:
data = json.load(file)
return data
def load_md_file(file_path: str) -> str:
"""
Load a Markdown file and return its content as a string.
The function reads the file in UTF-8 encoding.
Args:
file_path (str): Path to the Markdown file.
Returns:
str: Content of the Markdown file as a string.
"""
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
return content
def data_preprocess(folder_path: str) -> dict:
"""
Load data from a folder containing JSON files and a Markdown file.
The function reads the following files:
- tables.json
- images.json
- text.json
- chunks.json
- {base_folder_name}-with-images.md
Args:
folder_path (str): Path to the folder containing the JSON and Markdown files.
Returns:
dict: A dictionary containing the loaded data from the JSON files and the
Markdown file.
"""
tables_path = os.path.join(folder_path, "tables.json")
images_path = os.path.join(folder_path, "images.json")
text_path = os.path.join(folder_path, "text.json")
chunks_path = os.path.join(folder_path, "chunks.json")
# Extract base folder name for md and images folder
base_folder_name = os.path.basename(folder_path)
images_folder_path = os.path.join(folder_path, f"{base_folder_name}-with-images_artifacts")
md_file_path = os.path.join(folder_path, f"{base_folder_name}-with-images.md")
# Load JSON contents
tables = load_json_file(tables_path)
images = load_json_file(images_path)
text = load_json_file(text_path)
chunks = load_json_file(chunks_path)
# Load Markdown content
markdown = load_md_file(md_file_path)
return {
"tables": tables,
"images": images,
"text": text,
"chunks": chunks,
"images_folder": images_folder_path,
"markdown": markdown
}
def load_json_data_documents(converted_document: dict, data_type: str)-> Document:
"""
Load JSON data documents from the converted document.
This function takes a converted document and a data type (e.g., "tables", "images", "text", "chunks")
and returns a list of Document objects.
Args:
converted_document (dict): The converted document containing data.
data_type (str): The type of data to load (e.g., "tables", "images", "text", "chunks").
Returns:
Document: A list of Document objects containing the loaded data.
"""
documents = []
for chunk in converted_document[data_type]:
content = chunk["content"]
metadata = chunk["metadata"]
# Create Document object
document = Document(
page_content=content,
metadata=metadata
)
documents.append(document)
return documents
#============================
# dataloader for all the data
# from the folder
# containing json and md files
# and images
#============================
def dataloader(folder_path: str)-> Tuple[list, list, list, list]:
"""
Load data from a folder containing JSON files and a Markdown file.
The function reads the following files:
Args:
folder_path (str): Folder path containing all folders with JSON files and
Markdown files.
Returns:
Tuple[list, list, list, list]: list of chunks, list of pictures, list of tables,
and list of text of overall data.
"""
chunks_list = []
pictures_list = []
tables_list = []
text_list = []
logging.info(f"Loading data from folder: {folder_path}")
for file_name in os.listdir(folder_path):
logging.info(f"Processing file: {file_name}")
file_path = os.path.join(folder_path, file_name)
# load the data
dict_data = data_preprocess(file_path)
chunks_data = load_json_data_documents(dict_data, "chunks")
pictures_data = load_json_data_documents(dict_data, "images")
tables_data = load_json_data_documents(dict_data, "tables")
text_data = load_json_data_documents(dict_data, "text")
# adding the data to the list
chunks_list.extend(chunks_data)
pictures_list.extend(pictures_data)
tables_list.extend(tables_data)
text_list.extend(text_data)
logging.info(f"Loaded {len(chunks_data)} chunks, {len(pictures_data)} pictures, "
f"{len(tables_data)} tables, and {len(text_data)} text documents from {file_name}")
return chunks_list, pictures_list, tables_list, text_list
if __name__ == "__main__":
# Example usage
folder_path = "dataset/converted_json_docs"
chunks, pictures, tables, text = dataloader(folder_path)
|