Spaces:
Running
Running
import logging | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
import hashlib | |
def chunk_documents(page_list, doc_id, chunk_size=1000, chunk_overlap=200): | |
""" | |
Chunk a list of page contents into smaller segments with document ID metadata. | |
Args: | |
page_list (list): List of strings, each string being the content of a page. | |
doc_id (str): Unique identifier for the document. | |
chunk_size (int): Maximum size of each chunk (default: 1000 characters). | |
chunk_overlap (int): Overlap between chunks (default: 200 characters). | |
Returns: | |
list: List of dictionaries, each containing 'text', 'source', and 'doc_id'. | |
""" | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) | |
documents = [] | |
seen_hashes = set() # Track hashes of chunks to avoid duplicates | |
for page_num, page_content in enumerate(page_list, start=1): # Start page numbering at 1 | |
if not page_content or not isinstance(page_content, str): | |
continue # Skip empty or invalid pages | |
# Split the page content into chunks | |
chunks = text_splitter.split_text(page_content) | |
for i, chunk in enumerate(chunks): | |
# Generate a unique hash for the chunk | |
chunk_hash = hashlib.sha256(chunk.encode()).hexdigest() | |
# Skip if the chunk is a duplicate | |
if chunk_hash in seen_hashes: | |
continue | |
# Create source identifier (e.g., "doc_123_page_1_chunk_0") | |
source = f"doc_{doc_id}_page_{page_num}_chunk_{i}" | |
# Add the chunk with doc_id as metadata | |
documents.append({ | |
'text': chunk, | |
'source': source, | |
'doc_id': doc_id | |
}) | |
seen_hashes.add(chunk_hash) | |
logging.info(f"Chunking of documents is done. Chunked the document to {len(documents)} numbers of chunks") | |
return documents |