Spaces:

chaaim123
/

finalproj01

Sleeping

App Files Files Community

chaaim123 commited on 19 days ago

Commit

4c0e6c8

verified ·

1 Parent(s): c0dc39c

Create retriever/chunk_documents.py

Browse files

Files changed (1) hide show

retriever/chunk_documents.py +49 -0

retriever/chunk_documents.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import logging
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+import hashlib
+def chunk_documents(page_list, doc_id, chunk_size=1000, chunk_overlap=200):
+    """
+    Chunk a list of page contents into smaller segments with document ID metadata.
+    Args:
+        page_list (list): List of strings, each string being the content of a page.
+        doc_id (str): Unique identifier for the document.
+        chunk_size (int): Maximum size of each chunk (default: 1000 characters).
+        chunk_overlap (int): Overlap between chunks (default: 200 characters).
+    Returns:
+        list: List of dictionaries, each containing 'text', 'source', and 'doc_id'.
+    """
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    documents = []
+    seen_hashes = set()  # Track hashes of chunks to avoid duplicates
+    for page_num, page_content in enumerate(page_list, start=1):  # Start page numbering at 1
+        if not page_content or not isinstance(page_content, str):
+            continue  # Skip empty or invalid pages
+        # Split the page content into chunks
+        chunks = text_splitter.split_text(page_content)
+        for i, chunk in enumerate(chunks):
+            # Generate a unique hash for the chunk
+            chunk_hash = hashlib.sha256(chunk.encode()).hexdigest()
+            # Skip if the chunk is a duplicate
+            if chunk_hash in seen_hashes:
+                continue
+            # Create source identifier (e.g., "doc_123_page_1_chunk_0")
+            source = f"doc_{doc_id}_page_{page_num}_chunk_{i}"
+            # Add the chunk with doc_id as metadata
+            documents.append({
+                'text': chunk,
+                'source': source,
+                'doc_id': doc_id
+            })
+            seen_hashes.add(chunk_hash)
+    logging.info(f"Chunking of documents is done. Chunked the document to {len(documents)} numbers of chunks")
+    return documents