chaaim123 commited on
Commit
4c0e6c8
·
verified ·
1 Parent(s): c0dc39c

Create retriever/chunk_documents.py

Browse files
Files changed (1) hide show
  1. retriever/chunk_documents.py +49 -0
retriever/chunk_documents.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ import hashlib
4
+
5
+ def chunk_documents(page_list, doc_id, chunk_size=1000, chunk_overlap=200):
6
+ """
7
+ Chunk a list of page contents into smaller segments with document ID metadata.
8
+
9
+ Args:
10
+ page_list (list): List of strings, each string being the content of a page.
11
+ doc_id (str): Unique identifier for the document.
12
+ chunk_size (int): Maximum size of each chunk (default: 1000 characters).
13
+ chunk_overlap (int): Overlap between chunks (default: 200 characters).
14
+
15
+ Returns:
16
+ list: List of dictionaries, each containing 'text', 'source', and 'doc_id'.
17
+ """
18
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
19
+ documents = []
20
+ seen_hashes = set() # Track hashes of chunks to avoid duplicates
21
+
22
+ for page_num, page_content in enumerate(page_list, start=1): # Start page numbering at 1
23
+ if not page_content or not isinstance(page_content, str):
24
+ continue # Skip empty or invalid pages
25
+
26
+ # Split the page content into chunks
27
+ chunks = text_splitter.split_text(page_content)
28
+
29
+ for i, chunk in enumerate(chunks):
30
+ # Generate a unique hash for the chunk
31
+ chunk_hash = hashlib.sha256(chunk.encode()).hexdigest()
32
+
33
+ # Skip if the chunk is a duplicate
34
+ if chunk_hash in seen_hashes:
35
+ continue
36
+
37
+ # Create source identifier (e.g., "doc_123_page_1_chunk_0")
38
+ source = f"doc_{doc_id}_page_{page_num}_chunk_{i}"
39
+
40
+ # Add the chunk with doc_id as metadata
41
+ documents.append({
42
+ 'text': chunk,
43
+ 'source': source,
44
+ 'doc_id': doc_id
45
+ })
46
+ seen_hashes.add(chunk_hash)
47
+
48
+ logging.info(f"Chunking of documents is done. Chunked the document to {len(documents)} numbers of chunks")
49
+ return documents