""" Text chunking module for Norwegian RAG chatbot. Splits documents into manageable chunks for embedding and retrieval. """ import re from typing import List, Optional, Tuple from ..api.config import CHUNK_SIZE, CHUNK_OVERLAP class TextChunker: """ Splits documents into manageable chunks for embedding and retrieval. Supports different chunking strategies optimized for Norwegian text. """ @staticmethod def chunk_text( text: str, chunk_size: int = CHUNK_SIZE, chunk_overlap: int = CHUNK_OVERLAP, strategy: str = "paragraph" ) -> List[str]: """ Split text into chunks using the specified strategy. Args: text: Text to split into chunks chunk_size: Maximum size of each chunk chunk_overlap: Overlap between consecutive chunks strategy: Chunking strategy ('fixed', 'paragraph', or 'sentence') Returns: List of text chunks """ if not text: return [] if strategy == "fixed": return TextChunker.fixed_size_chunks(text, chunk_size, chunk_overlap) elif strategy == "paragraph": return TextChunker.paragraph_chunks(text, chunk_size, chunk_overlap) elif strategy == "sentence": return TextChunker.sentence_chunks(text, chunk_size, chunk_overlap) else: raise ValueError(f"Unknown chunking strategy: {strategy}") @staticmethod def fixed_size_chunks( text: str, chunk_size: int = CHUNK_SIZE, chunk_overlap: int = CHUNK_OVERLAP ) -> List[str]: """ Split text into fixed-size chunks with overlap. Args: text: Text to split into chunks chunk_size: Maximum size of each chunk chunk_overlap: Overlap between consecutive chunks Returns: List of text chunks """ if not text: return [] chunks = [] start = 0 text_length = len(text) while start < text_length: end = min(start + chunk_size, text_length) # If this is not the first chunk and we're not at the end, # try to find a good breaking point (whitespace) if start > 0 and end < text_length: # Look for the last whitespace within the chunk last_whitespace = text.rfind(' ', start, end) if last_whitespace != -1: end = last_whitespace + 1 # Include the space # Add the chunk chunks.append(text[start:end].strip()) # Move the start position for the next chunk, considering overlap start = end - chunk_overlap if end < text_length else text_length return chunks @staticmethod def paragraph_chunks( text: str, max_chunk_size: int = CHUNK_SIZE, chunk_overlap: int = CHUNK_OVERLAP ) -> List[str]: """ Split text into chunks based on paragraphs. Args: text: Text to split into chunks max_chunk_size: Maximum size of each chunk chunk_overlap: Overlap between consecutive chunks Returns: List of text chunks """ if not text: return [] # Split text into paragraphs paragraphs = re.split(r'\n\s*\n', text) paragraphs = [p.strip() for p in paragraphs if p.strip()] chunks = [] current_chunk = [] current_size = 0 for paragraph in paragraphs: paragraph_size = len(paragraph) # If adding this paragraph would exceed the max chunk size and we already have content, # save the current chunk and start a new one if current_size + paragraph_size > max_chunk_size and current_chunk: chunks.append('\n\n'.join(current_chunk)) # For overlap, keep some paragraphs from the previous chunk overlap_size = 0 overlap_paragraphs = [] # Add paragraphs from the end until we reach the desired overlap for p in reversed(current_chunk): if overlap_size + len(p) <= chunk_overlap: overlap_paragraphs.insert(0, p) overlap_size += len(p) else: break current_chunk = overlap_paragraphs current_size = overlap_size # If the paragraph itself is larger than the max chunk size, split it further if paragraph_size > max_chunk_size: # First, add the current chunk if it's not empty if current_chunk: chunks.append('\n\n'.join(current_chunk)) current_chunk = [] current_size = 0 # Then split the large paragraph into fixed-size chunks paragraph_chunks = TextChunker.fixed_size_chunks(paragraph, max_chunk_size, chunk_overlap) chunks.extend(paragraph_chunks) else: # Add the paragraph to the current chunk current_chunk.append(paragraph) current_size += paragraph_size # Add the last chunk if it's not empty if current_chunk: chunks.append('\n\n'.join(current_chunk)) return chunks @staticmethod def sentence_chunks( text: str, max_chunk_size: int = CHUNK_SIZE, chunk_overlap: int = CHUNK_OVERLAP ) -> List[str]: """ Split text into chunks based on sentences. Args: text: Text to split into chunks max_chunk_size: Maximum size of each chunk chunk_overlap: Overlap between consecutive chunks Returns: List of text chunks """ if not text: return [] # Norwegian-aware sentence splitting # This pattern handles common Norwegian sentence endings sentence_pattern = r'(?<=[.!?])\s+(?=[A-ZÆØÅ])' sentences = re.split(sentence_pattern, text) sentences = [s.strip() for s in sentences if s.strip()] chunks = [] current_chunk = [] current_size = 0 for sentence in sentences: sentence_size = len(sentence) # If adding this sentence would exceed the max chunk size and we already have content, # save the current chunk and start a new one if current_size + sentence_size > max_chunk_size and current_chunk: chunks.append(' '.join(current_chunk)) # For overlap, keep some sentences from the previous chunk overlap_size = 0 overlap_sentences = [] # Add sentences from the end until we reach the desired overlap for s in reversed(current_chunk): if overlap_size + len(s) <= chunk_overlap: overlap_sentences.insert(0, s) overlap_size += len(s) else: break current_chunk = overlap_sentences current_size = overlap_size # If the sentence itself is larger than the max chunk size, split it further if sentence_size > max_chunk_size: # First, add the current chunk if it's not empty if current_chunk: chunks.append(' '.join(current_chunk)) current_chunk = [] current_size = 0 # Then split the large sentence into fixed-size chunks sentence_chunks = TextChunker.fixed_size_chunks(sentence, max_chunk_size, chunk_overlap) chunks.extend(sentence_chunks) else: # Add the sentence to the current chunk current_chunk.append(sentence) current_size += sentence_size # Add the last chunk if it's not empty if current_chunk: chunks.append(' '.join(current_chunk)) return chunks @staticmethod def clean_chunk(chunk: str) -> str: """ Clean a text chunk by removing excessive whitespace and normalizing. Args: chunk: Text chunk to clean Returns: Cleaned text chunk """ if not chunk: return "" # Replace multiple whitespace with a single space cleaned = re.sub(r'\s+', ' ', chunk) # Normalize Norwegian characters (if needed) # This ensures consistent handling of æ, ø, å cleaned = cleaned.replace('æ', 'æ').replace('Æ', 'Æ') cleaned = cleaned.replace('ø', 'ø').replace('Ø', 'Ø') cleaned = cleaned.replace('å', 'å').replace('Å', 'Å') return cleaned.strip()