"""
Text chunking module for Norwegian RAG chatbot.
Splits documents into manageable chunks for embedding and retrieval.
"""

import re
from typing import List, Optional, Tuple

from ..api.config import CHUNK_SIZE, CHUNK_OVERLAP

class TextChunker:
    """
    Splits documents into manageable chunks for embedding and retrieval.
    Supports different chunking strategies optimized for Norwegian text.
    """
    
    @staticmethod
    def chunk_text(
        text: str,
        chunk_size: int = CHUNK_SIZE,
        chunk_overlap: int = CHUNK_OVERLAP,
        strategy: str = "paragraph"
    ) -> List[str]:
        """
        Split text into chunks using the specified strategy.
        
        Args:
            text: Text to split into chunks
            chunk_size: Maximum size of each chunk
            chunk_overlap: Overlap between consecutive chunks
            strategy: Chunking strategy ('fixed', 'paragraph', or 'sentence')
            
        Returns:
            List of text chunks
        """
        if not text:
            return []
        
        if strategy == "fixed":
            return TextChunker.fixed_size_chunks(text, chunk_size, chunk_overlap)
        elif strategy == "paragraph":
            return TextChunker.paragraph_chunks(text, chunk_size, chunk_overlap)
        elif strategy == "sentence":
            return TextChunker.sentence_chunks(text, chunk_size, chunk_overlap)
        else:
            raise ValueError(f"Unknown chunking strategy: {strategy}")
    
    @staticmethod
    def fixed_size_chunks(
        text: str,
        chunk_size: int = CHUNK_SIZE,
        chunk_overlap: int = CHUNK_OVERLAP
    ) -> List[str]:
        """
        Split text into fixed-size chunks with overlap.
        
        Args:
            text: Text to split into chunks
            chunk_size: Maximum size of each chunk
            chunk_overlap: Overlap between consecutive chunks
            
        Returns:
            List of text chunks
        """
        if not text:
            return []
        
        chunks = []
        start = 0
        text_length = len(text)
        
        while start < text_length:
            end = min(start + chunk_size, text_length)
            
            # If this is not the first chunk and we're not at the end,
            # try to find a good breaking point (whitespace)
            if start > 0 and end < text_length:
                # Look for the last whitespace within the chunk
                last_whitespace = text.rfind(' ', start, end)
                if last_whitespace != -1:
                    end = last_whitespace + 1  # Include the space
            
            # Add the chunk
            chunks.append(text[start:end].strip())
            
            # Move the start position for the next chunk, considering overlap
            start = end - chunk_overlap if end < text_length else text_length
        
        return chunks
    
    @staticmethod
    def paragraph_chunks(
        text: str,
        max_chunk_size: int = CHUNK_SIZE,
        chunk_overlap: int = CHUNK_OVERLAP
    ) -> List[str]:
        """
        Split text into chunks based on paragraphs.
        
        Args:
            text: Text to split into chunks
            max_chunk_size: Maximum size of each chunk
            chunk_overlap: Overlap between consecutive chunks
            
        Returns:
            List of text chunks
        """
        if not text:
            return []
        
        # Split text into paragraphs
        paragraphs = re.split(r'\n\s*\n', text)
        paragraphs = [p.strip() for p in paragraphs if p.strip()]
        
        chunks = []
        current_chunk = []
        current_size = 0
        
        for paragraph in paragraphs:
            paragraph_size = len(paragraph)
            
            # If adding this paragraph would exceed the max chunk size and we already have content,
            # save the current chunk and start a new one
            if current_size + paragraph_size > max_chunk_size and current_chunk:
                chunks.append('\n\n'.join(current_chunk))
                
                # For overlap, keep some paragraphs from the previous chunk
                overlap_size = 0
                overlap_paragraphs = []
                
                # Add paragraphs from the end until we reach the desired overlap
                for p in reversed(current_chunk):
                    if overlap_size + len(p) <= chunk_overlap:
                        overlap_paragraphs.insert(0, p)
                        overlap_size += len(p)
                    else:
                        break
                
                current_chunk = overlap_paragraphs
                current_size = overlap_size
            
            # If the paragraph itself is larger than the max chunk size, split it further
            if paragraph_size > max_chunk_size:
                # First, add the current chunk if it's not empty
                if current_chunk:
                    chunks.append('\n\n'.join(current_chunk))
                    current_chunk = []
                    current_size = 0
                
                # Then split the large paragraph into fixed-size chunks
                paragraph_chunks = TextChunker.fixed_size_chunks(paragraph, max_chunk_size, chunk_overlap)
                chunks.extend(paragraph_chunks)
            else:
                # Add the paragraph to the current chunk
                current_chunk.append(paragraph)
                current_size += paragraph_size
        
        # Add the last chunk if it's not empty
        if current_chunk:
            chunks.append('\n\n'.join(current_chunk))
        
        return chunks
    
    @staticmethod
    def sentence_chunks(
        text: str,
        max_chunk_size: int = CHUNK_SIZE,
        chunk_overlap: int = CHUNK_OVERLAP
    ) -> List[str]:
        """
        Split text into chunks based on sentences.
        
        Args:
            text: Text to split into chunks
            max_chunk_size: Maximum size of each chunk
            chunk_overlap: Overlap between consecutive chunks
            
        Returns:
            List of text chunks
        """
        if not text:
            return []
        
        # Norwegian-aware sentence splitting
        # This pattern handles common Norwegian sentence endings
        sentence_pattern = r'(?<=[.!?])\s+(?=[A-ZÆØÅ])'
        sentences = re.split(sentence_pattern, text)
        sentences = [s.strip() for s in sentences if s.strip()]
        
        chunks = []
        current_chunk = []
        current_size = 0
        
        for sentence in sentences:
            sentence_size = len(sentence)
            
            # If adding this sentence would exceed the max chunk size and we already have content,
            # save the current chunk and start a new one
            if current_size + sentence_size > max_chunk_size and current_chunk:
                chunks.append(' '.join(current_chunk))
                
                # For overlap, keep some sentences from the previous chunk
                overlap_size = 0
                overlap_sentences = []
                
                # Add sentences from the end until we reach the desired overlap
                for s in reversed(current_chunk):
                    if overlap_size + len(s) <= chunk_overlap:
                        overlap_sentences.insert(0, s)
                        overlap_size += len(s)
                    else:
                        break
                
                current_chunk = overlap_sentences
                current_size = overlap_size
            
            # If the sentence itself is larger than the max chunk size, split it further
            if sentence_size > max_chunk_size:
                # First, add the current chunk if it's not empty
                if current_chunk:
                    chunks.append(' '.join(current_chunk))
                    current_chunk = []
                    current_size = 0
                
                # Then split the large sentence into fixed-size chunks
                sentence_chunks = TextChunker.fixed_size_chunks(sentence, max_chunk_size, chunk_overlap)
                chunks.extend(sentence_chunks)
            else:
                # Add the sentence to the current chunk
                current_chunk.append(sentence)
                current_size += sentence_size
        
        # Add the last chunk if it's not empty
        if current_chunk:
            chunks.append(' '.join(current_chunk))
        
        return chunks
    
    @staticmethod
    def clean_chunk(chunk: str) -> str:
        """
        Clean a text chunk by removing excessive whitespace and normalizing.
        
        Args:
            chunk: Text chunk to clean
            
        Returns:
            Cleaned text chunk
        """
        if not chunk:
            return ""
        
        # Replace multiple whitespace with a single space
        cleaned = re.sub(r'\s+', ' ', chunk)
        
        # Normalize Norwegian characters (if needed)
        # This ensures consistent handling of æ, ø, å
        cleaned = cleaned.replace('æ', 'æ').replace('Æ', 'Æ')
        cleaned = cleaned.replace('ø', 'ø').replace('Ø', 'Ø')
        cleaned = cleaned.replace('å', 'å').replace('Å', 'Å')
        
        return cleaned.strip()