""" PDF processing module for extracting and chunking text from PDF documents. """ import logging from typing import List, Tuple import PyPDF2 from io import BytesIO from app.config import MAX_CHUNK_SIZE, OVERLAP_SIZE logger = logging.getLogger('pdf') class PDFProcessor: """Handles PDF document processing and text chunking.""" @staticmethod def extract_text(pdf_file: BytesIO) -> str: """Extract text content from a PDF file.""" try: pdf_reader = PyPDF2.PdfReader(pdf_file) text = "" for page in pdf_reader.pages: text += page.extract_text() + "\n" logger.info(f"Successfully extracted text from PDF ({len(text)} characters)") return text except Exception as e: logger.error(f"Error extracting text from PDF: {str(e)}") raise @staticmethod def create_chunks(text: str, chunk_size: int = MAX_CHUNK_SIZE, overlap: int = OVERLAP_SIZE) -> List[Tuple[str, dict]]: """Split text into overlapping chunks with metadata.""" try: chunks = [] start = 0 while start < len(text): # Find the end of the chunk end = start + chunk_size # If we're not at the end of the text, try to find a good break point if end < len(text): # Try to find the last period or newline in the chunk last_period = text.rfind('.', start, end) last_newline = text.rfind('\n', start, end) break_point = max(last_period, last_newline) if break_point > start: end = break_point + 1 # Create chunk with metadata chunk_text = text[start:end].strip() if chunk_text: # Only add non-empty chunks metadata = { "start_char": start, "end_char": end, "chunk_size": len(chunk_text) } chunks.append((chunk_text, metadata)) # Move the start position, accounting for overlap start = end - overlap if end < len(text) else len(text) logger.info(f"Created {len(chunks)} chunks from text") return chunks except Exception as e: logger.error(f"Error creating chunks: {str(e)}") raise @staticmethod def clean_text(text: str) -> str: """Clean and normalize extracted text.""" try: # Remove extra whitespace text = ' '.join(text.split()) # Remove special characters that might cause issues text = text.replace('\x00', '') # Normalize newlines text = text.replace('\r\n', '\n') logger.info("Text cleaned successfully") return text except Exception as e: logger.error(f"Error cleaning text: {str(e)}") raise def process_pdf(self, pdf_file: BytesIO) -> List[Tuple[str, dict]]: """Process PDF file and return chunks with metadata.""" try: # Extract text from PDF raw_text = self.extract_text(pdf_file) # Clean the extracted text cleaned_text = self.clean_text(raw_text) # Create chunks chunks = self.create_chunks(cleaned_text) logger.info(f"PDF processed successfully: {len(chunks)} chunks created") return chunks except Exception as e: logger.error(f"Error processing PDF: {str(e)}") raise