cre-chatbot-rag / src /pdf_processor.py
tony-42069's picture
Add source code and test files
d16e9aa
"""
PDF processing module for extracting and chunking text from PDF documents.
"""
import logging
from typing import List, Tuple
import PyPDF2
from io import BytesIO
from app.config import MAX_CHUNK_SIZE, OVERLAP_SIZE
logger = logging.getLogger('pdf')
class PDFProcessor:
"""Handles PDF document processing and text chunking."""
@staticmethod
def extract_text(pdf_file: BytesIO) -> str:
"""Extract text content from a PDF file."""
try:
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
logger.info(f"Successfully extracted text from PDF ({len(text)} characters)")
return text
except Exception as e:
logger.error(f"Error extracting text from PDF: {str(e)}")
raise
@staticmethod
def create_chunks(text: str, chunk_size: int = MAX_CHUNK_SIZE,
overlap: int = OVERLAP_SIZE) -> List[Tuple[str, dict]]:
"""Split text into overlapping chunks with metadata."""
try:
chunks = []
start = 0
while start < len(text):
# Find the end of the chunk
end = start + chunk_size
# If we're not at the end of the text, try to find a good break point
if end < len(text):
# Try to find the last period or newline in the chunk
last_period = text.rfind('.', start, end)
last_newline = text.rfind('\n', start, end)
break_point = max(last_period, last_newline)
if break_point > start:
end = break_point + 1
# Create chunk with metadata
chunk_text = text[start:end].strip()
if chunk_text: # Only add non-empty chunks
metadata = {
"start_char": start,
"end_char": end,
"chunk_size": len(chunk_text)
}
chunks.append((chunk_text, metadata))
# Move the start position, accounting for overlap
start = end - overlap if end < len(text) else len(text)
logger.info(f"Created {len(chunks)} chunks from text")
return chunks
except Exception as e:
logger.error(f"Error creating chunks: {str(e)}")
raise
@staticmethod
def clean_text(text: str) -> str:
"""Clean and normalize extracted text."""
try:
# Remove extra whitespace
text = ' '.join(text.split())
# Remove special characters that might cause issues
text = text.replace('\x00', '')
# Normalize newlines
text = text.replace('\r\n', '\n')
logger.info("Text cleaned successfully")
return text
except Exception as e:
logger.error(f"Error cleaning text: {str(e)}")
raise
def process_pdf(self, pdf_file: BytesIO) -> List[Tuple[str, dict]]:
"""Process PDF file and return chunks with metadata."""
try:
# Extract text from PDF
raw_text = self.extract_text(pdf_file)
# Clean the extracted text
cleaned_text = self.clean_text(raw_text)
# Create chunks
chunks = self.create_chunks(cleaned_text)
logger.info(f"PDF processed successfully: {len(chunks)} chunks created")
return chunks
except Exception as e:
logger.error(f"Error processing PDF: {str(e)}")
raise