File size: 3,979 Bytes
d16e9aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
PDF processing module for extracting and chunking text from PDF documents.
"""
import logging
from typing import List, Tuple
import PyPDF2
from io import BytesIO

from app.config import MAX_CHUNK_SIZE, OVERLAP_SIZE

logger = logging.getLogger('pdf')

class PDFProcessor:
    """Handles PDF document processing and text chunking."""
    
    @staticmethod
    def extract_text(pdf_file: BytesIO) -> str:
        """Extract text content from a PDF file."""
        try:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            text = ""
            
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
            
            logger.info(f"Successfully extracted text from PDF ({len(text)} characters)")
            return text
            
        except Exception as e:
            logger.error(f"Error extracting text from PDF: {str(e)}")
            raise
    
    @staticmethod
    def create_chunks(text: str, chunk_size: int = MAX_CHUNK_SIZE, 
                     overlap: int = OVERLAP_SIZE) -> List[Tuple[str, dict]]:
        """Split text into overlapping chunks with metadata."""
        try:
            chunks = []
            start = 0
            
            while start < len(text):
                # Find the end of the chunk
                end = start + chunk_size
                
                # If we're not at the end of the text, try to find a good break point
                if end < len(text):
                    # Try to find the last period or newline in the chunk
                    last_period = text.rfind('.', start, end)
                    last_newline = text.rfind('\n', start, end)
                    break_point = max(last_period, last_newline)
                    
                    if break_point > start:
                        end = break_point + 1
                
                # Create chunk with metadata
                chunk_text = text[start:end].strip()
                if chunk_text:  # Only add non-empty chunks
                    metadata = {
                        "start_char": start,
                        "end_char": end,
                        "chunk_size": len(chunk_text)
                    }
                    chunks.append((chunk_text, metadata))
                
                # Move the start position, accounting for overlap
                start = end - overlap if end < len(text) else len(text)
            
            logger.info(f"Created {len(chunks)} chunks from text")
            return chunks
            
        except Exception as e:
            logger.error(f"Error creating chunks: {str(e)}")
            raise
    
    @staticmethod
    def clean_text(text: str) -> str:
        """Clean and normalize extracted text."""
        try:
            # Remove extra whitespace
            text = ' '.join(text.split())
            
            # Remove special characters that might cause issues
            text = text.replace('\x00', '')
            
            # Normalize newlines
            text = text.replace('\r\n', '\n')
            
            logger.info("Text cleaned successfully")
            return text
            
        except Exception as e:
            logger.error(f"Error cleaning text: {str(e)}")
            raise
    
    def process_pdf(self, pdf_file: BytesIO) -> List[Tuple[str, dict]]:
        """Process PDF file and return chunks with metadata."""
        try:
            # Extract text from PDF
            raw_text = self.extract_text(pdf_file)
            
            # Clean the extracted text
            cleaned_text = self.clean_text(raw_text)
            
            # Create chunks
            chunks = self.create_chunks(cleaned_text)
            
            logger.info(f"PDF processed successfully: {len(chunks)} chunks created")
            return chunks
            
        except Exception as e:
            logger.error(f"Error processing PDF: {str(e)}")
            raise