Spaces:

tony-42069
/

cre-chatbot-rag

Sleeping

App Files Files Community

cre-chatbot-rag / src /pdf_processor.py

tony-42069

Add source code and test files

d16e9aa 5 months ago

raw

history blame contribute delete

3.98 kB

	"""
	PDF processing module for extracting and chunking text from PDF documents.
	"""
	import logging
	from typing import List, Tuple
	import PyPDF2
	from io import BytesIO

	from app.config import MAX_CHUNK_SIZE, OVERLAP_SIZE

	logger = logging.getLogger('pdf')

	class PDFProcessor:
	"""Handles PDF document processing and text chunking."""

	@staticmethod
	def extract_text(pdf_file: BytesIO) -> str:
	"""Extract text content from a PDF file."""
	try:
	pdf_reader = PyPDF2.PdfReader(pdf_file)
	text = ""

	for page in pdf_reader.pages:
	text += page.extract_text() + "\n"

	logger.info(f"Successfully extracted text from PDF ({len(text)} characters)")
	return text

	except Exception as e:
	logger.error(f"Error extracting text from PDF: {str(e)}")
	raise

	@staticmethod
	def create_chunks(text: str, chunk_size: int = MAX_CHUNK_SIZE,
	overlap: int = OVERLAP_SIZE) -> List[Tuple[str, dict]]:
	"""Split text into overlapping chunks with metadata."""
	try:
	chunks = []
	start = 0

	while start < len(text):
	# Find the end of the chunk
	end = start + chunk_size

	# If we're not at the end of the text, try to find a good break point
	if end < len(text):
	# Try to find the last period or newline in the chunk
	last_period = text.rfind('.', start, end)
	last_newline = text.rfind('\n', start, end)
	break_point = max(last_period, last_newline)

	if break_point > start:
	end = break_point + 1

	# Create chunk with metadata
	chunk_text = text[start:end].strip()
	if chunk_text: # Only add non-empty chunks
	metadata = {
	"start_char": start,
	"end_char": end,
	"chunk_size": len(chunk_text)
	}
	chunks.append((chunk_text, metadata))

	# Move the start position, accounting for overlap
	start = end - overlap if end < len(text) else len(text)

	logger.info(f"Created {len(chunks)} chunks from text")
	return chunks

	except Exception as e:
	logger.error(f"Error creating chunks: {str(e)}")
	raise

	@staticmethod
	def clean_text(text: str) -> str:
	"""Clean and normalize extracted text."""
	try:
	# Remove extra whitespace
	text = ' '.join(text.split())

	# Remove special characters that might cause issues
	text = text.replace('\x00', '')

	# Normalize newlines
	text = text.replace('\r\n', '\n')

	logger.info("Text cleaned successfully")
	return text

	except Exception as e:
	logger.error(f"Error cleaning text: {str(e)}")
	raise

	def process_pdf(self, pdf_file: BytesIO) -> List[Tuple[str, dict]]:
	"""Process PDF file and return chunks with metadata."""
	try:
	# Extract text from PDF
	raw_text = self.extract_text(pdf_file)

	# Clean the extracted text
	cleaned_text = self.clean_text(raw_text)

	# Create chunks
	chunks = self.create_chunks(cleaned_text)

	logger.info(f"PDF processed successfully: {len(chunks)} chunks created")
	return chunks

	except Exception as e:
	logger.error(f"Error processing PDF: {str(e)}")
	raise