Spaces:
Running
Running
# pdf_reader.py | |
import PyPDF2 | |
from typing import List | |
class PDFReader: | |
def __init__(self): | |
self.page_list = [] | |
def read_pdf(self, file_path: str) -> List[str]: | |
""" | |
Read PDF content and return list of pages | |
Each element in the list is the text content of a page | |
""" | |
try: | |
# Open and read the PDF file | |
with open(file_path, 'rb') as file: | |
pdf_reader = PyPDF2.PdfReader(file) | |
num_pages = len(pdf_reader.pages) | |
# Extract text from each page | |
self.page_list = [] | |
for page_num in range(num_pages): | |
page = pdf_reader.pages[page_num] | |
text = page.extract_text() | |
if text: # Only add non-empty pages | |
self.page_list.append(text.strip()) | |
return self.page_list | |
except Exception as e: | |
raise Exception(f"Error reading PDF: {str(e)}") |