#!/usr/bin/env python3 """ PDFOCR - Module for processing PDF files with OCR and extracting structured data. """ import json from pathlib import Path from structured_ocr import StructuredOCR class PDFOCR: """Class for processing PDF files with OCR and extracting structured data.""" def __init__(self, api_key=None): """Initialize the PDF OCR processor.""" self.processor = StructuredOCR(api_key=api_key) def process_pdf(self, pdf_path, use_vision=True): """ Process a PDF file with OCR and extract structured data. Args: pdf_path: Path to the PDF file use_vision: Whether to use vision model for improved analysis Returns: Dictionary with structured OCR results """ pdf_path = Path(pdf_path) if not pdf_path.exists(): raise FileNotFoundError(f"PDF file not found: {pdf_path}") return self.processor.process_file(pdf_path, file_type="pdf", use_vision=use_vision) def save_json_output(self, pdf_path, output_path, use_vision=True): """ Process a PDF file and save the structured output as JSON. Args: pdf_path: Path to the PDF file output_path: Path where to save the JSON output use_vision: Whether to use vision model for improved analysis Returns: Path to the saved JSON file """ # Process the PDF result = self.process_pdf(pdf_path, use_vision=use_vision) # Save the result to JSON output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w') as f: json.dump(result, f, indent=2) return output_path # For testing directly if __name__ == "__main__": import sys if len(sys.argv) < 2: print("Usage: python pdf_ocr.py [output_path]") sys.exit(1) pdf_path = sys.argv[1] output_path = sys.argv[2] if len(sys.argv) > 2 else None processor = PDFOCR() if output_path: result_path = processor.save_json_output(pdf_path, output_path) print(f"Results saved to: {result_path}") else: result = processor.process_pdf(pdf_path) print(json.dumps(result, indent=2))