import sys import os import subprocess import shutil from pathlib import Path import urllib.request # Get the current directory current_dir = os.path.dirname(os.path.abspath(__file__)) # Run setup.sh at startup try: setup_script = os.path.join(current_dir, "setup.sh") if os.path.exists(setup_script): print("Running setup.sh...") subprocess.run(["bash", setup_script], check=False) print("setup.sh completed") except Exception as e: print(f"Error running setup.sh: {e}") # Try to load environment variables from .env file try: from dotenv import load_dotenv load_dotenv() print("Loaded environment variables from .env file") except ImportError: print("python-dotenv not installed, skipping .env file loading") # Function to setup Tesseract def setup_tesseract(): """Setup Tesseract OCR environment.""" print("Setting up Tesseract OCR environment...") # Create tessdata directory if it doesn't exist tessdata_dir = os.path.join(current_dir, "tessdata") os.makedirs(tessdata_dir, exist_ok=True) # Set TESSDATA_PREFIX environment variable if not already set if not os.environ.get('TESSDATA_PREFIX'): # Check multiple possible locations possible_tessdata_dirs = [ tessdata_dir, # Our local tessdata directory "/usr/share/tesseract-ocr/4.00/tessdata", # Common location in Hugging Face "/usr/share/tesseract-ocr/tessdata", # Another common location "/usr/local/share/tessdata", # Standard installation location ] # Use the first directory that exists for dir_path in possible_tessdata_dirs: if os.path.exists(dir_path): os.environ['TESSDATA_PREFIX'] = dir_path print(f"Set TESSDATA_PREFIX to {dir_path}") break else: # If none exist, use our local directory os.environ['TESSDATA_PREFIX'] = tessdata_dir print(f"No existing tessdata directory found, set TESSDATA_PREFIX to {tessdata_dir}") # Download eng.traineddata if it doesn't exist in our local tessdata eng_traineddata = os.path.join(tessdata_dir, "eng.traineddata") if not os.path.exists(eng_traineddata): try: print("Downloading eng.traineddata...") url = "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata" urllib.request.urlretrieve(url, eng_traineddata) print("Downloaded eng.traineddata") except Exception as e: print(f"Error downloading eng.traineddata: {e}") # Configure pytesseract try: import pytesseract # Check if tesseract is in PATH tesseract_cmd = shutil.which("tesseract") if tesseract_cmd: pytesseract.pytesseract.tesseract_cmd = tesseract_cmd print(f"Set pytesseract.tesseract_cmd to {tesseract_cmd}") else: # Try common locations common_locations = [ "/usr/bin/tesseract", "/usr/local/bin/tesseract", "/app/tesseract/tesseract" ] for location in common_locations: if os.path.isfile(location) and os.access(location, os.X_OK): pytesseract.pytesseract.tesseract_cmd = location print(f"Set pytesseract.tesseract_cmd to {location}") break else: print("Warning: Could not find tesseract executable") except ImportError: print("pytesseract not installed") # Try to import tesserocr to verify it's working try: import tesserocr print(f"tesserocr imported successfully, version: {tesserocr.tesseract_version()}") except ImportError: print("tesserocr not installed or not working") except Exception as e: print(f"Error importing tesserocr: {e}") # Load Gemini API key from environment variable gemini_api_key = os.getenv("GOOGLE_API_KEY") # Check if API key is available and print a message if not if not gemini_api_key: print("Warning: GOOGLE_API_KEY environment variable not found. Gemini Flash parser may not work.") else: print(f"Found Gemini API key: {gemini_api_key[:5]}...{gemini_api_key[-5:] if len(gemini_api_key) > 10 else ''}") # Add the current directory to the Python path sys.path.append(current_dir) # Try different import approaches try: # First attempt - standard import from src.main import main except ModuleNotFoundError: try: # Second attempt - adjust path and try again sys.path.append(os.path.join(current_dir, "src")) from main import main except ModuleNotFoundError: # Third attempt - create __init__.py if it doesn't exist init_path = os.path.join(current_dir, "src", "__init__.py") if not os.path.exists(init_path): with open(init_path, "w") as f: pass # Create empty __init__.py file # Try import again from src.main import main # Call setup function at import time setup_tesseract() if __name__ == "__main__": main()