File size: 5,184 Bytes
4cfe522
 
6179695
 
 
 
4cfe522
67baccc
 
 
f89b538
 
67baccc
 
 
 
 
f89b538
 
 
06351e4
 
 
 
 
 
 
 
919e7d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05abb4e
06351e4
 
 
 
 
 
 
 
 
27722f3
 
 
b1d8341
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4cfe522
6179695
 
 
4cfe522
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import sys
import os
import subprocess
import shutil
from pathlib import Path
import urllib.request

# Get the current directory
current_dir = os.path.dirname(os.path.abspath(__file__))

# Run setup.sh at startup
try:
    setup_script = os.path.join(current_dir, "setup.sh")
    if os.path.exists(setup_script):
        print("Running setup.sh...")
        subprocess.run(["bash", setup_script], check=False)
        print("setup.sh completed")
except Exception as e:
    print(f"Error running setup.sh: {e}")

# Try to load environment variables from .env file
try:
    from dotenv import load_dotenv
    load_dotenv()
    print("Loaded environment variables from .env file")
except ImportError:
    print("python-dotenv not installed, skipping .env file loading")

# Function to setup Tesseract
def setup_tesseract():
    """Setup Tesseract OCR environment."""
    print("Setting up Tesseract OCR environment...")
    
    # Create tessdata directory if it doesn't exist
    tessdata_dir = os.path.join(current_dir, "tessdata")
    os.makedirs(tessdata_dir, exist_ok=True)
    
    # Set TESSDATA_PREFIX environment variable if not already set
    if not os.environ.get('TESSDATA_PREFIX'):
        # Check multiple possible locations
        possible_tessdata_dirs = [
            tessdata_dir,  # Our local tessdata directory
            "/usr/share/tesseract-ocr/4.00/tessdata",  # Common location in Hugging Face
            "/usr/share/tesseract-ocr/tessdata",  # Another common location
            "/usr/local/share/tessdata",  # Standard installation location
        ]
        
        # Use the first directory that exists
        for dir_path in possible_tessdata_dirs:
            if os.path.exists(dir_path):
                os.environ['TESSDATA_PREFIX'] = dir_path
                print(f"Set TESSDATA_PREFIX to {dir_path}")
                break
        else:
            # If none exist, use our local directory
            os.environ['TESSDATA_PREFIX'] = tessdata_dir
            print(f"No existing tessdata directory found, set TESSDATA_PREFIX to {tessdata_dir}")
    
    # Download eng.traineddata if it doesn't exist in our local tessdata
    eng_traineddata = os.path.join(tessdata_dir, "eng.traineddata")
    if not os.path.exists(eng_traineddata):
        try:
            print("Downloading eng.traineddata...")
            url = "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
            urllib.request.urlretrieve(url, eng_traineddata)
            print("Downloaded eng.traineddata")
        except Exception as e:
            print(f"Error downloading eng.traineddata: {e}")
    
    # Configure pytesseract
    try:
        import pytesseract
        # Check if tesseract is in PATH
        tesseract_cmd = shutil.which("tesseract")
        if tesseract_cmd:
            pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
            print(f"Set pytesseract.tesseract_cmd to {tesseract_cmd}")
        else:
            # Try common locations
            common_locations = [
                "/usr/bin/tesseract",
                "/usr/local/bin/tesseract",
                "/app/tesseract/tesseract"
            ]
            for location in common_locations:
                if os.path.isfile(location) and os.access(location, os.X_OK):
                    pytesseract.pytesseract.tesseract_cmd = location
                    print(f"Set pytesseract.tesseract_cmd to {location}")
                    break
            else:
                print("Warning: Could not find tesseract executable")
    except ImportError:
        print("pytesseract not installed")
    
    # Try to import tesserocr to verify it's working
    try:
        import tesserocr
        print(f"tesserocr imported successfully, version: {tesserocr.tesseract_version()}")
    except ImportError:
        print("tesserocr not installed or not working")
    except Exception as e:
        print(f"Error importing tesserocr: {e}")

# Load Gemini API key from environment variable
gemini_api_key = os.getenv("GOOGLE_API_KEY")

# Check if API key is available and print a message if not
if not gemini_api_key:
    print("Warning: GOOGLE_API_KEY environment variable not found. Gemini Flash parser may not work.")
else:
    print(f"Found Gemini API key: {gemini_api_key[:5]}...{gemini_api_key[-5:] if len(gemini_api_key) > 10 else ''}")

# Add the current directory to the Python path
sys.path.append(current_dir)

# Try different import approaches
try:
    # First attempt - standard import
    from src.main import main
except ModuleNotFoundError:
    try:
        # Second attempt - adjust path and try again
        sys.path.append(os.path.join(current_dir, "src"))
        from main import main
    except ModuleNotFoundError:
        # Third attempt - create __init__.py if it doesn't exist
        init_path = os.path.join(current_dir, "src", "__init__.py")
        if not os.path.exists(init_path):
            with open(init_path, "w") as f:
                pass  # Create empty __init__.py file
        # Try import again
        from src.main import main

# Call setup function at import time
setup_tesseract()

if __name__ == "__main__":
    main()