Spaces:
Sleeping
Sleeping
File size: 5,184 Bytes
4cfe522 6179695 4cfe522 67baccc f89b538 67baccc f89b538 06351e4 919e7d0 05abb4e 06351e4 27722f3 b1d8341 4cfe522 6179695 4cfe522 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import sys
import os
import subprocess
import shutil
from pathlib import Path
import urllib.request
# Get the current directory
current_dir = os.path.dirname(os.path.abspath(__file__))
# Run setup.sh at startup
try:
setup_script = os.path.join(current_dir, "setup.sh")
if os.path.exists(setup_script):
print("Running setup.sh...")
subprocess.run(["bash", setup_script], check=False)
print("setup.sh completed")
except Exception as e:
print(f"Error running setup.sh: {e}")
# Try to load environment variables from .env file
try:
from dotenv import load_dotenv
load_dotenv()
print("Loaded environment variables from .env file")
except ImportError:
print("python-dotenv not installed, skipping .env file loading")
# Function to setup Tesseract
def setup_tesseract():
"""Setup Tesseract OCR environment."""
print("Setting up Tesseract OCR environment...")
# Create tessdata directory if it doesn't exist
tessdata_dir = os.path.join(current_dir, "tessdata")
os.makedirs(tessdata_dir, exist_ok=True)
# Set TESSDATA_PREFIX environment variable if not already set
if not os.environ.get('TESSDATA_PREFIX'):
# Check multiple possible locations
possible_tessdata_dirs = [
tessdata_dir, # Our local tessdata directory
"/usr/share/tesseract-ocr/4.00/tessdata", # Common location in Hugging Face
"/usr/share/tesseract-ocr/tessdata", # Another common location
"/usr/local/share/tessdata", # Standard installation location
]
# Use the first directory that exists
for dir_path in possible_tessdata_dirs:
if os.path.exists(dir_path):
os.environ['TESSDATA_PREFIX'] = dir_path
print(f"Set TESSDATA_PREFIX to {dir_path}")
break
else:
# If none exist, use our local directory
os.environ['TESSDATA_PREFIX'] = tessdata_dir
print(f"No existing tessdata directory found, set TESSDATA_PREFIX to {tessdata_dir}")
# Download eng.traineddata if it doesn't exist in our local tessdata
eng_traineddata = os.path.join(tessdata_dir, "eng.traineddata")
if not os.path.exists(eng_traineddata):
try:
print("Downloading eng.traineddata...")
url = "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
urllib.request.urlretrieve(url, eng_traineddata)
print("Downloaded eng.traineddata")
except Exception as e:
print(f"Error downloading eng.traineddata: {e}")
# Configure pytesseract
try:
import pytesseract
# Check if tesseract is in PATH
tesseract_cmd = shutil.which("tesseract")
if tesseract_cmd:
pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
print(f"Set pytesseract.tesseract_cmd to {tesseract_cmd}")
else:
# Try common locations
common_locations = [
"/usr/bin/tesseract",
"/usr/local/bin/tesseract",
"/app/tesseract/tesseract"
]
for location in common_locations:
if os.path.isfile(location) and os.access(location, os.X_OK):
pytesseract.pytesseract.tesseract_cmd = location
print(f"Set pytesseract.tesseract_cmd to {location}")
break
else:
print("Warning: Could not find tesseract executable")
except ImportError:
print("pytesseract not installed")
# Try to import tesserocr to verify it's working
try:
import tesserocr
print(f"tesserocr imported successfully, version: {tesserocr.tesseract_version()}")
except ImportError:
print("tesserocr not installed or not working")
except Exception as e:
print(f"Error importing tesserocr: {e}")
# Load Gemini API key from environment variable
gemini_api_key = os.getenv("GOOGLE_API_KEY")
# Check if API key is available and print a message if not
if not gemini_api_key:
print("Warning: GOOGLE_API_KEY environment variable not found. Gemini Flash parser may not work.")
else:
print(f"Found Gemini API key: {gemini_api_key[:5]}...{gemini_api_key[-5:] if len(gemini_api_key) > 10 else ''}")
# Add the current directory to the Python path
sys.path.append(current_dir)
# Try different import approaches
try:
# First attempt - standard import
from src.main import main
except ModuleNotFoundError:
try:
# Second attempt - adjust path and try again
sys.path.append(os.path.join(current_dir, "src"))
from main import main
except ModuleNotFoundError:
# Third attempt - create __init__.py if it doesn't exist
init_path = os.path.join(current_dir, "src", "__init__.py")
if not os.path.exists(init_path):
with open(init_path, "w") as f:
pass # Create empty __init__.py file
# Try import again
from src.main import main
# Call setup function at import time
setup_tesseract()
if __name__ == "__main__":
main() |