Spaces:
Sleeping
Sleeping
fixing the tesseract path issues for full page ocr
Browse files- app.py +77 -32
- fix_tesseract_huggingface.py +144 -0
- packages.txt +6 -0
- setup.sh +43 -24
app.py
CHANGED
@@ -13,12 +13,83 @@ try:
|
|
13 |
except ImportError:
|
14 |
print("python-dotenv not installed, skipping .env file loading")
|
15 |
|
16 |
-
#
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
# Load Gemini API key from environment variable
|
24 |
gemini_api_key = os.getenv("GOOGLE_API_KEY")
|
@@ -29,9 +100,6 @@ if not gemini_api_key:
|
|
29 |
else:
|
30 |
print(f"Found Gemini API key: {gemini_api_key[:5]}...{gemini_api_key[-5:] if len(gemini_api_key) > 10 else ''}")
|
31 |
|
32 |
-
# Get the current directory
|
33 |
-
current_dir = os.path.dirname(os.path.abspath(__file__))
|
34 |
-
|
35 |
# Add the current directory to the Python path
|
36 |
sys.path.append(current_dir)
|
37 |
|
@@ -53,29 +121,6 @@ except ModuleNotFoundError:
|
|
53 |
# Try import again
|
54 |
from src.main import main
|
55 |
|
56 |
-
# Function to setup Tesseract
|
57 |
-
def setup_tesseract():
|
58 |
-
"""Setup Tesseract OCR environment."""
|
59 |
-
# Create tessdata directory if it doesn't exist
|
60 |
-
tessdata_dir = os.path.join(current_dir, "tessdata")
|
61 |
-
os.makedirs(tessdata_dir, exist_ok=True)
|
62 |
-
|
63 |
-
# Set TESSDATA_PREFIX environment variable if not already set
|
64 |
-
if not os.environ.get('TESSDATA_PREFIX'):
|
65 |
-
os.environ['TESSDATA_PREFIX'] = tessdata_dir
|
66 |
-
print(f"Set TESSDATA_PREFIX to {tessdata_dir}")
|
67 |
-
|
68 |
-
# Download eng.traineddata if it doesn't exist
|
69 |
-
eng_traineddata = os.path.join(tessdata_dir, "eng.traineddata")
|
70 |
-
if not os.path.exists(eng_traineddata):
|
71 |
-
try:
|
72 |
-
print("Downloading eng.traineddata...")
|
73 |
-
url = "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
|
74 |
-
urllib.request.urlretrieve(url, eng_traineddata)
|
75 |
-
print("Downloaded eng.traineddata")
|
76 |
-
except Exception as e:
|
77 |
-
print(f"Error downloading eng.traineddata: {e}")
|
78 |
-
|
79 |
# Call setup function at import time
|
80 |
setup_tesseract()
|
81 |
|
|
|
13 |
except ImportError:
|
14 |
print("python-dotenv not installed, skipping .env file loading")
|
15 |
|
16 |
+
# Get the current directory
|
17 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
18 |
+
|
19 |
+
# Function to setup Tesseract
|
20 |
+
def setup_tesseract():
|
21 |
+
"""Setup Tesseract OCR environment."""
|
22 |
+
print("Setting up Tesseract OCR environment...")
|
23 |
+
|
24 |
+
# Create tessdata directory if it doesn't exist
|
25 |
+
tessdata_dir = os.path.join(current_dir, "tessdata")
|
26 |
+
os.makedirs(tessdata_dir, exist_ok=True)
|
27 |
+
|
28 |
+
# Set TESSDATA_PREFIX environment variable if not already set
|
29 |
+
if not os.environ.get('TESSDATA_PREFIX'):
|
30 |
+
# Check multiple possible locations
|
31 |
+
possible_tessdata_dirs = [
|
32 |
+
tessdata_dir, # Our local tessdata directory
|
33 |
+
"/usr/share/tesseract-ocr/4.00/tessdata", # Common location in Hugging Face
|
34 |
+
"/usr/share/tesseract-ocr/tessdata", # Another common location
|
35 |
+
"/usr/local/share/tessdata", # Standard installation location
|
36 |
+
]
|
37 |
+
|
38 |
+
# Use the first directory that exists
|
39 |
+
for dir_path in possible_tessdata_dirs:
|
40 |
+
if os.path.exists(dir_path):
|
41 |
+
os.environ['TESSDATA_PREFIX'] = dir_path
|
42 |
+
print(f"Set TESSDATA_PREFIX to {dir_path}")
|
43 |
+
break
|
44 |
+
else:
|
45 |
+
# If none exist, use our local directory
|
46 |
+
os.environ['TESSDATA_PREFIX'] = tessdata_dir
|
47 |
+
print(f"No existing tessdata directory found, set TESSDATA_PREFIX to {tessdata_dir}")
|
48 |
+
|
49 |
+
# Download eng.traineddata if it doesn't exist in our local tessdata
|
50 |
+
eng_traineddata = os.path.join(tessdata_dir, "eng.traineddata")
|
51 |
+
if not os.path.exists(eng_traineddata):
|
52 |
+
try:
|
53 |
+
print("Downloading eng.traineddata...")
|
54 |
+
url = "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
|
55 |
+
urllib.request.urlretrieve(url, eng_traineddata)
|
56 |
+
print("Downloaded eng.traineddata")
|
57 |
+
except Exception as e:
|
58 |
+
print(f"Error downloading eng.traineddata: {e}")
|
59 |
+
|
60 |
+
# Configure pytesseract
|
61 |
+
try:
|
62 |
+
import pytesseract
|
63 |
+
# Check if tesseract is in PATH
|
64 |
+
tesseract_cmd = shutil.which("tesseract")
|
65 |
+
if tesseract_cmd:
|
66 |
+
pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
|
67 |
+
print(f"Set pytesseract.tesseract_cmd to {tesseract_cmd}")
|
68 |
+
else:
|
69 |
+
# Try common locations
|
70 |
+
common_locations = [
|
71 |
+
"/usr/bin/tesseract",
|
72 |
+
"/usr/local/bin/tesseract",
|
73 |
+
"/app/tesseract/tesseract"
|
74 |
+
]
|
75 |
+
for location in common_locations:
|
76 |
+
if os.path.isfile(location) and os.access(location, os.X_OK):
|
77 |
+
pytesseract.pytesseract.tesseract_cmd = location
|
78 |
+
print(f"Set pytesseract.tesseract_cmd to {location}")
|
79 |
+
break
|
80 |
+
else:
|
81 |
+
print("Warning: Could not find tesseract executable")
|
82 |
+
except ImportError:
|
83 |
+
print("pytesseract not installed")
|
84 |
+
|
85 |
+
# Try to import tesserocr to verify it's working
|
86 |
+
try:
|
87 |
+
import tesserocr
|
88 |
+
print(f"tesserocr imported successfully, version: {tesserocr.tesseract_version()}")
|
89 |
+
except ImportError:
|
90 |
+
print("tesserocr not installed or not working")
|
91 |
+
except Exception as e:
|
92 |
+
print(f"Error importing tesserocr: {e}")
|
93 |
|
94 |
# Load Gemini API key from environment variable
|
95 |
gemini_api_key = os.getenv("GOOGLE_API_KEY")
|
|
|
100 |
else:
|
101 |
print(f"Found Gemini API key: {gemini_api_key[:5]}...{gemini_api_key[-5:] if len(gemini_api_key) > 10 else ''}")
|
102 |
|
|
|
|
|
|
|
103 |
# Add the current directory to the Python path
|
104 |
sys.path.append(current_dir)
|
105 |
|
|
|
121 |
# Try import again
|
122 |
from src.main import main
|
123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
# Call setup function at import time
|
125 |
setup_tesseract()
|
126 |
|
fix_tesseract_huggingface.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
"""
|
3 |
+
Script to diagnose and fix Tesseract issues in Hugging Face environments.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
import sys
|
8 |
+
import shutil
|
9 |
+
import subprocess
|
10 |
+
import platform
|
11 |
+
from pathlib import Path
|
12 |
+
import urllib.request
|
13 |
+
|
14 |
+
def diagnose_tesseract():
|
15 |
+
"""Diagnose Tesseract installation and configuration issues."""
|
16 |
+
print("=== Tesseract Diagnostics ===")
|
17 |
+
|
18 |
+
# Check OS
|
19 |
+
print(f"Operating System: {platform.system()} {platform.release()}")
|
20 |
+
|
21 |
+
# Check if tesseract is in PATH
|
22 |
+
tesseract_path = shutil.which("tesseract")
|
23 |
+
if tesseract_path:
|
24 |
+
print(f"β
Tesseract found in PATH: {tesseract_path}")
|
25 |
+
try:
|
26 |
+
version = subprocess.check_output(["tesseract", "--version"],
|
27 |
+
stderr=subprocess.STDOUT,
|
28 |
+
universal_newlines=True)
|
29 |
+
print(f"β
Tesseract version info:\n{version.splitlines()[0]}")
|
30 |
+
except (subprocess.SubprocessError, FileNotFoundError) as e:
|
31 |
+
print(f"β Error running tesseract: {e}")
|
32 |
+
else:
|
33 |
+
print("β Tesseract not found in PATH")
|
34 |
+
|
35 |
+
# Check common installation locations
|
36 |
+
common_locations = [
|
37 |
+
"/usr/bin/tesseract",
|
38 |
+
"/usr/local/bin/tesseract",
|
39 |
+
"/opt/conda/bin/tesseract",
|
40 |
+
"/app/tesseract/tesseract",
|
41 |
+
r"C:\Program Files\Tesseract-OCR\tesseract.exe"
|
42 |
+
]
|
43 |
+
|
44 |
+
for location in common_locations:
|
45 |
+
if os.path.isfile(location) and os.access(location, os.X_OK):
|
46 |
+
print(f"β
Tesseract executable found at: {location}")
|
47 |
+
|
48 |
+
# Check TESSDATA_PREFIX
|
49 |
+
tessdata_prefix = os.environ.get('TESSDATA_PREFIX')
|
50 |
+
if tessdata_prefix:
|
51 |
+
print(f"β
TESSDATA_PREFIX is set to: {tessdata_prefix}")
|
52 |
+
if os.path.exists(tessdata_prefix):
|
53 |
+
print(f"β
TESSDATA_PREFIX directory exists")
|
54 |
+
eng_traineddata = os.path.join(tessdata_prefix, "eng.traineddata")
|
55 |
+
if os.path.exists(eng_traineddata):
|
56 |
+
print(f"β
eng.traineddata found at: {eng_traineddata}")
|
57 |
+
else:
|
58 |
+
print(f"β eng.traineddata not found at: {eng_traineddata}")
|
59 |
+
else:
|
60 |
+
print(f"β TESSDATA_PREFIX directory does not exist: {tessdata_prefix}")
|
61 |
+
else:
|
62 |
+
print("β TESSDATA_PREFIX environment variable not set")
|
63 |
+
|
64 |
+
# Check pytesseract
|
65 |
+
try:
|
66 |
+
import pytesseract
|
67 |
+
print(f"β
pytesseract is installed")
|
68 |
+
print(f"β
pytesseract.tesseract_cmd = {pytesseract.pytesseract.tesseract_cmd}")
|
69 |
+
except ImportError:
|
70 |
+
print("β pytesseract is not installed")
|
71 |
+
|
72 |
+
# Check tesserocr
|
73 |
+
try:
|
74 |
+
import tesserocr
|
75 |
+
print(f"β
tesserocr is installed")
|
76 |
+
print(f"β
tesserocr version: {tesserocr.tesseract_version()}")
|
77 |
+
except ImportError:
|
78 |
+
print("β tesserocr is not installed")
|
79 |
+
except Exception as e:
|
80 |
+
print(f"β Error importing tesserocr: {e}")
|
81 |
+
|
82 |
+
def fix_tesseract():
|
83 |
+
"""Fix common Tesseract issues."""
|
84 |
+
print("\n=== Fixing Tesseract Issues ===")
|
85 |
+
|
86 |
+
# Create local tessdata directory
|
87 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
88 |
+
tessdata_dir = os.path.join(current_dir, "tessdata")
|
89 |
+
os.makedirs(tessdata_dir, exist_ok=True)
|
90 |
+
print(f"β
Created local tessdata directory: {tessdata_dir}")
|
91 |
+
|
92 |
+
# Set TESSDATA_PREFIX to our local directory
|
93 |
+
os.environ['TESSDATA_PREFIX'] = tessdata_dir
|
94 |
+
print(f"β
Set TESSDATA_PREFIX to: {tessdata_dir}")
|
95 |
+
|
96 |
+
# Download eng.traineddata
|
97 |
+
eng_traineddata = os.path.join(tessdata_dir, "eng.traineddata")
|
98 |
+
if not os.path.exists(eng_traineddata):
|
99 |
+
try:
|
100 |
+
print("Downloading eng.traineddata...")
|
101 |
+
url = "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
|
102 |
+
urllib.request.urlretrieve(url, eng_traineddata)
|
103 |
+
print("β
Downloaded eng.traineddata")
|
104 |
+
except Exception as e:
|
105 |
+
print(f"β Error downloading eng.traineddata: {e}")
|
106 |
+
else:
|
107 |
+
print("β
eng.traineddata already exists")
|
108 |
+
|
109 |
+
# Configure pytesseract
|
110 |
+
try:
|
111 |
+
import pytesseract
|
112 |
+
tesseract_path = shutil.which("tesseract")
|
113 |
+
if tesseract_path:
|
114 |
+
pytesseract.pytesseract.tesseract_cmd = tesseract_path
|
115 |
+
print(f"β
Set pytesseract.tesseract_cmd to {tesseract_path}")
|
116 |
+
else:
|
117 |
+
# Try common locations
|
118 |
+
common_locations = [
|
119 |
+
"/usr/bin/tesseract",
|
120 |
+
"/usr/local/bin/tesseract",
|
121 |
+
"/app/tesseract/tesseract"
|
122 |
+
]
|
123 |
+
for location in common_locations:
|
124 |
+
if os.path.isfile(location) and os.access(location, os.X_OK):
|
125 |
+
pytesseract.pytesseract.tesseract_cmd = location
|
126 |
+
print(f"β
Set pytesseract.tesseract_cmd to {location}")
|
127 |
+
break
|
128 |
+
except ImportError:
|
129 |
+
print("β pytesseract not installed, please install it with: pip install pytesseract")
|
130 |
+
|
131 |
+
# Add TESSDATA_PREFIX to .env file for persistence
|
132 |
+
try:
|
133 |
+
with open(".env", "a") as f:
|
134 |
+
f.write(f"\nTESSDATAFIX_PREFIX={tessdata_dir}\n")
|
135 |
+
print("β
Added TESSDATA_PREFIX to .env file")
|
136 |
+
except Exception as e:
|
137 |
+
print(f"β Error adding TESSDATA_PREFIX to .env file: {e}")
|
138 |
+
|
139 |
+
print("\n=== Tesseract Fix Complete ===")
|
140 |
+
print("Please restart your application for changes to take effect.")
|
141 |
+
|
142 |
+
if __name__ == "__main__":
|
143 |
+
diagnose_tesseract()
|
144 |
+
fix_tesseract()
|
packages.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tesseract-ocr
|
2 |
+
tesseract-ocr-eng
|
3 |
+
libtesseract-dev
|
4 |
+
libleptonica-dev
|
5 |
+
imagemagick
|
6 |
+
poppler-utils
|
setup.sh
CHANGED
@@ -5,10 +5,28 @@ set -e
|
|
5 |
|
6 |
echo "Setting up Tesseract OCR environment..."
|
7 |
|
8 |
-
# Install
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
pip install -q -U google-genai
|
11 |
-
echo "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
# Create tessdata directory if it doesn't exist
|
14 |
mkdir -p tessdata
|
@@ -20,33 +38,34 @@ echo "TESSDATA_PREFIX set to: $TESSDATA_PREFIX"
|
|
20 |
# Download eng.traineddata if it doesn't exist
|
21 |
if [ ! -f "tessdata/eng.traineddata" ]; then
|
22 |
echo "Downloading eng.traineddata..."
|
23 |
-
wget -O tessdata/eng.traineddata https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata
|
|
|
24 |
echo "Downloaded eng.traineddata"
|
25 |
else
|
26 |
echo "eng.traineddata already exists"
|
27 |
fi
|
28 |
|
29 |
-
#
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
|
|
34 |
|
35 |
# Verify Tesseract installation
|
36 |
echo "Verifying Tesseract installation..."
|
37 |
-
tesseract --version || echo "Tesseract not found in PATH"
|
38 |
-
|
39 |
-
# Test
|
40 |
-
echo "Testing
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
cat test_output.txt
|
49 |
-
fi
|
50 |
-
fi
|
51 |
|
52 |
-
|
|
|
|
5 |
|
6 |
echo "Setting up Tesseract OCR environment..."
|
7 |
|
8 |
+
# Install required packages if not already installed
|
9 |
+
if ! command -v tesseract &> /dev/null; then
|
10 |
+
echo "Tesseract not found, attempting to install..."
|
11 |
+
apt-get update -y || echo "Failed to update apt, continuing anyway"
|
12 |
+
apt-get install -y tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev || echo "Failed to install tesseract via apt, continuing anyway"
|
13 |
+
fi
|
14 |
+
|
15 |
+
# Install Python dependencies
|
16 |
+
echo "Installing Python dependencies..."
|
17 |
+
pip install -q -U pytesseract pillow opencv-python-headless pdf2image
|
18 |
pip install -q -U google-genai
|
19 |
+
echo "Python dependencies installed successfully"
|
20 |
+
|
21 |
+
# Install tesserocr with pip
|
22 |
+
echo "Installing tesserocr..."
|
23 |
+
pip install -q -U tesserocr || echo "Failed to install tesserocr with pip, trying with specific compiler flags..."
|
24 |
+
|
25 |
+
# If tesserocr installation failed, try with specific compiler flags
|
26 |
+
if ! python -c "import tesserocr" &> /dev/null; then
|
27 |
+
echo "Trying alternative tesserocr installation..."
|
28 |
+
CPPFLAGS="-I/usr/local/include -I/usr/include" LDFLAGS="-L/usr/local/lib -L/usr/lib" pip install -q -U tesserocr || echo "Failed to install tesserocr with compiler flags, continuing anyway"
|
29 |
+
fi
|
30 |
|
31 |
# Create tessdata directory if it doesn't exist
|
32 |
mkdir -p tessdata
|
|
|
38 |
# Download eng.traineddata if it doesn't exist
|
39 |
if [ ! -f "tessdata/eng.traineddata" ]; then
|
40 |
echo "Downloading eng.traineddata..."
|
41 |
+
wget -O tessdata/eng.traineddata https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata || \
|
42 |
+
curl -o tessdata/eng.traineddata https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata
|
43 |
echo "Downloaded eng.traineddata"
|
44 |
else
|
45 |
echo "eng.traineddata already exists"
|
46 |
fi
|
47 |
|
48 |
+
# Try to copy to system locations (may fail in restricted environments)
|
49 |
+
for tessdata_dir in "/usr/share/tesseract-ocr/4.00/tessdata" "/usr/share/tesseract-ocr/tessdata" "/usr/local/share/tessdata"; do
|
50 |
+
if [ -d "$tessdata_dir" ]; then
|
51 |
+
echo "Copying eng.traineddata to $tessdata_dir..."
|
52 |
+
cp -f tessdata/eng.traineddata "$tessdata_dir/" 2>/dev/null || echo "Failed to copy to $tessdata_dir, continuing anyway"
|
53 |
+
fi
|
54 |
+
done
|
55 |
|
56 |
# Verify Tesseract installation
|
57 |
echo "Verifying Tesseract installation..."
|
58 |
+
tesseract --version || echo "Tesseract not found in PATH, but may still be available to Python"
|
59 |
+
|
60 |
+
# Test tesserocr if installed
|
61 |
+
echo "Testing tesserocr..."
|
62 |
+
python -c "import tesserocr; print(f'tesserocr version: {tesserocr.tesseract_version()}')" || echo "tesserocr not working, but may still be able to use pytesseract"
|
63 |
+
|
64 |
+
# Test pytesseract
|
65 |
+
echo "Testing pytesseract..."
|
66 |
+
python -c "import pytesseract; print(f'pytesseract path: {pytesseract.tesseract_cmd}')" || echo "pytesseract not working"
|
67 |
+
|
68 |
+
echo "Setup completed"
|
|
|
|
|
|
|
69 |
|
70 |
+
# Add TESSDATA_PREFIX to .env file for persistence
|
71 |
+
echo "TESSDATA_PREFIX=$(pwd)/tessdata" >> .env
|