AnseMin commited on
Commit
919e7d0
Β·
1 Parent(s): 057e923

fixing the tesseract path issues for full page ocr

Browse files
Files changed (4) hide show
  1. app.py +77 -32
  2. fix_tesseract_huggingface.py +144 -0
  3. packages.txt +6 -0
  4. setup.sh +43 -24
app.py CHANGED
@@ -13,12 +13,83 @@ try:
13
  except ImportError:
14
  print("python-dotenv not installed, skipping .env file loading")
15
 
16
- # Set TESSDATA_PREFIX if not already set
17
- if not os.environ.get('TESSDATA_PREFIX'):
18
- tessdata_dir = "/usr/share/tesseract-ocr/4.00/tessdata"
19
- if os.path.exists(tessdata_dir):
20
- os.environ['TESSDATA_PREFIX'] = tessdata_dir
21
- print(f"Set TESSDATA_PREFIX to {tessdata_dir}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  # Load Gemini API key from environment variable
24
  gemini_api_key = os.getenv("GOOGLE_API_KEY")
@@ -29,9 +100,6 @@ if not gemini_api_key:
29
  else:
30
  print(f"Found Gemini API key: {gemini_api_key[:5]}...{gemini_api_key[-5:] if len(gemini_api_key) > 10 else ''}")
31
 
32
- # Get the current directory
33
- current_dir = os.path.dirname(os.path.abspath(__file__))
34
-
35
  # Add the current directory to the Python path
36
  sys.path.append(current_dir)
37
 
@@ -53,29 +121,6 @@ except ModuleNotFoundError:
53
  # Try import again
54
  from src.main import main
55
 
56
- # Function to setup Tesseract
57
- def setup_tesseract():
58
- """Setup Tesseract OCR environment."""
59
- # Create tessdata directory if it doesn't exist
60
- tessdata_dir = os.path.join(current_dir, "tessdata")
61
- os.makedirs(tessdata_dir, exist_ok=True)
62
-
63
- # Set TESSDATA_PREFIX environment variable if not already set
64
- if not os.environ.get('TESSDATA_PREFIX'):
65
- os.environ['TESSDATA_PREFIX'] = tessdata_dir
66
- print(f"Set TESSDATA_PREFIX to {tessdata_dir}")
67
-
68
- # Download eng.traineddata if it doesn't exist
69
- eng_traineddata = os.path.join(tessdata_dir, "eng.traineddata")
70
- if not os.path.exists(eng_traineddata):
71
- try:
72
- print("Downloading eng.traineddata...")
73
- url = "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
74
- urllib.request.urlretrieve(url, eng_traineddata)
75
- print("Downloaded eng.traineddata")
76
- except Exception as e:
77
- print(f"Error downloading eng.traineddata: {e}")
78
-
79
  # Call setup function at import time
80
  setup_tesseract()
81
 
 
13
  except ImportError:
14
  print("python-dotenv not installed, skipping .env file loading")
15
 
16
+ # Get the current directory
17
+ current_dir = os.path.dirname(os.path.abspath(__file__))
18
+
19
+ # Function to setup Tesseract
20
+ def setup_tesseract():
21
+ """Setup Tesseract OCR environment."""
22
+ print("Setting up Tesseract OCR environment...")
23
+
24
+ # Create tessdata directory if it doesn't exist
25
+ tessdata_dir = os.path.join(current_dir, "tessdata")
26
+ os.makedirs(tessdata_dir, exist_ok=True)
27
+
28
+ # Set TESSDATA_PREFIX environment variable if not already set
29
+ if not os.environ.get('TESSDATA_PREFIX'):
30
+ # Check multiple possible locations
31
+ possible_tessdata_dirs = [
32
+ tessdata_dir, # Our local tessdata directory
33
+ "/usr/share/tesseract-ocr/4.00/tessdata", # Common location in Hugging Face
34
+ "/usr/share/tesseract-ocr/tessdata", # Another common location
35
+ "/usr/local/share/tessdata", # Standard installation location
36
+ ]
37
+
38
+ # Use the first directory that exists
39
+ for dir_path in possible_tessdata_dirs:
40
+ if os.path.exists(dir_path):
41
+ os.environ['TESSDATA_PREFIX'] = dir_path
42
+ print(f"Set TESSDATA_PREFIX to {dir_path}")
43
+ break
44
+ else:
45
+ # If none exist, use our local directory
46
+ os.environ['TESSDATA_PREFIX'] = tessdata_dir
47
+ print(f"No existing tessdata directory found, set TESSDATA_PREFIX to {tessdata_dir}")
48
+
49
+ # Download eng.traineddata if it doesn't exist in our local tessdata
50
+ eng_traineddata = os.path.join(tessdata_dir, "eng.traineddata")
51
+ if not os.path.exists(eng_traineddata):
52
+ try:
53
+ print("Downloading eng.traineddata...")
54
+ url = "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
55
+ urllib.request.urlretrieve(url, eng_traineddata)
56
+ print("Downloaded eng.traineddata")
57
+ except Exception as e:
58
+ print(f"Error downloading eng.traineddata: {e}")
59
+
60
+ # Configure pytesseract
61
+ try:
62
+ import pytesseract
63
+ # Check if tesseract is in PATH
64
+ tesseract_cmd = shutil.which("tesseract")
65
+ if tesseract_cmd:
66
+ pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
67
+ print(f"Set pytesseract.tesseract_cmd to {tesseract_cmd}")
68
+ else:
69
+ # Try common locations
70
+ common_locations = [
71
+ "/usr/bin/tesseract",
72
+ "/usr/local/bin/tesseract",
73
+ "/app/tesseract/tesseract"
74
+ ]
75
+ for location in common_locations:
76
+ if os.path.isfile(location) and os.access(location, os.X_OK):
77
+ pytesseract.pytesseract.tesseract_cmd = location
78
+ print(f"Set pytesseract.tesseract_cmd to {location}")
79
+ break
80
+ else:
81
+ print("Warning: Could not find tesseract executable")
82
+ except ImportError:
83
+ print("pytesseract not installed")
84
+
85
+ # Try to import tesserocr to verify it's working
86
+ try:
87
+ import tesserocr
88
+ print(f"tesserocr imported successfully, version: {tesserocr.tesseract_version()}")
89
+ except ImportError:
90
+ print("tesserocr not installed or not working")
91
+ except Exception as e:
92
+ print(f"Error importing tesserocr: {e}")
93
 
94
  # Load Gemini API key from environment variable
95
  gemini_api_key = os.getenv("GOOGLE_API_KEY")
 
100
  else:
101
  print(f"Found Gemini API key: {gemini_api_key[:5]}...{gemini_api_key[-5:] if len(gemini_api_key) > 10 else ''}")
102
 
 
 
 
103
  # Add the current directory to the Python path
104
  sys.path.append(current_dir)
105
 
 
121
  # Try import again
122
  from src.main import main
123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  # Call setup function at import time
125
  setup_tesseract()
126
 
fix_tesseract_huggingface.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Script to diagnose and fix Tesseract issues in Hugging Face environments.
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ import shutil
9
+ import subprocess
10
+ import platform
11
+ from pathlib import Path
12
+ import urllib.request
13
+
14
+ def diagnose_tesseract():
15
+ """Diagnose Tesseract installation and configuration issues."""
16
+ print("=== Tesseract Diagnostics ===")
17
+
18
+ # Check OS
19
+ print(f"Operating System: {platform.system()} {platform.release()}")
20
+
21
+ # Check if tesseract is in PATH
22
+ tesseract_path = shutil.which("tesseract")
23
+ if tesseract_path:
24
+ print(f"βœ… Tesseract found in PATH: {tesseract_path}")
25
+ try:
26
+ version = subprocess.check_output(["tesseract", "--version"],
27
+ stderr=subprocess.STDOUT,
28
+ universal_newlines=True)
29
+ print(f"βœ… Tesseract version info:\n{version.splitlines()[0]}")
30
+ except (subprocess.SubprocessError, FileNotFoundError) as e:
31
+ print(f"❌ Error running tesseract: {e}")
32
+ else:
33
+ print("❌ Tesseract not found in PATH")
34
+
35
+ # Check common installation locations
36
+ common_locations = [
37
+ "/usr/bin/tesseract",
38
+ "/usr/local/bin/tesseract",
39
+ "/opt/conda/bin/tesseract",
40
+ "/app/tesseract/tesseract",
41
+ r"C:\Program Files\Tesseract-OCR\tesseract.exe"
42
+ ]
43
+
44
+ for location in common_locations:
45
+ if os.path.isfile(location) and os.access(location, os.X_OK):
46
+ print(f"βœ… Tesseract executable found at: {location}")
47
+
48
+ # Check TESSDATA_PREFIX
49
+ tessdata_prefix = os.environ.get('TESSDATA_PREFIX')
50
+ if tessdata_prefix:
51
+ print(f"βœ… TESSDATA_PREFIX is set to: {tessdata_prefix}")
52
+ if os.path.exists(tessdata_prefix):
53
+ print(f"βœ… TESSDATA_PREFIX directory exists")
54
+ eng_traineddata = os.path.join(tessdata_prefix, "eng.traineddata")
55
+ if os.path.exists(eng_traineddata):
56
+ print(f"βœ… eng.traineddata found at: {eng_traineddata}")
57
+ else:
58
+ print(f"❌ eng.traineddata not found at: {eng_traineddata}")
59
+ else:
60
+ print(f"❌ TESSDATA_PREFIX directory does not exist: {tessdata_prefix}")
61
+ else:
62
+ print("❌ TESSDATA_PREFIX environment variable not set")
63
+
64
+ # Check pytesseract
65
+ try:
66
+ import pytesseract
67
+ print(f"βœ… pytesseract is installed")
68
+ print(f"βœ… pytesseract.tesseract_cmd = {pytesseract.pytesseract.tesseract_cmd}")
69
+ except ImportError:
70
+ print("❌ pytesseract is not installed")
71
+
72
+ # Check tesserocr
73
+ try:
74
+ import tesserocr
75
+ print(f"βœ… tesserocr is installed")
76
+ print(f"βœ… tesserocr version: {tesserocr.tesseract_version()}")
77
+ except ImportError:
78
+ print("❌ tesserocr is not installed")
79
+ except Exception as e:
80
+ print(f"❌ Error importing tesserocr: {e}")
81
+
82
+ def fix_tesseract():
83
+ """Fix common Tesseract issues."""
84
+ print("\n=== Fixing Tesseract Issues ===")
85
+
86
+ # Create local tessdata directory
87
+ current_dir = os.path.dirname(os.path.abspath(__file__))
88
+ tessdata_dir = os.path.join(current_dir, "tessdata")
89
+ os.makedirs(tessdata_dir, exist_ok=True)
90
+ print(f"βœ… Created local tessdata directory: {tessdata_dir}")
91
+
92
+ # Set TESSDATA_PREFIX to our local directory
93
+ os.environ['TESSDATA_PREFIX'] = tessdata_dir
94
+ print(f"βœ… Set TESSDATA_PREFIX to: {tessdata_dir}")
95
+
96
+ # Download eng.traineddata
97
+ eng_traineddata = os.path.join(tessdata_dir, "eng.traineddata")
98
+ if not os.path.exists(eng_traineddata):
99
+ try:
100
+ print("Downloading eng.traineddata...")
101
+ url = "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
102
+ urllib.request.urlretrieve(url, eng_traineddata)
103
+ print("βœ… Downloaded eng.traineddata")
104
+ except Exception as e:
105
+ print(f"❌ Error downloading eng.traineddata: {e}")
106
+ else:
107
+ print("βœ… eng.traineddata already exists")
108
+
109
+ # Configure pytesseract
110
+ try:
111
+ import pytesseract
112
+ tesseract_path = shutil.which("tesseract")
113
+ if tesseract_path:
114
+ pytesseract.pytesseract.tesseract_cmd = tesseract_path
115
+ print(f"βœ… Set pytesseract.tesseract_cmd to {tesseract_path}")
116
+ else:
117
+ # Try common locations
118
+ common_locations = [
119
+ "/usr/bin/tesseract",
120
+ "/usr/local/bin/tesseract",
121
+ "/app/tesseract/tesseract"
122
+ ]
123
+ for location in common_locations:
124
+ if os.path.isfile(location) and os.access(location, os.X_OK):
125
+ pytesseract.pytesseract.tesseract_cmd = location
126
+ print(f"βœ… Set pytesseract.tesseract_cmd to {location}")
127
+ break
128
+ except ImportError:
129
+ print("❌ pytesseract not installed, please install it with: pip install pytesseract")
130
+
131
+ # Add TESSDATA_PREFIX to .env file for persistence
132
+ try:
133
+ with open(".env", "a") as f:
134
+ f.write(f"\nTESSDATAFIX_PREFIX={tessdata_dir}\n")
135
+ print("βœ… Added TESSDATA_PREFIX to .env file")
136
+ except Exception as e:
137
+ print(f"❌ Error adding TESSDATA_PREFIX to .env file: {e}")
138
+
139
+ print("\n=== Tesseract Fix Complete ===")
140
+ print("Please restart your application for changes to take effect.")
141
+
142
+ if __name__ == "__main__":
143
+ diagnose_tesseract()
144
+ fix_tesseract()
packages.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ tesseract-ocr
2
+ tesseract-ocr-eng
3
+ libtesseract-dev
4
+ libleptonica-dev
5
+ imagemagick
6
+ poppler-utils
setup.sh CHANGED
@@ -5,10 +5,28 @@ set -e
5
 
6
  echo "Setting up Tesseract OCR environment..."
7
 
8
- # Install google-genai package
9
- echo "Installing Google Gemini API client..."
 
 
 
 
 
 
 
 
10
  pip install -q -U google-genai
11
- echo "Google Gemini API client installed successfully"
 
 
 
 
 
 
 
 
 
 
12
 
13
  # Create tessdata directory if it doesn't exist
14
  mkdir -p tessdata
@@ -20,33 +38,34 @@ echo "TESSDATA_PREFIX set to: $TESSDATA_PREFIX"
20
  # Download eng.traineddata if it doesn't exist
21
  if [ ! -f "tessdata/eng.traineddata" ]; then
22
  echo "Downloading eng.traineddata..."
23
- wget -O tessdata/eng.traineddata https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata
 
24
  echo "Downloaded eng.traineddata"
25
  else
26
  echo "eng.traineddata already exists"
27
  fi
28
 
29
- # Also copy to system location
30
- if [ -d "/usr/local/share/tessdata" ]; then
31
- echo "Copying eng.traineddata to system location..."
32
- sudo cp -f tessdata/eng.traineddata /usr/local/share/tessdata/ || echo "Failed to copy to system location, continuing anyway"
33
- fi
 
 
34
 
35
  # Verify Tesseract installation
36
  echo "Verifying Tesseract installation..."
37
- tesseract --version || echo "Tesseract not found in PATH"
38
-
39
- # Test Tesseract functionality
40
- echo "Testing Tesseract functionality..."
41
- echo "Hello World" > test.txt
42
- convert -size 100x30 xc:white -font Arial -pointsize 12 -fill black -annotate +10+20 "Hello World" test.png || echo "ImageMagick convert not available, skipping test image creation"
43
-
44
- if [ -f "test.png" ]; then
45
- tesseract test.png test_output || echo "Tesseract test failed, but continuing"
46
- if [ -f "test_output.txt" ]; then
47
- echo "Tesseract test output:"
48
- cat test_output.txt
49
- fi
50
- fi
51
 
52
- echo "Setup completed"
 
 
5
 
6
  echo "Setting up Tesseract OCR environment..."
7
 
8
+ # Install required packages if not already installed
9
+ if ! command -v tesseract &> /dev/null; then
10
+ echo "Tesseract not found, attempting to install..."
11
+ apt-get update -y || echo "Failed to update apt, continuing anyway"
12
+ apt-get install -y tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev || echo "Failed to install tesseract via apt, continuing anyway"
13
+ fi
14
+
15
+ # Install Python dependencies
16
+ echo "Installing Python dependencies..."
17
+ pip install -q -U pytesseract pillow opencv-python-headless pdf2image
18
  pip install -q -U google-genai
19
+ echo "Python dependencies installed successfully"
20
+
21
+ # Install tesserocr with pip
22
+ echo "Installing tesserocr..."
23
+ pip install -q -U tesserocr || echo "Failed to install tesserocr with pip, trying with specific compiler flags..."
24
+
25
+ # If tesserocr installation failed, try with specific compiler flags
26
+ if ! python -c "import tesserocr" &> /dev/null; then
27
+ echo "Trying alternative tesserocr installation..."
28
+ CPPFLAGS="-I/usr/local/include -I/usr/include" LDFLAGS="-L/usr/local/lib -L/usr/lib" pip install -q -U tesserocr || echo "Failed to install tesserocr with compiler flags, continuing anyway"
29
+ fi
30
 
31
  # Create tessdata directory if it doesn't exist
32
  mkdir -p tessdata
 
38
  # Download eng.traineddata if it doesn't exist
39
  if [ ! -f "tessdata/eng.traineddata" ]; then
40
  echo "Downloading eng.traineddata..."
41
+ wget -O tessdata/eng.traineddata https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata || \
42
+ curl -o tessdata/eng.traineddata https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata
43
  echo "Downloaded eng.traineddata"
44
  else
45
  echo "eng.traineddata already exists"
46
  fi
47
 
48
+ # Try to copy to system locations (may fail in restricted environments)
49
+ for tessdata_dir in "/usr/share/tesseract-ocr/4.00/tessdata" "/usr/share/tesseract-ocr/tessdata" "/usr/local/share/tessdata"; do
50
+ if [ -d "$tessdata_dir" ]; then
51
+ echo "Copying eng.traineddata to $tessdata_dir..."
52
+ cp -f tessdata/eng.traineddata "$tessdata_dir/" 2>/dev/null || echo "Failed to copy to $tessdata_dir, continuing anyway"
53
+ fi
54
+ done
55
 
56
  # Verify Tesseract installation
57
  echo "Verifying Tesseract installation..."
58
+ tesseract --version || echo "Tesseract not found in PATH, but may still be available to Python"
59
+
60
+ # Test tesserocr if installed
61
+ echo "Testing tesserocr..."
62
+ python -c "import tesserocr; print(f'tesserocr version: {tesserocr.tesseract_version()}')" || echo "tesserocr not working, but may still be able to use pytesseract"
63
+
64
+ # Test pytesseract
65
+ echo "Testing pytesseract..."
66
+ python -c "import pytesseract; print(f'pytesseract path: {pytesseract.tesseract_cmd}')" || echo "pytesseract not working"
67
+
68
+ echo "Setup completed"
 
 
 
69
 
70
+ # Add TESSDATA_PREFIX to .env file for persistence
71
+ echo "TESSDATA_PREFIX=$(pwd)/tessdata" >> .env