AnseMin commited on
Commit
08e21aa
·
1 Parent(s): 9db4bb9

Improve Tesseract installation and configuration

Browse files
Files changed (2) hide show
  1. build.sh +42 -3
  2. requirements.txt +3 -0
build.sh CHANGED
@@ -1,24 +1,63 @@
1
  #!/bin/bash
2
 
 
 
 
 
 
3
  # Install system dependencies for tesseract with more specific packages
4
- apt-get update && apt-get install -y tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev pkg-config
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  # Set TESSDATA_PREFIX environment variable
7
  TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)
 
 
 
 
8
  echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
9
  export TESSDATA_PREFIX
10
 
 
 
 
 
 
 
 
 
 
 
 
11
  # Uninstall any existing tesserocr and install from source
12
- pip uninstall -y tesserocr
 
13
  pip install --no-binary :all: tesserocr
14
 
15
  # Install ocrmac
 
16
  pip install ocrmac
17
 
18
  # Install Python dependencies
 
19
  pip install -e .
20
 
21
  # Create .env file if it doesn't exist
22
  if [ ! -f .env ]; then
 
23
  cp .env.example .env
24
- fi
 
 
 
1
  #!/bin/bash
2
 
3
+ # Exit on error
4
+ set -e
5
+
6
+ echo "Starting build process..."
7
+
8
  # Install system dependencies for tesseract with more specific packages
9
+ echo "Installing Tesseract and dependencies..."
10
+ apt-get update && apt-get install -y \
11
+ tesseract-ocr \
12
+ tesseract-ocr-eng \
13
+ libtesseract-dev \
14
+ libleptonica-dev \
15
+ pkg-config
16
+
17
+ # Verify tesseract installation
18
+ if ! command -v tesseract &> /dev/null; then
19
+ echo "Tesseract installation failed!"
20
+ exit 1
21
+ fi
22
+ echo "Tesseract version: $(tesseract --version)"
23
 
24
  # Set TESSDATA_PREFIX environment variable
25
  TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)
26
+ if [ -z "$TESSDATA_PREFIX" ]; then
27
+ echo "Could not find tessdata directory!"
28
+ exit 1
29
+ fi
30
  echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
31
  export TESSDATA_PREFIX
32
 
33
+ # Add TESSDATA_PREFIX to environment for persistence
34
+ echo "TESSDATA_PREFIX=${TESSDATA_PREFIX}" >> /etc/environment
35
+
36
+ # Verify tessdata directory
37
+ if [ ! -d "$TESSDATA_PREFIX" ]; then
38
+ echo "Tessdata directory does not exist!"
39
+ exit 1
40
+ fi
41
+ echo "Tessdata directory contents:"
42
+ ls -l $TESSDATA_PREFIX
43
+
44
  # Uninstall any existing tesserocr and install from source
45
+ echo "Installing tesserocr from source..."
46
+ pip uninstall -y tesserocr || true
47
  pip install --no-binary :all: tesserocr
48
 
49
  # Install ocrmac
50
+ echo "Installing ocrmac..."
51
  pip install ocrmac
52
 
53
  # Install Python dependencies
54
+ echo "Installing Python dependencies..."
55
  pip install -e .
56
 
57
  # Create .env file if it doesn't exist
58
  if [ ! -f .env ]; then
59
+ echo "Creating .env file..."
60
  cp .env.example .env
61
+ fi
62
+
63
+ echo "Build process completed successfully!"
requirements.txt CHANGED
@@ -9,4 +9,7 @@ pipdeptree==2.25.0
9
  pytesseract==0.3.13
10
  semchunk==2.2.2
11
  tesseract==0.1.3
 
 
 
12
  # Use pytesseract instead of tesserocr for cross-platform compatibility
 
9
  pytesseract==0.3.13
10
  semchunk==2.2.2
11
  tesseract==0.1.3
12
+ tesserocr>=2.5.0; platform_system != "Windows" # Only install on non-Windows systems
13
+ Pillow>=9.0.0 # Required for image processing
14
+ numpy>=1.21.0 # Required for image processing
15
  # Use pytesseract instead of tesserocr for cross-platform compatibility