AnseMin commited on
Commit
46bc24b
·
1 Parent(s): 1920eee

fix: download tesseract traineddata files directly from source

Browse files
Files changed (1) hide show
  1. build.sh +45 -22
build.sh CHANGED
@@ -5,50 +5,73 @@ set -e
5
 
6
  echo "Starting build process..."
7
 
8
- # Install system dependencies for tesseract with more specific packages
9
  echo "Installing Tesseract and dependencies..."
10
  apt-get update && apt-get install -y \
11
  tesseract-ocr \
12
  tesseract-ocr-eng \
13
  libtesseract-dev \
14
  libleptonica-dev \
15
- pkg-config
 
16
 
17
- # Verify tesseract installation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  if ! command -v tesseract &> /dev/null; then
19
  echo "Tesseract installation failed!"
20
  exit 1
21
  fi
22
  echo "Tesseract version: $(tesseract --version)"
23
 
24
- # Set TESSDATA_PREFIX environment variable
25
- TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)
26
- if [ -z "$TESSDATA_PREFIX" ]; then
27
- echo "Could not find tessdata directory!"
 
 
 
 
28
  exit 1
29
  fi
30
- echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
31
- export TESSDATA_PREFIX
32
 
33
- # Add TESSDATA_PREFIX to environment for persistence
34
- echo "TESSDATA_PREFIX=${TESSDATA_PREFIX}" >> /etc/environment
35
 
36
- # Verify tessdata directory
37
- if [ ! -d "$TESSDATA_PREFIX" ]; then
38
- echo "Tessdata directory does not exist!"
 
 
39
  exit 1
40
  fi
41
- echo "Tessdata directory contents:"
42
- ls -l $TESSDATA_PREFIX
43
 
44
- # Uninstall any existing tesserocr and install from source
45
  echo "Installing tesserocr from source..."
46
  pip uninstall -y tesserocr || true
47
- pip install --no-binary :all: tesserocr
48
 
49
- # Install ocrmac
50
- echo "Installing ocrmac..."
51
- pip install ocrmac
 
 
 
 
 
52
 
53
  # Install Python dependencies
54
  echo "Installing Python dependencies..."
@@ -57,7 +80,7 @@ pip install -e .
57
  # Create .env file if it doesn't exist
58
  if [ ! -f .env ]; then
59
  echo "Creating .env file..."
60
- cp .env.example .env
61
  fi
62
 
63
  echo "Build process completed successfully!"
 
5
 
6
  echo "Starting build process..."
7
 
8
+ # Install system dependencies for tesseract
9
  echo "Installing Tesseract and dependencies..."
10
  apt-get update && apt-get install -y \
11
  tesseract-ocr \
12
  tesseract-ocr-eng \
13
  libtesseract-dev \
14
  libleptonica-dev \
15
+ pkg-config \
16
+ wget
17
 
18
+ # Create tessdata directory
19
+ TESSDATA_DIR="/usr/share/tesseract-ocr/4.00/tessdata"
20
+ mkdir -p "$TESSDATA_DIR"
21
+
22
+ # Download traineddata files directly from the official repository
23
+ echo "Downloading Tesseract traineddata files..."
24
+ wget -O "$TESSDATA_DIR/eng.traineddata" "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
25
+ wget -O "$TESSDATA_DIR/osd.traineddata" "https://github.com/tesseract-ocr/tessdata/raw/main/osd.traineddata"
26
+
27
+ # Set and verify TESSDATA_PREFIX
28
+ export TESSDATA_PREFIX="$TESSDATA_DIR"
29
+ echo "TESSDATA_PREFIX=${TESSDATA_PREFIX}" >> /etc/environment
30
+
31
+ # Verify tesseract installation and data files
32
+ echo "Verifying Tesseract installation..."
33
  if ! command -v tesseract &> /dev/null; then
34
  echo "Tesseract installation failed!"
35
  exit 1
36
  fi
37
  echo "Tesseract version: $(tesseract --version)"
38
 
39
+ # Verify traineddata files
40
+ echo "Verifying traineddata files..."
41
+ if [ ! -f "$TESSDATA_DIR/eng.traineddata" ]; then
42
+ echo "eng.traineddata is missing!"
43
+ exit 1
44
+ fi
45
+ if [ ! -f "$TESSDATA_DIR/osd.traineddata" ]; then
46
+ echo "osd.traineddata is missing!"
47
  exit 1
48
  fi
 
 
49
 
50
+ echo "Traineddata files in $TESSDATA_DIR:"
51
+ ls -l "$TESSDATA_DIR"
52
 
53
+ # Test Tesseract functionality
54
+ echo "Testing Tesseract functionality..."
55
+ echo "Hello World" > test.png
56
+ if ! tesseract test.png stdout; then
57
+ echo "Tesseract test failed!"
58
  exit 1
59
  fi
60
+ rm test.png
 
61
 
62
+ # Clean and install tesserocr from source
63
  echo "Installing tesserocr from source..."
64
  pip uninstall -y tesserocr || true
65
+ CPPFLAGS="-I/usr/include/tesseract" LDFLAGS="-L/usr/lib/x86_64-linux-gnu/" pip install --no-binary :all: tesserocr
66
 
67
+ # Verify tesserocr installation
68
+ echo "Verifying tesserocr installation..."
69
+ python3 -c "
70
+ import tesserocr
71
+ print(f'tesserocr version: {tesserocr.__version__}')
72
+ print(f'Available languages: {tesserocr.get_languages()}')
73
+ print(f'TESSDATA_PREFIX: {tesserocr.get_languages()[1]}')
74
+ "
75
 
76
  # Install Python dependencies
77
  echo "Installing Python dependencies..."
 
80
  # Create .env file if it doesn't exist
81
  if [ ! -f .env ]; then
82
  echo "Creating .env file..."
83
+ cp .env.example .env || echo "Warning: .env.example not found"
84
  fi
85
 
86
  echo "Build process completed successfully!"