File size: 2,659 Bytes
4c35288
 
08e21aa
 
 
 
 
46bc24b
08e21aa
 
 
 
 
 
46bc24b
 
08e21aa
46bc24b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08e21aa
 
 
 
 
4c35288
46bc24b
 
 
 
 
 
 
 
1920eee
 
787c3f4
46bc24b
 
08e21aa
46bc24b
 
 
 
 
1920eee
08e21aa
46bc24b
08e21aa
46bc24b
b7d7d76
1920eee
46bc24b
b7d7d76
46bc24b
 
 
 
 
 
 
 
9cf505e
2dc4c21
 
 
 
 
4c35288
08e21aa
4c35288
 
 
 
08e21aa
46bc24b
08e21aa
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/bin/bash

# Exit on error
set -e

echo "Starting build process..."

# Install system dependencies for tesseract
echo "Installing Tesseract and dependencies..."
apt-get update && apt-get install -y \
    tesseract-ocr \
    tesseract-ocr-eng \
    libtesseract-dev \
    libleptonica-dev \
    pkg-config \
    wget

# Create tessdata directory
TESSDATA_DIR="/usr/share/tesseract-ocr/4.00/tessdata"
mkdir -p "$TESSDATA_DIR"

# Download traineddata files directly from the official repository
echo "Downloading Tesseract traineddata files..."
wget -O "$TESSDATA_DIR/eng.traineddata" "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
wget -O "$TESSDATA_DIR/osd.traineddata" "https://github.com/tesseract-ocr/tessdata/raw/main/osd.traineddata"

# Set and verify TESSDATA_PREFIX
export TESSDATA_PREFIX="$TESSDATA_DIR"
echo "TESSDATA_PREFIX=${TESSDATA_PREFIX}" >> /etc/environment

# Verify tesseract installation and data files
echo "Verifying Tesseract installation..."
if ! command -v tesseract &> /dev/null; then
    echo "Tesseract installation failed!"
    exit 1
fi
echo "Tesseract version: $(tesseract --version)"

# Verify traineddata files
echo "Verifying traineddata files..."
if [ ! -f "$TESSDATA_DIR/eng.traineddata" ]; then
    echo "eng.traineddata is missing!"
    exit 1
fi
if [ ! -f "$TESSDATA_DIR/osd.traineddata" ]; then
    echo "osd.traineddata is missing!"
    exit 1
fi

echo "Traineddata files in $TESSDATA_DIR:"
ls -l "$TESSDATA_DIR"

# Test Tesseract functionality
echo "Testing Tesseract functionality..."
echo "Hello World" > test.png
if ! tesseract test.png stdout; then
    echo "Tesseract test failed!"
    exit 1
fi
rm test.png

# Clean and install tesserocr from source
echo "Installing tesserocr from source..."
pip uninstall -y tesserocr || true
CPPFLAGS="-I/usr/include/tesseract" LDFLAGS="-L/usr/lib/x86_64-linux-gnu/" pip install --no-binary :all: tesserocr

# Verify tesserocr installation
echo "Verifying tesserocr installation..."
python3 -c "
import tesserocr
print(f'tesserocr version: {tesserocr.__version__}')
print(f'Available languages: {tesserocr.get_languages()}')
print(f'TESSDATA_PREFIX: {tesserocr.get_languages()[1]}')
"

# Install Google Gemini API client
echo "Installing Google Gemini API client..."
pip install -q -U google-genai
echo "Google Gemini API client installed successfully"

# Install Python dependencies
echo "Installing Python dependencies..."
pip install -e .

# Create .env file if it doesn't exist
if [ ! -f .env ]; then
    echo "Creating .env file..."
    cp .env.example .env || echo "Warning: .env.example not found"
fi

echo "Build process completed successfully!"