Spaces:
Sleeping
Sleeping
File size: 2,659 Bytes
4c35288 08e21aa 46bc24b 08e21aa 46bc24b 08e21aa 46bc24b 08e21aa 4c35288 46bc24b 1920eee 787c3f4 46bc24b 08e21aa 46bc24b 1920eee 08e21aa 46bc24b 08e21aa 46bc24b b7d7d76 1920eee 46bc24b b7d7d76 46bc24b 9cf505e 2dc4c21 4c35288 08e21aa 4c35288 08e21aa 46bc24b 08e21aa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
#!/bin/bash
# Exit on error
set -e
echo "Starting build process..."
# Install system dependencies for tesseract
echo "Installing Tesseract and dependencies..."
apt-get update && apt-get install -y \
tesseract-ocr \
tesseract-ocr-eng \
libtesseract-dev \
libleptonica-dev \
pkg-config \
wget
# Create tessdata directory
TESSDATA_DIR="/usr/share/tesseract-ocr/4.00/tessdata"
mkdir -p "$TESSDATA_DIR"
# Download traineddata files directly from the official repository
echo "Downloading Tesseract traineddata files..."
wget -O "$TESSDATA_DIR/eng.traineddata" "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
wget -O "$TESSDATA_DIR/osd.traineddata" "https://github.com/tesseract-ocr/tessdata/raw/main/osd.traineddata"
# Set and verify TESSDATA_PREFIX
export TESSDATA_PREFIX="$TESSDATA_DIR"
echo "TESSDATA_PREFIX=${TESSDATA_PREFIX}" >> /etc/environment
# Verify tesseract installation and data files
echo "Verifying Tesseract installation..."
if ! command -v tesseract &> /dev/null; then
echo "Tesseract installation failed!"
exit 1
fi
echo "Tesseract version: $(tesseract --version)"
# Verify traineddata files
echo "Verifying traineddata files..."
if [ ! -f "$TESSDATA_DIR/eng.traineddata" ]; then
echo "eng.traineddata is missing!"
exit 1
fi
if [ ! -f "$TESSDATA_DIR/osd.traineddata" ]; then
echo "osd.traineddata is missing!"
exit 1
fi
echo "Traineddata files in $TESSDATA_DIR:"
ls -l "$TESSDATA_DIR"
# Test Tesseract functionality
echo "Testing Tesseract functionality..."
echo "Hello World" > test.png
if ! tesseract test.png stdout; then
echo "Tesseract test failed!"
exit 1
fi
rm test.png
# Clean and install tesserocr from source
echo "Installing tesserocr from source..."
pip uninstall -y tesserocr || true
CPPFLAGS="-I/usr/include/tesseract" LDFLAGS="-L/usr/lib/x86_64-linux-gnu/" pip install --no-binary :all: tesserocr
# Verify tesserocr installation
echo "Verifying tesserocr installation..."
python3 -c "
import tesserocr
print(f'tesserocr version: {tesserocr.__version__}')
print(f'Available languages: {tesserocr.get_languages()}')
print(f'TESSDATA_PREFIX: {tesserocr.get_languages()[1]}')
"
# Install Google Gemini API client
echo "Installing Google Gemini API client..."
pip install -q -U google-genai
echo "Google Gemini API client installed successfully"
# Install Python dependencies
echo "Installing Python dependencies..."
pip install -e .
# Create .env file if it doesn't exist
if [ ! -f .env ]; then
echo "Creating .env file..."
cp .env.example .env || echo "Warning: .env.example not found"
fi
echo "Build process completed successfully!" |