Spaces:
Sleeping
Sleeping
fix: download tesseract traineddata files directly from source
Browse files
build.sh
CHANGED
@@ -5,50 +5,73 @@ set -e
|
|
5 |
|
6 |
echo "Starting build process..."
|
7 |
|
8 |
-
# Install system dependencies for tesseract
|
9 |
echo "Installing Tesseract and dependencies..."
|
10 |
apt-get update && apt-get install -y \
|
11 |
tesseract-ocr \
|
12 |
tesseract-ocr-eng \
|
13 |
libtesseract-dev \
|
14 |
libleptonica-dev \
|
15 |
-
pkg-config
|
|
|
16 |
|
17 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
if ! command -v tesseract &> /dev/null; then
|
19 |
echo "Tesseract installation failed!"
|
20 |
exit 1
|
21 |
fi
|
22 |
echo "Tesseract version: $(tesseract --version)"
|
23 |
|
24 |
-
#
|
25 |
-
|
26 |
-
if [ -
|
27 |
-
echo "
|
|
|
|
|
|
|
|
|
28 |
exit 1
|
29 |
fi
|
30 |
-
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
31 |
-
export TESSDATA_PREFIX
|
32 |
|
33 |
-
|
34 |
-
|
35 |
|
36 |
-
#
|
37 |
-
|
38 |
-
|
|
|
|
|
39 |
exit 1
|
40 |
fi
|
41 |
-
|
42 |
-
ls -l $TESSDATA_PREFIX
|
43 |
|
44 |
-
#
|
45 |
echo "Installing tesserocr from source..."
|
46 |
pip uninstall -y tesserocr || true
|
47 |
-
pip install --no-binary :all: tesserocr
|
48 |
|
49 |
-
#
|
50 |
-
echo "
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
# Install Python dependencies
|
54 |
echo "Installing Python dependencies..."
|
@@ -57,7 +80,7 @@ pip install -e .
|
|
57 |
# Create .env file if it doesn't exist
|
58 |
if [ ! -f .env ]; then
|
59 |
echo "Creating .env file..."
|
60 |
-
cp .env.example .env
|
61 |
fi
|
62 |
|
63 |
echo "Build process completed successfully!"
|
|
|
5 |
|
6 |
echo "Starting build process..."
|
7 |
|
8 |
+
# Install system dependencies for tesseract
|
9 |
echo "Installing Tesseract and dependencies..."
|
10 |
apt-get update && apt-get install -y \
|
11 |
tesseract-ocr \
|
12 |
tesseract-ocr-eng \
|
13 |
libtesseract-dev \
|
14 |
libleptonica-dev \
|
15 |
+
pkg-config \
|
16 |
+
wget
|
17 |
|
18 |
+
# Create tessdata directory
|
19 |
+
TESSDATA_DIR="/usr/share/tesseract-ocr/4.00/tessdata"
|
20 |
+
mkdir -p "$TESSDATA_DIR"
|
21 |
+
|
22 |
+
# Download traineddata files directly from the official repository
|
23 |
+
echo "Downloading Tesseract traineddata files..."
|
24 |
+
wget -O "$TESSDATA_DIR/eng.traineddata" "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
|
25 |
+
wget -O "$TESSDATA_DIR/osd.traineddata" "https://github.com/tesseract-ocr/tessdata/raw/main/osd.traineddata"
|
26 |
+
|
27 |
+
# Set and verify TESSDATA_PREFIX
|
28 |
+
export TESSDATA_PREFIX="$TESSDATA_DIR"
|
29 |
+
echo "TESSDATA_PREFIX=${TESSDATA_PREFIX}" >> /etc/environment
|
30 |
+
|
31 |
+
# Verify tesseract installation and data files
|
32 |
+
echo "Verifying Tesseract installation..."
|
33 |
if ! command -v tesseract &> /dev/null; then
|
34 |
echo "Tesseract installation failed!"
|
35 |
exit 1
|
36 |
fi
|
37 |
echo "Tesseract version: $(tesseract --version)"
|
38 |
|
39 |
+
# Verify traineddata files
|
40 |
+
echo "Verifying traineddata files..."
|
41 |
+
if [ ! -f "$TESSDATA_DIR/eng.traineddata" ]; then
|
42 |
+
echo "eng.traineddata is missing!"
|
43 |
+
exit 1
|
44 |
+
fi
|
45 |
+
if [ ! -f "$TESSDATA_DIR/osd.traineddata" ]; then
|
46 |
+
echo "osd.traineddata is missing!"
|
47 |
exit 1
|
48 |
fi
|
|
|
|
|
49 |
|
50 |
+
echo "Traineddata files in $TESSDATA_DIR:"
|
51 |
+
ls -l "$TESSDATA_DIR"
|
52 |
|
53 |
+
# Test Tesseract functionality
|
54 |
+
echo "Testing Tesseract functionality..."
|
55 |
+
echo "Hello World" > test.png
|
56 |
+
if ! tesseract test.png stdout; then
|
57 |
+
echo "Tesseract test failed!"
|
58 |
exit 1
|
59 |
fi
|
60 |
+
rm test.png
|
|
|
61 |
|
62 |
+
# Clean and install tesserocr from source
|
63 |
echo "Installing tesserocr from source..."
|
64 |
pip uninstall -y tesserocr || true
|
65 |
+
CPPFLAGS="-I/usr/include/tesseract" LDFLAGS="-L/usr/lib/x86_64-linux-gnu/" pip install --no-binary :all: tesserocr
|
66 |
|
67 |
+
# Verify tesserocr installation
|
68 |
+
echo "Verifying tesserocr installation..."
|
69 |
+
python3 -c "
|
70 |
+
import tesserocr
|
71 |
+
print(f'tesserocr version: {tesserocr.__version__}')
|
72 |
+
print(f'Available languages: {tesserocr.get_languages()}')
|
73 |
+
print(f'TESSDATA_PREFIX: {tesserocr.get_languages()[1]}')
|
74 |
+
"
|
75 |
|
76 |
# Install Python dependencies
|
77 |
echo "Installing Python dependencies..."
|
|
|
80 |
# Create .env file if it doesn't exist
|
81 |
if [ ! -f .env ]; then
|
82 |
echo "Creating .env file..."
|
83 |
+
cp .env.example .env || echo "Warning: .env.example not found"
|
84 |
fi
|
85 |
|
86 |
echo "Build process completed successfully!"
|