Spaces:
Sleeping
Sleeping
Improve Tesseract installation and configuration
Browse files- build.sh +42 -3
- requirements.txt +3 -0
build.sh
CHANGED
@@ -1,24 +1,63 @@
|
|
1 |
#!/bin/bash
|
2 |
|
|
|
|
|
|
|
|
|
|
|
3 |
# Install system dependencies for tesseract with more specific packages
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
# Set TESSDATA_PREFIX environment variable
|
7 |
TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)
|
|
|
|
|
|
|
|
|
8 |
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
9 |
export TESSDATA_PREFIX
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
# Uninstall any existing tesserocr and install from source
|
12 |
-
|
|
|
13 |
pip install --no-binary :all: tesserocr
|
14 |
|
15 |
# Install ocrmac
|
|
|
16 |
pip install ocrmac
|
17 |
|
18 |
# Install Python dependencies
|
|
|
19 |
pip install -e .
|
20 |
|
21 |
# Create .env file if it doesn't exist
|
22 |
if [ ! -f .env ]; then
|
|
|
23 |
cp .env.example .env
|
24 |
-
fi
|
|
|
|
|
|
1 |
#!/bin/bash
|
2 |
|
3 |
+
# Exit on error
|
4 |
+
set -e
|
5 |
+
|
6 |
+
echo "Starting build process..."
|
7 |
+
|
8 |
# Install system dependencies for tesseract with more specific packages
|
9 |
+
echo "Installing Tesseract and dependencies..."
|
10 |
+
apt-get update && apt-get install -y \
|
11 |
+
tesseract-ocr \
|
12 |
+
tesseract-ocr-eng \
|
13 |
+
libtesseract-dev \
|
14 |
+
libleptonica-dev \
|
15 |
+
pkg-config
|
16 |
+
|
17 |
+
# Verify tesseract installation
|
18 |
+
if ! command -v tesseract &> /dev/null; then
|
19 |
+
echo "Tesseract installation failed!"
|
20 |
+
exit 1
|
21 |
+
fi
|
22 |
+
echo "Tesseract version: $(tesseract --version)"
|
23 |
|
24 |
# Set TESSDATA_PREFIX environment variable
|
25 |
TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)
|
26 |
+
if [ -z "$TESSDATA_PREFIX" ]; then
|
27 |
+
echo "Could not find tessdata directory!"
|
28 |
+
exit 1
|
29 |
+
fi
|
30 |
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
31 |
export TESSDATA_PREFIX
|
32 |
|
33 |
+
# Add TESSDATA_PREFIX to environment for persistence
|
34 |
+
echo "TESSDATA_PREFIX=${TESSDATA_PREFIX}" >> /etc/environment
|
35 |
+
|
36 |
+
# Verify tessdata directory
|
37 |
+
if [ ! -d "$TESSDATA_PREFIX" ]; then
|
38 |
+
echo "Tessdata directory does not exist!"
|
39 |
+
exit 1
|
40 |
+
fi
|
41 |
+
echo "Tessdata directory contents:"
|
42 |
+
ls -l $TESSDATA_PREFIX
|
43 |
+
|
44 |
# Uninstall any existing tesserocr and install from source
|
45 |
+
echo "Installing tesserocr from source..."
|
46 |
+
pip uninstall -y tesserocr || true
|
47 |
pip install --no-binary :all: tesserocr
|
48 |
|
49 |
# Install ocrmac
|
50 |
+
echo "Installing ocrmac..."
|
51 |
pip install ocrmac
|
52 |
|
53 |
# Install Python dependencies
|
54 |
+
echo "Installing Python dependencies..."
|
55 |
pip install -e .
|
56 |
|
57 |
# Create .env file if it doesn't exist
|
58 |
if [ ! -f .env ]; then
|
59 |
+
echo "Creating .env file..."
|
60 |
cp .env.example .env
|
61 |
+
fi
|
62 |
+
|
63 |
+
echo "Build process completed successfully!"
|
requirements.txt
CHANGED
@@ -9,4 +9,7 @@ pipdeptree==2.25.0
|
|
9 |
pytesseract==0.3.13
|
10 |
semchunk==2.2.2
|
11 |
tesseract==0.1.3
|
|
|
|
|
|
|
12 |
# Use pytesseract instead of tesserocr for cross-platform compatibility
|
|
|
9 |
pytesseract==0.3.13
|
10 |
semchunk==2.2.2
|
11 |
tesseract==0.1.3
|
12 |
+
tesserocr>=2.5.0; platform_system != "Windows" # Only install on non-Windows systems
|
13 |
+
Pillow>=9.0.0 # Required for image processing
|
14 |
+
numpy>=1.21.0 # Required for image processing
|
15 |
# Use pytesseract instead of tesserocr for cross-platform compatibility
|