Spaces:
Sleeping
Sleeping
fix: add Tesseract configuration and startup script
Browse files
README.md
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
|
2 |
---
|
3 |
title: Doc2Md
|
4 |
emoji: π
|
@@ -8,6 +7,7 @@ sdk: gradio
|
|
8 |
sdk_version: 5.14.0
|
9 |
app_file: app.py
|
10 |
build_script: build.sh
|
|
|
11 |
pinned: false
|
12 |
---
|
13 |
|
|
|
|
|
1 |
---
|
2 |
title: Doc2Md
|
3 |
emoji: π
|
|
|
7 |
sdk_version: 5.14.0
|
8 |
app_file: app.py
|
9 |
build_script: build.sh
|
10 |
+
startup_script: setup.sh
|
11 |
pinned: false
|
12 |
---
|
13 |
|
app.py
CHANGED
@@ -1,6 +1,13 @@
|
|
1 |
import sys
|
2 |
import os
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
# Get the current directory
|
5 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
6 |
|
|
|
1 |
import sys
|
2 |
import os
|
3 |
|
4 |
+
# Set TESSDATA_PREFIX if not already set
|
5 |
+
if not os.environ.get('TESSDATA_PREFIX'):
|
6 |
+
tessdata_dir = "/usr/share/tesseract-ocr/4.00/tessdata"
|
7 |
+
if os.path.exists(tessdata_dir):
|
8 |
+
os.environ['TESSDATA_PREFIX'] = tessdata_dir
|
9 |
+
print(f"Set TESSDATA_PREFIX to {tessdata_dir}")
|
10 |
+
|
11 |
# Get the current directory
|
12 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
13 |
|
setup.sh
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Exit on error
|
4 |
+
set -e
|
5 |
+
|
6 |
+
# Create tessdata directory if it doesn't exist
|
7 |
+
TESSDATA_DIR="/usr/share/tesseract-ocr/4.00/tessdata"
|
8 |
+
mkdir -p "$TESSDATA_DIR"
|
9 |
+
|
10 |
+
# Download traineddata files if they don't exist
|
11 |
+
if [ ! -f "$TESSDATA_DIR/eng.traineddata" ]; then
|
12 |
+
echo "Downloading eng.traineddata..."
|
13 |
+
wget -O "$TESSDATA_DIR/eng.traineddata" "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
|
14 |
+
fi
|
15 |
+
|
16 |
+
if [ ! -f "$TESSDATA_DIR/osd.traineddata" ]; then
|
17 |
+
echo "Downloading osd.traineddata..."
|
18 |
+
wget -O "$TESSDATA_DIR/osd.traineddata" "https://github.com/tesseract-ocr/tessdata/raw/main/osd.traineddata"
|
19 |
+
fi
|
20 |
+
|
21 |
+
# Set TESSDATA_PREFIX
|
22 |
+
export TESSDATA_PREFIX="$TESSDATA_DIR"
|
23 |
+
echo "TESSDATA_PREFIX=${TESSDATA_PREFIX}" >> /etc/environment
|
24 |
+
|
25 |
+
# Test Tesseract functionality
|
26 |
+
echo "Testing Tesseract..."
|
27 |
+
echo "Hello World" > test.png
|
28 |
+
tesseract test.png stdout
|
29 |
+
rm test.png
|
30 |
+
|
31 |
+
# Print Tesseract version and available languages
|
32 |
+
echo "Tesseract version:"
|
33 |
+
tesseract --version
|
34 |
+
echo "Available languages:"
|
35 |
+
tesseract --list-langs
|