Spaces:
Running
on
Zero
Running
on
Zero
Implement Markit_GOT files into Markit_v2
Browse files- .gitignore +87 -0
- README.md +214 -6
- app.py +138 -0
- build.sh +96 -0
- packages.txt +6 -0
- requirements.txt +35 -0
- setup.sh +76 -0
- src/__init__.py +1 -0
- src/core/__init__.py +1 -0
- src/core/converter.py +190 -0
- src/core/parser_factory.py +95 -0
- src/main.py +15 -0
- src/parsers/__init__.py +12 -0
- src/parsers/docling_parser.py +170 -0
- src/parsers/gemini_flash_parser.py +144 -0
- src/parsers/got_ocr_parser.py +172 -0
- src/parsers/marker_parser.py +61 -0
- src/parsers/parser_interface.py +47 -0
- src/parsers/parser_registry.py +74 -0
- src/parsers/pypdfium_parser.py +78 -0
- src/services/__init__.py +1 -0
- src/services/docling_chat.py +29 -0
- src/ui/__init__.py +1 -0
- src/ui/ui.py +301 -0
.gitignore
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
*.so
|
6 |
+
.Python
|
7 |
+
build/
|
8 |
+
develop-eggs/
|
9 |
+
dist/
|
10 |
+
downloads/
|
11 |
+
eggs/
|
12 |
+
.eggs/
|
13 |
+
lib/
|
14 |
+
lib64/
|
15 |
+
parts/
|
16 |
+
sdist/
|
17 |
+
var/
|
18 |
+
wheels/
|
19 |
+
*.egg-info/
|
20 |
+
.installed.cfg
|
21 |
+
*.egg
|
22 |
+
|
23 |
+
# Environment variables
|
24 |
+
.env
|
25 |
+
.env.local
|
26 |
+
.env.development.local
|
27 |
+
.env.test.local
|
28 |
+
.env.production.local
|
29 |
+
|
30 |
+
# Tesseract
|
31 |
+
tessdata/
|
32 |
+
|
33 |
+
# Temporary files
|
34 |
+
*.tmp
|
35 |
+
*.temp
|
36 |
+
temp/
|
37 |
+
tmp/
|
38 |
+
|
39 |
+
# Logs
|
40 |
+
logs/
|
41 |
+
*.log
|
42 |
+
npm-debug.log*
|
43 |
+
yarn-debug.log*
|
44 |
+
yarn-error.log*
|
45 |
+
|
46 |
+
# Editor directories and files
|
47 |
+
.idea/
|
48 |
+
.vscode/
|
49 |
+
*.swp
|
50 |
+
*.swo
|
51 |
+
*~
|
52 |
+
|
53 |
+
# OS specific
|
54 |
+
.DS_Store
|
55 |
+
Thumbs.db
|
56 |
+
|
57 |
+
# Virtual environment
|
58 |
+
.venv/
|
59 |
+
venv/
|
60 |
+
ENV/
|
61 |
+
|
62 |
+
# Distribution / packaging
|
63 |
+
*.egg-info/
|
64 |
+
|
65 |
+
# Local Gradio files
|
66 |
+
.gradio/
|
67 |
+
|
68 |
+
# IDE specific files
|
69 |
+
*.swp
|
70 |
+
*.swo
|
71 |
+
|
72 |
+
# Backup folder
|
73 |
+
/backup
|
74 |
+
|
75 |
+
# Specific files to ignore
|
76 |
+
README_HF.md
|
77 |
+
requirement.txt
|
78 |
+
.env_example
|
79 |
+
test_gemini_parser.py
|
80 |
+
|
81 |
+
# Ignore documents folder
|
82 |
+
/documents/
|
83 |
+
/documents/*
|
84 |
+
|
85 |
+
# Ignore tessdata folder
|
86 |
+
/tessdata/
|
87 |
+
/tessdata/*
|
README.md
CHANGED
@@ -1,12 +1,220 @@
|
|
1 |
---
|
2 |
-
title: Markit
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 5.
|
8 |
app_file: app.py
|
|
|
|
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: Markit
|
3 |
+
emoji: 📄
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 5.14.0
|
8 |
app_file: app.py
|
9 |
+
build_script: build.sh
|
10 |
+
startup_script: setup.sh
|
11 |
pinned: false
|
12 |
---
|
13 |
|
14 |
+
# Markit: Document to Markdown Converter
|
15 |
+
|
16 |
+
[](https://huggingface.co/spaces/Ansemin101/Markit)
|
17 |
+
|
18 |
+
**Author: Anse Min** | [GitHub](https://github.com/ansemin) | [LinkedIn](https://www.linkedin.com/in/ansemin/)
|
19 |
+
|
20 |
+
## Project Links
|
21 |
+
- **GitHub Repository**: [github.com/ansemin/Markit_HF](https://github.com/ansemin/Markit_HF)
|
22 |
+
- **Hugging Face Space**: [huggingface.co/spaces/Ansemin101/Markit](https://huggingface.co/spaces/Ansemin101/Markit)
|
23 |
+
|
24 |
+
## Overview
|
25 |
+
Markit is a powerful tool that converts various document formats (PDF, DOCX, images, etc.) to Markdown format. It uses different parsing engines and OCR methods to extract text from documents and convert them to clean, readable Markdown formats.
|
26 |
+
|
27 |
+
## Key Features
|
28 |
+
- **Multiple Document Formats**: Convert PDFs, Word documents, images, and other document formats
|
29 |
+
- **Versatile Output Formats**: Export to Markdown, JSON, plain text, or document tags format
|
30 |
+
- **Advanced Parsing Engines**:
|
31 |
+
- **PyPdfium**: Fast PDF parsing using the PDFium engine
|
32 |
+
- **Docling**: Advanced document structure analysis
|
33 |
+
- **Marker**: Specialized for markup and formatting
|
34 |
+
- **Gemini Flash**: AI-powered conversion using Google's Gemini API
|
35 |
+
- **GOT-OCR**: State-of-the-art OCR model for images (JPG/PNG only)
|
36 |
+
- **OCR Integration**: Extract text from images and scanned documents using Tesseract OCR
|
37 |
+
- **Interactive UI**: User-friendly Gradio interface with page navigation for large documents
|
38 |
+
- **AI-Powered Chat**: Interact with your documents using AI to ask questions about content
|
39 |
+
|
40 |
+
## System Architecture
|
41 |
+
The application is built with a modular architecture:
|
42 |
+
- **Core Engine**: Handles document conversion and processing workflows
|
43 |
+
- **Parser Registry**: Central registry for all document parsers
|
44 |
+
- **UI Layer**: Gradio-based web interface
|
45 |
+
- **Service Layer**: Handles AI chat functionality and external services integration
|
46 |
+
|
47 |
+
## Installation
|
48 |
+
|
49 |
+
### For Local Development
|
50 |
+
1. Clone the repository
|
51 |
+
2. Install dependencies:
|
52 |
+
```bash
|
53 |
+
pip install -r requirements.txt
|
54 |
+
```
|
55 |
+
3. Install Tesseract OCR (required for OCR functionality):
|
56 |
+
- Windows: Download and install from [GitHub](https://github.com/UB-Mannheim/tesseract/wiki)
|
57 |
+
- Linux: `sudo apt-get install tesseract-ocr libtesseract-dev`
|
58 |
+
- macOS: `brew install tesseract`
|
59 |
+
|
60 |
+
4. Run the application:
|
61 |
+
```bash
|
62 |
+
python app.py
|
63 |
+
```
|
64 |
+
|
65 |
+
### API Keys Setup
|
66 |
+
|
67 |
+
#### Gemini Flash Parser
|
68 |
+
To use the Gemini Flash parser, you need to:
|
69 |
+
1. Install the Google Generative AI client: `pip install google-genai`
|
70 |
+
2. Set the API key environment variable:
|
71 |
+
```bash
|
72 |
+
# On Windows
|
73 |
+
set GOOGLE_API_KEY=your_api_key_here
|
74 |
+
|
75 |
+
# On Linux/Mac
|
76 |
+
export GOOGLE_API_KEY=your_api_key_here
|
77 |
+
```
|
78 |
+
3. Alternatively, create a `.env` file in the project root with:
|
79 |
+
```
|
80 |
+
GOOGLE_API_KEY=your_api_key_here
|
81 |
+
```
|
82 |
+
4. Get your Gemini API key from [Google AI Studio](https://aistudio.google.com/app/apikey)
|
83 |
+
|
84 |
+
#### GOT-OCR Parser
|
85 |
+
The GOT-OCR parser requires:
|
86 |
+
1. CUDA-capable GPU with sufficient memory
|
87 |
+
2. The following dependencies will be installed automatically:
|
88 |
+
```bash
|
89 |
+
torch>=2.0.1
|
90 |
+
torchvision>=0.15.2
|
91 |
+
transformers>=4.37.2,<4.48.0 # Specific version range required
|
92 |
+
tiktoken>=0.6.0
|
93 |
+
verovio>=4.3.1
|
94 |
+
accelerate>=0.28.0
|
95 |
+
```
|
96 |
+
3. Note that GOT-OCR only supports JPG and PNG image formats
|
97 |
+
|
98 |
+
## Deploying to Hugging Face Spaces
|
99 |
+
|
100 |
+
### Environment Configuration
|
101 |
+
1. Go to your Space settings: `https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME/settings`
|
102 |
+
2. Add the following repository secrets:
|
103 |
+
- Name: `GOOGLE_API_KEY`
|
104 |
+
- Value: Your Gemini API key
|
105 |
+
|
106 |
+
### Space Configuration
|
107 |
+
Ensure your Hugging Face Space configuration includes:
|
108 |
+
```yaml
|
109 |
+
build:
|
110 |
+
dockerfile: Dockerfile
|
111 |
+
python_version: "3.10"
|
112 |
+
system_packages:
|
113 |
+
- "tesseract-ocr"
|
114 |
+
- "libtesseract-dev"
|
115 |
+
```
|
116 |
+
|
117 |
+
## How to Use
|
118 |
+
|
119 |
+
### Document Conversion
|
120 |
+
1. Upload your document using the file uploader
|
121 |
+
2. Select a parser provider:
|
122 |
+
- **PyPdfium**: Best for standard PDFs with selectable text
|
123 |
+
- **Docling**: Best for complex document layouts
|
124 |
+
- **Marker**: Best for preserving document formatting
|
125 |
+
- **Gemini Flash**: Best for AI-powered conversions (requires API key)
|
126 |
+
- **GOT-OCR**: Best for high-quality OCR on images (JPG/PNG only)
|
127 |
+
3. Choose an OCR option based on your selected parser:
|
128 |
+
- **None**: No OCR processing (for documents with selectable text)
|
129 |
+
- **Tesseract**: Basic OCR using Tesseract
|
130 |
+
- **Advanced**: Enhanced OCR with layout preservation (available with specific parsers)
|
131 |
+
4. Select your desired output format:
|
132 |
+
- **Markdown**: Clean, readable markdown format
|
133 |
+
- **JSON**: Structured data representation
|
134 |
+
- **Text**: Plain text extraction
|
135 |
+
- **Document Tags**: XML-like structure tags
|
136 |
+
5. Click "Convert" to process your document
|
137 |
+
6. Navigate through pages using the navigation buttons for multi-page documents
|
138 |
+
7. Download the converted content in your selected format
|
139 |
+
|
140 |
+
### Document Chat
|
141 |
+
1. After converting a document, switch to the "Chat with Document" tab
|
142 |
+
2. Type your questions about the document content
|
143 |
+
3. The AI will analyze the document and provide context-aware responses
|
144 |
+
4. Use the conversation history to track your Q&A session
|
145 |
+
5. Click "Clear" to start a new conversation
|
146 |
+
|
147 |
+
## Troubleshooting
|
148 |
+
|
149 |
+
### OCR Issues
|
150 |
+
- Ensure Tesseract is properly installed and in your system PATH
|
151 |
+
- Check the TESSDATA_PREFIX environment variable is set correctly
|
152 |
+
- Verify language files are available in the tessdata directory
|
153 |
+
|
154 |
+
### Gemini Flash Parser Issues
|
155 |
+
- Confirm your API key is set correctly as an environment variable
|
156 |
+
- Check for API usage limits or restrictions
|
157 |
+
- Verify the document format is supported by the Gemini API
|
158 |
+
|
159 |
+
### GOT-OCR Parser Issues
|
160 |
+
- Ensure you have a CUDA-capable GPU with sufficient memory
|
161 |
+
- Verify that all required dependencies are installed correctly
|
162 |
+
- Remember that GOT-OCR only supports JPG and PNG image formats
|
163 |
+
- If you encounter CUDA out-of-memory errors, try using a smaller image
|
164 |
+
- GOT-OCR requires transformers version <4.48.0 due to API changes in newer versions
|
165 |
+
- If you see errors about 'get_max_length', downgrade transformers to version 4.47.0
|
166 |
+
|
167 |
+
### General Issues
|
168 |
+
- Check the console logs for error messages
|
169 |
+
- Ensure all dependencies are installed correctly
|
170 |
+
- For large documents, try processing fewer pages at a time
|
171 |
+
|
172 |
+
## Development Guide
|
173 |
+
|
174 |
+
### Project Structure
|
175 |
+
|
176 |
+
```
|
177 |
+
markit/
|
178 |
+
├── app.py # Main application entry point
|
179 |
+
├── setup.sh # Setup script
|
180 |
+
├── build.sh # Build script
|
181 |
+
├── requirements.txt # Python dependencies
|
182 |
+
├── README.md # Project documentation
|
183 |
+
├── .env # Environment variables
|
184 |
+
├── .gitignore # Git ignore file
|
185 |
+
├── .gitattributes # Git attributes file
|
186 |
+
├── src/ # Source code
|
187 |
+
│ ├── __init__.py # Package initialization
|
188 |
+
│ ├── main.py # Main module
|
189 |
+
│ ├── core/ # Core functionality
|
190 |
+
│ │ ├── __init__.py # Package initialization
|
191 |
+
│ │ ├── converter.py # Document conversion logic
|
192 |
+
│ │ └── parser_factory.py # Parser factory
|
193 |
+
│ ├── parsers/ # Parser implementations
|
194 |
+
│ │ ├── __init__.py # Package initialization
|
195 |
+
│ │ ├── parser_interface.py # Parser interface
|
196 |
+
│ │ ├── parser_registry.py # Parser registry
|
197 |
+
│ │ ├── docling_parser.py # Docling parser
|
198 |
+
│ │ ├── marker_parser.py # Marker parser
|
199 |
+
│ │ └── pypdfium_parser.py # PyPDFium parser
|
200 |
+
│ ├── ui/ # User interface
|
201 |
+
│ │ ├── __init__.py # Package initialization
|
202 |
+
│ │ └── ui.py # Gradio UI implementation
|
203 |
+
│ └── services/ # External services
|
204 |
+
│ ├── __init__.py # Package initialization
|
205 |
+
│ └── docling_chat.py # Chat service
|
206 |
+
└── tests/ # Tests
|
207 |
+
└── __init__.py # Package initialization
|
208 |
+
```
|
209 |
+
|
210 |
+
### Adding a New Parser
|
211 |
+
1. Create a new parser class implementing the `DocumentParser` interface
|
212 |
+
2. Register the parser with the `ParserRegistry`
|
213 |
+
3. Implement the required methods: `get_name()`, `get_supported_ocr_methods()`, and `parse()`
|
214 |
+
4. Add your parser to the imports in `src/parsers/__init__.py`
|
215 |
+
|
216 |
+
## Contributing
|
217 |
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
218 |
+
|
219 |
+
## License
|
220 |
+
This project is open source and available under the MIT License.
|
app.py
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
import subprocess
|
4 |
+
import shutil
|
5 |
+
from pathlib import Path
|
6 |
+
import urllib.request
|
7 |
+
|
8 |
+
# Get the current directory
|
9 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
10 |
+
|
11 |
+
# Run setup.sh at startup
|
12 |
+
try:
|
13 |
+
setup_script = os.path.join(current_dir, "setup.sh")
|
14 |
+
if os.path.exists(setup_script):
|
15 |
+
print("Running setup.sh...")
|
16 |
+
subprocess.run(["bash", setup_script], check=False)
|
17 |
+
print("setup.sh completed")
|
18 |
+
except Exception as e:
|
19 |
+
print(f"Error running setup.sh: {e}")
|
20 |
+
|
21 |
+
# Try to load environment variables from .env file
|
22 |
+
try:
|
23 |
+
from dotenv import load_dotenv
|
24 |
+
load_dotenv()
|
25 |
+
print("Loaded environment variables from .env file")
|
26 |
+
except ImportError:
|
27 |
+
print("python-dotenv not installed, skipping .env file loading")
|
28 |
+
|
29 |
+
# Function to setup Tesseract
|
30 |
+
def setup_tesseract():
|
31 |
+
"""Setup Tesseract OCR environment."""
|
32 |
+
print("Setting up Tesseract OCR environment...")
|
33 |
+
|
34 |
+
# Create tessdata directory if it doesn't exist
|
35 |
+
tessdata_dir = os.path.join(current_dir, "tessdata")
|
36 |
+
os.makedirs(tessdata_dir, exist_ok=True)
|
37 |
+
|
38 |
+
# Set TESSDATA_PREFIX environment variable if not already set
|
39 |
+
if not os.environ.get('TESSDATA_PREFIX'):
|
40 |
+
# Check multiple possible locations
|
41 |
+
possible_tessdata_dirs = [
|
42 |
+
tessdata_dir, # Our local tessdata directory
|
43 |
+
"/usr/share/tesseract-ocr/4.00/tessdata", # Common location in Hugging Face
|
44 |
+
"/usr/share/tesseract-ocr/tessdata", # Another common location
|
45 |
+
"/usr/local/share/tessdata", # Standard installation location
|
46 |
+
]
|
47 |
+
|
48 |
+
# Use the first directory that exists
|
49 |
+
for dir_path in possible_tessdata_dirs:
|
50 |
+
if os.path.exists(dir_path):
|
51 |
+
os.environ['TESSDATA_PREFIX'] = dir_path
|
52 |
+
print(f"Set TESSDATA_PREFIX to {dir_path}")
|
53 |
+
break
|
54 |
+
else:
|
55 |
+
# If none exist, use our local directory
|
56 |
+
os.environ['TESSDATA_PREFIX'] = tessdata_dir
|
57 |
+
print(f"No existing tessdata directory found, set TESSDATA_PREFIX to {tessdata_dir}")
|
58 |
+
|
59 |
+
# Download eng.traineddata if it doesn't exist in our local tessdata
|
60 |
+
eng_traineddata = os.path.join(tessdata_dir, "eng.traineddata")
|
61 |
+
if not os.path.exists(eng_traineddata):
|
62 |
+
try:
|
63 |
+
print("Downloading eng.traineddata...")
|
64 |
+
url = "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
|
65 |
+
urllib.request.urlretrieve(url, eng_traineddata)
|
66 |
+
print("Downloaded eng.traineddata")
|
67 |
+
except Exception as e:
|
68 |
+
print(f"Error downloading eng.traineddata: {e}")
|
69 |
+
|
70 |
+
# Configure pytesseract
|
71 |
+
try:
|
72 |
+
import pytesseract
|
73 |
+
# Check if tesseract is in PATH
|
74 |
+
tesseract_cmd = shutil.which("tesseract")
|
75 |
+
if tesseract_cmd:
|
76 |
+
pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
|
77 |
+
print(f"Set pytesseract.tesseract_cmd to {tesseract_cmd}")
|
78 |
+
else:
|
79 |
+
# Try common locations
|
80 |
+
common_locations = [
|
81 |
+
"/usr/bin/tesseract",
|
82 |
+
"/usr/local/bin/tesseract",
|
83 |
+
"/app/tesseract/tesseract"
|
84 |
+
]
|
85 |
+
for location in common_locations:
|
86 |
+
if os.path.isfile(location) and os.access(location, os.X_OK):
|
87 |
+
pytesseract.pytesseract.tesseract_cmd = location
|
88 |
+
print(f"Set pytesseract.tesseract_cmd to {location}")
|
89 |
+
break
|
90 |
+
else:
|
91 |
+
print("Warning: Could not find tesseract executable")
|
92 |
+
except ImportError:
|
93 |
+
print("pytesseract not installed")
|
94 |
+
|
95 |
+
# Try to import tesserocr to verify it's working
|
96 |
+
try:
|
97 |
+
import tesserocr
|
98 |
+
print(f"tesserocr imported successfully, version: {tesserocr.tesseract_version()}")
|
99 |
+
except ImportError:
|
100 |
+
print("tesserocr not installed or not working")
|
101 |
+
except Exception as e:
|
102 |
+
print(f"Error importing tesserocr: {e}")
|
103 |
+
|
104 |
+
# Load Gemini API key from environment variable
|
105 |
+
gemini_api_key = os.getenv("GOOGLE_API_KEY")
|
106 |
+
|
107 |
+
# Check if API key is available and print a message if not
|
108 |
+
if not gemini_api_key:
|
109 |
+
print("Warning: GOOGLE_API_KEY environment variable not found. Gemini Flash parser may not work.")
|
110 |
+
else:
|
111 |
+
print(f"Found Gemini API key: {gemini_api_key[:5]}...{gemini_api_key[-5:] if len(gemini_api_key) > 10 else ''}")
|
112 |
+
|
113 |
+
# Add the current directory to the Python path
|
114 |
+
sys.path.append(current_dir)
|
115 |
+
|
116 |
+
# Try different import approaches
|
117 |
+
try:
|
118 |
+
# First attempt - standard import
|
119 |
+
from src.main import main
|
120 |
+
except ModuleNotFoundError:
|
121 |
+
try:
|
122 |
+
# Second attempt - adjust path and try again
|
123 |
+
sys.path.append(os.path.join(current_dir, "src"))
|
124 |
+
from src.main import main
|
125 |
+
except ModuleNotFoundError:
|
126 |
+
# Third attempt - create __init__.py if it doesn't exist
|
127 |
+
init_path = os.path.join(current_dir, "src", "__init__.py")
|
128 |
+
if not os.path.exists(init_path):
|
129 |
+
with open(init_path, "w") as f:
|
130 |
+
pass # Create empty __init__.py file
|
131 |
+
# Try import again
|
132 |
+
from src.main import main
|
133 |
+
|
134 |
+
# Call setup function at import time
|
135 |
+
setup_tesseract()
|
136 |
+
|
137 |
+
if __name__ == "__main__":
|
138 |
+
main()
|
build.sh
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Exit on error
|
4 |
+
set -e
|
5 |
+
|
6 |
+
echo "Starting build process..."
|
7 |
+
|
8 |
+
# Install system dependencies for tesseract
|
9 |
+
echo "Installing Tesseract and dependencies..."
|
10 |
+
apt-get update && apt-get install -y \
|
11 |
+
tesseract-ocr \
|
12 |
+
tesseract-ocr-eng \
|
13 |
+
libtesseract-dev \
|
14 |
+
libleptonica-dev \
|
15 |
+
pkg-config \
|
16 |
+
wget
|
17 |
+
|
18 |
+
# Create tessdata directory
|
19 |
+
TESSDATA_DIR="/usr/share/tesseract-ocr/4.00/tessdata"
|
20 |
+
mkdir -p "$TESSDATA_DIR"
|
21 |
+
|
22 |
+
# Download traineddata files directly from the official repository
|
23 |
+
echo "Downloading Tesseract traineddata files..."
|
24 |
+
wget -O "$TESSDATA_DIR/eng.traineddata" "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
|
25 |
+
wget -O "$TESSDATA_DIR/osd.traineddata" "https://github.com/tesseract-ocr/tessdata/raw/main/osd.traineddata"
|
26 |
+
|
27 |
+
# Set and verify TESSDATA_PREFIX
|
28 |
+
export TESSDATA_PREFIX="$TESSDATA_DIR"
|
29 |
+
echo "TESSDATA_PREFIX=${TESSDATA_PREFIX}" >> /etc/environment
|
30 |
+
|
31 |
+
# Verify tesseract installation and data files
|
32 |
+
echo "Verifying Tesseract installation..."
|
33 |
+
if ! command -v tesseract &> /dev/null; then
|
34 |
+
echo "Tesseract installation failed!"
|
35 |
+
exit 1
|
36 |
+
fi
|
37 |
+
echo "Tesseract version: $(tesseract --version)"
|
38 |
+
|
39 |
+
# Verify traineddata files
|
40 |
+
echo "Verifying traineddata files..."
|
41 |
+
if [ ! -f "$TESSDATA_DIR/eng.traineddata" ]; then
|
42 |
+
echo "eng.traineddata is missing!"
|
43 |
+
exit 1
|
44 |
+
fi
|
45 |
+
if [ ! -f "$TESSDATA_DIR/osd.traineddata" ]; then
|
46 |
+
echo "osd.traineddata is missing!"
|
47 |
+
exit 1
|
48 |
+
fi
|
49 |
+
|
50 |
+
echo "Traineddata files in $TESSDATA_DIR:"
|
51 |
+
ls -l "$TESSDATA_DIR"
|
52 |
+
|
53 |
+
# Test Tesseract functionality
|
54 |
+
echo "Testing Tesseract functionality..."
|
55 |
+
echo "Hello World" > test.png
|
56 |
+
if ! tesseract test.png stdout; then
|
57 |
+
echo "Tesseract test failed!"
|
58 |
+
exit 1
|
59 |
+
fi
|
60 |
+
rm test.png
|
61 |
+
|
62 |
+
# Clean and install tesserocr from source
|
63 |
+
echo "Installing tesserocr from source..."
|
64 |
+
pip uninstall -y tesserocr || true
|
65 |
+
CPPFLAGS="-I/usr/include/tesseract" LDFLAGS="-L/usr/lib/x86_64-linux-gnu/" pip install --no-binary :all: tesserocr
|
66 |
+
|
67 |
+
# Verify tesserocr installation
|
68 |
+
echo "Verifying tesserocr installation..."
|
69 |
+
python3 -c "
|
70 |
+
import tesserocr
|
71 |
+
print(f'tesserocr version: {tesserocr.__version__}')
|
72 |
+
print(f'Available languages: {tesserocr.get_languages()}')
|
73 |
+
print(f'TESSDATA_PREFIX: {tesserocr.get_languages()[1]}')
|
74 |
+
"
|
75 |
+
|
76 |
+
# Install Google Gemini API client
|
77 |
+
echo "Installing Google Gemini API client..."
|
78 |
+
pip install -q -U google-genai
|
79 |
+
echo "Google Gemini API client installed successfully"
|
80 |
+
|
81 |
+
# Install GOT-OCR dependencies
|
82 |
+
echo "Installing GOT-OCR dependencies..."
|
83 |
+
pip install -q -U torch==2.0.1 torchvision==0.15.2 transformers==4.47.0 tiktoken==0.6.0 verovio==4.3.1 accelerate==0.28.0 safetensors==0.4.0
|
84 |
+
echo "GOT-OCR dependencies installed successfully"
|
85 |
+
|
86 |
+
# Install Python dependencies
|
87 |
+
echo "Installing Python dependencies..."
|
88 |
+
pip install -e .
|
89 |
+
|
90 |
+
# Create .env file if it doesn't exist
|
91 |
+
if [ ! -f .env ]; then
|
92 |
+
echo "Creating .env file..."
|
93 |
+
cp .env.example .env || echo "Warning: .env.example not found"
|
94 |
+
fi
|
95 |
+
|
96 |
+
echo "Build process completed successfully!"
|
packages.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tesseract-ocr
|
2 |
+
tesseract-ocr-eng
|
3 |
+
libtesseract-dev
|
4 |
+
libleptonica-dev
|
5 |
+
imagemagick
|
6 |
+
poppler-utils
|
requirements.txt
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
docling==2.25.0
|
2 |
+
gradio==5.14.0
|
3 |
+
grpcio-status==1.70.0
|
4 |
+
markdown==3.7
|
5 |
+
marker-pdf==1.3.5
|
6 |
+
multiprocess==0.70.16
|
7 |
+
openai==1.61.1
|
8 |
+
pipdeptree==2.25.0
|
9 |
+
pytesseract==0.3.13
|
10 |
+
semchunk==2.2.2
|
11 |
+
Pillow>=9.0.0
|
12 |
+
numpy>=1.21.0
|
13 |
+
# Tesseract dependencies
|
14 |
+
tesseract==0.1.3
|
15 |
+
tesserocr>=2.5.0; platform_system != "Windows" # Only install on non-Windows systems
|
16 |
+
# Additional dependencies for image processing
|
17 |
+
opencv-python-headless>=4.5.0 # Headless version for server environments
|
18 |
+
pdf2image>=1.16.0 # For PDF processing
|
19 |
+
dill==0.3.8 # Downgraded to be compatible with datasets
|
20 |
+
# Gemini API client
|
21 |
+
google-genai>=0.1.0
|
22 |
+
# Environment variables
|
23 |
+
python-dotenv>=1.0.0
|
24 |
+
# Pin pydantic to resolve compatibility issues with gradio
|
25 |
+
pydantic==2.7.1
|
26 |
+
|
27 |
+
# GOT-OCR dependencies
|
28 |
+
torch>=2.0.1
|
29 |
+
torchvision>=0.15.2
|
30 |
+
transformers>=4.37.2,<4.48.0 # Pin to a compatible version for GOT-OCR
|
31 |
+
tiktoken>=0.6.0
|
32 |
+
verovio>=4.3.1
|
33 |
+
accelerate>=0.28.0
|
34 |
+
safetensors>=0.4.0
|
35 |
+
packaging>=21.0 # For version comparison
|
setup.sh
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Exit on error
|
4 |
+
set -e
|
5 |
+
|
6 |
+
echo "Setting up Tesseract OCR environment..."
|
7 |
+
|
8 |
+
# Install required packages if not already installed
|
9 |
+
if ! command -v tesseract &> /dev/null; then
|
10 |
+
echo "Tesseract not found, attempting to install..."
|
11 |
+
apt-get update -y || echo "Failed to update apt, continuing anyway"
|
12 |
+
apt-get install -y tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev || echo "Failed to install tesseract via apt, continuing anyway"
|
13 |
+
fi
|
14 |
+
|
15 |
+
# Install Python dependencies
|
16 |
+
echo "Installing Python dependencies..."
|
17 |
+
pip install -q -U pytesseract pillow opencv-python-headless pdf2image
|
18 |
+
pip install -q -U google-genai
|
19 |
+
echo "Python dependencies installed successfully"
|
20 |
+
|
21 |
+
# Install GOT-OCR dependencies
|
22 |
+
echo "Installing GOT-OCR dependencies..."
|
23 |
+
pip install -q -U torch==2.0.1 torchvision==0.15.2 transformers==4.47.0 tiktoken==0.6.0 verovio==4.3.1 accelerate==0.28.0 safetensors==0.4.0
|
24 |
+
echo "GOT-OCR dependencies installed successfully"
|
25 |
+
|
26 |
+
# Install tesserocr with pip
|
27 |
+
echo "Installing tesserocr..."
|
28 |
+
pip install -q -U tesserocr || echo "Failed to install tesserocr with pip, trying with specific compiler flags..."
|
29 |
+
|
30 |
+
# If tesserocr installation failed, try with specific compiler flags
|
31 |
+
if ! python -c "import tesserocr" &> /dev/null; then
|
32 |
+
echo "Trying alternative tesserocr installation..."
|
33 |
+
CPPFLAGS="-I/usr/local/include -I/usr/include" LDFLAGS="-L/usr/local/lib -L/usr/lib" pip install -q -U tesserocr || echo "Failed to install tesserocr with compiler flags, continuing anyway"
|
34 |
+
fi
|
35 |
+
|
36 |
+
# Create tessdata directory if it doesn't exist
|
37 |
+
mkdir -p tessdata
|
38 |
+
|
39 |
+
# Set TESSDATA_PREFIX environment variable
|
40 |
+
export TESSDATA_PREFIX="$(pwd)/tessdata"
|
41 |
+
echo "TESSDATA_PREFIX set to: $TESSDATA_PREFIX"
|
42 |
+
|
43 |
+
# Download eng.traineddata if it doesn't exist
|
44 |
+
if [ ! -f "tessdata/eng.traineddata" ]; then
|
45 |
+
echo "Downloading eng.traineddata..."
|
46 |
+
wget -O tessdata/eng.traineddata https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata || \
|
47 |
+
curl -o tessdata/eng.traineddata https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata
|
48 |
+
echo "Downloaded eng.traineddata"
|
49 |
+
else
|
50 |
+
echo "eng.traineddata already exists"
|
51 |
+
fi
|
52 |
+
|
53 |
+
# Try to copy to system locations (may fail in restricted environments)
|
54 |
+
for tessdata_dir in "/usr/share/tesseract-ocr/4.00/tessdata" "/usr/share/tesseract-ocr/tessdata" "/usr/local/share/tessdata"; do
|
55 |
+
if [ -d "$tessdata_dir" ]; then
|
56 |
+
echo "Copying eng.traineddata to $tessdata_dir..."
|
57 |
+
cp -f tessdata/eng.traineddata "$tessdata_dir/" 2>/dev/null || echo "Failed to copy to $tessdata_dir, continuing anyway"
|
58 |
+
fi
|
59 |
+
done
|
60 |
+
|
61 |
+
# Verify Tesseract installation
|
62 |
+
echo "Verifying Tesseract installation..."
|
63 |
+
tesseract --version || echo "Tesseract not found in PATH, but may still be available to Python"
|
64 |
+
|
65 |
+
# Test tesserocr if installed
|
66 |
+
echo "Testing tesserocr..."
|
67 |
+
python -c "import tesserocr; print(f'tesserocr version: {tesserocr.tesseract_version()}')" || echo "tesserocr not working, but may still be able to use pytesseract"
|
68 |
+
|
69 |
+
# Test pytesseract
|
70 |
+
echo "Testing pytesseract..."
|
71 |
+
python -c "import pytesseract; print(f'pytesseract path: {pytesseract.tesseract_cmd}')" || echo "pytesseract not working"
|
72 |
+
|
73 |
+
echo "Setup completed"
|
74 |
+
|
75 |
+
# Add TESSDATA_PREFIX to .env file for persistence
|
76 |
+
echo "TESSDATA_PREFIX=$(pwd)/tessdata" >> .env
|
src/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# This file makes the src directory a Python package
|
src/core/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
"""Core functionality module for document conversion."""
|
src/core/converter.py
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tempfile
|
2 |
+
import logging
|
3 |
+
import time
|
4 |
+
import os
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
# Use relative imports instead of absolute imports
|
8 |
+
from src.core.parser_factory import ParserFactory
|
9 |
+
|
10 |
+
# Import all parsers to ensure they're registered
|
11 |
+
import parsers
|
12 |
+
|
13 |
+
# Reference to the cancellation flag from ui.py
|
14 |
+
# This will be set by the UI when the cancel button is clicked
|
15 |
+
conversion_cancelled = None # Will be a threading.Event object
|
16 |
+
# Flag to track if conversion is currently in progress
|
17 |
+
_conversion_in_progress = False
|
18 |
+
|
19 |
+
def set_cancellation_flag(flag):
|
20 |
+
"""Set the reference to the cancellation flag from ui.py"""
|
21 |
+
global conversion_cancelled
|
22 |
+
conversion_cancelled = flag
|
23 |
+
|
24 |
+
def is_conversion_in_progress():
|
25 |
+
"""Check if conversion is currently in progress"""
|
26 |
+
global _conversion_in_progress
|
27 |
+
return _conversion_in_progress
|
28 |
+
|
29 |
+
def check_cancellation():
|
30 |
+
"""Check if cancellation has been requested"""
|
31 |
+
if conversion_cancelled and conversion_cancelled.is_set():
|
32 |
+
logging.info("Cancellation detected in check_cancellation")
|
33 |
+
return True
|
34 |
+
return False
|
35 |
+
|
36 |
+
def safe_delete_file(file_path):
|
37 |
+
"""Safely delete a file with error handling"""
|
38 |
+
if file_path and os.path.exists(file_path):
|
39 |
+
try:
|
40 |
+
os.unlink(file_path)
|
41 |
+
except Exception as e:
|
42 |
+
logging.error(f"Error cleaning up temp file {file_path}: {e}")
|
43 |
+
|
44 |
+
def convert_file(file_path, parser_name, ocr_method_name, output_format):
|
45 |
+
"""
|
46 |
+
Convert a file using the specified parser and OCR method.
|
47 |
+
|
48 |
+
Args:
|
49 |
+
file_path: Path to the file
|
50 |
+
parser_name: Name of the parser to use
|
51 |
+
ocr_method_name: Name of the OCR method to use
|
52 |
+
output_format: Output format (Markdown, JSON, Text, Document Tags)
|
53 |
+
|
54 |
+
Returns:
|
55 |
+
tuple: (content, download_file_path)
|
56 |
+
"""
|
57 |
+
global conversion_cancelled, _conversion_in_progress
|
58 |
+
|
59 |
+
# Set the conversion in progress flag
|
60 |
+
_conversion_in_progress = True
|
61 |
+
|
62 |
+
# Temporary file paths to clean up
|
63 |
+
temp_input = None
|
64 |
+
tmp_path = None
|
65 |
+
|
66 |
+
# Ensure we clean up the flag when we're done
|
67 |
+
try:
|
68 |
+
if not file_path:
|
69 |
+
return "Please upload a file.", None
|
70 |
+
|
71 |
+
# Check for cancellation
|
72 |
+
if check_cancellation():
|
73 |
+
logging.info("Cancellation detected at start of convert_file")
|
74 |
+
return "Conversion cancelled.", None
|
75 |
+
|
76 |
+
# Create a temporary file with English filename
|
77 |
+
try:
|
78 |
+
original_ext = Path(file_path).suffix
|
79 |
+
with tempfile.NamedTemporaryFile(suffix=original_ext, delete=False) as temp_file:
|
80 |
+
temp_input = temp_file.name
|
81 |
+
# Copy the content of original file to temp file
|
82 |
+
with open(file_path, 'rb') as original:
|
83 |
+
# Read in smaller chunks and check for cancellation between chunks
|
84 |
+
chunk_size = 1024 * 1024 # 1MB chunks
|
85 |
+
while True:
|
86 |
+
# Check for cancellation frequently
|
87 |
+
if check_cancellation():
|
88 |
+
logging.info("Cancellation detected during file copy")
|
89 |
+
safe_delete_file(temp_input)
|
90 |
+
return "Conversion cancelled.", None
|
91 |
+
|
92 |
+
chunk = original.read(chunk_size)
|
93 |
+
if not chunk:
|
94 |
+
break
|
95 |
+
temp_file.write(chunk)
|
96 |
+
file_path = temp_input
|
97 |
+
except Exception as e:
|
98 |
+
safe_delete_file(temp_input)
|
99 |
+
return f"Error creating temporary file: {e}", None
|
100 |
+
|
101 |
+
# Check for cancellation again
|
102 |
+
if check_cancellation():
|
103 |
+
logging.info("Cancellation detected after file preparation")
|
104 |
+
safe_delete_file(temp_input)
|
105 |
+
return "Conversion cancelled.", None
|
106 |
+
|
107 |
+
content = None
|
108 |
+
try:
|
109 |
+
# Use the parser factory to parse the document
|
110 |
+
start = time.time()
|
111 |
+
|
112 |
+
# Pass the cancellation flag to the parser factory
|
113 |
+
content = ParserFactory.parse_document(
|
114 |
+
file_path=file_path,
|
115 |
+
parser_name=parser_name,
|
116 |
+
ocr_method_name=ocr_method_name,
|
117 |
+
output_format=output_format.lower(),
|
118 |
+
cancellation_flag=conversion_cancelled # Pass the flag to parsers
|
119 |
+
)
|
120 |
+
|
121 |
+
# If content indicates cancellation, return early
|
122 |
+
if content == "Conversion cancelled.":
|
123 |
+
logging.info("Parser reported cancellation")
|
124 |
+
safe_delete_file(temp_input)
|
125 |
+
return content, None
|
126 |
+
|
127 |
+
duration = time.time() - start
|
128 |
+
logging.info(f"Processed in {duration:.2f} seconds.")
|
129 |
+
|
130 |
+
# Check for cancellation after processing
|
131 |
+
if check_cancellation():
|
132 |
+
logging.info("Cancellation detected after processing")
|
133 |
+
safe_delete_file(temp_input)
|
134 |
+
return "Conversion cancelled.", None
|
135 |
+
|
136 |
+
except Exception as e:
|
137 |
+
safe_delete_file(temp_input)
|
138 |
+
return f"Error: {e}", None
|
139 |
+
|
140 |
+
# Determine the file extension based on the output format
|
141 |
+
if output_format == "Markdown":
|
142 |
+
ext = ".md"
|
143 |
+
elif output_format == "JSON":
|
144 |
+
ext = ".json"
|
145 |
+
elif output_format == "Text":
|
146 |
+
ext = ".txt"
|
147 |
+
elif output_format == "Document Tags":
|
148 |
+
ext = ".doctags"
|
149 |
+
else:
|
150 |
+
ext = ".txt"
|
151 |
+
|
152 |
+
# Check for cancellation again
|
153 |
+
if check_cancellation():
|
154 |
+
logging.info("Cancellation detected before output file creation")
|
155 |
+
safe_delete_file(temp_input)
|
156 |
+
return "Conversion cancelled.", None
|
157 |
+
|
158 |
+
try:
|
159 |
+
# Create a temporary file for download
|
160 |
+
with tempfile.NamedTemporaryFile(mode="w", suffix=ext, delete=False, encoding="utf-8") as tmp:
|
161 |
+
tmp_path = tmp.name
|
162 |
+
# Write in chunks and check for cancellation
|
163 |
+
chunk_size = 10000 # characters
|
164 |
+
for i in range(0, len(content), chunk_size):
|
165 |
+
# Check for cancellation
|
166 |
+
if check_cancellation():
|
167 |
+
logging.info("Cancellation detected during output file writing")
|
168 |
+
safe_delete_file(tmp_path)
|
169 |
+
safe_delete_file(temp_input)
|
170 |
+
return "Conversion cancelled.", None
|
171 |
+
|
172 |
+
tmp.write(content[i:i+chunk_size])
|
173 |
+
|
174 |
+
# Clean up the temporary input file
|
175 |
+
safe_delete_file(temp_input)
|
176 |
+
temp_input = None # Mark as cleaned up
|
177 |
+
|
178 |
+
return content, tmp_path
|
179 |
+
except Exception as e:
|
180 |
+
safe_delete_file(tmp_path)
|
181 |
+
safe_delete_file(temp_input)
|
182 |
+
return f"Error: {e}", None
|
183 |
+
finally:
|
184 |
+
# Always clean up any remaining temp files
|
185 |
+
safe_delete_file(temp_input)
|
186 |
+
if check_cancellation() and tmp_path:
|
187 |
+
safe_delete_file(tmp_path)
|
188 |
+
|
189 |
+
# Always clear the conversion in progress flag when done
|
190 |
+
_conversion_in_progress = False
|
src/core/parser_factory.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional, Dict, Any, Union
|
2 |
+
from pathlib import Path
|
3 |
+
import threading
|
4 |
+
import logging
|
5 |
+
import time
|
6 |
+
|
7 |
+
from src.parsers.parser_interface import DocumentParser
|
8 |
+
from src.parsers.parser_registry import ParserRegistry
|
9 |
+
|
10 |
+
|
11 |
+
class ParserFactory:
|
12 |
+
"""Factory for creating parser instances."""
|
13 |
+
|
14 |
+
@classmethod
|
15 |
+
def create_parser(cls, parser_name: str) -> Optional[DocumentParser]:
|
16 |
+
"""
|
17 |
+
Create a parser instance.
|
18 |
+
|
19 |
+
Args:
|
20 |
+
parser_name: Name of the parser to create
|
21 |
+
|
22 |
+
Returns:
|
23 |
+
An instance of the requested parser or None if not found
|
24 |
+
"""
|
25 |
+
parser_class = ParserRegistry.get_parser_class(parser_name)
|
26 |
+
if not parser_class:
|
27 |
+
return None
|
28 |
+
return parser_class()
|
29 |
+
|
30 |
+
@classmethod
|
31 |
+
def parse_document(cls,
|
32 |
+
file_path: Union[str, Path],
|
33 |
+
parser_name: str,
|
34 |
+
ocr_method_name: str,
|
35 |
+
output_format: str = "markdown",
|
36 |
+
cancellation_flag: Optional[threading.Event] = None,
|
37 |
+
**kwargs) -> str:
|
38 |
+
"""
|
39 |
+
Parse a document using the specified parser and OCR method.
|
40 |
+
|
41 |
+
Args:
|
42 |
+
file_path: Path to the document
|
43 |
+
parser_name: Name of the parser to use
|
44 |
+
ocr_method_name: Display name of the OCR method to use
|
45 |
+
output_format: Output format (markdown, json, text, document_tags)
|
46 |
+
cancellation_flag: Optional flag to check for cancellation
|
47 |
+
**kwargs: Additional parser-specific options
|
48 |
+
|
49 |
+
Returns:
|
50 |
+
str: The parsed content
|
51 |
+
"""
|
52 |
+
# Helper function to check cancellation
|
53 |
+
def check_cancellation():
|
54 |
+
if cancellation_flag and cancellation_flag.is_set():
|
55 |
+
logging.info("Cancellation detected in parser_factory")
|
56 |
+
return True
|
57 |
+
return False
|
58 |
+
|
59 |
+
# Check for cancellation immediately
|
60 |
+
if check_cancellation():
|
61 |
+
return "Conversion cancelled."
|
62 |
+
|
63 |
+
parser = cls.create_parser(parser_name)
|
64 |
+
if not parser:
|
65 |
+
raise ValueError(f"Unknown parser: {parser_name}")
|
66 |
+
|
67 |
+
# Get the internal OCR method ID
|
68 |
+
ocr_method_id = ParserRegistry.get_ocr_method_id(parser_name, ocr_method_name)
|
69 |
+
if not ocr_method_id:
|
70 |
+
raise ValueError(f"Unknown OCR method: {ocr_method_name} for parser {parser_name}")
|
71 |
+
|
72 |
+
# Check for cancellation again before starting the parsing
|
73 |
+
if check_cancellation():
|
74 |
+
return "Conversion cancelled."
|
75 |
+
|
76 |
+
# Add a function to check cancellation that parsers can call
|
77 |
+
def should_check_cancellation():
|
78 |
+
"""Function that parsers can call to check if they should check cancellation"""
|
79 |
+
# No need to sleep here - this just returns whether cancellation should be checked
|
80 |
+
return True
|
81 |
+
|
82 |
+
# Parse the document, passing the cancellation flag and helper functions
|
83 |
+
kwargs['cancellation_flag'] = cancellation_flag
|
84 |
+
kwargs['check_cancellation'] = check_cancellation
|
85 |
+
kwargs['should_check_cancellation'] = should_check_cancellation
|
86 |
+
kwargs['output_format'] = output_format
|
87 |
+
|
88 |
+
# Parse the document
|
89 |
+
result = parser.parse(file_path, ocr_method=ocr_method_id, **kwargs)
|
90 |
+
|
91 |
+
# Check one more time after parsing completes
|
92 |
+
if check_cancellation():
|
93 |
+
return "Conversion cancelled."
|
94 |
+
|
95 |
+
return result
|
src/main.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import parsers # Import all parsers to ensure they're registered
|
2 |
+
|
3 |
+
from src.ui.ui import launch_ui
|
4 |
+
|
5 |
+
|
6 |
+
def main():
|
7 |
+
launch_ui(
|
8 |
+
server_name="0.0.0.0",
|
9 |
+
server_port=7860,
|
10 |
+
share=False # Explicitly disable sharing on Hugging Face
|
11 |
+
)
|
12 |
+
|
13 |
+
|
14 |
+
if __name__ == "__main__":
|
15 |
+
main()
|
src/parsers/__init__.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Parser implementations for document conversion."""
|
2 |
+
|
3 |
+
# Import all parsers to ensure they're registered
|
4 |
+
from src.parsers.docling_parser import DoclingParser
|
5 |
+
from src.parsers.marker_parser import MarkerParser
|
6 |
+
from src.parsers.pypdfium_parser import PyPdfiumParser
|
7 |
+
from src.parsers.gemini_flash_parser import GeminiFlashParser
|
8 |
+
from src.parsers.got_ocr_parser import GotOcrParser
|
9 |
+
|
10 |
+
# You can add new parsers here in the future
|
11 |
+
|
12 |
+
# This file makes the parsers directory a Python package
|
src/parsers/docling_parser.py
ADDED
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from typing import Dict, List, Optional, Any, Union
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
import shutil
|
6 |
+
|
7 |
+
from src.parsers.parser_interface import DocumentParser
|
8 |
+
from src.parsers.parser_registry import ParserRegistry
|
9 |
+
from docling.document_converter import DocumentConverter, PdfFormatOption
|
10 |
+
from docling.datamodel.base_models import InputFormat
|
11 |
+
from docling.datamodel.pipeline_options import (
|
12 |
+
AcceleratorDevice,
|
13 |
+
AcceleratorOptions,
|
14 |
+
PdfPipelineOptions,
|
15 |
+
)
|
16 |
+
from docling.models.tesseract_ocr_model import TesseractOcrOptions
|
17 |
+
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
|
18 |
+
from docling.models.ocr_mac_model import OcrMacOptions
|
19 |
+
|
20 |
+
|
21 |
+
class DoclingParser(DocumentParser):
|
22 |
+
"""Parser implementation using Docling."""
|
23 |
+
|
24 |
+
@classmethod
|
25 |
+
def get_name(cls) -> str:
|
26 |
+
return "Docling"
|
27 |
+
|
28 |
+
@classmethod
|
29 |
+
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
|
30 |
+
return [
|
31 |
+
{
|
32 |
+
"id": "no_ocr",
|
33 |
+
"name": "No OCR",
|
34 |
+
"default_params": {}
|
35 |
+
},
|
36 |
+
{
|
37 |
+
"id": "easyocr",
|
38 |
+
"name": "EasyOCR",
|
39 |
+
"default_params": {"languages": ["en"]}
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"id": "easyocr_cpu",
|
43 |
+
"name": "EasyOCR (CPU only)",
|
44 |
+
"default_params": {"languages": ["en"], "use_gpu": False}
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"id": "tesseract",
|
48 |
+
"name": "Tesseract",
|
49 |
+
"default_params": {}
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"id": "tesseract_cli",
|
53 |
+
"name": "Tesseract CLI",
|
54 |
+
"default_params": {}
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"id": "ocrmac",
|
58 |
+
"name": "ocrmac",
|
59 |
+
"default_params": {}
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"id": "full_force_ocr",
|
63 |
+
"name": "Full Force OCR",
|
64 |
+
"default_params": {}
|
65 |
+
}
|
66 |
+
]
|
67 |
+
|
68 |
+
def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
|
69 |
+
"""Parse a document using Docling."""
|
70 |
+
# Special case for full force OCR
|
71 |
+
if ocr_method == "full_force_ocr":
|
72 |
+
return self._apply_full_force_ocr(file_path)
|
73 |
+
|
74 |
+
# Regular Docling parsing
|
75 |
+
pipeline_options = PdfPipelineOptions()
|
76 |
+
pipeline_options.do_table_structure = True
|
77 |
+
pipeline_options.table_structure_options.do_cell_matching = True
|
78 |
+
|
79 |
+
# Configure OCR based on the method
|
80 |
+
if ocr_method == "no_ocr":
|
81 |
+
pipeline_options.do_ocr = False
|
82 |
+
elif ocr_method == "easyocr":
|
83 |
+
pipeline_options.do_ocr = True
|
84 |
+
pipeline_options.ocr_options.lang = kwargs.get("languages", ["en"])
|
85 |
+
pipeline_options.accelerator_options = AcceleratorOptions(
|
86 |
+
num_threads=4, device=AcceleratorDevice.AUTO
|
87 |
+
)
|
88 |
+
elif ocr_method == "easyocr_cpu":
|
89 |
+
pipeline_options.do_ocr = True
|
90 |
+
pipeline_options.ocr_options.lang = kwargs.get("languages", ["en"])
|
91 |
+
pipeline_options.ocr_options.use_gpu = False
|
92 |
+
elif ocr_method == "tesseract":
|
93 |
+
pipeline_options.do_ocr = True
|
94 |
+
pipeline_options.ocr_options = TesseractOcrOptions()
|
95 |
+
elif ocr_method == "tesseract_cli":
|
96 |
+
pipeline_options.do_ocr = True
|
97 |
+
pipeline_options.ocr_options = TesseractCliOcrOptions()
|
98 |
+
elif ocr_method == "ocrmac":
|
99 |
+
pipeline_options.do_ocr = True
|
100 |
+
pipeline_options.ocr_options = OcrMacOptions()
|
101 |
+
|
102 |
+
# Create the converter
|
103 |
+
converter = DocumentConverter(
|
104 |
+
format_options={
|
105 |
+
InputFormat.PDF: PdfFormatOption(
|
106 |
+
pipeline_options=pipeline_options
|
107 |
+
)
|
108 |
+
}
|
109 |
+
)
|
110 |
+
|
111 |
+
# Convert the document
|
112 |
+
result = converter.convert(Path(file_path))
|
113 |
+
doc = result.document
|
114 |
+
|
115 |
+
# Return the content in the requested format
|
116 |
+
output_format = kwargs.get("output_format", "markdown")
|
117 |
+
if output_format.lower() == "json":
|
118 |
+
return json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
|
119 |
+
elif output_format.lower() == "text":
|
120 |
+
return doc.export_to_text()
|
121 |
+
elif output_format.lower() == "document_tags":
|
122 |
+
return doc.export_to_document_tokens()
|
123 |
+
else:
|
124 |
+
return doc.export_to_markdown()
|
125 |
+
|
126 |
+
def _apply_full_force_ocr(self, file_path: Union[str, Path]) -> str:
|
127 |
+
"""Apply full force OCR to a document."""
|
128 |
+
input_doc = Path(file_path)
|
129 |
+
file_extension = input_doc.suffix.lower()
|
130 |
+
|
131 |
+
# Debug information
|
132 |
+
print(f"Applying full force OCR to file: {input_doc} (type: {file_extension})")
|
133 |
+
|
134 |
+
# Basic pipeline setup
|
135 |
+
pipeline_options = PdfPipelineOptions()
|
136 |
+
pipeline_options.do_ocr = True
|
137 |
+
pipeline_options.do_table_structure = True
|
138 |
+
pipeline_options.table_structure_options.do_cell_matching = True
|
139 |
+
|
140 |
+
# Find tesseract executable
|
141 |
+
tesseract_path = shutil.which("tesseract") or "/usr/bin/tesseract"
|
142 |
+
print(f"Using tesseract at: {tesseract_path}")
|
143 |
+
|
144 |
+
# Configure OCR options
|
145 |
+
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True) # Using standard options instead of CLI
|
146 |
+
pipeline_options.ocr_options = ocr_options
|
147 |
+
|
148 |
+
# Set up format options based on file type
|
149 |
+
format_options = {
|
150 |
+
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
151 |
+
}
|
152 |
+
|
153 |
+
# Handle image files
|
154 |
+
if file_extension in ['.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp']:
|
155 |
+
print(f"Processing as image file: {file_extension}")
|
156 |
+
format_options[InputFormat.IMAGE] = PdfFormatOption(pipeline_options=pipeline_options)
|
157 |
+
|
158 |
+
# Try full force OCR with standard options
|
159 |
+
try:
|
160 |
+
converter = DocumentConverter(format_options=format_options)
|
161 |
+
result = converter.convert(input_doc)
|
162 |
+
return result.document.export_to_markdown()
|
163 |
+
except Exception as e:
|
164 |
+
print(f"Error with standard OCR: {e}")
|
165 |
+
print(f"Attempting fallback to tesseract_cli OCR...")
|
166 |
+
return self.parse(file_path, ocr_method="tesseract_cli")
|
167 |
+
|
168 |
+
|
169 |
+
# Register the parser with the registry
|
170 |
+
ParserRegistry.register(DoclingParser)
|
src/parsers/gemini_flash_parser.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from typing import Dict, List, Optional, Any, Union
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
import tempfile
|
6 |
+
import base64
|
7 |
+
from PIL import Image
|
8 |
+
import io
|
9 |
+
|
10 |
+
from src.parsers.parser_interface import DocumentParser
|
11 |
+
from src.parsers.parser_registry import ParserRegistry
|
12 |
+
|
13 |
+
# Import the Google Gemini API client
|
14 |
+
try:
|
15 |
+
import google.generativeai as genai
|
16 |
+
GEMINI_AVAILABLE = True
|
17 |
+
except ImportError:
|
18 |
+
GEMINI_AVAILABLE = False
|
19 |
+
|
20 |
+
# Load API key from environment variable
|
21 |
+
api_key = os.getenv("GOOGLE_API_KEY")
|
22 |
+
|
23 |
+
# Check if API key is available and print a message if not
|
24 |
+
if not api_key:
|
25 |
+
print("Warning: GOOGLE_API_KEY environment variable not found. Gemini Flash parser may not work.")
|
26 |
+
|
27 |
+
class GeminiFlashParser(DocumentParser):
|
28 |
+
"""Parser that uses Google's Gemini Flash 2.0 to convert documents to markdown."""
|
29 |
+
|
30 |
+
@classmethod
|
31 |
+
def get_name(cls) -> str:
|
32 |
+
return "Gemini Flash"
|
33 |
+
|
34 |
+
@classmethod
|
35 |
+
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
|
36 |
+
return [
|
37 |
+
{
|
38 |
+
"id": "none",
|
39 |
+
"name": "None",
|
40 |
+
"default_params": {}
|
41 |
+
}
|
42 |
+
]
|
43 |
+
|
44 |
+
@classmethod
|
45 |
+
def get_description(cls) -> str:
|
46 |
+
return "Gemini Flash 2.0 parser for converting documents and images to markdown"
|
47 |
+
|
48 |
+
def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
|
49 |
+
"""Parse a document using Gemini Flash 2.0."""
|
50 |
+
if not GEMINI_AVAILABLE:
|
51 |
+
raise ImportError(
|
52 |
+
"The Google Gemini API client is not installed. "
|
53 |
+
"Please install it with 'pip install google-genai'."
|
54 |
+
)
|
55 |
+
|
56 |
+
# Use the globally loaded API key
|
57 |
+
if not api_key:
|
58 |
+
raise ValueError(
|
59 |
+
"GOOGLE_API_KEY environment variable is not set. "
|
60 |
+
"Please set it to your Gemini API key."
|
61 |
+
)
|
62 |
+
|
63 |
+
try:
|
64 |
+
# Configure the Gemini API with the API key
|
65 |
+
genai.configure(api_key=api_key)
|
66 |
+
|
67 |
+
# Determine file type based on extension
|
68 |
+
file_path = Path(file_path)
|
69 |
+
file_extension = file_path.suffix.lower()
|
70 |
+
|
71 |
+
# Read the file content
|
72 |
+
file_content = file_path.read_bytes()
|
73 |
+
|
74 |
+
# Determine MIME type based on file extension
|
75 |
+
mime_type = self._get_mime_type(file_extension)
|
76 |
+
|
77 |
+
# Create a multipart content with the file
|
78 |
+
model = genai.GenerativeModel('gemini-2.0-flash')
|
79 |
+
|
80 |
+
# Set up the prompt
|
81 |
+
prompt = """
|
82 |
+
Convert this document to markdown format.
|
83 |
+
Preserve the structure, headings, lists, tables, and formatting as much as possible.
|
84 |
+
For images, include a brief description in markdown image syntax.
|
85 |
+
"""
|
86 |
+
|
87 |
+
# Generate the response
|
88 |
+
response = model.generate_content(
|
89 |
+
contents=[
|
90 |
+
prompt,
|
91 |
+
{
|
92 |
+
"mime_type": mime_type,
|
93 |
+
"data": file_content
|
94 |
+
}
|
95 |
+
],
|
96 |
+
generation_config={
|
97 |
+
"temperature": 0.2,
|
98 |
+
"top_p": 0.95,
|
99 |
+
"top_k": 40,
|
100 |
+
"max_output_tokens": 8192,
|
101 |
+
}
|
102 |
+
)
|
103 |
+
|
104 |
+
# Extract the markdown text from the response
|
105 |
+
markdown_text = response.text
|
106 |
+
|
107 |
+
return markdown_text
|
108 |
+
|
109 |
+
except Exception as e:
|
110 |
+
error_message = f"Error parsing document with Gemini Flash: {str(e)}"
|
111 |
+
print(error_message)
|
112 |
+
return f"# Error\n\n{error_message}\n\nPlease check your API key and try again."
|
113 |
+
|
114 |
+
def _get_mime_type(self, file_extension: str) -> str:
|
115 |
+
"""Get the MIME type for a file extension."""
|
116 |
+
mime_types = {
|
117 |
+
".pdf": "application/pdf",
|
118 |
+
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
119 |
+
".doc": "application/msword",
|
120 |
+
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
121 |
+
".ppt": "application/vnd.ms-powerpoint",
|
122 |
+
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
123 |
+
".xls": "application/vnd.ms-excel",
|
124 |
+
".txt": "text/plain",
|
125 |
+
".md": "text/markdown",
|
126 |
+
".html": "text/html",
|
127 |
+
".htm": "text/html",
|
128 |
+
".jpg": "image/jpeg",
|
129 |
+
".jpeg": "image/jpeg",
|
130 |
+
".png": "image/png",
|
131 |
+
".gif": "image/gif",
|
132 |
+
".bmp": "image/bmp",
|
133 |
+
".tiff": "image/tiff",
|
134 |
+
".tif": "image/tiff",
|
135 |
+
}
|
136 |
+
|
137 |
+
return mime_types.get(file_extension, "application/octet-stream")
|
138 |
+
|
139 |
+
|
140 |
+
# Register the parser with the registry
|
141 |
+
if GEMINI_AVAILABLE:
|
142 |
+
ParserRegistry.register(GeminiFlashParser)
|
143 |
+
else:
|
144 |
+
print("Gemini Flash parser not registered: google-genai package not installed")
|
src/parsers/got_ocr_parser.py
ADDED
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from typing import Dict, List, Optional, Any, Union
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
import tempfile
|
6 |
+
import logging
|
7 |
+
|
8 |
+
from src.parsers.parser_interface import DocumentParser
|
9 |
+
from src.parsers.parser_registry import ParserRegistry
|
10 |
+
|
11 |
+
# Configure logging
|
12 |
+
logger = logging.getLogger(__name__)
|
13 |
+
|
14 |
+
# Check if required packages are installed
|
15 |
+
try:
|
16 |
+
import torch
|
17 |
+
import transformers
|
18 |
+
from transformers import AutoModel, AutoTokenizer
|
19 |
+
|
20 |
+
# Check if transformers version is compatible
|
21 |
+
from packaging import version
|
22 |
+
if version.parse(transformers.__version__) >= version.parse("4.48.0"):
|
23 |
+
logger.warning(
|
24 |
+
f"Transformers version {transformers.__version__} may not be compatible with GOT-OCR. "
|
25 |
+
"Consider downgrading to version <4.48.0"
|
26 |
+
)
|
27 |
+
|
28 |
+
GOT_AVAILABLE = True
|
29 |
+
except ImportError:
|
30 |
+
GOT_AVAILABLE = False
|
31 |
+
logger.warning("GOT-OCR dependencies not installed. The parser will not be available.")
|
32 |
+
|
33 |
+
class GotOcrParser(DocumentParser):
|
34 |
+
"""Parser implementation using GOT-OCR 2.0."""
|
35 |
+
|
36 |
+
_model = None
|
37 |
+
_tokenizer = None
|
38 |
+
|
39 |
+
@classmethod
|
40 |
+
def get_name(cls) -> str:
|
41 |
+
return "GOT-OCR (jpg,png only)"
|
42 |
+
|
43 |
+
@classmethod
|
44 |
+
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
|
45 |
+
return [
|
46 |
+
{
|
47 |
+
"id": "plain",
|
48 |
+
"name": "Plain Text",
|
49 |
+
"default_params": {}
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"id": "format",
|
53 |
+
"name": "Formatted Text",
|
54 |
+
"default_params": {}
|
55 |
+
}
|
56 |
+
]
|
57 |
+
|
58 |
+
@classmethod
|
59 |
+
def _load_model(cls):
|
60 |
+
"""Load the GOT-OCR model and tokenizer if not already loaded."""
|
61 |
+
if cls._model is None or cls._tokenizer is None:
|
62 |
+
try:
|
63 |
+
logger.info("Loading GOT-OCR model and tokenizer...")
|
64 |
+
cls._tokenizer = AutoTokenizer.from_pretrained(
|
65 |
+
'stepfun-ai/GOT-OCR2_0',
|
66 |
+
trust_remote_code=True
|
67 |
+
)
|
68 |
+
cls._model = AutoModel.from_pretrained(
|
69 |
+
'stepfun-ai/GOT-OCR2_0',
|
70 |
+
trust_remote_code=True,
|
71 |
+
low_cpu_mem_usage=True,
|
72 |
+
device_map='cuda',
|
73 |
+
use_safetensors=True,
|
74 |
+
pad_token_id=cls._tokenizer.eos_token_id
|
75 |
+
)
|
76 |
+
cls._model = cls._model.eval().cuda()
|
77 |
+
logger.info("GOT-OCR model loaded successfully")
|
78 |
+
except Exception as e:
|
79 |
+
cls._model = None
|
80 |
+
cls._tokenizer = None
|
81 |
+
logger.error(f"Failed to load GOT-OCR model: {str(e)}")
|
82 |
+
raise RuntimeError(f"Failed to load GOT-OCR model: {str(e)}")
|
83 |
+
|
84 |
+
@classmethod
|
85 |
+
def release_model(cls):
|
86 |
+
"""Release the model from memory."""
|
87 |
+
if cls._model is not None:
|
88 |
+
del cls._model
|
89 |
+
cls._model = None
|
90 |
+
if cls._tokenizer is not None:
|
91 |
+
del cls._tokenizer
|
92 |
+
cls._tokenizer = None
|
93 |
+
if torch.cuda.is_available():
|
94 |
+
torch.cuda.empty_cache()
|
95 |
+
logger.info("GOT-OCR model released from memory")
|
96 |
+
|
97 |
+
def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
|
98 |
+
"""Parse a document using GOT-OCR 2.0."""
|
99 |
+
if not GOT_AVAILABLE:
|
100 |
+
raise ImportError(
|
101 |
+
"GOT-OCR dependencies not installed. Please install required packages: "
|
102 |
+
"torch, transformers, tiktoken, verovio, accelerate"
|
103 |
+
)
|
104 |
+
|
105 |
+
# Check if CUDA is available
|
106 |
+
if not torch.cuda.is_available():
|
107 |
+
raise RuntimeError("GOT-OCR requires CUDA. CPU-only mode is not supported.")
|
108 |
+
|
109 |
+
# Check file extension
|
110 |
+
file_path = Path(file_path)
|
111 |
+
if file_path.suffix.lower() not in ['.jpg', '.jpeg', '.png']:
|
112 |
+
raise ValueError(
|
113 |
+
"GOT-OCR only supports JPG and PNG formats. "
|
114 |
+
f"Received file with extension: {file_path.suffix}"
|
115 |
+
)
|
116 |
+
|
117 |
+
# Determine OCR type based on method
|
118 |
+
ocr_type = "format" if ocr_method == "format" else "ocr"
|
119 |
+
|
120 |
+
try:
|
121 |
+
# Load the model
|
122 |
+
self._load_model()
|
123 |
+
|
124 |
+
# Use the model's chat method as shown in the documentation
|
125 |
+
logger.info(f"Processing image with GOT-OCR: {file_path}")
|
126 |
+
result = self._model.chat(
|
127 |
+
self._tokenizer,
|
128 |
+
str(file_path),
|
129 |
+
ocr_type=ocr_type
|
130 |
+
)
|
131 |
+
|
132 |
+
# Format the output based on the requested format
|
133 |
+
output_format = kwargs.get("output_format", "markdown").lower()
|
134 |
+
if output_format == "json":
|
135 |
+
return json.dumps({"content": result}, ensure_ascii=False, indent=2)
|
136 |
+
elif output_format == "text":
|
137 |
+
# Simple markdown to text conversion
|
138 |
+
return result.replace("#", "").replace("*", "").replace("_", "")
|
139 |
+
elif output_format == "document_tags":
|
140 |
+
return f"<doc>\n{result}\n</doc>"
|
141 |
+
else:
|
142 |
+
return result
|
143 |
+
|
144 |
+
except torch.cuda.OutOfMemoryError:
|
145 |
+
self.release_model() # Release memory
|
146 |
+
logger.error("GPU out of memory while processing with GOT-OCR")
|
147 |
+
raise RuntimeError(
|
148 |
+
"GPU out of memory while processing with GOT-OCR. "
|
149 |
+
"Try using a smaller image or a different parser."
|
150 |
+
)
|
151 |
+
except AttributeError as e:
|
152 |
+
if "get_max_length" in str(e):
|
153 |
+
logger.error(f"Transformers version compatibility error: {str(e)}")
|
154 |
+
self.release_model() # Release memory
|
155 |
+
raise RuntimeError(
|
156 |
+
"Transformers version compatibility error with GOT-OCR. "
|
157 |
+
"Please downgrade transformers to version <4.48.0. "
|
158 |
+
f"Error: {str(e)}"
|
159 |
+
)
|
160 |
+
else:
|
161 |
+
logger.error(f"Error processing document with GOT-OCR: {str(e)}")
|
162 |
+
raise RuntimeError(f"Error processing document with GOT-OCR: {str(e)}")
|
163 |
+
except Exception as e:
|
164 |
+
logger.error(f"Error processing document with GOT-OCR: {str(e)}")
|
165 |
+
raise RuntimeError(f"Error processing document with GOT-OCR: {str(e)}")
|
166 |
+
|
167 |
+
# Register the parser with the registry if GOT is available
|
168 |
+
if GOT_AVAILABLE:
|
169 |
+
ParserRegistry.register(GotOcrParser)
|
170 |
+
logger.info("GOT-OCR parser registered successfully")
|
171 |
+
else:
|
172 |
+
logger.warning("GOT-OCR parser not registered: required dependencies not installed")
|
src/parsers/marker_parser.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from typing import Dict, List, Optional, Any, Union
|
3 |
+
import subprocess
|
4 |
+
import tempfile
|
5 |
+
import os
|
6 |
+
import json
|
7 |
+
|
8 |
+
from src.parsers.parser_interface import DocumentParser
|
9 |
+
from src.parsers.parser_registry import ParserRegistry
|
10 |
+
from marker.converters.pdf import PdfConverter
|
11 |
+
from marker.models import create_model_dict
|
12 |
+
from marker.output import text_from_rendered
|
13 |
+
|
14 |
+
|
15 |
+
class MarkerParser(DocumentParser):
|
16 |
+
"""Parser implementation using Marker."""
|
17 |
+
|
18 |
+
@classmethod
|
19 |
+
def get_name(cls) -> str:
|
20 |
+
return "Marker"
|
21 |
+
|
22 |
+
@classmethod
|
23 |
+
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
|
24 |
+
return [
|
25 |
+
{
|
26 |
+
"id": "no_ocr",
|
27 |
+
"name": "No OCR",
|
28 |
+
"default_params": {}
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"id": "force_ocr",
|
32 |
+
"name": "Force OCR",
|
33 |
+
"default_params": {}
|
34 |
+
}
|
35 |
+
]
|
36 |
+
|
37 |
+
def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
|
38 |
+
"""Parse a document using Marker."""
|
39 |
+
force_ocr = ocr_method == "force_ocr"
|
40 |
+
|
41 |
+
converter = PdfConverter(
|
42 |
+
artifact_dict=create_model_dict(),
|
43 |
+
config={"force_ocr": force_ocr}
|
44 |
+
)
|
45 |
+
rendered = converter(str(file_path))
|
46 |
+
content, _, _ = text_from_rendered(rendered)
|
47 |
+
|
48 |
+
# Format the content based on the requested output format
|
49 |
+
output_format = kwargs.get("output_format", "markdown")
|
50 |
+
if output_format.lower() == "json":
|
51 |
+
return json.dumps({"content": content}, ensure_ascii=False, indent=2)
|
52 |
+
elif output_format.lower() == "text":
|
53 |
+
return content.replace("#", "").replace("*", "").replace("_", "")
|
54 |
+
elif output_format.lower() == "document_tags":
|
55 |
+
return f"<doc>\n{content}\n</doc>"
|
56 |
+
else:
|
57 |
+
return content
|
58 |
+
|
59 |
+
|
60 |
+
# Register the parser with the registry
|
61 |
+
ParserRegistry.register(MarkerParser)
|
src/parsers/parser_interface.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
from pathlib import Path
|
3 |
+
from typing import Dict, List, Optional, Any, Union
|
4 |
+
|
5 |
+
|
6 |
+
class DocumentParser(ABC):
|
7 |
+
"""Base interface for all document parsers in the system."""
|
8 |
+
|
9 |
+
@abstractmethod
|
10 |
+
def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
|
11 |
+
"""
|
12 |
+
Parse a document and return its content.
|
13 |
+
|
14 |
+
Args:
|
15 |
+
file_path: Path to the document
|
16 |
+
ocr_method: OCR method to use (if applicable)
|
17 |
+
**kwargs: Additional parser-specific options
|
18 |
+
|
19 |
+
Returns:
|
20 |
+
str: The parsed content
|
21 |
+
"""
|
22 |
+
pass
|
23 |
+
|
24 |
+
@classmethod
|
25 |
+
@abstractmethod
|
26 |
+
def get_name(cls) -> str:
|
27 |
+
"""Return the display name of this parser"""
|
28 |
+
pass
|
29 |
+
|
30 |
+
@classmethod
|
31 |
+
@abstractmethod
|
32 |
+
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
|
33 |
+
"""
|
34 |
+
Return a list of supported OCR methods.
|
35 |
+
|
36 |
+
Returns:
|
37 |
+
List of dictionaries with keys:
|
38 |
+
- id: Unique identifier for the OCR method
|
39 |
+
- name: Display name for the OCR method
|
40 |
+
- default_params: Default parameters for this OCR method
|
41 |
+
"""
|
42 |
+
pass
|
43 |
+
|
44 |
+
@classmethod
|
45 |
+
def get_description(cls) -> str:
|
46 |
+
"""Return a description of this parser"""
|
47 |
+
return f"{cls.get_name()} document parser"
|
src/parsers/parser_registry.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, List, Type, Any, Optional
|
2 |
+
from src.parsers.parser_interface import DocumentParser
|
3 |
+
|
4 |
+
|
5 |
+
class ParserRegistry:
|
6 |
+
"""Central registry for all document parsers in the system."""
|
7 |
+
|
8 |
+
_parsers: Dict[str, Type[DocumentParser]] = {}
|
9 |
+
|
10 |
+
@classmethod
|
11 |
+
def register(cls, parser_class: Type[DocumentParser]) -> None:
|
12 |
+
"""
|
13 |
+
Register a parser with the system.
|
14 |
+
|
15 |
+
Args:
|
16 |
+
parser_class: The parser class to register
|
17 |
+
"""
|
18 |
+
parser_name = parser_class.get_name()
|
19 |
+
cls._parsers[parser_name] = parser_class
|
20 |
+
print(f"Registered parser: {parser_name}")
|
21 |
+
|
22 |
+
@classmethod
|
23 |
+
def get_available_parsers(cls) -> Dict[str, Type[DocumentParser]]:
|
24 |
+
"""Return all registered parsers"""
|
25 |
+
return cls._parsers
|
26 |
+
|
27 |
+
@classmethod
|
28 |
+
def get_parser_class(cls, name: str) -> Optional[Type[DocumentParser]]:
|
29 |
+
"""Get a specific parser class by name"""
|
30 |
+
return cls._parsers.get(name)
|
31 |
+
|
32 |
+
@classmethod
|
33 |
+
def get_parser_names(cls) -> List[str]:
|
34 |
+
"""Get a list of all registered parser names"""
|
35 |
+
return list(cls._parsers.keys())
|
36 |
+
|
37 |
+
@classmethod
|
38 |
+
def get_ocr_options(cls, parser_name: str) -> List[str]:
|
39 |
+
"""
|
40 |
+
Get OCR methods supported by a parser.
|
41 |
+
|
42 |
+
Args:
|
43 |
+
parser_name: Name of the parser
|
44 |
+
|
45 |
+
Returns:
|
46 |
+
List of OCR method display names
|
47 |
+
"""
|
48 |
+
parser_class = cls.get_parser_class(parser_name)
|
49 |
+
if not parser_class:
|
50 |
+
return []
|
51 |
+
|
52 |
+
return [method["name"] for method in parser_class.get_supported_ocr_methods()]
|
53 |
+
|
54 |
+
@classmethod
|
55 |
+
def get_ocr_method_id(cls, parser_name: str, ocr_display_name: str) -> Optional[str]:
|
56 |
+
"""
|
57 |
+
Get the internal ID for an OCR method based on its display name.
|
58 |
+
|
59 |
+
Args:
|
60 |
+
parser_name: Name of the parser
|
61 |
+
ocr_display_name: Display name of the OCR method
|
62 |
+
|
63 |
+
Returns:
|
64 |
+
Internal ID of the OCR method or None if not found
|
65 |
+
"""
|
66 |
+
parser_class = cls.get_parser_class(parser_name)
|
67 |
+
if not parser_class:
|
68 |
+
return None
|
69 |
+
|
70 |
+
for method in parser_class.get_supported_ocr_methods():
|
71 |
+
if method["name"] == ocr_display_name:
|
72 |
+
return method["id"]
|
73 |
+
|
74 |
+
return None
|
src/parsers/pypdfium_parser.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from typing import Dict, List, Optional, Any, Union
|
3 |
+
import json
|
4 |
+
import pypdfium2 as pdfium
|
5 |
+
|
6 |
+
from src.parsers.parser_interface import DocumentParser
|
7 |
+
from src.parsers.parser_registry import ParserRegistry
|
8 |
+
from docling.document_converter import DocumentConverter, PdfFormatOption
|
9 |
+
from docling.datamodel.base_models import InputFormat
|
10 |
+
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
11 |
+
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
12 |
+
|
13 |
+
|
14 |
+
class PyPdfiumParser(DocumentParser):
|
15 |
+
"""Parser implementation using PyPdfium."""
|
16 |
+
|
17 |
+
@classmethod
|
18 |
+
def get_name(cls) -> str:
|
19 |
+
return "PyPdfium"
|
20 |
+
|
21 |
+
@classmethod
|
22 |
+
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
|
23 |
+
return [
|
24 |
+
{
|
25 |
+
"id": "no_ocr",
|
26 |
+
"name": "No OCR",
|
27 |
+
"default_params": {}
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"id": "easyocr",
|
31 |
+
"name": "EasyOCR",
|
32 |
+
"default_params": {"languages": ["en"]}
|
33 |
+
}
|
34 |
+
]
|
35 |
+
|
36 |
+
def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
|
37 |
+
"""Parse a document using PyPdfium."""
|
38 |
+
pipeline_options = PdfPipelineOptions()
|
39 |
+
pipeline_options.do_table_structure = True
|
40 |
+
pipeline_options.table_structure_options.do_cell_matching = True
|
41 |
+
|
42 |
+
# Configure OCR based on the method
|
43 |
+
if ocr_method == "easyocr":
|
44 |
+
pipeline_options.do_ocr = True
|
45 |
+
# Apply any custom parameters from kwargs
|
46 |
+
if "languages" in kwargs:
|
47 |
+
pipeline_options.ocr_options.lang = kwargs["languages"]
|
48 |
+
else:
|
49 |
+
pipeline_options.do_ocr = False
|
50 |
+
|
51 |
+
# Create the converter
|
52 |
+
converter = DocumentConverter(
|
53 |
+
format_options={
|
54 |
+
InputFormat.PDF: PdfFormatOption(
|
55 |
+
pipeline_options=pipeline_options,
|
56 |
+
backend=PyPdfiumDocumentBackend
|
57 |
+
)
|
58 |
+
}
|
59 |
+
)
|
60 |
+
|
61 |
+
# Convert the document
|
62 |
+
result = converter.convert(Path(file_path))
|
63 |
+
doc = result.document
|
64 |
+
|
65 |
+
# Return the content in the requested format
|
66 |
+
output_format = kwargs.get("output_format", "markdown")
|
67 |
+
if output_format.lower() == "json":
|
68 |
+
return json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
|
69 |
+
elif output_format.lower() == "text":
|
70 |
+
return doc.export_to_text()
|
71 |
+
elif output_format.lower() == "document_tags":
|
72 |
+
return doc.export_to_document_tokens()
|
73 |
+
else:
|
74 |
+
return doc.export_to_markdown()
|
75 |
+
|
76 |
+
|
77 |
+
# Register the parser with the registry
|
78 |
+
ParserRegistry.register(PyPdfiumParser)
|
src/services/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
"""External services module for the application."""
|
src/services/docling_chat.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import os
|
3 |
+
|
4 |
+
# Load API key from environment variable
|
5 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
6 |
+
|
7 |
+
# Check if API key is available and print a message if not
|
8 |
+
if not openai.api_key:
|
9 |
+
print("Warning: OPENAI_API_KEY environment variable not found. Chat functionality may not work.")
|
10 |
+
|
11 |
+
def chat_with_document(message, history, document_text_state):
|
12 |
+
history = history or []
|
13 |
+
history.append({"role": "user", "content": message})
|
14 |
+
|
15 |
+
context = f"Document: {document_text_state}\n\nUser: {message}"
|
16 |
+
|
17 |
+
# Add error handling for API calls
|
18 |
+
try:
|
19 |
+
response = openai.chat.completions.create(
|
20 |
+
model="gpt-4o-2024-08-06",
|
21 |
+
messages=[{"role": "system", "content": context}] + history
|
22 |
+
)
|
23 |
+
reply = response.choices[0].message.content
|
24 |
+
except Exception as e:
|
25 |
+
reply = f"Error: Could not generate response. Please check your OpenAI API key. Details: {str(e)}"
|
26 |
+
print(f"OpenAI API error: {str(e)}")
|
27 |
+
|
28 |
+
history.append({"role": "assistant", "content": reply})
|
29 |
+
return history, history
|
src/ui/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
"""User interface module for the application."""
|
src/ui/ui.py
ADDED
@@ -0,0 +1,301 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import markdown
|
3 |
+
import threading
|
4 |
+
import time
|
5 |
+
import logging
|
6 |
+
from pathlib import Path
|
7 |
+
from src.core.converter import convert_file, set_cancellation_flag, is_conversion_in_progress
|
8 |
+
from src.services.docling_chat import chat_with_document
|
9 |
+
from src.parsers.parser_registry import ParserRegistry
|
10 |
+
|
11 |
+
# Configure logging
|
12 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
|
15 |
+
# Add a global variable to track cancellation state
|
16 |
+
conversion_cancelled = threading.Event()
|
17 |
+
|
18 |
+
# Pass the cancellation flag to the converter module
|
19 |
+
set_cancellation_flag(conversion_cancelled)
|
20 |
+
|
21 |
+
# Add a background thread to monitor cancellation
|
22 |
+
def monitor_cancellation():
|
23 |
+
"""Background thread to monitor cancellation and update UI if needed"""
|
24 |
+
logger.info("Starting cancellation monitor thread")
|
25 |
+
while is_conversion_in_progress():
|
26 |
+
if conversion_cancelled.is_set():
|
27 |
+
logger.info("Cancellation detected by monitor thread")
|
28 |
+
time.sleep(0.1) # Check every 100ms
|
29 |
+
logger.info("Cancellation monitor thread ending")
|
30 |
+
|
31 |
+
def validate_file_for_parser(file_path, parser_name):
|
32 |
+
"""Validate if the file type is supported by the selected parser."""
|
33 |
+
if not file_path:
|
34 |
+
return True, "" # No file selected yet
|
35 |
+
|
36 |
+
if "GOT-OCR" in parser_name:
|
37 |
+
file_ext = Path(file_path).suffix.lower()
|
38 |
+
if file_ext not in ['.jpg', '.jpeg', '.png']:
|
39 |
+
return False, "GOT-OCR only supports JPG and PNG formats."
|
40 |
+
return True, ""
|
41 |
+
|
42 |
+
def format_markdown_content(content):
|
43 |
+
if not content:
|
44 |
+
return content
|
45 |
+
|
46 |
+
# Convert the content to HTML using markdown library
|
47 |
+
html_content = markdown.markdown(str(content), extensions=['tables'])
|
48 |
+
return html_content
|
49 |
+
|
50 |
+
# Function to run conversion in a separate thread
|
51 |
+
def run_conversion_thread(file_path, parser_name, ocr_method_name, output_format):
|
52 |
+
"""Run the conversion in a separate thread and return the thread object"""
|
53 |
+
global conversion_cancelled
|
54 |
+
|
55 |
+
# Reset the cancellation flag
|
56 |
+
conversion_cancelled.clear()
|
57 |
+
|
58 |
+
# Create a container for the results
|
59 |
+
results = {"content": None, "download_file": None, "error": None}
|
60 |
+
|
61 |
+
def conversion_worker():
|
62 |
+
try:
|
63 |
+
content, download_file = convert_file(file_path, parser_name, ocr_method_name, output_format)
|
64 |
+
results["content"] = content
|
65 |
+
results["download_file"] = download_file
|
66 |
+
except Exception as e:
|
67 |
+
logger.error(f"Error during conversion: {str(e)}")
|
68 |
+
results["error"] = str(e)
|
69 |
+
|
70 |
+
# Create and start the thread
|
71 |
+
thread = threading.Thread(target=conversion_worker)
|
72 |
+
thread.daemon = True
|
73 |
+
thread.start()
|
74 |
+
|
75 |
+
return thread, results
|
76 |
+
|
77 |
+
def handle_convert(file_path, parser_name, ocr_method_name, output_format, is_cancelled):
|
78 |
+
"""Handle file conversion."""
|
79 |
+
global conversion_cancelled
|
80 |
+
|
81 |
+
# Check if we should cancel before starting
|
82 |
+
if is_cancelled:
|
83 |
+
logger.info("Conversion cancelled before starting")
|
84 |
+
return "Conversion cancelled.", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), None
|
85 |
+
|
86 |
+
# Validate file type for the selected parser
|
87 |
+
is_valid, error_msg = validate_file_for_parser(file_path, parser_name)
|
88 |
+
if not is_valid:
|
89 |
+
logger.error(f"File validation error: {error_msg}")
|
90 |
+
return f"Error: {error_msg}", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), None
|
91 |
+
|
92 |
+
logger.info("Starting conversion with cancellation flag cleared")
|
93 |
+
|
94 |
+
# Start the conversion in a separate thread
|
95 |
+
thread, results = run_conversion_thread(file_path, parser_name, ocr_method_name, output_format)
|
96 |
+
|
97 |
+
# Start the monitoring thread
|
98 |
+
monitor_thread = threading.Thread(target=monitor_cancellation)
|
99 |
+
monitor_thread.daemon = True
|
100 |
+
monitor_thread.start()
|
101 |
+
|
102 |
+
# Wait for the thread to complete or be cancelled
|
103 |
+
while thread.is_alive():
|
104 |
+
# Check if cancellation was requested
|
105 |
+
if conversion_cancelled.is_set():
|
106 |
+
logger.info("Cancellation detected, waiting for thread to finish")
|
107 |
+
# Give the thread a chance to clean up
|
108 |
+
thread.join(timeout=0.5)
|
109 |
+
if thread.is_alive():
|
110 |
+
logger.warning("Thread did not finish within timeout")
|
111 |
+
return "Conversion cancelled.", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), None
|
112 |
+
|
113 |
+
# Sleep briefly to avoid busy waiting
|
114 |
+
time.sleep(0.1)
|
115 |
+
|
116 |
+
# Thread has completed, check results
|
117 |
+
if results["error"]:
|
118 |
+
return f"Error: {results['error']}", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), None
|
119 |
+
|
120 |
+
content = results["content"]
|
121 |
+
download_file = results["download_file"]
|
122 |
+
|
123 |
+
# If conversion returned a cancellation message
|
124 |
+
if content == "Conversion cancelled.":
|
125 |
+
logger.info("Converter returned cancellation message")
|
126 |
+
return content, None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), None
|
127 |
+
|
128 |
+
# Format the content and wrap it in the scrollable container
|
129 |
+
formatted_content = format_markdown_content(str(content))
|
130 |
+
html_output = f"<div class='output-container'>{formatted_content}</div>"
|
131 |
+
|
132 |
+
logger.info("Conversion completed successfully")
|
133 |
+
return html_output, download_file, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), None
|
134 |
+
|
135 |
+
def create_ui():
|
136 |
+
with gr.Blocks(css="""
|
137 |
+
/* Simple output container with only one scrollbar */
|
138 |
+
.output-container {
|
139 |
+
max-height: 420px; /* Changed from 600px to 70% of original height */
|
140 |
+
overflow-y: auto;
|
141 |
+
border: 1px solid #ddd; /* Added border for better visual definition */
|
142 |
+
padding: 10px; /* Added padding for better content spacing */
|
143 |
+
}
|
144 |
+
|
145 |
+
/* Hide any scrollbars from parent containers */
|
146 |
+
.gradio-container .prose {
|
147 |
+
overflow: visible;
|
148 |
+
}
|
149 |
+
|
150 |
+
.processing-controls {
|
151 |
+
display: flex;
|
152 |
+
justify-content: center;
|
153 |
+
gap: 10px;
|
154 |
+
margin-top: 10px;
|
155 |
+
}
|
156 |
+
|
157 |
+
/* Add margin above the provider/OCR options row */
|
158 |
+
.provider-options-row {
|
159 |
+
margin-top: 15px;
|
160 |
+
margin-bottom: 15px;
|
161 |
+
}
|
162 |
+
""") as demo:
|
163 |
+
gr.Markdown("Markit: Convert any documents to Markdown")
|
164 |
+
|
165 |
+
# State to track if cancellation is requested
|
166 |
+
cancel_requested = gr.State(False)
|
167 |
+
# State to store the conversion thread
|
168 |
+
conversion_thread = gr.State(None)
|
169 |
+
# State to store the output format (fixed to Markdown)
|
170 |
+
output_format_state = gr.State("Markdown")
|
171 |
+
|
172 |
+
with gr.Tabs():
|
173 |
+
with gr.Tab("Upload and Convert"):
|
174 |
+
# File input first
|
175 |
+
file_input = gr.File(label="Upload PDF", type="filepath")
|
176 |
+
|
177 |
+
# Provider and OCR options below the file input
|
178 |
+
with gr.Row(elem_classes=["provider-options-row"]):
|
179 |
+
with gr.Column(scale=1):
|
180 |
+
parser_names = ParserRegistry.get_parser_names()
|
181 |
+
default_parser = parser_names[0] if parser_names else "PyPdfium"
|
182 |
+
|
183 |
+
provider_dropdown = gr.Dropdown(
|
184 |
+
label="Provider",
|
185 |
+
choices=parser_names,
|
186 |
+
value=default_parser,
|
187 |
+
interactive=True
|
188 |
+
)
|
189 |
+
with gr.Column(scale=1):
|
190 |
+
default_ocr_options = ParserRegistry.get_ocr_options(default_parser)
|
191 |
+
default_ocr = default_ocr_options[0] if default_ocr_options else "No OCR"
|
192 |
+
|
193 |
+
ocr_dropdown = gr.Dropdown(
|
194 |
+
label="OCR Options",
|
195 |
+
choices=default_ocr_options,
|
196 |
+
value=default_ocr,
|
197 |
+
interactive=True
|
198 |
+
)
|
199 |
+
|
200 |
+
# Simple output container with just one scrollbar
|
201 |
+
file_display = gr.HTML(
|
202 |
+
value="<div class='output-container'></div>",
|
203 |
+
label="Converted Content"
|
204 |
+
)
|
205 |
+
|
206 |
+
file_download = gr.File(label="Download File")
|
207 |
+
|
208 |
+
# Processing controls row
|
209 |
+
with gr.Row(elem_classes=["processing-controls"]):
|
210 |
+
convert_button = gr.Button("Convert", variant="primary")
|
211 |
+
cancel_button = gr.Button("Cancel", variant="stop", visible=False)
|
212 |
+
|
213 |
+
with gr.Tab("Chat with Document"):
|
214 |
+
document_text_state = gr.State("")
|
215 |
+
chatbot = gr.Chatbot(label="Chat", type="messages")
|
216 |
+
text_input = gr.Textbox(placeholder="Type here...")
|
217 |
+
clear_btn = gr.Button("Clear")
|
218 |
+
|
219 |
+
# Event handlers
|
220 |
+
provider_dropdown.change(
|
221 |
+
lambda p: gr.Dropdown(
|
222 |
+
choices=["Plain Text", "Formatted Text"] if "GOT-OCR" in p else ParserRegistry.get_ocr_options(p),
|
223 |
+
value="Plain Text" if "GOT-OCR" in p else (ParserRegistry.get_ocr_options(p)[0] if ParserRegistry.get_ocr_options(p) else None)
|
224 |
+
),
|
225 |
+
inputs=[provider_dropdown],
|
226 |
+
outputs=[ocr_dropdown]
|
227 |
+
)
|
228 |
+
|
229 |
+
# Reset cancel flag when starting conversion
|
230 |
+
def start_conversion():
|
231 |
+
global conversion_cancelled
|
232 |
+
conversion_cancelled.clear()
|
233 |
+
logger.info("Starting conversion with cancellation flag cleared")
|
234 |
+
return gr.update(visible=False), gr.update(visible=True), False
|
235 |
+
|
236 |
+
# Set cancel flag and terminate thread when cancel button is clicked
|
237 |
+
def request_cancellation(thread):
|
238 |
+
global conversion_cancelled
|
239 |
+
conversion_cancelled.set()
|
240 |
+
logger.info("Cancel button clicked, cancellation flag set")
|
241 |
+
|
242 |
+
# Try to join the thread with a timeout
|
243 |
+
if thread is not None:
|
244 |
+
logger.info(f"Attempting to join conversion thread: {thread}")
|
245 |
+
thread.join(timeout=0.5)
|
246 |
+
if thread.is_alive():
|
247 |
+
logger.warning("Thread did not finish within timeout")
|
248 |
+
|
249 |
+
# Add immediate feedback to the user
|
250 |
+
return gr.update(visible=True), gr.update(visible=False), True, None
|
251 |
+
|
252 |
+
# Start conversion sequence
|
253 |
+
convert_button.click(
|
254 |
+
fn=start_conversion,
|
255 |
+
inputs=[],
|
256 |
+
outputs=[convert_button, cancel_button, cancel_requested],
|
257 |
+
queue=False # Execute immediately
|
258 |
+
).then(
|
259 |
+
fn=handle_convert,
|
260 |
+
inputs=[file_input, provider_dropdown, ocr_dropdown, output_format_state, cancel_requested],
|
261 |
+
outputs=[file_display, file_download, convert_button, cancel_button, conversion_thread]
|
262 |
+
)
|
263 |
+
|
264 |
+
# Handle cancel button click
|
265 |
+
cancel_button.click(
|
266 |
+
fn=request_cancellation,
|
267 |
+
inputs=[conversion_thread],
|
268 |
+
outputs=[convert_button, cancel_button, cancel_requested, conversion_thread],
|
269 |
+
queue=False # Execute immediately
|
270 |
+
)
|
271 |
+
|
272 |
+
file_display.change(
|
273 |
+
lambda text: text,
|
274 |
+
inputs=[file_display],
|
275 |
+
outputs=[document_text_state]
|
276 |
+
)
|
277 |
+
|
278 |
+
text_input.submit(
|
279 |
+
fn=chat_with_document,
|
280 |
+
inputs=[text_input, chatbot, document_text_state],
|
281 |
+
outputs=[chatbot, chatbot]
|
282 |
+
)
|
283 |
+
|
284 |
+
clear_btn.click(
|
285 |
+
lambda: ([], []),
|
286 |
+
None,
|
287 |
+
[chatbot, chatbot]
|
288 |
+
)
|
289 |
+
|
290 |
+
return demo
|
291 |
+
|
292 |
+
|
293 |
+
def launch_ui(server_name="0.0.0.0", server_port=7860, share=False):
|
294 |
+
demo = create_ui()
|
295 |
+
demo.launch(
|
296 |
+
server_name=server_name,
|
297 |
+
server_port=server_port,
|
298 |
+
root_path="",
|
299 |
+
show_error=True,
|
300 |
+
share=share
|
301 |
+
)
|