Spaces:
Running
on
Zero
Running
on
Zero
Remove chat with document feature, updated ui, remove ocrmac, pinned transformer version to 4.37 for GOT OCR
Browse files- build.sh +1 -1
- requirements.txt +6 -7
- setup.sh +1 -1
- src/parsers/__init__.py +0 -1
- src/parsers/docling_parser.py +0 -9
- src/parsers/marker_parser.py +0 -61
- src/services/docling_chat.py +0 -29
- src/ui/ui.py +35 -62
build.sh
CHANGED
@@ -80,7 +80,7 @@ echo "Google Gemini API client installed successfully"
|
|
80 |
|
81 |
# Install GOT-OCR dependencies
|
82 |
echo "Installing GOT-OCR dependencies..."
|
83 |
-
pip install -q -U torch==2.0.1 torchvision==0.15.2 transformers==4.
|
84 |
echo "GOT-OCR dependencies installed successfully"
|
85 |
|
86 |
# Install Python dependencies
|
|
|
80 |
|
81 |
# Install GOT-OCR dependencies
|
82 |
echo "Installing GOT-OCR dependencies..."
|
83 |
+
pip install -q -U torch==2.0.1 torchvision==0.15.2 transformers==4.37.2 tiktoken==0.6.0 verovio==4.3.1 accelerate==0.28.0 safetensors==0.4.0
|
84 |
echo "GOT-OCR dependencies installed successfully"
|
85 |
|
86 |
# Install Python dependencies
|
requirements.txt
CHANGED
@@ -2,7 +2,6 @@ docling==2.25.0
|
|
2 |
gradio==5.14.0
|
3 |
grpcio-status==1.70.0
|
4 |
markdown==3.7
|
5 |
-
marker-pdf==1.3.5
|
6 |
multiprocess==0.70.16
|
7 |
openai==1.61.1
|
8 |
pipdeptree==2.25.0
|
@@ -25,11 +24,11 @@ python-dotenv>=1.0.0
|
|
25 |
pydantic==2.7.1
|
26 |
|
27 |
# GOT-OCR dependencies
|
28 |
-
torch
|
29 |
-
torchvision
|
30 |
-
transformers
|
31 |
-
tiktoken
|
32 |
-
verovio
|
33 |
-
accelerate
|
34 |
safetensors>=0.4.0
|
35 |
packaging>=21.0 # For version comparison
|
|
|
2 |
gradio==5.14.0
|
3 |
grpcio-status==1.70.0
|
4 |
markdown==3.7
|
|
|
5 |
multiprocess==0.70.16
|
6 |
openai==1.61.1
|
7 |
pipdeptree==2.25.0
|
|
|
24 |
pydantic==2.7.1
|
25 |
|
26 |
# GOT-OCR dependencies
|
27 |
+
torch==2.0.1
|
28 |
+
torchvision==0.15.2
|
29 |
+
transformers==4.37.2 # Exact version for GOT-OCR2
|
30 |
+
tiktoken==0.6.0
|
31 |
+
verovio==4.3.1
|
32 |
+
accelerate==0.28.0
|
33 |
safetensors>=0.4.0
|
34 |
packaging>=21.0 # For version comparison
|
setup.sh
CHANGED
@@ -20,7 +20,7 @@ echo "Python dependencies installed successfully"
|
|
20 |
|
21 |
# Install GOT-OCR dependencies
|
22 |
echo "Installing GOT-OCR dependencies..."
|
23 |
-
pip install -q -U torch==2.0.1 torchvision==0.15.2 transformers==4.
|
24 |
echo "GOT-OCR dependencies installed successfully"
|
25 |
|
26 |
# Install tesserocr with pip
|
|
|
20 |
|
21 |
# Install GOT-OCR dependencies
|
22 |
echo "Installing GOT-OCR dependencies..."
|
23 |
+
pip install -q -U torch==2.0.1 torchvision==0.15.2 transformers==4.37.2 tiktoken==0.6.0 verovio==4.3.1 accelerate==0.28.0 safetensors==0.4.0
|
24 |
echo "GOT-OCR dependencies installed successfully"
|
25 |
|
26 |
# Install tesserocr with pip
|
src/parsers/__init__.py
CHANGED
@@ -2,7 +2,6 @@
|
|
2 |
|
3 |
# Import all parsers to ensure they're registered
|
4 |
from src.parsers.docling_parser import DoclingParser
|
5 |
-
from src.parsers.marker_parser import MarkerParser
|
6 |
from src.parsers.pypdfium_parser import PyPdfiumParser
|
7 |
from src.parsers.gemini_flash_parser import GeminiFlashParser
|
8 |
from src.parsers.got_ocr_parser import GotOcrParser
|
|
|
2 |
|
3 |
# Import all parsers to ensure they're registered
|
4 |
from src.parsers.docling_parser import DoclingParser
|
|
|
5 |
from src.parsers.pypdfium_parser import PyPdfiumParser
|
6 |
from src.parsers.gemini_flash_parser import GeminiFlashParser
|
7 |
from src.parsers.got_ocr_parser import GotOcrParser
|
src/parsers/docling_parser.py
CHANGED
@@ -15,7 +15,6 @@ from docling.datamodel.pipeline_options import (
|
|
15 |
)
|
16 |
from docling.models.tesseract_ocr_model import TesseractOcrOptions
|
17 |
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
|
18 |
-
from docling.models.ocr_mac_model import OcrMacOptions
|
19 |
|
20 |
|
21 |
class DoclingParser(DocumentParser):
|
@@ -53,11 +52,6 @@ class DoclingParser(DocumentParser):
|
|
53 |
"name": "Tesseract CLI",
|
54 |
"default_params": {}
|
55 |
},
|
56 |
-
{
|
57 |
-
"id": "ocrmac",
|
58 |
-
"name": "ocrmac",
|
59 |
-
"default_params": {}
|
60 |
-
},
|
61 |
{
|
62 |
"id": "full_force_ocr",
|
63 |
"name": "Full Force OCR",
|
@@ -95,9 +89,6 @@ class DoclingParser(DocumentParser):
|
|
95 |
elif ocr_method == "tesseract_cli":
|
96 |
pipeline_options.do_ocr = True
|
97 |
pipeline_options.ocr_options = TesseractCliOcrOptions()
|
98 |
-
elif ocr_method == "ocrmac":
|
99 |
-
pipeline_options.do_ocr = True
|
100 |
-
pipeline_options.ocr_options = OcrMacOptions()
|
101 |
|
102 |
# Create the converter
|
103 |
converter = DocumentConverter(
|
|
|
15 |
)
|
16 |
from docling.models.tesseract_ocr_model import TesseractOcrOptions
|
17 |
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
|
|
|
18 |
|
19 |
|
20 |
class DoclingParser(DocumentParser):
|
|
|
52 |
"name": "Tesseract CLI",
|
53 |
"default_params": {}
|
54 |
},
|
|
|
|
|
|
|
|
|
|
|
55 |
{
|
56 |
"id": "full_force_ocr",
|
57 |
"name": "Full Force OCR",
|
|
|
89 |
elif ocr_method == "tesseract_cli":
|
90 |
pipeline_options.do_ocr = True
|
91 |
pipeline_options.ocr_options = TesseractCliOcrOptions()
|
|
|
|
|
|
|
92 |
|
93 |
# Create the converter
|
94 |
converter = DocumentConverter(
|
src/parsers/marker_parser.py
DELETED
@@ -1,61 +0,0 @@
|
|
1 |
-
from pathlib import Path
|
2 |
-
from typing import Dict, List, Optional, Any, Union
|
3 |
-
import subprocess
|
4 |
-
import tempfile
|
5 |
-
import os
|
6 |
-
import json
|
7 |
-
|
8 |
-
from src.parsers.parser_interface import DocumentParser
|
9 |
-
from src.parsers.parser_registry import ParserRegistry
|
10 |
-
from marker.converters.pdf import PdfConverter
|
11 |
-
from marker.models import create_model_dict
|
12 |
-
from marker.output import text_from_rendered
|
13 |
-
|
14 |
-
|
15 |
-
class MarkerParser(DocumentParser):
|
16 |
-
"""Parser implementation using Marker."""
|
17 |
-
|
18 |
-
@classmethod
|
19 |
-
def get_name(cls) -> str:
|
20 |
-
return "Marker"
|
21 |
-
|
22 |
-
@classmethod
|
23 |
-
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
|
24 |
-
return [
|
25 |
-
{
|
26 |
-
"id": "no_ocr",
|
27 |
-
"name": "No OCR",
|
28 |
-
"default_params": {}
|
29 |
-
},
|
30 |
-
{
|
31 |
-
"id": "force_ocr",
|
32 |
-
"name": "Force OCR",
|
33 |
-
"default_params": {}
|
34 |
-
}
|
35 |
-
]
|
36 |
-
|
37 |
-
def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
|
38 |
-
"""Parse a document using Marker."""
|
39 |
-
force_ocr = ocr_method == "force_ocr"
|
40 |
-
|
41 |
-
converter = PdfConverter(
|
42 |
-
artifact_dict=create_model_dict(),
|
43 |
-
config={"force_ocr": force_ocr}
|
44 |
-
)
|
45 |
-
rendered = converter(str(file_path))
|
46 |
-
content, _, _ = text_from_rendered(rendered)
|
47 |
-
|
48 |
-
# Format the content based on the requested output format
|
49 |
-
output_format = kwargs.get("output_format", "markdown")
|
50 |
-
if output_format.lower() == "json":
|
51 |
-
return json.dumps({"content": content}, ensure_ascii=False, indent=2)
|
52 |
-
elif output_format.lower() == "text":
|
53 |
-
return content.replace("#", "").replace("*", "").replace("_", "")
|
54 |
-
elif output_format.lower() == "document_tags":
|
55 |
-
return f"<doc>\n{content}\n</doc>"
|
56 |
-
else:
|
57 |
-
return content
|
58 |
-
|
59 |
-
|
60 |
-
# Register the parser with the registry
|
61 |
-
ParserRegistry.register(MarkerParser)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/services/docling_chat.py
DELETED
@@ -1,29 +0,0 @@
|
|
1 |
-
import openai
|
2 |
-
import os
|
3 |
-
|
4 |
-
# Load API key from environment variable
|
5 |
-
openai.api_key = os.getenv("OPENAI_API_KEY")
|
6 |
-
|
7 |
-
# Check if API key is available and print a message if not
|
8 |
-
if not openai.api_key:
|
9 |
-
print("Warning: OPENAI_API_KEY environment variable not found. Chat functionality may not work.")
|
10 |
-
|
11 |
-
def chat_with_document(message, history, document_text_state):
|
12 |
-
history = history or []
|
13 |
-
history.append({"role": "user", "content": message})
|
14 |
-
|
15 |
-
context = f"Document: {document_text_state}\n\nUser: {message}"
|
16 |
-
|
17 |
-
# Add error handling for API calls
|
18 |
-
try:
|
19 |
-
response = openai.chat.completions.create(
|
20 |
-
model="gpt-4o-2024-08-06",
|
21 |
-
messages=[{"role": "system", "content": context}] + history
|
22 |
-
)
|
23 |
-
reply = response.choices[0].message.content
|
24 |
-
except Exception as e:
|
25 |
-
reply = f"Error: Could not generate response. Please check your OpenAI API key. Details: {str(e)}"
|
26 |
-
print(f"OpenAI API error: {str(e)}")
|
27 |
-
|
28 |
-
history.append({"role": "assistant", "content": reply})
|
29 |
-
return history, history
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/ui/ui.py
CHANGED
@@ -5,7 +5,6 @@ import time
|
|
5 |
import logging
|
6 |
from pathlib import Path
|
7 |
from src.core.converter import convert_file, set_cancellation_flag, is_conversion_in_progress
|
8 |
-
from src.services.docling_chat import chat_with_document
|
9 |
from src.parsers.parser_registry import ParserRegistry
|
10 |
|
11 |
# Configure logging
|
@@ -169,52 +168,44 @@ def create_ui():
|
|
169 |
# State to store the output format (fixed to Markdown)
|
170 |
output_format_state = gr.State("Markdown")
|
171 |
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
parser_names = ParserRegistry.get_parser_names()
|
181 |
-
default_parser = parser_names[0] if parser_names else "PyPdfium"
|
182 |
-
|
183 |
-
provider_dropdown = gr.Dropdown(
|
184 |
-
label="Provider",
|
185 |
-
choices=parser_names,
|
186 |
-
value=default_parser,
|
187 |
-
interactive=True
|
188 |
-
)
|
189 |
-
with gr.Column(scale=1):
|
190 |
-
default_ocr_options = ParserRegistry.get_ocr_options(default_parser)
|
191 |
-
default_ocr = default_ocr_options[0] if default_ocr_options else "No OCR"
|
192 |
-
|
193 |
-
ocr_dropdown = gr.Dropdown(
|
194 |
-
label="OCR Options",
|
195 |
-
choices=default_ocr_options,
|
196 |
-
value=default_ocr,
|
197 |
-
interactive=True
|
198 |
-
)
|
199 |
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
|
|
204 |
)
|
|
|
|
|
|
|
205 |
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
|
219 |
# Event handlers
|
220 |
provider_dropdown.change(
|
@@ -269,24 +260,6 @@ def create_ui():
|
|
269 |
queue=False # Execute immediately
|
270 |
)
|
271 |
|
272 |
-
file_display.change(
|
273 |
-
lambda text: text,
|
274 |
-
inputs=[file_display],
|
275 |
-
outputs=[document_text_state]
|
276 |
-
)
|
277 |
-
|
278 |
-
text_input.submit(
|
279 |
-
fn=chat_with_document,
|
280 |
-
inputs=[text_input, chatbot, document_text_state],
|
281 |
-
outputs=[chatbot, chatbot]
|
282 |
-
)
|
283 |
-
|
284 |
-
clear_btn.click(
|
285 |
-
lambda: ([], []),
|
286 |
-
None,
|
287 |
-
[chatbot, chatbot]
|
288 |
-
)
|
289 |
-
|
290 |
return demo
|
291 |
|
292 |
|
|
|
5 |
import logging
|
6 |
from pathlib import Path
|
7 |
from src.core.converter import convert_file, set_cancellation_flag, is_conversion_in_progress
|
|
|
8 |
from src.parsers.parser_registry import ParserRegistry
|
9 |
|
10 |
# Configure logging
|
|
|
168 |
# State to store the output format (fixed to Markdown)
|
169 |
output_format_state = gr.State("Markdown")
|
170 |
|
171 |
+
# File input first
|
172 |
+
file_input = gr.File(label="Upload PDF", type="filepath")
|
173 |
+
|
174 |
+
# Provider and OCR options below the file input
|
175 |
+
with gr.Row(elem_classes=["provider-options-row"]):
|
176 |
+
with gr.Column(scale=1):
|
177 |
+
parser_names = ParserRegistry.get_parser_names()
|
178 |
+
default_parser = parser_names[0] if parser_names else "PyPdfium"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
|
180 |
+
provider_dropdown = gr.Dropdown(
|
181 |
+
label="Provider",
|
182 |
+
choices=parser_names,
|
183 |
+
value=default_parser,
|
184 |
+
interactive=True
|
185 |
)
|
186 |
+
with gr.Column(scale=1):
|
187 |
+
default_ocr_options = ParserRegistry.get_ocr_options(default_parser)
|
188 |
+
default_ocr = default_ocr_options[0] if default_ocr_options else "No OCR"
|
189 |
|
190 |
+
ocr_dropdown = gr.Dropdown(
|
191 |
+
label="OCR Options",
|
192 |
+
choices=default_ocr_options,
|
193 |
+
value=default_ocr,
|
194 |
+
interactive=True
|
195 |
+
)
|
196 |
+
|
197 |
+
# Simple output container with just one scrollbar
|
198 |
+
file_display = gr.HTML(
|
199 |
+
value="<div class='output-container'></div>",
|
200 |
+
label="Converted Content"
|
201 |
+
)
|
202 |
+
|
203 |
+
file_download = gr.File(label="Download File")
|
204 |
+
|
205 |
+
# Processing controls row
|
206 |
+
with gr.Row(elem_classes=["processing-controls"]):
|
207 |
+
convert_button = gr.Button("Convert", variant="primary")
|
208 |
+
cancel_button = gr.Button("Cancel", variant="stop", visible=False)
|
209 |
|
210 |
# Event handlers
|
211 |
provider_dropdown.change(
|
|
|
260 |
queue=False # Execute immediately
|
261 |
)
|
262 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
return demo
|
264 |
|
265 |
|