AnseMin commited on
Commit
e66626c
·
1 Parent(s): 08189e1

Remove chat with document feature, updated ui, remove ocrmac, pinned transformer version to 4.37 for GOT OCR

Browse files
build.sh CHANGED
@@ -80,7 +80,7 @@ echo "Google Gemini API client installed successfully"
80
 
81
  # Install GOT-OCR dependencies
82
  echo "Installing GOT-OCR dependencies..."
83
- pip install -q -U torch==2.0.1 torchvision==0.15.2 transformers==4.47.0 tiktoken==0.6.0 verovio==4.3.1 accelerate==0.28.0 safetensors==0.4.0
84
  echo "GOT-OCR dependencies installed successfully"
85
 
86
  # Install Python dependencies
 
80
 
81
  # Install GOT-OCR dependencies
82
  echo "Installing GOT-OCR dependencies..."
83
+ pip install -q -U torch==2.0.1 torchvision==0.15.2 transformers==4.37.2 tiktoken==0.6.0 verovio==4.3.1 accelerate==0.28.0 safetensors==0.4.0
84
  echo "GOT-OCR dependencies installed successfully"
85
 
86
  # Install Python dependencies
requirements.txt CHANGED
@@ -2,7 +2,6 @@ docling==2.25.0
2
  gradio==5.14.0
3
  grpcio-status==1.70.0
4
  markdown==3.7
5
- marker-pdf==1.3.5
6
  multiprocess==0.70.16
7
  openai==1.61.1
8
  pipdeptree==2.25.0
@@ -25,11 +24,11 @@ python-dotenv>=1.0.0
25
  pydantic==2.7.1
26
 
27
  # GOT-OCR dependencies
28
- torch>=2.0.1
29
- torchvision>=0.15.2
30
- transformers>=4.37.2,<4.48.0 # Pin to a compatible version for GOT-OCR
31
- tiktoken>=0.6.0
32
- verovio>=4.3.1
33
- accelerate>=0.28.0
34
  safetensors>=0.4.0
35
  packaging>=21.0 # For version comparison
 
2
  gradio==5.14.0
3
  grpcio-status==1.70.0
4
  markdown==3.7
 
5
  multiprocess==0.70.16
6
  openai==1.61.1
7
  pipdeptree==2.25.0
 
24
  pydantic==2.7.1
25
 
26
  # GOT-OCR dependencies
27
+ torch==2.0.1
28
+ torchvision==0.15.2
29
+ transformers==4.37.2 # Exact version for GOT-OCR2
30
+ tiktoken==0.6.0
31
+ verovio==4.3.1
32
+ accelerate==0.28.0
33
  safetensors>=0.4.0
34
  packaging>=21.0 # For version comparison
setup.sh CHANGED
@@ -20,7 +20,7 @@ echo "Python dependencies installed successfully"
20
 
21
  # Install GOT-OCR dependencies
22
  echo "Installing GOT-OCR dependencies..."
23
- pip install -q -U torch==2.0.1 torchvision==0.15.2 transformers==4.47.0 tiktoken==0.6.0 verovio==4.3.1 accelerate==0.28.0 safetensors==0.4.0
24
  echo "GOT-OCR dependencies installed successfully"
25
 
26
  # Install tesserocr with pip
 
20
 
21
  # Install GOT-OCR dependencies
22
  echo "Installing GOT-OCR dependencies..."
23
+ pip install -q -U torch==2.0.1 torchvision==0.15.2 transformers==4.37.2 tiktoken==0.6.0 verovio==4.3.1 accelerate==0.28.0 safetensors==0.4.0
24
  echo "GOT-OCR dependencies installed successfully"
25
 
26
  # Install tesserocr with pip
src/parsers/__init__.py CHANGED
@@ -2,7 +2,6 @@
2
 
3
  # Import all parsers to ensure they're registered
4
  from src.parsers.docling_parser import DoclingParser
5
- from src.parsers.marker_parser import MarkerParser
6
  from src.parsers.pypdfium_parser import PyPdfiumParser
7
  from src.parsers.gemini_flash_parser import GeminiFlashParser
8
  from src.parsers.got_ocr_parser import GotOcrParser
 
2
 
3
  # Import all parsers to ensure they're registered
4
  from src.parsers.docling_parser import DoclingParser
 
5
  from src.parsers.pypdfium_parser import PyPdfiumParser
6
  from src.parsers.gemini_flash_parser import GeminiFlashParser
7
  from src.parsers.got_ocr_parser import GotOcrParser
src/parsers/docling_parser.py CHANGED
@@ -15,7 +15,6 @@ from docling.datamodel.pipeline_options import (
15
  )
16
  from docling.models.tesseract_ocr_model import TesseractOcrOptions
17
  from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
18
- from docling.models.ocr_mac_model import OcrMacOptions
19
 
20
 
21
  class DoclingParser(DocumentParser):
@@ -53,11 +52,6 @@ class DoclingParser(DocumentParser):
53
  "name": "Tesseract CLI",
54
  "default_params": {}
55
  },
56
- {
57
- "id": "ocrmac",
58
- "name": "ocrmac",
59
- "default_params": {}
60
- },
61
  {
62
  "id": "full_force_ocr",
63
  "name": "Full Force OCR",
@@ -95,9 +89,6 @@ class DoclingParser(DocumentParser):
95
  elif ocr_method == "tesseract_cli":
96
  pipeline_options.do_ocr = True
97
  pipeline_options.ocr_options = TesseractCliOcrOptions()
98
- elif ocr_method == "ocrmac":
99
- pipeline_options.do_ocr = True
100
- pipeline_options.ocr_options = OcrMacOptions()
101
 
102
  # Create the converter
103
  converter = DocumentConverter(
 
15
  )
16
  from docling.models.tesseract_ocr_model import TesseractOcrOptions
17
  from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
 
18
 
19
 
20
  class DoclingParser(DocumentParser):
 
52
  "name": "Tesseract CLI",
53
  "default_params": {}
54
  },
 
 
 
 
 
55
  {
56
  "id": "full_force_ocr",
57
  "name": "Full Force OCR",
 
89
  elif ocr_method == "tesseract_cli":
90
  pipeline_options.do_ocr = True
91
  pipeline_options.ocr_options = TesseractCliOcrOptions()
 
 
 
92
 
93
  # Create the converter
94
  converter = DocumentConverter(
src/parsers/marker_parser.py DELETED
@@ -1,61 +0,0 @@
1
- from pathlib import Path
2
- from typing import Dict, List, Optional, Any, Union
3
- import subprocess
4
- import tempfile
5
- import os
6
- import json
7
-
8
- from src.parsers.parser_interface import DocumentParser
9
- from src.parsers.parser_registry import ParserRegistry
10
- from marker.converters.pdf import PdfConverter
11
- from marker.models import create_model_dict
12
- from marker.output import text_from_rendered
13
-
14
-
15
- class MarkerParser(DocumentParser):
16
- """Parser implementation using Marker."""
17
-
18
- @classmethod
19
- def get_name(cls) -> str:
20
- return "Marker"
21
-
22
- @classmethod
23
- def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
24
- return [
25
- {
26
- "id": "no_ocr",
27
- "name": "No OCR",
28
- "default_params": {}
29
- },
30
- {
31
- "id": "force_ocr",
32
- "name": "Force OCR",
33
- "default_params": {}
34
- }
35
- ]
36
-
37
- def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
38
- """Parse a document using Marker."""
39
- force_ocr = ocr_method == "force_ocr"
40
-
41
- converter = PdfConverter(
42
- artifact_dict=create_model_dict(),
43
- config={"force_ocr": force_ocr}
44
- )
45
- rendered = converter(str(file_path))
46
- content, _, _ = text_from_rendered(rendered)
47
-
48
- # Format the content based on the requested output format
49
- output_format = kwargs.get("output_format", "markdown")
50
- if output_format.lower() == "json":
51
- return json.dumps({"content": content}, ensure_ascii=False, indent=2)
52
- elif output_format.lower() == "text":
53
- return content.replace("#", "").replace("*", "").replace("_", "")
54
- elif output_format.lower() == "document_tags":
55
- return f"<doc>\n{content}\n</doc>"
56
- else:
57
- return content
58
-
59
-
60
- # Register the parser with the registry
61
- ParserRegistry.register(MarkerParser)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/services/docling_chat.py DELETED
@@ -1,29 +0,0 @@
1
- import openai
2
- import os
3
-
4
- # Load API key from environment variable
5
- openai.api_key = os.getenv("OPENAI_API_KEY")
6
-
7
- # Check if API key is available and print a message if not
8
- if not openai.api_key:
9
- print("Warning: OPENAI_API_KEY environment variable not found. Chat functionality may not work.")
10
-
11
- def chat_with_document(message, history, document_text_state):
12
- history = history or []
13
- history.append({"role": "user", "content": message})
14
-
15
- context = f"Document: {document_text_state}\n\nUser: {message}"
16
-
17
- # Add error handling for API calls
18
- try:
19
- response = openai.chat.completions.create(
20
- model="gpt-4o-2024-08-06",
21
- messages=[{"role": "system", "content": context}] + history
22
- )
23
- reply = response.choices[0].message.content
24
- except Exception as e:
25
- reply = f"Error: Could not generate response. Please check your OpenAI API key. Details: {str(e)}"
26
- print(f"OpenAI API error: {str(e)}")
27
-
28
- history.append({"role": "assistant", "content": reply})
29
- return history, history
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/ui/ui.py CHANGED
@@ -5,7 +5,6 @@ import time
5
  import logging
6
  from pathlib import Path
7
  from src.core.converter import convert_file, set_cancellation_flag, is_conversion_in_progress
8
- from src.services.docling_chat import chat_with_document
9
  from src.parsers.parser_registry import ParserRegistry
10
 
11
  # Configure logging
@@ -169,52 +168,44 @@ def create_ui():
169
  # State to store the output format (fixed to Markdown)
170
  output_format_state = gr.State("Markdown")
171
 
172
- with gr.Tabs():
173
- with gr.Tab("Upload and Convert"):
174
- # File input first
175
- file_input = gr.File(label="Upload PDF", type="filepath")
176
-
177
- # Provider and OCR options below the file input
178
- with gr.Row(elem_classes=["provider-options-row"]):
179
- with gr.Column(scale=1):
180
- parser_names = ParserRegistry.get_parser_names()
181
- default_parser = parser_names[0] if parser_names else "PyPdfium"
182
-
183
- provider_dropdown = gr.Dropdown(
184
- label="Provider",
185
- choices=parser_names,
186
- value=default_parser,
187
- interactive=True
188
- )
189
- with gr.Column(scale=1):
190
- default_ocr_options = ParserRegistry.get_ocr_options(default_parser)
191
- default_ocr = default_ocr_options[0] if default_ocr_options else "No OCR"
192
-
193
- ocr_dropdown = gr.Dropdown(
194
- label="OCR Options",
195
- choices=default_ocr_options,
196
- value=default_ocr,
197
- interactive=True
198
- )
199
 
200
- # Simple output container with just one scrollbar
201
- file_display = gr.HTML(
202
- value="<div class='output-container'></div>",
203
- label="Converted Content"
 
204
  )
 
 
 
205
 
206
- file_download = gr.File(label="Download File")
207
-
208
- # Processing controls row
209
- with gr.Row(elem_classes=["processing-controls"]):
210
- convert_button = gr.Button("Convert", variant="primary")
211
- cancel_button = gr.Button("Cancel", variant="stop", visible=False)
212
-
213
- with gr.Tab("Chat with Document"):
214
- document_text_state = gr.State("")
215
- chatbot = gr.Chatbot(label="Chat", type="messages")
216
- text_input = gr.Textbox(placeholder="Type here...")
217
- clear_btn = gr.Button("Clear")
 
 
 
 
 
 
 
218
 
219
  # Event handlers
220
  provider_dropdown.change(
@@ -269,24 +260,6 @@ def create_ui():
269
  queue=False # Execute immediately
270
  )
271
 
272
- file_display.change(
273
- lambda text: text,
274
- inputs=[file_display],
275
- outputs=[document_text_state]
276
- )
277
-
278
- text_input.submit(
279
- fn=chat_with_document,
280
- inputs=[text_input, chatbot, document_text_state],
281
- outputs=[chatbot, chatbot]
282
- )
283
-
284
- clear_btn.click(
285
- lambda: ([], []),
286
- None,
287
- [chatbot, chatbot]
288
- )
289
-
290
  return demo
291
 
292
 
 
5
  import logging
6
  from pathlib import Path
7
  from src.core.converter import convert_file, set_cancellation_flag, is_conversion_in_progress
 
8
  from src.parsers.parser_registry import ParserRegistry
9
 
10
  # Configure logging
 
168
  # State to store the output format (fixed to Markdown)
169
  output_format_state = gr.State("Markdown")
170
 
171
+ # File input first
172
+ file_input = gr.File(label="Upload PDF", type="filepath")
173
+
174
+ # Provider and OCR options below the file input
175
+ with gr.Row(elem_classes=["provider-options-row"]):
176
+ with gr.Column(scale=1):
177
+ parser_names = ParserRegistry.get_parser_names()
178
+ default_parser = parser_names[0] if parser_names else "PyPdfium"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
+ provider_dropdown = gr.Dropdown(
181
+ label="Provider",
182
+ choices=parser_names,
183
+ value=default_parser,
184
+ interactive=True
185
  )
186
+ with gr.Column(scale=1):
187
+ default_ocr_options = ParserRegistry.get_ocr_options(default_parser)
188
+ default_ocr = default_ocr_options[0] if default_ocr_options else "No OCR"
189
 
190
+ ocr_dropdown = gr.Dropdown(
191
+ label="OCR Options",
192
+ choices=default_ocr_options,
193
+ value=default_ocr,
194
+ interactive=True
195
+ )
196
+
197
+ # Simple output container with just one scrollbar
198
+ file_display = gr.HTML(
199
+ value="<div class='output-container'></div>",
200
+ label="Converted Content"
201
+ )
202
+
203
+ file_download = gr.File(label="Download File")
204
+
205
+ # Processing controls row
206
+ with gr.Row(elem_classes=["processing-controls"]):
207
+ convert_button = gr.Button("Convert", variant="primary")
208
+ cancel_button = gr.Button("Cancel", variant="stop", visible=False)
209
 
210
  # Event handlers
211
  provider_dropdown.change(
 
260
  queue=False # Execute immediately
261
  )
262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  return demo
264
 
265