AnseMin commited on
Commit
66f3c4d
·
1 Parent(s): 98f25ae
app.py CHANGED
@@ -134,26 +134,5 @@ except ModuleNotFoundError:
134
  # Call setup function at import time
135
  setup_tesseract()
136
 
137
- # Add this near the top of app.py after imports
138
- # Handle potential import conflicts
139
- try:
140
- import transformers
141
- print(f"Transformers version: {transformers.__version__}")
142
- except ImportError:
143
- print("Warning: Transformers not installed or not working")
144
-
145
- try:
146
- import torch
147
- print(f"Torch version: {torch.__version__}")
148
- print(f"CUDA available: {torch.cuda.is_available()}")
149
- except ImportError:
150
- print("Warning: PyTorch not installed or not working")
151
-
152
- try:
153
- import docling
154
- print(f"Docling version: {docling.__version__ if hasattr(docling, '__version__') else 'unknown'}")
155
- except ImportError:
156
- print("Warning: Docling not installed or not working")
157
-
158
  if __name__ == "__main__":
159
  main()
 
134
  # Call setup function at import time
135
  setup_tesseract()
136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  if __name__ == "__main__":
138
  main()
build.sh CHANGED
@@ -78,20 +78,14 @@ echo "Installing Google Gemini API client..."
78
  pip install -q -U google-genai
79
  echo "Google Gemini API client installed successfully"
80
 
81
- # Install GOT-OCR dependencies first
82
  echo "Installing GOT-OCR dependencies..."
83
- pip install -q -U torch==2.0.1 torchvision==0.15.2 --no-deps
84
- pip install -q -U transformers==4.37.2 tiktoken==0.6.0 verovio==4.3.1 accelerate==0.28.0 safetensors==0.4.0
85
  echo "GOT-OCR dependencies installed successfully"
86
 
87
- # Install docling separately with --no-deps to avoid conflicts
88
- echo "Installing docling..."
89
- pip install -q -U docling==2.25.0 --no-deps
90
- echo "Docling installed successfully"
91
-
92
- # Install remaining Python dependencies
93
- echo "Installing remaining Python dependencies..."
94
- pip install -e . --no-deps
95
 
96
  # Create .env file if it doesn't exist
97
  if [ ! -f .env ]; then
 
78
  pip install -q -U google-genai
79
  echo "Google Gemini API client installed successfully"
80
 
81
+ # Install GOT-OCR dependencies
82
  echo "Installing GOT-OCR dependencies..."
83
+ pip install -q -U torch==2.0.1 torchvision==0.15.2 transformers==4.47.0 tiktoken==0.6.0 verovio==4.3.1 accelerate==0.28.0 safetensors==0.4.0
 
84
  echo "GOT-OCR dependencies installed successfully"
85
 
86
+ # Install Python dependencies
87
+ echo "Installing Python dependencies..."
88
+ pip install -e .
 
 
 
 
 
89
 
90
  # Create .env file if it doesn't exist
91
  if [ ! -f .env ]; then
requirements.txt CHANGED
@@ -1,7 +1,8 @@
1
- # Core dependencies
2
  gradio==5.14.0
3
  grpcio-status==1.70.0
4
  markdown==3.7
 
5
  multiprocess==0.70.16
6
  openai==1.61.1
7
  pipdeptree==2.25.0
@@ -9,35 +10,26 @@ pytesseract==0.3.13
9
  semchunk==2.2.2
10
  Pillow>=9.0.0
11
  numpy>=1.21.0
12
-
13
  # Tesseract dependencies
14
  tesseract==0.1.3
15
  tesserocr>=2.5.0; platform_system != "Windows" # Only install on non-Windows systems
16
-
17
  # Additional dependencies for image processing
18
  opencv-python-headless>=4.5.0 # Headless version for server environments
19
  pdf2image>=1.16.0 # For PDF processing
20
  dill==0.3.8 # Downgraded to be compatible with datasets
21
-
22
  # Gemini API client
23
  google-genai>=0.1.0
24
-
25
  # Environment variables
26
  python-dotenv>=1.0.0
27
-
28
  # Pin pydantic to resolve compatibility issues with gradio
29
  pydantic==2.7.1
30
 
31
- # Common dependencies - not pinned to allow resolution
32
- packaging>=21.0 # For version comparison
 
 
 
 
 
33
  safetensors>=0.4.0
34
-
35
- # Note: The following packages will be installed separately in setup.sh and build.sh
36
- # to avoid dependency conflicts:
37
- # - docling
38
- # - transformers
39
- # - torch
40
- # - torchvision
41
- # - tiktoken
42
- # - verovio
43
- # - accelerate
 
1
+ docling==2.25.0
2
  gradio==5.14.0
3
  grpcio-status==1.70.0
4
  markdown==3.7
5
+ marker-pdf==1.3.5
6
  multiprocess==0.70.16
7
  openai==1.61.1
8
  pipdeptree==2.25.0
 
10
  semchunk==2.2.2
11
  Pillow>=9.0.0
12
  numpy>=1.21.0
 
13
  # Tesseract dependencies
14
  tesseract==0.1.3
15
  tesserocr>=2.5.0; platform_system != "Windows" # Only install on non-Windows systems
 
16
  # Additional dependencies for image processing
17
  opencv-python-headless>=4.5.0 # Headless version for server environments
18
  pdf2image>=1.16.0 # For PDF processing
19
  dill==0.3.8 # Downgraded to be compatible with datasets
 
20
  # Gemini API client
21
  google-genai>=0.1.0
 
22
  # Environment variables
23
  python-dotenv>=1.0.0
 
24
  # Pin pydantic to resolve compatibility issues with gradio
25
  pydantic==2.7.1
26
 
27
+ # GOT-OCR dependencies
28
+ torch>=2.0.1
29
+ torchvision>=0.15.2
30
+ transformers>=4.37.2,<4.48.0 # Pin to a compatible version for GOT-OCR
31
+ tiktoken>=0.6.0
32
+ verovio>=4.3.1
33
+ accelerate>=0.28.0
34
  safetensors>=0.4.0
35
+ packaging>=21.0 # For version comparison
 
 
 
 
 
 
 
 
 
setup.sh CHANGED
@@ -18,17 +18,11 @@ pip install -q -U pytesseract pillow opencv-python-headless pdf2image
18
  pip install -q -U google-genai
19
  echo "Python dependencies installed successfully"
20
 
21
- # Install GOT-OCR dependencies first
22
  echo "Installing GOT-OCR dependencies..."
23
- pip install -q -U torch==2.0.1 torchvision==0.15.2 --no-deps
24
- pip install -q -U transformers==4.37.2 tiktoken==0.6.0 verovio==4.3.1 accelerate==0.28.0 safetensors==0.4.0
25
  echo "GOT-OCR dependencies installed successfully"
26
 
27
- # Install docling separately with --no-deps to avoid conflicts
28
- echo "Installing docling..."
29
- pip install -q -U docling==2.25.0 --no-deps
30
- echo "Docling installed successfully"
31
-
32
  # Install tesserocr with pip
33
  echo "Installing tesserocr..."
34
  pip install -q -U tesserocr || echo "Failed to install tesserocr with pip, trying with specific compiler flags..."
 
18
  pip install -q -U google-genai
19
  echo "Python dependencies installed successfully"
20
 
21
+ # Install GOT-OCR dependencies
22
  echo "Installing GOT-OCR dependencies..."
23
+ pip install -q -U torch==2.0.1 torchvision==0.15.2 transformers==4.47.0 tiktoken==0.6.0 verovio==4.3.1 accelerate==0.28.0 safetensors==0.4.0
 
24
  echo "GOT-OCR dependencies installed successfully"
25
 
 
 
 
 
 
26
  # Install tesserocr with pip
27
  echo "Installing tesserocr..."
28
  pip install -q -U tesserocr || echo "Failed to install tesserocr with pip, trying with specific compiler flags..."
src/parsers/__init__.py CHANGED
@@ -2,6 +2,7 @@
2
 
3
  # Import all parsers to ensure they're registered
4
  from src.parsers.docling_parser import DoclingParser
 
5
  from src.parsers.pypdfium_parser import PyPdfiumParser
6
  from src.parsers.gemini_flash_parser import GeminiFlashParser
7
  from src.parsers.got_ocr_parser import GotOcrParser
 
2
 
3
  # Import all parsers to ensure they're registered
4
  from src.parsers.docling_parser import DoclingParser
5
+ from src.parsers.marker_parser import MarkerParser
6
  from src.parsers.pypdfium_parser import PyPdfiumParser
7
  from src.parsers.gemini_flash_parser import GeminiFlashParser
8
  from src.parsers.got_ocr_parser import GotOcrParser
src/parsers/marker_parser.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Dict, List, Optional, Any, Union
3
+ import subprocess
4
+ import tempfile
5
+ import os
6
+ import json
7
+
8
+ from src.parsers.parser_interface import DocumentParser
9
+ from src.parsers.parser_registry import ParserRegistry
10
+ from marker.converters.pdf import PdfConverter
11
+ from marker.models import create_model_dict
12
+ from marker.output import text_from_rendered
13
+
14
+
15
+ class MarkerParser(DocumentParser):
16
+ """Parser implementation using Marker."""
17
+
18
+ @classmethod
19
+ def get_name(cls) -> str:
20
+ return "Marker"
21
+
22
+ @classmethod
23
+ def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
24
+ return [
25
+ {
26
+ "id": "no_ocr",
27
+ "name": "No OCR",
28
+ "default_params": {}
29
+ },
30
+ {
31
+ "id": "force_ocr",
32
+ "name": "Force OCR",
33
+ "default_params": {}
34
+ }
35
+ ]
36
+
37
+ def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
38
+ """Parse a document using Marker."""
39
+ force_ocr = ocr_method == "force_ocr"
40
+
41
+ converter = PdfConverter(
42
+ artifact_dict=create_model_dict(),
43
+ config={"force_ocr": force_ocr}
44
+ )
45
+ rendered = converter(str(file_path))
46
+ content, _, _ = text_from_rendered(rendered)
47
+
48
+ # Format the content based on the requested output format
49
+ output_format = kwargs.get("output_format", "markdown")
50
+ if output_format.lower() == "json":
51
+ return json.dumps({"content": content}, ensure_ascii=False, indent=2)
52
+ elif output_format.lower() == "text":
53
+ return content.replace("#", "").replace("*", "").replace("_", "")
54
+ elif output_format.lower() == "document_tags":
55
+ return f"<doc>\n{content}\n</doc>"
56
+ else:
57
+ return content
58
+
59
+
60
+ # Register the parser with the registry
61
+ ParserRegistry.register(MarkerParser)
src/services/docling_chat.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import os
3
+
4
+ # Load API key from environment variable
5
+ openai.api_key = os.getenv("OPENAI_API_KEY")
6
+
7
+ # Check if API key is available and print a message if not
8
+ if not openai.api_key:
9
+ print("Warning: OPENAI_API_KEY environment variable not found. Chat functionality may not work.")
10
+
11
+ def chat_with_document(message, history, document_text_state):
12
+ history = history or []
13
+ history.append({"role": "user", "content": message})
14
+
15
+ context = f"Document: {document_text_state}\n\nUser: {message}"
16
+
17
+ # Add error handling for API calls
18
+ try:
19
+ response = openai.chat.completions.create(
20
+ model="gpt-4o-2024-08-06",
21
+ messages=[{"role": "system", "content": context}] + history
22
+ )
23
+ reply = response.choices[0].message.content
24
+ except Exception as e:
25
+ reply = f"Error: Could not generate response. Please check your OpenAI API key. Details: {str(e)}"
26
+ print(f"OpenAI API error: {str(e)}")
27
+
28
+ history.append({"role": "assistant", "content": reply})
29
+ return history, history
src/ui/ui.py CHANGED
@@ -5,6 +5,7 @@ import time
5
  import logging
6
  from pathlib import Path
7
  from src.core.converter import convert_file, set_cancellation_flag, is_conversion_in_progress
 
8
  from src.parsers.parser_registry import ParserRegistry
9
 
10
  # Configure logging
@@ -168,44 +169,52 @@ def create_ui():
168
  # State to store the output format (fixed to Markdown)
169
  output_format_state = gr.State("Markdown")
170
 
171
- # File input first
172
- file_input = gr.File(label="Upload PDF", type="filepath")
173
-
174
- # Provider and OCR options below the file input
175
- with gr.Row(elem_classes=["provider-options-row"]):
176
- with gr.Column(scale=1):
177
- parser_names = ParserRegistry.get_parser_names()
178
- default_parser = parser_names[0] if parser_names else "PyPdfium"
179
 
180
- provider_dropdown = gr.Dropdown(
181
- label="Provider",
182
- choices=parser_names,
183
- value=default_parser,
184
- interactive=True
185
- )
186
- with gr.Column(scale=1):
187
- default_ocr_options = ParserRegistry.get_ocr_options(default_parser)
188
- default_ocr = default_ocr_options[0] if default_ocr_options else "No OCR"
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
- ocr_dropdown = gr.Dropdown(
191
- label="OCR Options",
192
- choices=default_ocr_options,
193
- value=default_ocr,
194
- interactive=True
195
  )
196
-
197
- # Simple output container with just one scrollbar
198
- file_display = gr.HTML(
199
- value="<div class='output-container'></div>",
200
- label="Converted Content"
201
- )
202
-
203
- file_download = gr.File(label="Download File")
204
-
205
- # Processing controls row
206
- with gr.Row(elem_classes=["processing-controls"]):
207
- convert_button = gr.Button("Convert", variant="primary")
208
- cancel_button = gr.Button("Cancel", variant="stop", visible=False)
209
 
210
  # Event handlers
211
  provider_dropdown.change(
@@ -260,6 +269,24 @@ def create_ui():
260
  queue=False # Execute immediately
261
  )
262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  return demo
264
 
265
 
 
5
  import logging
6
  from pathlib import Path
7
  from src.core.converter import convert_file, set_cancellation_flag, is_conversion_in_progress
8
+ from src.services.docling_chat import chat_with_document
9
  from src.parsers.parser_registry import ParserRegistry
10
 
11
  # Configure logging
 
169
  # State to store the output format (fixed to Markdown)
170
  output_format_state = gr.State("Markdown")
171
 
172
+ with gr.Tabs():
173
+ with gr.Tab("Upload and Convert"):
174
+ # File input first
175
+ file_input = gr.File(label="Upload PDF", type="filepath")
 
 
 
 
176
 
177
+ # Provider and OCR options below the file input
178
+ with gr.Row(elem_classes=["provider-options-row"]):
179
+ with gr.Column(scale=1):
180
+ parser_names = ParserRegistry.get_parser_names()
181
+ default_parser = parser_names[0] if parser_names else "PyPdfium"
182
+
183
+ provider_dropdown = gr.Dropdown(
184
+ label="Provider",
185
+ choices=parser_names,
186
+ value=default_parser,
187
+ interactive=True
188
+ )
189
+ with gr.Column(scale=1):
190
+ default_ocr_options = ParserRegistry.get_ocr_options(default_parser)
191
+ default_ocr = default_ocr_options[0] if default_ocr_options else "No OCR"
192
+
193
+ ocr_dropdown = gr.Dropdown(
194
+ label="OCR Options",
195
+ choices=default_ocr_options,
196
+ value=default_ocr,
197
+ interactive=True
198
+ )
199
 
200
+ # Simple output container with just one scrollbar
201
+ file_display = gr.HTML(
202
+ value="<div class='output-container'></div>",
203
+ label="Converted Content"
 
204
  )
205
+
206
+ file_download = gr.File(label="Download File")
207
+
208
+ # Processing controls row
209
+ with gr.Row(elem_classes=["processing-controls"]):
210
+ convert_button = gr.Button("Convert", variant="primary")
211
+ cancel_button = gr.Button("Cancel", variant="stop", visible=False)
212
+
213
+ with gr.Tab("Chat with Document"):
214
+ document_text_state = gr.State("")
215
+ chatbot = gr.Chatbot(label="Chat", type="messages")
216
+ text_input = gr.Textbox(placeholder="Type here...")
217
+ clear_btn = gr.Button("Clear")
218
 
219
  # Event handlers
220
  provider_dropdown.change(
 
269
  queue=False # Execute immediately
270
  )
271
 
272
+ file_display.change(
273
+ lambda text: text,
274
+ inputs=[file_display],
275
+ outputs=[document_text_state]
276
+ )
277
+
278
+ text_input.submit(
279
+ fn=chat_with_document,
280
+ inputs=[text_input, chatbot, document_text_state],
281
+ outputs=[chatbot, chatbot]
282
+ )
283
+
284
+ clear_btn.click(
285
+ lambda: ([], []),
286
+ None,
287
+ [chatbot, chatbot]
288
+ )
289
+
290
  return demo
291
 
292