Spaces:
Sleeping
Sleeping
file structure change
Browse files- README.md +36 -0
- src/core/__init__.py +1 -0
- src/{converter.py β core/converter.py} +1 -1
- src/{parser_factory.py β core/parser_factory.py} +2 -2
- src/main.py +1 -1
- src/parsers/__init__.py +6 -4
- src/parsers/docling_parser.py +2 -2
- src/parsers/marker_parser.py +5 -2
- src/{parser_interface.py β parsers/parser_interface.py} +0 -0
- src/{parser_registry.py β parsers/parser_registry.py} +1 -1
- src/parsers/pypdfium_parser.py +4 -3
- src/services/__init__.py +1 -0
- src/{docling_chat.py β services/docling_chat.py} +0 -0
- src/ui/__init__.py +1 -0
- src/{ui.py β ui/ui.py} +3 -3
- tests/__init__.py +1 -0
README.md
CHANGED
@@ -82,3 +82,39 @@ build:
|
|
82 |
## Development
|
83 |
|
84 |
For local development, ensure you have Tesseract OCR installed and the TESSDATA_PREFIX environment variable set correctly.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
## Development
|
83 |
|
84 |
For local development, ensure you have Tesseract OCR installed and the TESSDATA_PREFIX environment variable set correctly.
|
85 |
+
|
86 |
+
## Recommended Folder Structure
|
87 |
+
|
88 |
+
```
|
89 |
+
markit/
|
90 |
+
βββ app.py # Main application entry point
|
91 |
+
βββ setup.sh # Setup script
|
92 |
+
βββ build.sh # Build script
|
93 |
+
βββ requirements.txt # Python dependencies
|
94 |
+
βββ README.md # Project documentation
|
95 |
+
βββ .env # Environment variables
|
96 |
+
βββ .gitignore # Git ignore file
|
97 |
+
βββ .gitattributes # Git attributes file
|
98 |
+
βββ src/ # Source code
|
99 |
+
β βββ __init__.py # Package initialization
|
100 |
+
β βββ main.py # Main module
|
101 |
+
β βββ core/ # Core functionality
|
102 |
+
β β βββ __init__.py # Package initialization
|
103 |
+
β β βββ converter.py # Document conversion logic
|
104 |
+
β β βββ parser_factory.py # Parser factory
|
105 |
+
β βββ parsers/ # Parser implementations
|
106 |
+
β β βββ __init__.py # Package initialization
|
107 |
+
β β βββ parser_interface.py # Parser interface
|
108 |
+
β β βββ parser_registry.py # Parser registry
|
109 |
+
β β βββ docling_parser.py # Docling parser
|
110 |
+
β β βββ marker_parser.py # Marker parser
|
111 |
+
β β βββ pypdfium_parser.py # PyPDFium parser
|
112 |
+
β βββ ui/ # User interface
|
113 |
+
β β βββ __init__.py # Package initialization
|
114 |
+
β β βββ ui.py # Gradio UI implementation
|
115 |
+
β βββ services/ # External services
|
116 |
+
β βββ __init__.py # Package initialization
|
117 |
+
β βββ docling_chat.py # Chat service
|
118 |
+
βββ tests/ # Tests
|
119 |
+
βββ __init__.py # Package initialization
|
120 |
+
```
|
src/core/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
"""Core functionality module for document conversion."""
|
src/{converter.py β core/converter.py}
RENAMED
@@ -5,7 +5,7 @@ import os
|
|
5 |
from pathlib import Path
|
6 |
|
7 |
# Use relative imports instead of absolute imports
|
8 |
-
from parser_factory import ParserFactory
|
9 |
|
10 |
# Import all parsers to ensure they're registered
|
11 |
import parsers
|
|
|
5 |
from pathlib import Path
|
6 |
|
7 |
# Use relative imports instead of absolute imports
|
8 |
+
from src.core.parser_factory import ParserFactory
|
9 |
|
10 |
# Import all parsers to ensure they're registered
|
11 |
import parsers
|
src/{parser_factory.py β core/parser_factory.py}
RENAMED
@@ -4,8 +4,8 @@ import threading
|
|
4 |
import logging
|
5 |
import time
|
6 |
|
7 |
-
from parser_interface import DocumentParser
|
8 |
-
from parser_registry import ParserRegistry
|
9 |
|
10 |
|
11 |
class ParserFactory:
|
|
|
4 |
import logging
|
5 |
import time
|
6 |
|
7 |
+
from src.parsers.parser_interface import DocumentParser
|
8 |
+
from src.parsers.parser_registry import ParserRegistry
|
9 |
|
10 |
|
11 |
class ParserFactory:
|
src/main.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import parsers # Import all parsers to ensure they're registered
|
2 |
|
3 |
-
from ui import launch_ui
|
4 |
|
5 |
|
6 |
def main():
|
|
|
1 |
import parsers # Import all parsers to ensure they're registered
|
2 |
|
3 |
+
from src.ui.ui import launch_ui
|
4 |
|
5 |
|
6 |
def main():
|
src/parsers/__init__.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
from parsers.
|
|
|
|
|
5 |
|
6 |
# You can add new parsers here in the future
|
7 |
|
|
|
1 |
+
"""Parser implementations for document conversion."""
|
2 |
+
|
3 |
+
# Import all parsers to ensure they're registered
|
4 |
+
from src.parsers.docling_parser import DoclingParser
|
5 |
+
from src.parsers.marker_parser import MarkerParser
|
6 |
+
from src.parsers.pypdfium_parser import PyPdfiumParser
|
7 |
|
8 |
# You can add new parsers here in the future
|
9 |
|
src/parsers/docling_parser.py
CHANGED
@@ -2,8 +2,8 @@ from pathlib import Path
|
|
2 |
from typing import Dict, List, Optional, Any, Union
|
3 |
import json
|
4 |
|
5 |
-
from parser_interface import DocumentParser
|
6 |
-
from parser_registry import ParserRegistry
|
7 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
8 |
from docling.datamodel.base_models import InputFormat
|
9 |
from docling.datamodel.pipeline_options import (
|
|
|
2 |
from typing import Dict, List, Optional, Any, Union
|
3 |
import json
|
4 |
|
5 |
+
from src.parsers.parser_interface import DocumentParser
|
6 |
+
from src.parsers.parser_registry import ParserRegistry
|
7 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
8 |
from docling.datamodel.base_models import InputFormat
|
9 |
from docling.datamodel.pipeline_options import (
|
src/parsers/marker_parser.py
CHANGED
@@ -1,9 +1,12 @@
|
|
1 |
from pathlib import Path
|
2 |
from typing import Dict, List, Optional, Any, Union
|
|
|
|
|
|
|
3 |
import json
|
4 |
|
5 |
-
from parser_interface import DocumentParser
|
6 |
-
from parser_registry import ParserRegistry
|
7 |
from marker.converters.pdf import PdfConverter
|
8 |
from marker.models import create_model_dict
|
9 |
from marker.output import text_from_rendered
|
|
|
1 |
from pathlib import Path
|
2 |
from typing import Dict, List, Optional, Any, Union
|
3 |
+
import subprocess
|
4 |
+
import tempfile
|
5 |
+
import os
|
6 |
import json
|
7 |
|
8 |
+
from src.parsers.parser_interface import DocumentParser
|
9 |
+
from src.parsers.parser_registry import ParserRegistry
|
10 |
from marker.converters.pdf import PdfConverter
|
11 |
from marker.models import create_model_dict
|
12 |
from marker.output import text_from_rendered
|
src/{parser_interface.py β parsers/parser_interface.py}
RENAMED
File without changes
|
src/{parser_registry.py β parsers/parser_registry.py}
RENAMED
@@ -1,5 +1,5 @@
|
|
1 |
from typing import Dict, List, Type, Any, Optional
|
2 |
-
from parser_interface import DocumentParser
|
3 |
|
4 |
|
5 |
class ParserRegistry:
|
|
|
1 |
from typing import Dict, List, Type, Any, Optional
|
2 |
+
from src.parsers.parser_interface import DocumentParser
|
3 |
|
4 |
|
5 |
class ParserRegistry:
|
src/parsers/pypdfium_parser.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1 |
from pathlib import Path
|
2 |
from typing import Dict, List, Optional, Any, Union
|
|
|
|
|
3 |
|
4 |
-
from parser_interface import DocumentParser
|
5 |
-
from parser_registry import ParserRegistry
|
6 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
7 |
from docling.datamodel.base_models import InputFormat
|
8 |
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
@@ -63,7 +65,6 @@ class PyPdfiumParser(DocumentParser):
|
|
63 |
# Return the content in the requested format
|
64 |
output_format = kwargs.get("output_format", "markdown")
|
65 |
if output_format.lower() == "json":
|
66 |
-
import json
|
67 |
return json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
|
68 |
elif output_format.lower() == "text":
|
69 |
return doc.export_to_text()
|
|
|
1 |
from pathlib import Path
|
2 |
from typing import Dict, List, Optional, Any, Union
|
3 |
+
import json
|
4 |
+
import pypdfium2 as pdfium
|
5 |
|
6 |
+
from src.parsers.parser_interface import DocumentParser
|
7 |
+
from src.parsers.parser_registry import ParserRegistry
|
8 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
9 |
from docling.datamodel.base_models import InputFormat
|
10 |
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|
|
65 |
# Return the content in the requested format
|
66 |
output_format = kwargs.get("output_format", "markdown")
|
67 |
if output_format.lower() == "json":
|
|
|
68 |
return json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
|
69 |
elif output_format.lower() == "text":
|
70 |
return doc.export_to_text()
|
src/services/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
"""External services module for the application."""
|
src/{docling_chat.py β services/docling_chat.py}
RENAMED
File without changes
|
src/ui/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
"""User interface module for the application."""
|
src/{ui.py β ui/ui.py}
RENAMED
@@ -3,9 +3,9 @@ import markdown
|
|
3 |
import threading
|
4 |
import time
|
5 |
import logging
|
6 |
-
from converter import convert_file, set_cancellation_flag, is_conversion_in_progress
|
7 |
-
from docling_chat import chat_with_document
|
8 |
-
from parser_registry import ParserRegistry
|
9 |
|
10 |
# Configure logging
|
11 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
3 |
import threading
|
4 |
import time
|
5 |
import logging
|
6 |
+
from src.core.converter import convert_file, set_cancellation_flag, is_conversion_in_progress
|
7 |
+
from src.services.docling_chat import chat_with_document
|
8 |
+
from src.parsers.parser_registry import ParserRegistry
|
9 |
|
10 |
# Configure logging
|
11 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
tests/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
"""Test suite for the application."""
|