AnseMin commited on
Commit
a370b95
Β·
1 Parent(s): dab47f5

file structure change

Browse files
README.md CHANGED
@@ -82,3 +82,39 @@ build:
82
  ## Development
83
 
84
  For local development, ensure you have Tesseract OCR installed and the TESSDATA_PREFIX environment variable set correctly.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  ## Development
83
 
84
  For local development, ensure you have Tesseract OCR installed and the TESSDATA_PREFIX environment variable set correctly.
85
+
86
+ ## Recommended Folder Structure
87
+
88
+ ```
89
+ markit/
90
+ β”œβ”€β”€ app.py # Main application entry point
91
+ β”œβ”€β”€ setup.sh # Setup script
92
+ β”œβ”€β”€ build.sh # Build script
93
+ β”œβ”€β”€ requirements.txt # Python dependencies
94
+ β”œβ”€β”€ README.md # Project documentation
95
+ β”œβ”€β”€ .env # Environment variables
96
+ β”œβ”€β”€ .gitignore # Git ignore file
97
+ β”œβ”€β”€ .gitattributes # Git attributes file
98
+ β”œβ”€β”€ src/ # Source code
99
+ β”‚ β”œβ”€β”€ __init__.py # Package initialization
100
+ β”‚ β”œβ”€β”€ main.py # Main module
101
+ β”‚ β”œβ”€β”€ core/ # Core functionality
102
+ β”‚ β”‚ β”œβ”€β”€ __init__.py # Package initialization
103
+ β”‚ β”‚ β”œβ”€β”€ converter.py # Document conversion logic
104
+ β”‚ β”‚ └── parser_factory.py # Parser factory
105
+ β”‚ β”œβ”€β”€ parsers/ # Parser implementations
106
+ β”‚ β”‚ β”œβ”€β”€ __init__.py # Package initialization
107
+ β”‚ β”‚ β”œβ”€β”€ parser_interface.py # Parser interface
108
+ β”‚ β”‚ β”œβ”€β”€ parser_registry.py # Parser registry
109
+ β”‚ β”‚ β”œβ”€β”€ docling_parser.py # Docling parser
110
+ β”‚ β”‚ β”œβ”€β”€ marker_parser.py # Marker parser
111
+ β”‚ β”‚ └── pypdfium_parser.py # PyPDFium parser
112
+ β”‚ β”œβ”€β”€ ui/ # User interface
113
+ β”‚ β”‚ β”œβ”€β”€ __init__.py # Package initialization
114
+ β”‚ β”‚ └── ui.py # Gradio UI implementation
115
+ β”‚ └── services/ # External services
116
+ β”‚ β”œβ”€β”€ __init__.py # Package initialization
117
+ β”‚ └── docling_chat.py # Chat service
118
+ └── tests/ # Tests
119
+ └── __init__.py # Package initialization
120
+ ```
src/core/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Core functionality module for document conversion."""
src/{converter.py β†’ core/converter.py} RENAMED
@@ -5,7 +5,7 @@ import os
5
  from pathlib import Path
6
 
7
  # Use relative imports instead of absolute imports
8
- from parser_factory import ParserFactory
9
 
10
  # Import all parsers to ensure they're registered
11
  import parsers
 
5
  from pathlib import Path
6
 
7
  # Use relative imports instead of absolute imports
8
+ from src.core.parser_factory import ParserFactory
9
 
10
  # Import all parsers to ensure they're registered
11
  import parsers
src/{parser_factory.py β†’ core/parser_factory.py} RENAMED
@@ -4,8 +4,8 @@ import threading
4
  import logging
5
  import time
6
 
7
- from parser_interface import DocumentParser
8
- from parser_registry import ParserRegistry
9
 
10
 
11
  class ParserFactory:
 
4
  import logging
5
  import time
6
 
7
+ from src.parsers.parser_interface import DocumentParser
8
+ from src.parsers.parser_registry import ParserRegistry
9
 
10
 
11
  class ParserFactory:
src/main.py CHANGED
@@ -1,6 +1,6 @@
1
  import parsers # Import all parsers to ensure they're registered
2
 
3
- from ui import launch_ui
4
 
5
 
6
  def main():
 
1
  import parsers # Import all parsers to ensure they're registered
2
 
3
+ from src.ui.ui import launch_ui
4
 
5
 
6
  def main():
src/parsers/__init__.py CHANGED
@@ -1,7 +1,9 @@
1
- # Import all parsers to ensure they register themselves
2
- from parsers.pypdfium_parser import PyPdfiumParser
3
- from parsers.docling_parser import DoclingParser
4
- from parsers.marker_parser import MarkerParser
 
 
5
 
6
  # You can add new parsers here in the future
7
 
 
1
+ """Parser implementations for document conversion."""
2
+
3
+ # Import all parsers to ensure they're registered
4
+ from src.parsers.docling_parser import DoclingParser
5
+ from src.parsers.marker_parser import MarkerParser
6
+ from src.parsers.pypdfium_parser import PyPdfiumParser
7
 
8
  # You can add new parsers here in the future
9
 
src/parsers/docling_parser.py CHANGED
@@ -2,8 +2,8 @@ from pathlib import Path
2
  from typing import Dict, List, Optional, Any, Union
3
  import json
4
 
5
- from parser_interface import DocumentParser
6
- from parser_registry import ParserRegistry
7
  from docling.document_converter import DocumentConverter, PdfFormatOption
8
  from docling.datamodel.base_models import InputFormat
9
  from docling.datamodel.pipeline_options import (
 
2
  from typing import Dict, List, Optional, Any, Union
3
  import json
4
 
5
+ from src.parsers.parser_interface import DocumentParser
6
+ from src.parsers.parser_registry import ParserRegistry
7
  from docling.document_converter import DocumentConverter, PdfFormatOption
8
  from docling.datamodel.base_models import InputFormat
9
  from docling.datamodel.pipeline_options import (
src/parsers/marker_parser.py CHANGED
@@ -1,9 +1,12 @@
1
  from pathlib import Path
2
  from typing import Dict, List, Optional, Any, Union
 
 
 
3
  import json
4
 
5
- from parser_interface import DocumentParser
6
- from parser_registry import ParserRegistry
7
  from marker.converters.pdf import PdfConverter
8
  from marker.models import create_model_dict
9
  from marker.output import text_from_rendered
 
1
  from pathlib import Path
2
  from typing import Dict, List, Optional, Any, Union
3
+ import subprocess
4
+ import tempfile
5
+ import os
6
  import json
7
 
8
+ from src.parsers.parser_interface import DocumentParser
9
+ from src.parsers.parser_registry import ParserRegistry
10
  from marker.converters.pdf import PdfConverter
11
  from marker.models import create_model_dict
12
  from marker.output import text_from_rendered
src/{parser_interface.py β†’ parsers/parser_interface.py} RENAMED
File without changes
src/{parser_registry.py β†’ parsers/parser_registry.py} RENAMED
@@ -1,5 +1,5 @@
1
  from typing import Dict, List, Type, Any, Optional
2
- from parser_interface import DocumentParser
3
 
4
 
5
  class ParserRegistry:
 
1
  from typing import Dict, List, Type, Any, Optional
2
+ from src.parsers.parser_interface import DocumentParser
3
 
4
 
5
  class ParserRegistry:
src/parsers/pypdfium_parser.py CHANGED
@@ -1,8 +1,10 @@
1
  from pathlib import Path
2
  from typing import Dict, List, Optional, Any, Union
 
 
3
 
4
- from parser_interface import DocumentParser
5
- from parser_registry import ParserRegistry
6
  from docling.document_converter import DocumentConverter, PdfFormatOption
7
  from docling.datamodel.base_models import InputFormat
8
  from docling.datamodel.pipeline_options import PdfPipelineOptions
@@ -63,7 +65,6 @@ class PyPdfiumParser(DocumentParser):
63
  # Return the content in the requested format
64
  output_format = kwargs.get("output_format", "markdown")
65
  if output_format.lower() == "json":
66
- import json
67
  return json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
68
  elif output_format.lower() == "text":
69
  return doc.export_to_text()
 
1
  from pathlib import Path
2
  from typing import Dict, List, Optional, Any, Union
3
+ import json
4
+ import pypdfium2 as pdfium
5
 
6
+ from src.parsers.parser_interface import DocumentParser
7
+ from src.parsers.parser_registry import ParserRegistry
8
  from docling.document_converter import DocumentConverter, PdfFormatOption
9
  from docling.datamodel.base_models import InputFormat
10
  from docling.datamodel.pipeline_options import PdfPipelineOptions
 
65
  # Return the content in the requested format
66
  output_format = kwargs.get("output_format", "markdown")
67
  if output_format.lower() == "json":
 
68
  return json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
69
  elif output_format.lower() == "text":
70
  return doc.export_to_text()
src/services/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """External services module for the application."""
src/{docling_chat.py β†’ services/docling_chat.py} RENAMED
File without changes
src/ui/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """User interface module for the application."""
src/{ui.py β†’ ui/ui.py} RENAMED
@@ -3,9 +3,9 @@ import markdown
3
  import threading
4
  import time
5
  import logging
6
- from converter import convert_file, set_cancellation_flag, is_conversion_in_progress
7
- from docling_chat import chat_with_document
8
- from parser_registry import ParserRegistry
9
 
10
  # Configure logging
11
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 
3
  import threading
4
  import time
5
  import logging
6
+ from src.core.converter import convert_file, set_cancellation_flag, is_conversion_in_progress
7
+ from src.services.docling_chat import chat_with_document
8
+ from src.parsers.parser_registry import ParserRegistry
9
 
10
  # Configure logging
11
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
tests/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Test suite for the application."""