Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .DS_Store +0 -0
- .gitignore +16 -0
- README.md +81 -7
- gradio_app.py +221 -0
- install_requirement.sh +5 -0
- process/__init__.py +2 -0
- process/process_excel.py +215 -0
- process/process_pdf_with_headers.py +251 -0
- process_invoice.py +219 -0
- requirements.txt +30 -0
- src/__init__.py +4 -0
- src/docx_to_pdf.py +34 -0
- src/excel_to_pdf.py +246 -0
- src/txt_to_pdf.py +43 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
.gitignore
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.env
|
2 |
+
result/
|
3 |
+
*.pdf
|
4 |
+
*.xlsx
|
5 |
+
*.xls
|
6 |
+
*.doc
|
7 |
+
*.docx
|
8 |
+
expiry_invoice/
|
9 |
+
ignore_code/
|
10 |
+
test.ipynb
|
11 |
+
__pycache__/
|
12 |
+
content/
|
13 |
+
.gradio/
|
14 |
+
*.json
|
15 |
+
invoiceprocessing/
|
16 |
+
invoiceprocessing/*
|
README.md
CHANGED
@@ -1,12 +1,86 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
colorFrom: red
|
5 |
-
colorTo: red
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.20.1
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
---
|
|
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: expiryprocess
|
3 |
+
app_file: gradio_app.py
|
|
|
|
|
4 |
sdk: gradio
|
5 |
sdk_version: 5.20.1
|
|
|
|
|
6 |
---
|
7 |
+
# Invoice Processing System with Gradio UI
|
8 |
|
9 |
+
This system processes invoice files (PDF, Excel, Word, Text) and extracts structured data using a combination of OCR, regex patterns, and LLM-based extraction. The extracted data can be downloaded as CSV.
|
10 |
+
|
11 |
+
## Features
|
12 |
+
|
13 |
+
- **Multiple File Formats**: Supports PDF, Excel (.xlsx, .xls), Word (.doc, .docx), and Text (.txt) files
|
14 |
+
- **Document Conversion**: Automatically converts Word and Text files to PDF for processing
|
15 |
+
- **LLM-Enhanced Extraction**: Uses Google's Generative AI for improved extraction accuracy (optional)
|
16 |
+
- **Web Interface**: Easy-to-use Gradio UI for uploading files and downloading results
|
17 |
+
- **CSV Export**: Download extracted data as CSV for further analysis
|
18 |
+
|
19 |
+
## Installation
|
20 |
+
|
21 |
+
1. Clone this repository:
|
22 |
+
```bash
|
23 |
+
git clone <repository-url>
|
24 |
+
cd invoice-processing-system
|
25 |
+
```
|
26 |
+
|
27 |
+
2. Install dependencies:
|
28 |
+
```bash
|
29 |
+
pip install -r requirements.txt
|
30 |
+
```
|
31 |
+
|
32 |
+
3. Set up environment variables:
|
33 |
+
- Create a `.env` file in the project root
|
34 |
+
- Add your Google API key for LLM processing:
|
35 |
+
```
|
36 |
+
GOOGLE_API_KEY=your_api_key_here
|
37 |
+
```
|
38 |
+
|
39 |
+
## Usage
|
40 |
+
|
41 |
+
### Web Interface (Gradio UI)
|
42 |
+
|
43 |
+
1. Start the Gradio web interface:
|
44 |
+
```bash
|
45 |
+
python gradio_app.py
|
46 |
+
```
|
47 |
+
|
48 |
+
2. Open your browser and navigate to the URL shown in the terminal (typically http://127.0.0.1:7860)
|
49 |
+
|
50 |
+
3. Upload an invoice file using the file upload button
|
51 |
+
|
52 |
+
4. Click "Process Invoice" to extract data from the file
|
53 |
+
|
54 |
+
5. View the extracted data in the table and download as CSV using the download button
|
55 |
+
|
56 |
+
### Command Line Interface
|
57 |
+
|
58 |
+
You can also use the command line interface:
|
59 |
+
|
60 |
+
```bash
|
61 |
+
# Process a file with default settings (using LLM if available)
|
62 |
+
python process_invoice.py path/to/invoice.pdf
|
63 |
+
|
64 |
+
# Process without using LLM
|
65 |
+
python process_invoice.py path/to/invoice.xlsx --no-llm
|
66 |
+
|
67 |
+
# Process without saving JSON output
|
68 |
+
python process_invoice.py path/to/invoice.docx --no-json
|
69 |
+
```
|
70 |
+
|
71 |
+
## Requirements
|
72 |
+
|
73 |
+
- Python 3.8+
|
74 |
+
- Google API key (for LLM-enhanced extraction)
|
75 |
+
- LibreOffice (for converting .doc/.docx files to PDF)
|
76 |
+
- Tesseract OCR (for PDF processing)
|
77 |
+
|
78 |
+
## Troubleshooting
|
79 |
+
|
80 |
+
- **LLM Processing Not Available**: Ensure your Google API key is correctly set in the `.env` file
|
81 |
+
- **PDF Conversion Issues**: Make sure LibreOffice is installed and accessible in your PATH
|
82 |
+
- **OCR Quality Issues**: Ensure Tesseract OCR is properly installed and configured
|
83 |
+
|
84 |
+
## License
|
85 |
+
|
86 |
+
[MIT License](LICENSE)
|
gradio_app.py
ADDED
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Gradio web interface for invoice processing system.
|
4 |
+
This UI allows users to upload invoice files (PDF, DOCX, TXT, etc.) and download the results as CSV.
|
5 |
+
"""
|
6 |
+
|
7 |
+
import os
|
8 |
+
import sys
|
9 |
+
import csv
|
10 |
+
import tempfile
|
11 |
+
import logging
|
12 |
+
import pandas as pd
|
13 |
+
from pathlib import Path
|
14 |
+
from typing import Dict, List, Optional, Tuple, Union
|
15 |
+
|
16 |
+
import gradio as gr
|
17 |
+
from gradio_pdf import PDF # Import the enhanced PDF component
|
18 |
+
from dotenv import load_dotenv
|
19 |
+
|
20 |
+
# Import the invoice processing functionality
|
21 |
+
from process_invoice import process_file, setup_google_client
|
22 |
+
# Load environment variables
|
23 |
+
load_dotenv()
|
24 |
+
|
25 |
+
# Configure logging
|
26 |
+
logging.basicConfig(
|
27 |
+
level=logging.INFO,
|
28 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
29 |
+
)
|
30 |
+
logger = logging.getLogger(__name__)
|
31 |
+
|
32 |
+
# Check if Google API is available
|
33 |
+
GOOGLE_API_AVAILABLE = setup_google_client() is not None
|
34 |
+
|
35 |
+
def convert_to_csv(invoice_data: Dict) -> str:
|
36 |
+
"""
|
37 |
+
Convert invoice data to CSV format.
|
38 |
+
|
39 |
+
Args:
|
40 |
+
invoice_data: Dictionary containing invoice data
|
41 |
+
|
42 |
+
Returns:
|
43 |
+
Path to the generated CSV file
|
44 |
+
"""
|
45 |
+
# Create a temporary file for the CSV
|
46 |
+
fd, temp_csv_path = tempfile.mkstemp(suffix='.csv')
|
47 |
+
os.close(fd)
|
48 |
+
|
49 |
+
# Extract items from invoice data
|
50 |
+
items = invoice_data.get('items', [])
|
51 |
+
|
52 |
+
if not items:
|
53 |
+
logger.warning("No items found in invoice data")
|
54 |
+
return temp_csv_path
|
55 |
+
|
56 |
+
# Get all unique keys from all items to use as headers
|
57 |
+
all_keys = set()
|
58 |
+
for item in items:
|
59 |
+
all_keys.update(item.keys())
|
60 |
+
|
61 |
+
# Write to CSV
|
62 |
+
with open(temp_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
|
63 |
+
writer = csv.DictWriter(csvfile, fieldnames=sorted(all_keys))
|
64 |
+
writer.writeheader()
|
65 |
+
writer.writerows(items)
|
66 |
+
|
67 |
+
logger.info(f"CSV file created at {temp_csv_path}")
|
68 |
+
return temp_csv_path
|
69 |
+
|
70 |
+
def process_invoice_file(
|
71 |
+
file_obj: tempfile._TemporaryFileWrapper,
|
72 |
+
use_llm: bool = True
|
73 |
+
) -> Tuple[Dict, str, str, Optional[str], Optional[str]]:
|
74 |
+
"""
|
75 |
+
Process an uploaded invoice file and return the results.
|
76 |
+
|
77 |
+
Args:
|
78 |
+
file_obj: The uploaded file object
|
79 |
+
use_llm: Whether to use LLM for processing
|
80 |
+
|
81 |
+
Returns:
|
82 |
+
Tuple containing:
|
83 |
+
- Dictionary of extracted data
|
84 |
+
- HTML table for display
|
85 |
+
- Status message
|
86 |
+
- Path to CSV file (or None if processing failed)
|
87 |
+
- Path to PDF file for display (or None if not a PDF)
|
88 |
+
"""
|
89 |
+
if not file_obj:
|
90 |
+
return {}, "", "No file uploaded", None, None
|
91 |
+
|
92 |
+
# Get the file extension
|
93 |
+
file_path = file_obj.name
|
94 |
+
file_ext = os.path.splitext(file_path)[1].lower()
|
95 |
+
|
96 |
+
# Check if file format is supported
|
97 |
+
supported_formats = ['.pdf', '.xlsx', '.xls', '.doc', '.docx', '.txt']
|
98 |
+
if file_ext not in supported_formats:
|
99 |
+
return {}, "", f"Unsupported file format: {file_ext}. Supported formats: {', '.join(supported_formats)}", None, None
|
100 |
+
|
101 |
+
# Process the file
|
102 |
+
logger.info(f"Processing file: {file_path}")
|
103 |
+
|
104 |
+
# Create a temporary directory for JSON output
|
105 |
+
result_dir = Path("result")
|
106 |
+
result_dir.mkdir(exist_ok=True)
|
107 |
+
|
108 |
+
# For PDF display
|
109 |
+
pdf_path = file_path
|
110 |
+
|
111 |
+
# If the file is not a PDF, convert it to PDF for display
|
112 |
+
if file_ext != '.pdf':
|
113 |
+
temp_pdf = None
|
114 |
+
try:
|
115 |
+
if file_ext in ['.xlsx', '.xls']:
|
116 |
+
from src.excel_to_pdf import excel_to_pdf, convert_xls_to_xlsx
|
117 |
+
if file_ext == '.xls':
|
118 |
+
xlsx_path = convert_xls_to_xlsx(file_path, tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx').name)
|
119 |
+
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf').name
|
120 |
+
pdf_path = excel_to_pdf(xlsx_path, pdf_path=temp_pdf)
|
121 |
+
else:
|
122 |
+
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf').name
|
123 |
+
pdf_path = excel_to_pdf(file_path, pdf_path=temp_pdf)
|
124 |
+
elif file_ext in ['.doc', '.docx']:
|
125 |
+
from src.docx_to_pdf import docx_to_pdf
|
126 |
+
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf').name
|
127 |
+
pdf_path = docx_to_pdf(file_path, temp_pdf)
|
128 |
+
elif file_ext == '.txt':
|
129 |
+
from src.txt_to_pdf import txt_to_pdf
|
130 |
+
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf').name
|
131 |
+
pdf_path = txt_to_pdf(file_path, temp_pdf)
|
132 |
+
|
133 |
+
logger.info(f"Converted {file_ext} file to PDF: {pdf_path}")
|
134 |
+
except Exception as e:
|
135 |
+
logger.error(f"Error converting file to PDF: {str(e)}")
|
136 |
+
pdf_path = None
|
137 |
+
|
138 |
+
json_path = process_file(file_path)
|
139 |
+
|
140 |
+
# Try to read the JSON file that was created
|
141 |
+
if os.path.exists(json_path):
|
142 |
+
import json
|
143 |
+
with open(json_path, 'r', encoding='utf-8') as f:
|
144 |
+
invoice_data = json.load(f)
|
145 |
+
else:
|
146 |
+
return {}, "", "Failed to process file. No output data found.", None, pdf_path
|
147 |
+
|
148 |
+
# Create a DataFrame for display
|
149 |
+
items = invoice_data.get('items', [])
|
150 |
+
if 'error' in invoice_data and invoice_data['error']:
|
151 |
+
html_table = f"<p class='error' style='color: red; font-weight: bold;'>{invoice_data['error']}</p>"
|
152 |
+
status = f"Error: {invoice_data['error']}"
|
153 |
+
# Still create CSV with any available items
|
154 |
+
csv_path = convert_to_csv(invoice_data)
|
155 |
+
return invoice_data, html_table, status, csv_path, pdf_path
|
156 |
+
elif items:
|
157 |
+
df = pd.DataFrame(items)
|
158 |
+
html_table = df.to_html(classes='table table-striped')
|
159 |
+
status = f"Successfully processed {len(items)} items from {os.path.basename(file_path)}"
|
160 |
+
# Convert to CSV
|
161 |
+
csv_path = convert_to_csv(invoice_data)
|
162 |
+
else:
|
163 |
+
html_table = "<p>No items found in the invoice</p>"
|
164 |
+
status = "No items extracted from the file"
|
165 |
+
# Create empty CSV
|
166 |
+
csv_path = convert_to_csv({"items": []})
|
167 |
+
|
168 |
+
return invoice_data, html_table, status, csv_path, pdf_path
|
169 |
+
|
170 |
+
|
171 |
+
def create_ui() -> gr.Blocks:
|
172 |
+
"""Create and return the Gradio UI."""
|
173 |
+
with gr.Blocks(title="Invoice Processing System") as app:
|
174 |
+
gr.Markdown("# Invoice Processing System")
|
175 |
+
gr.Markdown("Upload an invoice file (PDF, Excel, Word, or Text) to extract and download the data as CSV.")
|
176 |
+
|
177 |
+
with gr.Row():
|
178 |
+
with gr.Column(scale=1):
|
179 |
+
file_input = gr.File(label="Upload Invoice File")
|
180 |
+
process_button = gr.Button("Process Invoice", variant="primary")
|
181 |
+
status_output = gr.Textbox(label="Status", interactive=False)
|
182 |
+
csv_output = gr.File(label="Download CSV", interactive=False)
|
183 |
+
|
184 |
+
with gr.Column(scale=2):
|
185 |
+
with gr.Tabs():
|
186 |
+
with gr.TabItem("Extracted Data"):
|
187 |
+
results_html = gr.HTML(label="Extracted Data")
|
188 |
+
with gr.TabItem("PDF View"):
|
189 |
+
# Use the enhanced PDF component from gradio_pdf
|
190 |
+
pdf_viewer = PDF(label="Invoice PDF", interactive=False)
|
191 |
+
|
192 |
+
# Define the process flow
|
193 |
+
process_button.click(
|
194 |
+
fn=process_invoice_file,
|
195 |
+
inputs=[file_input],
|
196 |
+
outputs=[gr.State(), results_html, status_output, csv_output, pdf_viewer]
|
197 |
+
)
|
198 |
+
|
199 |
+
# Add examples if available
|
200 |
+
example_dir = Path("examples")
|
201 |
+
if example_dir.exists():
|
202 |
+
example_files = list(example_dir.glob("*.pdf")) + list(example_dir.glob("*.xlsx"))
|
203 |
+
if example_files:
|
204 |
+
gr.Examples(
|
205 |
+
examples=[[str(f)] for f in example_files],
|
206 |
+
inputs=[file_input]
|
207 |
+
)
|
208 |
+
|
209 |
+
return app
|
210 |
+
|
211 |
+
def main():
|
212 |
+
"""Main function to launch the Gradio app."""
|
213 |
+
app = create_ui()
|
214 |
+
app.launch(
|
215 |
+
server_name="0.0.0.0", # Make accessible from other computers
|
216 |
+
share=True, # Create a public link
|
217 |
+
inbrowser=True # Open in browser
|
218 |
+
)
|
219 |
+
|
220 |
+
if __name__ == "__main__":
|
221 |
+
main()
|
install_requirement.sh
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
sudo apt-get update
|
2 |
+
sudo apt-get install -y poppler-utils
|
3 |
+
sudo apt-get install -y libreoffice
|
4 |
+
sudo apt-get install -y python3-pip
|
5 |
+
pip install -r requirements.txt
|
process/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .process_pdf_with_headers import InvoiceItem, InvoiceData
|
2 |
+
from .process_excel import process_excel_file # Import the function for processing Excel files
|
process/process_excel.py
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
import re
|
5 |
+
import concurrent.futures
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
from google import genai
|
8 |
+
from typing import List, Dict, Any, Optional, Tuple
|
9 |
+
import logging
|
10 |
+
from pathlib import Path
|
11 |
+
|
12 |
+
# Configure logging
|
13 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
14 |
+
logger = logging.getLogger(__name__)
|
15 |
+
|
16 |
+
|
17 |
+
def setup_environment() -> None:
|
18 |
+
"""
|
19 |
+
Load environment variables and configure the Gemini API client.
|
20 |
+
|
21 |
+
Returns:
|
22 |
+
None
|
23 |
+
"""
|
24 |
+
load_dotenv()
|
25 |
+
|
26 |
+
|
27 |
+
def get_gemini_client() -> genai.Client:
|
28 |
+
"""
|
29 |
+
Initialize and return a Gemini API client.
|
30 |
+
|
31 |
+
Returns:
|
32 |
+
genai.Client: Configured Gemini client
|
33 |
+
"""
|
34 |
+
api_key = os.getenv("GEMINI_API_KEY")
|
35 |
+
if not api_key:
|
36 |
+
raise ValueError("GEMINI_API_KEY environment variable not set")
|
37 |
+
return genai.Client(api_key=api_key)
|
38 |
+
|
39 |
+
|
40 |
+
def process_chunk(chunk_info: Tuple[int, pd.DataFrame, int, int], client: genai.Client) -> List[Dict[str, Any]]:
|
41 |
+
"""
|
42 |
+
Process a single chunk of data using Gemini API.
|
43 |
+
|
44 |
+
Args:
|
45 |
+
chunk_info: Tuple containing (chunk_index, dataframe_chunk, start_index, end_index)
|
46 |
+
client: Gemini API client
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
List of extracted items from the chunk
|
50 |
+
"""
|
51 |
+
i, chunk_df, start_idx, end_idx = chunk_info
|
52 |
+
|
53 |
+
# Create a structured extraction prompt for the specific chunk
|
54 |
+
extraction_prompt = f"""
|
55 |
+
Extract product information from rows {start_idx} to {end_idx-1} in this Excel data.
|
56 |
+
|
57 |
+
For each product row, extract:
|
58 |
+
1. Product name
|
59 |
+
2. Batch number
|
60 |
+
3. Expiry date (MM/YY format)
|
61 |
+
4. MRP (Maximum Retail Price)
|
62 |
+
5. Quantity (as integer)
|
63 |
+
|
64 |
+
Return ONLY a JSON array of objects, one for each product, with these properties:
|
65 |
+
[
|
66 |
+
{{
|
67 |
+
"product_name": "...",
|
68 |
+
"batch_number": "...",
|
69 |
+
"expiry_date": "...",
|
70 |
+
"mrp": "...",
|
71 |
+
"quantity": ...
|
72 |
+
}},
|
73 |
+
...
|
74 |
+
]
|
75 |
+
|
76 |
+
Use null for any value you cannot extract. Return ONLY the JSON array.
|
77 |
+
"""
|
78 |
+
|
79 |
+
chunk_items = []
|
80 |
+
|
81 |
+
# Process chunk
|
82 |
+
try:
|
83 |
+
chunk_response = client.models.generate_content(
|
84 |
+
model="gemini-2.0-flash",
|
85 |
+
contents=[extraction_prompt, chunk_df.to_string()],
|
86 |
+
config={
|
87 |
+
'response_mime_type': 'application/json',
|
88 |
+
'temperature': 0.1,
|
89 |
+
'max_output_tokens': 8192,
|
90 |
+
}
|
91 |
+
)
|
92 |
+
|
93 |
+
# Extract items
|
94 |
+
chunk_text = chunk_response.text
|
95 |
+
# Fix common JSON issues
|
96 |
+
chunk_text = re.sub(r'[\n\r\t]', '', chunk_text)
|
97 |
+
chunk_text = re.sub(r',\s*]', ']', chunk_text)
|
98 |
+
|
99 |
+
# Extract JSON array
|
100 |
+
match = re.search(r'\[(.*)\]', chunk_text, re.DOTALL)
|
101 |
+
if match:
|
102 |
+
try:
|
103 |
+
chunk_items = json.loads('[' + match.group(1) + ']')
|
104 |
+
logger.info(f"Successfully processed chunk {i+1} with {len(chunk_items)} items")
|
105 |
+
except json.JSONDecodeError:
|
106 |
+
logger.error(f"Error parsing JSON in chunk {i+1}")
|
107 |
+
|
108 |
+
except Exception as e:
|
109 |
+
logger.error(f"Error processing chunk {i+1}: {str(e)}")
|
110 |
+
|
111 |
+
return chunk_items
|
112 |
+
|
113 |
+
|
114 |
+
def prepare_chunks(df: pd.DataFrame, chunk_size: int) -> List[Tuple[int, pd.DataFrame, int, int]]:
|
115 |
+
"""
|
116 |
+
Prepare dataframe chunks for processing.
|
117 |
+
|
118 |
+
Args:
|
119 |
+
df: Input dataframe
|
120 |
+
chunk_size: Size of each chunk
|
121 |
+
|
122 |
+
Returns:
|
123 |
+
List of chunk information tuples
|
124 |
+
"""
|
125 |
+
num_chunks = (len(df) + chunk_size - 1) // chunk_size
|
126 |
+
chunks_to_process = []
|
127 |
+
|
128 |
+
for i in range(num_chunks):
|
129 |
+
start_idx = i * chunk_size
|
130 |
+
end_idx = min((i + 1) * chunk_size, len(df))
|
131 |
+
chunk_df = df.iloc[start_idx:end_idx]
|
132 |
+
chunks_to_process.append((i, chunk_df, start_idx, end_idx))
|
133 |
+
|
134 |
+
return chunks_to_process
|
135 |
+
|
136 |
+
|
137 |
+
def process_excel_file(file_path: str, output_path: str, chunk_size: int = 20, max_workers: int = 2) -> Dict[str, Any]:
|
138 |
+
"""
|
139 |
+
Process an Excel file to extract product information using Gemini API.
|
140 |
+
|
141 |
+
Args:
|
142 |
+
file_path: Path to the Excel file
|
143 |
+
output_path: Path to save the extracted data
|
144 |
+
chunk_size: Size of each chunk for processing
|
145 |
+
max_workers: Maximum number of parallel workers
|
146 |
+
|
147 |
+
Returns:
|
148 |
+
Dict containing the extraction results
|
149 |
+
"""
|
150 |
+
# Setup environment
|
151 |
+
setup_environment()
|
152 |
+
client = get_gemini_client()
|
153 |
+
|
154 |
+
# Read Excel file
|
155 |
+
logger.info(f"Reading Excel file: {file_path}")
|
156 |
+
df = pd.read_excel(file_path)
|
157 |
+
|
158 |
+
# Prepare chunks for processing
|
159 |
+
chunks_to_process = prepare_chunks(df, chunk_size)
|
160 |
+
num_chunks = len(chunks_to_process)
|
161 |
+
|
162 |
+
# Process chunks in parallel
|
163 |
+
logger.info(f"Processing {num_chunks} chunks with {max_workers} workers")
|
164 |
+
all_items = []
|
165 |
+
|
166 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
167 |
+
# Pass client to each process_chunk call
|
168 |
+
results = list(executor.map(
|
169 |
+
lambda chunk: process_chunk(chunk, client),
|
170 |
+
chunks_to_process
|
171 |
+
))
|
172 |
+
|
173 |
+
# Combine results
|
174 |
+
for chunk_items in results:
|
175 |
+
all_items.extend(chunk_items)
|
176 |
+
|
177 |
+
# Create final result
|
178 |
+
final_result = {
|
179 |
+
"items": all_items,
|
180 |
+
"extraction_status": "COMPLETE" if all_items else "INCOMPLETE",
|
181 |
+
"total_items": len(all_items)
|
182 |
+
}
|
183 |
+
|
184 |
+
# Save the final result
|
185 |
+
with open(output_path, "w") as f:
|
186 |
+
json.dump(final_result, f, indent=2)
|
187 |
+
|
188 |
+
logger.info(f"Extraction complete. Total items extracted: {len(all_items)}")
|
189 |
+
return final_result
|
190 |
+
|
191 |
+
|
192 |
+
def main() -> None:
|
193 |
+
"""
|
194 |
+
Main function to run the Excel processing script.
|
195 |
+
"""
|
196 |
+
input_file = 'expiry_invoice/SAC01000975.xls'
|
197 |
+
output_file = "extracted_invoice_data.json"
|
198 |
+
|
199 |
+
# Ensure the output directory exists
|
200 |
+
output_path = Path(output_file)
|
201 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
202 |
+
|
203 |
+
# Process the Excel file
|
204 |
+
result = process_excel_file(
|
205 |
+
file_path=input_file,
|
206 |
+
output_path=output_file,
|
207 |
+
chunk_size=20,
|
208 |
+
max_workers=2
|
209 |
+
)
|
210 |
+
|
211 |
+
print(f"Extraction complete. Total items extracted: {result['total_items']}")
|
212 |
+
|
213 |
+
|
214 |
+
if __name__ == "__main__":
|
215 |
+
main()
|
process/process_pdf_with_headers.py
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from google import genai
|
2 |
+
from pydantic import BaseModel, Field
|
3 |
+
from typing import List, Optional, Dict, Tuple
|
4 |
+
import pdf2image
|
5 |
+
import os
|
6 |
+
from pathlib import Path
|
7 |
+
import concurrent.futures
|
8 |
+
from dataclasses import dataclass
|
9 |
+
from functools import partial
|
10 |
+
import logging
|
11 |
+
from PIL import Image
|
12 |
+
from dotenv import load_dotenv
|
13 |
+
|
14 |
+
|
15 |
+
load_dotenv()
|
16 |
+
|
17 |
+
|
18 |
+
class InvoiceItem(BaseModel):
|
19 |
+
"""Represents a single item in an invoice."""
|
20 |
+
product_name: str = Field(description="The name of the product")
|
21 |
+
batch_number: str = Field(description="The batch number of the product")
|
22 |
+
expiry_date: str = Field(description="The expiry date (format: MM/YY)")
|
23 |
+
mrp: str = Field(description="Maximum Retail Price")
|
24 |
+
quantity: int = Field(description="Product quantity")
|
25 |
+
|
26 |
+
class InvoiceData(BaseModel):
|
27 |
+
"""Represents the complete invoice data including headers."""
|
28 |
+
headers: List[str] = Field(
|
29 |
+
description="Column headers from the invoice table",
|
30 |
+
default_factory=list
|
31 |
+
)
|
32 |
+
items: List[InvoiceItem] = Field(
|
33 |
+
description="List of extracted invoice items",
|
34 |
+
default_factory=list
|
35 |
+
)
|
36 |
+
|
37 |
+
class HeaderExtraction(BaseModel):
|
38 |
+
"""Model for extracting headers separately."""
|
39 |
+
headers: List[str] = Field(
|
40 |
+
description="The column headers found in the invoice table"
|
41 |
+
)
|
42 |
+
|
43 |
+
@dataclass
|
44 |
+
class PageData:
|
45 |
+
"""Container for page processing data."""
|
46 |
+
idx: int
|
47 |
+
image_path: str
|
48 |
+
headers: List[str]
|
49 |
+
items: List[InvoiceItem]
|
50 |
+
|
51 |
+
def extract_headers(client: genai.Client, image_path: str, model_id: str) -> List[str]:
|
52 |
+
"""
|
53 |
+
Extract column headers from the first page of the invoice.
|
54 |
+
|
55 |
+
Args:
|
56 |
+
client: The Gemini API client
|
57 |
+
image_path: Path to the image file
|
58 |
+
model_id: The model ID to use for extraction
|
59 |
+
|
60 |
+
Returns:
|
61 |
+
List of column headers
|
62 |
+
"""
|
63 |
+
header_prompt = """
|
64 |
+
Extract only the column headers from this invoice table.
|
65 |
+
Return them exactly as they appear, maintaining their order from left to right.
|
66 |
+
Only extract the headers, not any data from the rows.
|
67 |
+
"""
|
68 |
+
|
69 |
+
image_file = client.files.upload(
|
70 |
+
file=image_path,
|
71 |
+
config={'display_name': 'invoice_header_page'}
|
72 |
+
)
|
73 |
+
|
74 |
+
response = client.models.generate_content(
|
75 |
+
model=model_id,
|
76 |
+
contents=[header_prompt, image_file],
|
77 |
+
config={
|
78 |
+
'response_mime_type': 'application/json',
|
79 |
+
'response_schema': HeaderExtraction
|
80 |
+
}
|
81 |
+
)
|
82 |
+
|
83 |
+
return response.parsed.headers if response.parsed else []
|
84 |
+
|
85 |
+
def setup_client() -> genai.Client:
|
86 |
+
"""Create and return a Gemini API client."""
|
87 |
+
return genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
|
88 |
+
|
89 |
+
def save_image(image: Image, temp_dir: Path, idx: int) -> str:
|
90 |
+
"""
|
91 |
+
Save a single page image to disk.
|
92 |
+
|
93 |
+
Args:
|
94 |
+
image: The PDF page image (PIL Image)
|
95 |
+
temp_dir: Directory to save the image
|
96 |
+
idx: Page index
|
97 |
+
|
98 |
+
Returns:
|
99 |
+
Path to the saved image
|
100 |
+
"""
|
101 |
+
image_path = str(temp_dir / f"page_{idx+1}.jpg")
|
102 |
+
image.save(image_path, "JPEG")
|
103 |
+
return image_path
|
104 |
+
|
105 |
+
def process_single_page(
|
106 |
+
page_data: Tuple[int, Image.Image, Path, List[str], genai.Client, str]
|
107 |
+
) -> PageData:
|
108 |
+
"""
|
109 |
+
Process a single page of the PDF.
|
110 |
+
|
111 |
+
Args:
|
112 |
+
page_data: Tuple containing (page_index, page_image, temp_dir, headers, client, model_id)
|
113 |
+
|
114 |
+
Returns:
|
115 |
+
PageData object containing extracted information
|
116 |
+
"""
|
117 |
+
idx, image, temp_dir, headers, client, model_id = page_data
|
118 |
+
|
119 |
+
# Save image
|
120 |
+
image_path = save_image(image, temp_dir, idx)
|
121 |
+
|
122 |
+
# First page: extract headers
|
123 |
+
if idx == 0:
|
124 |
+
headers = extract_headers(client, image_path, model_id)
|
125 |
+
prompt = """
|
126 |
+
Extract product details from this invoice table.
|
127 |
+
Use the exact column headers you see in the table.
|
128 |
+
"""
|
129 |
+
else:
|
130 |
+
headers_str = ", ".join(headers)
|
131 |
+
prompt = f"""
|
132 |
+
Extract product details from this invoice table.
|
133 |
+
This is page {idx + 1} of the same invoice.
|
134 |
+
Use these column headers: {headers_str}
|
135 |
+
Ensure the extracted data aligns with these columns in order.
|
136 |
+
"""
|
137 |
+
|
138 |
+
# Process image
|
139 |
+
image_file = client.files.upload(
|
140 |
+
file=image_path,
|
141 |
+
config={'display_name': f'invoice_page_{idx+1}'}
|
142 |
+
)
|
143 |
+
|
144 |
+
response = client.models.generate_content(
|
145 |
+
model=model_id,
|
146 |
+
contents=[prompt, image_file],
|
147 |
+
config={
|
148 |
+
'response_mime_type': 'application/json',
|
149 |
+
'response_schema': InvoiceData
|
150 |
+
}
|
151 |
+
)
|
152 |
+
|
153 |
+
items = response.parsed.items if response.parsed and response.parsed.items else []
|
154 |
+
return PageData(idx=idx, image_path=image_path, headers=headers, items=items)
|
155 |
+
|
156 |
+
def process_pdf_with_headers(pdf_path: str, max_workers: int = 3) -> InvoiceData:
|
157 |
+
"""
|
158 |
+
Process a PDF invoice while preserving column header context using parallel processing.
|
159 |
+
|
160 |
+
Args:
|
161 |
+
pdf_path: Path to the PDF file
|
162 |
+
max_workers: Maximum number of concurrent workers
|
163 |
+
|
164 |
+
Returns:
|
165 |
+
InvoiceData object containing headers and extracted items
|
166 |
+
"""
|
167 |
+
# Convert PDF pages to images
|
168 |
+
images = pdf2image.convert_from_path(pdf_path)
|
169 |
+
|
170 |
+
# Create temp directory
|
171 |
+
temp_dir = Path("content/temp")
|
172 |
+
temp_dir.mkdir(parents=True, exist_ok=True)
|
173 |
+
|
174 |
+
# Initialize shared resources
|
175 |
+
client = setup_client()
|
176 |
+
model_id = "gemini-2.0-flash"
|
177 |
+
headers: List[str] = []
|
178 |
+
|
179 |
+
# Prepare data for parallel processing
|
180 |
+
page_data = []
|
181 |
+
|
182 |
+
try:
|
183 |
+
# Process first page separately to get headers
|
184 |
+
first_page = process_single_page((0, images[0], temp_dir, headers, client, model_id))
|
185 |
+
headers = first_page.headers
|
186 |
+
all_items = first_page.items
|
187 |
+
|
188 |
+
# Prepare remaining pages for parallel processing
|
189 |
+
remaining_pages = [
|
190 |
+
(i, img, temp_dir, headers, client, model_id)
|
191 |
+
for i, img in enumerate(images[1:], start=1)
|
192 |
+
]
|
193 |
+
|
194 |
+
# Process remaining pages in parallel
|
195 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
196 |
+
future_to_page = {
|
197 |
+
executor.submit(process_single_page, page): page[0]
|
198 |
+
for page in remaining_pages
|
199 |
+
}
|
200 |
+
|
201 |
+
# Collect results as they complete
|
202 |
+
for future in concurrent.futures.as_completed(future_to_page):
|
203 |
+
page_idx = future_to_page[future]
|
204 |
+
try:
|
205 |
+
page_result = future.result()
|
206 |
+
all_items.extend(page_result.items)
|
207 |
+
except Exception as e:
|
208 |
+
logging.error(f"Error processing page {page_idx}: {str(e)}")
|
209 |
+
|
210 |
+
finally:
|
211 |
+
# Cleanup temporary files
|
212 |
+
for file in temp_dir.glob("*.jpg"):
|
213 |
+
try:
|
214 |
+
file.unlink()
|
215 |
+
except Exception as e:
|
216 |
+
logging.warning(f"Failed to delete temporary file {file}: {str(e)}")
|
217 |
+
|
218 |
+
return InvoiceData(headers=headers, items=all_items)
|
219 |
+
|
220 |
+
def main():
|
221 |
+
"""Main function to demonstrate usage."""
|
222 |
+
# Configure logging
|
223 |
+
logging.basicConfig(
|
224 |
+
level=logging.INFO,
|
225 |
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
226 |
+
)
|
227 |
+
|
228 |
+
try:
|
229 |
+
invoice_data = process_pdf_with_headers(
|
230 |
+
"/Users/krishnaadithya/Desktop/dev/invoice_processing_2.0/pdf_only/expiry_invoice/DR REDDYS PE 1194.pdf",
|
231 |
+
max_workers=3 # Adjust based on your system and API limits
|
232 |
+
)
|
233 |
+
|
234 |
+
# Print headers
|
235 |
+
print("Column Headers:", ", ".join(invoice_data.headers))
|
236 |
+
print("\nExtracted Items:")
|
237 |
+
|
238 |
+
# Print results
|
239 |
+
for item in invoice_data.items:
|
240 |
+
print(f"Product: {item.product_name}")
|
241 |
+
print(f"Batch: {item.batch_number}")
|
242 |
+
print(f"Expiry: {item.expiry_date}")
|
243 |
+
print(f"MRP: {item.mrp}")
|
244 |
+
print(f"Quantity: {item.quantity}")
|
245 |
+
print("-" * 50)
|
246 |
+
|
247 |
+
except Exception as e:
|
248 |
+
logging.error(f"Error processing invoice: {str(e)}")
|
249 |
+
|
250 |
+
if __name__ == "__main__":
|
251 |
+
main()
|
process_invoice.py
ADDED
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Unified invoice processing script that handles both PDF and Excel files.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
import sys
|
8 |
+
# Add the project root directory to the Python path
|
9 |
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
10 |
+
import json
|
11 |
+
import logging
|
12 |
+
from typing import Optional
|
13 |
+
from pathlib import Path
|
14 |
+
import argparse
|
15 |
+
import tempfile
|
16 |
+
from dotenv import load_dotenv
|
17 |
+
|
18 |
+
# Import document processing functions
|
19 |
+
from process.process_pdf_with_headers import process_pdf_with_headers
|
20 |
+
from process.process_excel import process_excel_file
|
21 |
+
from src.excel_to_pdf import excel_to_pdf, convert_xls_to_xlsx
|
22 |
+
from src.docx_to_pdf import docx_to_pdf
|
23 |
+
from src.txt_to_pdf import txt_to_pdf
|
24 |
+
|
25 |
+
# Load environment variables from .env file if it exists
|
26 |
+
load_dotenv()
|
27 |
+
|
28 |
+
# Configure logging
|
29 |
+
logging.basicConfig(
|
30 |
+
level=logging.INFO,
|
31 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
32 |
+
)
|
33 |
+
logger = logging.getLogger(__name__)
|
34 |
+
|
35 |
+
def setup_google_client():
|
36 |
+
"""Set up and return the Google Generative AI client."""
|
37 |
+
try:
|
38 |
+
from google import genai
|
39 |
+
api_key = os.environ.get("GOOGLE_API_KEY")
|
40 |
+
if not api_key:
|
41 |
+
logger.warning("GOOGLE_API_KEY environment variable not set. PDF processing with LLM will not be available.")
|
42 |
+
return None
|
43 |
+
|
44 |
+
return genai.Client(api_key=api_key)
|
45 |
+
except ImportError:
|
46 |
+
logger.warning("google-generativeai package not installed. PDF processing with LLM will not be available.")
|
47 |
+
return None
|
48 |
+
except Exception as e:
|
49 |
+
logger.error(f"Error setting up Google client: {str(e)}")
|
50 |
+
return None
|
51 |
+
|
52 |
+
def save_to_json(invoice_data, input_file_path: str) -> str:
|
53 |
+
"""
|
54 |
+
Save the invoice data to a JSON file in the 'result' directory.
|
55 |
+
|
56 |
+
Args:
|
57 |
+
invoice_data: The invoice data to save (can be a dictionary or an object)
|
58 |
+
input_file_path: The path to the input file
|
59 |
+
|
60 |
+
Returns:
|
61 |
+
The path to the saved JSON file
|
62 |
+
"""
|
63 |
+
# Create result directory if it doesn't exist
|
64 |
+
result_dir = "result"
|
65 |
+
os.makedirs(result_dir, exist_ok=True)
|
66 |
+
|
67 |
+
# Get the base filename without extension
|
68 |
+
base_filename = os.path.splitext(os.path.basename(input_file_path))[0]
|
69 |
+
|
70 |
+
# Create the output JSON file path
|
71 |
+
output_file_path = os.path.join(result_dir, f"{base_filename}.json")
|
72 |
+
|
73 |
+
# Convert invoice data to JSON-serializable format
|
74 |
+
# Check if invoice_data is a dictionary or an object
|
75 |
+
if isinstance(invoice_data, dict):
|
76 |
+
# It's already a dictionary, just ensure items are serializable
|
77 |
+
json_data = invoice_data
|
78 |
+
else:
|
79 |
+
# It's an object, convert to dictionary
|
80 |
+
json_data = {
|
81 |
+
"headers": invoice_data.headers if hasattr(invoice_data, 'headers') else [],
|
82 |
+
"items": [item.model_dump() if hasattr(item, 'model_dump') else item.dict()
|
83 |
+
for item in invoice_data.items]
|
84 |
+
}
|
85 |
+
|
86 |
+
# Write to JSON file
|
87 |
+
with open(output_file_path, 'w', encoding='utf-8') as f:
|
88 |
+
json.dump(json_data, f, indent=2, ensure_ascii=False)
|
89 |
+
|
90 |
+
logger.info(f"Saved invoice data to {output_file_path}")
|
91 |
+
return output_file_path
|
92 |
+
|
93 |
+
def process_file(file_path: str) -> None:
|
94 |
+
"""
|
95 |
+
Process an invoice file (PDF, Excel, or Document) and print the extracted data.
|
96 |
+
|
97 |
+
Args:
|
98 |
+
file_path: Path to the invoice file
|
99 |
+
"""
|
100 |
+
file_path = os.path.abspath(file_path)
|
101 |
+
if not os.path.exists(file_path):
|
102 |
+
logger.error(f"File not found: {file_path}")
|
103 |
+
return
|
104 |
+
|
105 |
+
file_ext = os.path.splitext(file_path)[1].lower()
|
106 |
+
|
107 |
+
llm_client = setup_google_client()
|
108 |
+
|
109 |
+
temp_pdf_path = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf').name
|
110 |
+
|
111 |
+
|
112 |
+
if file_ext in ['.xlsx', '.xls']:
|
113 |
+
# Process Excel file
|
114 |
+
# For .xls files, convert to .xlsx format first
|
115 |
+
if file_ext == '.xls':
|
116 |
+
xlsx_path = convert_xls_to_xlsx(file_path)
|
117 |
+
file_path = xlsx_path
|
118 |
+
|
119 |
+
# Create output JSON path
|
120 |
+
output_json_path = os.path.join("result", f"{os.path.splitext(os.path.basename(file_path))[0]}.json")
|
121 |
+
|
122 |
+
result = process_excel_file(
|
123 |
+
file_path=file_path,
|
124 |
+
output_path=output_json_path,
|
125 |
+
chunk_size=20,
|
126 |
+
max_workers=2
|
127 |
+
)
|
128 |
+
|
129 |
+
# Create the expected invoice_data format
|
130 |
+
invoice_data = {
|
131 |
+
"headers": ["Product Name", "Batch Number", "Expiry Date", "MRP", "Quantity"],
|
132 |
+
"items": result["items"]
|
133 |
+
}
|
134 |
+
|
135 |
+
|
136 |
+
elif file_ext == '.pdf':
|
137 |
+
|
138 |
+
try:
|
139 |
+
logger.info(f"Processing PDF file with header context: {file_path}")
|
140 |
+
|
141 |
+
# Process the PDF using process_pdf_with_headers
|
142 |
+
invoice_data_obj = process_pdf_with_headers(file_path)
|
143 |
+
|
144 |
+
# Convert the InvoiceData object to the format expected by the rest of the code
|
145 |
+
invoice_data = {
|
146 |
+
"headers": invoice_data_obj.headers,
|
147 |
+
"items": [item.model_dump() if hasattr(item, 'model_dump') else item.dict() for item in invoice_data_obj.items]
|
148 |
+
}
|
149 |
+
|
150 |
+
except Exception as e:
|
151 |
+
logger.error(f"Error processing PDF with headers: {str(e)}")
|
152 |
+
|
153 |
+
elif file_ext in ['.doc', '.docx', '.txt']:
|
154 |
+
# Process Document file by first converting to PDF
|
155 |
+
# Ensure the required modules are imported
|
156 |
+
if file_ext == '.txt':
|
157 |
+
temp_pdf_path = txt_to_pdf(file_path, temp_pdf_path)
|
158 |
+
logger.info(f"Converted text file to PDF: {temp_pdf_path}")
|
159 |
+
elif file_ext in ['.doc', '.docx']:
|
160 |
+
temp_pdf_path = docx_to_pdf(file_path, temp_pdf_path)
|
161 |
+
logger.info(f"Converted document file to PDF: {temp_pdf_path}")
|
162 |
+
|
163 |
+
invoice_data_obj = process_pdf_with_headers(temp_pdf_path)
|
164 |
+
|
165 |
+
# Convert the InvoiceData object to the format expected by the rest of the code
|
166 |
+
invoice_data = {
|
167 |
+
"headers": invoice_data_obj.headers,
|
168 |
+
"items": [item.model_dump() if hasattr(item, 'model_dump') else item.dict() for item in invoice_data_obj.items]
|
169 |
+
}
|
170 |
+
|
171 |
+
else:
|
172 |
+
logger.error(f"Unsupported file format: {file_ext}")
|
173 |
+
logger.error("Supported formats: .pdf, .xlsx, .xls, .doc, .docx, .txt")
|
174 |
+
return
|
175 |
+
|
176 |
+
json_path = save_to_json(invoice_data, file_path)
|
177 |
+
print(f"Results saved to: {json_path}")
|
178 |
+
|
179 |
+
# Print results
|
180 |
+
if isinstance(invoice_data, dict):
|
181 |
+
# It's a dictionary
|
182 |
+
items_count = len(invoice_data.get('items', []))
|
183 |
+
items = invoice_data.get('items', [])
|
184 |
+
print(f"\nExtracted {items_count} items from {file_path}:")
|
185 |
+
for i, item in enumerate(items, 1):
|
186 |
+
print(f"\nItem {i}:")
|
187 |
+
print(f" Product: {item.get('product_name', 'N/A')}")
|
188 |
+
print(f" Batch Number: {item.get('batch_number', 'N/A')}")
|
189 |
+
print(f" Expiry: {item.get('expiry_date', 'N/A')}")
|
190 |
+
print(f" MRP: {item.get('mrp', 'N/A')}")
|
191 |
+
print(f" Quantity: {item.get('quantity', 'N/A')}")
|
192 |
+
else:
|
193 |
+
# It's an object (likely a Pydantic model)
|
194 |
+
items_count = len(invoice_data.items) if hasattr(invoice_data, 'items') else 0
|
195 |
+
print(f"\nExtracted {items_count} items from {file_path}:")
|
196 |
+
for i, item in enumerate(invoice_data.items if hasattr(invoice_data, 'items') else [], 1):
|
197 |
+
print(f"\nItem {i}:")
|
198 |
+
print(f" Product: {getattr(item, 'product_name', 'N/A')}")
|
199 |
+
print(f" Batch Number: {getattr(item, 'batch_number', 'N/A')}")
|
200 |
+
print(f" Expiry: {getattr(item, 'expiry_date', 'N/A')}")
|
201 |
+
print(f" MRP: {getattr(item, 'mrp', 'N/A')}")
|
202 |
+
print(f" Quantity: {getattr(item, 'quantity', 'N/A')}")
|
203 |
+
return json_path
|
204 |
+
|
205 |
+
def main():
|
206 |
+
"""Main function to parse arguments and process files."""
|
207 |
+
parser = argparse.ArgumentParser(description="Process invoice files (PDF, Excel, XLS)")
|
208 |
+
parser.add_argument("--file_path", help="Path to the invoice file")
|
209 |
+
|
210 |
+
args = parser.parse_args()
|
211 |
+
|
212 |
+
try:
|
213 |
+
process_file(args.file_path)
|
214 |
+
except Exception as e:
|
215 |
+
logger.error(f"Error processing file: {str(e)}")
|
216 |
+
sys.exit(1)
|
217 |
+
|
218 |
+
if __name__ == "__main__":
|
219 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Core dependencies
|
2 |
+
python-dotenv
|
3 |
+
pandas
|
4 |
+
numpy
|
5 |
+
pydantic
|
6 |
+
|
7 |
+
# PDF processing
|
8 |
+
pdf2image
|
9 |
+
PyMuPDF
|
10 |
+
pytesseract
|
11 |
+
Pillow
|
12 |
+
|
13 |
+
# Document processing
|
14 |
+
python-docx
|
15 |
+
reportlab
|
16 |
+
aspose-words
|
17 |
+
|
18 |
+
# Excel processing
|
19 |
+
openpyxl
|
20 |
+
xlrd
|
21 |
+
pyexcel
|
22 |
+
pyexcel-xls
|
23 |
+
pyexcel-xlsx
|
24 |
+
|
25 |
+
# LLM integration
|
26 |
+
google-genai
|
27 |
+
|
28 |
+
# Web UI
|
29 |
+
gradio
|
30 |
+
gradio_pdf
|
src/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import functions to make them available at the package level
|
2 |
+
from .excel_to_pdf import excel_to_pdf, convert_xls_to_xlsx
|
3 |
+
from .docx_to_pdf import docx_to_pdf
|
4 |
+
from .txt_to_pdf import txt_to_pdf
|
src/docx_to_pdf.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
import aspose.words as aw
|
4 |
+
|
5 |
+
def docx_to_pdf(input_file, output_file=None):
|
6 |
+
#convert doc,docx into pdf
|
7 |
+
# Ensure LibreOffice is installed and get absolute path
|
8 |
+
input_path = os.path.abspath(input_file)
|
9 |
+
output_dir = os.path.dirname(input_path) # Save in the same directory
|
10 |
+
|
11 |
+
# Run LibreOffice command
|
12 |
+
command = ["libreoffice", "--headless", "--convert-to", "pdf", input_path, "--outdir", output_dir]
|
13 |
+
subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
14 |
+
|
15 |
+
# Return output file path
|
16 |
+
if output_file is None:
|
17 |
+
output_file = os.path.join(output_dir, os.path.splitext(os.path.basename(input_file))[0] + ".pdf")
|
18 |
+
return output_file
|
19 |
+
|
20 |
+
|
21 |
+
def docx_to_pdf_(input_file, output_file=None):
|
22 |
+
input_path = os.path.abspath(input_file)
|
23 |
+
output_dir = os.path.dirname(input_path) # Save in the same directory
|
24 |
+
|
25 |
+
|
26 |
+
# Load .doc file
|
27 |
+
doc = aw.Document(input_path)
|
28 |
+
|
29 |
+
if output_file is None:
|
30 |
+
output_file = os.path.join(output_dir, os.path.splitext(os.path.basename(input_file))[0] + ".pdf")
|
31 |
+
|
32 |
+
doc.save(output_file)
|
33 |
+
|
34 |
+
return output_file
|
src/excel_to_pdf.py
ADDED
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
import os
|
4 |
+
import math
|
5 |
+
from openpyxl import load_workbook
|
6 |
+
from reportlab.lib import colors
|
7 |
+
from reportlab.lib.pagesizes import letter, A4, A3, landscape, portrait
|
8 |
+
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, PageBreak
|
9 |
+
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
10 |
+
from reportlab.lib.enums import TA_LEFT, TA_CENTER
|
11 |
+
from reportlab.lib.units import inch
|
12 |
+
import pyexcel as p
|
13 |
+
|
14 |
+
|
15 |
+
def convert_xls_to_xlsx(xls_path, xlsx_path=None):
|
16 |
+
"""Convert the old .xls file to .xlsx format"""
|
17 |
+
if xlsx_path is None:
|
18 |
+
xlsx_path = os.path.splitext(xls_path)[0] + '.xlsx'
|
19 |
+
p.save_book_as(file_name=xls_path, dest_file_name=xlsx_path)
|
20 |
+
return xlsx_path
|
21 |
+
|
22 |
+
|
23 |
+
def determine_page_format(num_columns, max_column_width=None):
|
24 |
+
"""
|
25 |
+
Determine the optimal page size and orientation based on table dimensions.
|
26 |
+
|
27 |
+
Args:
|
28 |
+
num_columns (int): Number of columns in the table.
|
29 |
+
max_column_width (float, optional): Maximum column width if available.
|
30 |
+
|
31 |
+
Returns:
|
32 |
+
tuple: (pagesize, orientation function)
|
33 |
+
"""
|
34 |
+
# Define thresholds for decision making
|
35 |
+
if num_columns <= 5:
|
36 |
+
# Few columns, likely to fit on portrait A4
|
37 |
+
return A4, portrait
|
38 |
+
elif num_columns <= 8:
|
39 |
+
# Medium number of columns, use landscape A4
|
40 |
+
return A4, landscape
|
41 |
+
elif num_columns <= 12:
|
42 |
+
# Many columns, use portrait A3
|
43 |
+
return A3, portrait
|
44 |
+
else:
|
45 |
+
# Lots of columns, use landscape A3
|
46 |
+
return A3, landscape
|
47 |
+
|
48 |
+
|
49 |
+
def is_effectively_empty(value):
|
50 |
+
"""
|
51 |
+
Return True if the cell value is considered empty.
|
52 |
+
|
53 |
+
Empty means:
|
54 |
+
- The value is None.
|
55 |
+
- The value is a float and math.isnan(value) is True.
|
56 |
+
- The value is a string that is empty (after stripping whitespace).
|
57 |
+
"""
|
58 |
+
if value is None:
|
59 |
+
return True
|
60 |
+
if isinstance(value, float) and math.isnan(value):
|
61 |
+
return True
|
62 |
+
if isinstance(value, str) and not value.strip():
|
63 |
+
return True
|
64 |
+
return False
|
65 |
+
|
66 |
+
|
67 |
+
def excel_to_pdf(excel_path, pdf_path=None, sheet_name=None, max_rows_per_table=50):
|
68 |
+
"""
|
69 |
+
Convert Excel file to PDF with adaptive page size based on content,
|
70 |
+
removing columns that contain only NaN (or empty) values.
|
71 |
+
|
72 |
+
Args:
|
73 |
+
excel_path (str): Path to the Excel file.
|
74 |
+
pdf_path (str, optional): Path for the output PDF file.
|
75 |
+
sheet_name (str, optional): Name of the sheet to convert.
|
76 |
+
max_rows_per_table (int): Maximum rows per table before splitting.
|
77 |
+
|
78 |
+
Returns:
|
79 |
+
str: Path to the created PDF file.
|
80 |
+
"""
|
81 |
+
if excel_path.endswith('.xls'):
|
82 |
+
excel_path = convert_xls_to_xlsx(excel_path)
|
83 |
+
|
84 |
+
if pdf_path is None:
|
85 |
+
pdf_path = os.path.splitext(excel_path)[0] + '.pdf'
|
86 |
+
|
87 |
+
# Load Excel file
|
88 |
+
wb = load_workbook(excel_path)
|
89 |
+
sheets = [sheet_name] if sheet_name else wb.sheetnames
|
90 |
+
|
91 |
+
# Create paragraph styles for cell content
|
92 |
+
styles = getSampleStyleSheet()
|
93 |
+
header_style = ParagraphStyle(
|
94 |
+
name='HeaderStyle',
|
95 |
+
parent=styles['Normal'],
|
96 |
+
fontName='Helvetica-Bold',
|
97 |
+
fontSize=9,
|
98 |
+
alignment=TA_CENTER,
|
99 |
+
textColor=colors.white,
|
100 |
+
leading=12
|
101 |
+
)
|
102 |
+
cell_style = ParagraphStyle(
|
103 |
+
name='CellStyle',
|
104 |
+
parent=styles['Normal'],
|
105 |
+
fontName='Helvetica',
|
106 |
+
fontSize=8,
|
107 |
+
alignment=TA_LEFT,
|
108 |
+
leading=10 # Line spacing
|
109 |
+
)
|
110 |
+
|
111 |
+
elements = []
|
112 |
+
|
113 |
+
# Determine the effective maximum number of columns among all sheets (after filtering out empty ones)
|
114 |
+
global_effective_max_columns = 0
|
115 |
+
for sh in sheets:
|
116 |
+
sheet = wb[sh]
|
117 |
+
effective_cols = 0
|
118 |
+
for col in range(1, sheet.max_column + 1):
|
119 |
+
# Check if any cell in the column is non-empty
|
120 |
+
for row in range(1, sheet.max_row + 1):
|
121 |
+
if not is_effectively_empty(sheet.cell(row=row, column=col).value):
|
122 |
+
effective_cols += 1
|
123 |
+
break
|
124 |
+
global_effective_max_columns = max(global_effective_max_columns, effective_cols)
|
125 |
+
|
126 |
+
# Determine optimal page format based on effective column count
|
127 |
+
pagesize, orientation_func = determine_page_format(global_effective_max_columns)
|
128 |
+
|
129 |
+
# Create the document with determined format
|
130 |
+
doc = SimpleDocTemplate(
|
131 |
+
pdf_path,
|
132 |
+
pagesize=orientation_func(pagesize),
|
133 |
+
leftMargin=10,
|
134 |
+
rightMargin=10,
|
135 |
+
topMargin=15,
|
136 |
+
bottomMargin=15
|
137 |
+
)
|
138 |
+
|
139 |
+
# Process each sheet
|
140 |
+
for sheet_idx, current_sheet in enumerate(sheets):
|
141 |
+
sheet = wb[current_sheet]
|
142 |
+
|
143 |
+
# Determine which columns to keep (those with at least one non-empty cell)
|
144 |
+
columns_to_keep = []
|
145 |
+
for col in range(1, sheet.max_column + 1):
|
146 |
+
for row in range(1, sheet.max_row + 1):
|
147 |
+
if not is_effectively_empty(sheet.cell(row=row, column=col).value):
|
148 |
+
columns_to_keep.append(col)
|
149 |
+
break
|
150 |
+
|
151 |
+
# If no columns have valid data, skip this sheet.
|
152 |
+
if not columns_to_keep:
|
153 |
+
continue
|
154 |
+
|
155 |
+
# Calculate appropriate column widths (only for kept columns)
|
156 |
+
max_col_width = 130 # Maximum column width in points
|
157 |
+
min_col_width = 40 # Minimum column width in points
|
158 |
+
if pagesize == A3:
|
159 |
+
max_col_width = 150 # Allow wider columns on A3
|
160 |
+
|
161 |
+
col_widths = []
|
162 |
+
for col in columns_to_keep:
|
163 |
+
max_length = 0
|
164 |
+
# Sample first 100 rows for efficiency
|
165 |
+
for row in range(1, min(100, sheet.max_row) + 1):
|
166 |
+
cell = sheet.cell(row=row, column=col)
|
167 |
+
if cell.value:
|
168 |
+
content_length = len(str(cell.value))
|
169 |
+
# Cap the length for width calculation at 30 characters
|
170 |
+
max_length = max(max_length, min(content_length, 30))
|
171 |
+
# Adjust multiplier based on page format (narrower columns for A4, wider for A3)
|
172 |
+
multiplier = 5.5 if pagesize == A4 else 6.0
|
173 |
+
width = min(max(min_col_width, max_length * multiplier), max_col_width)
|
174 |
+
col_widths.append(width)
|
175 |
+
|
176 |
+
# Build the header row from the kept columns
|
177 |
+
header_row = []
|
178 |
+
# Using row 1 as header (or adjust if your header is in another row)
|
179 |
+
for col in columns_to_keep:
|
180 |
+
cell_value = sheet.cell(row=1, column=col).value
|
181 |
+
header_row.append(Paragraph(str(cell_value or ""), header_style))
|
182 |
+
|
183 |
+
# Process data rows in chunks to avoid huge tables that might get chopped
|
184 |
+
row_count = sheet.max_row
|
185 |
+
# Start after header row
|
186 |
+
start_row = 2
|
187 |
+
while start_row <= row_count:
|
188 |
+
end_row = min(start_row + max_rows_per_table - 1, row_count)
|
189 |
+
|
190 |
+
# Create data for this chunk, starting with the header row
|
191 |
+
chunk_data = [header_row]
|
192 |
+
for row_idx in range(start_row, end_row + 1):
|
193 |
+
data_row = []
|
194 |
+
for col in columns_to_keep:
|
195 |
+
cell = sheet.cell(row=row_idx, column=col)
|
196 |
+
cell_value = cell.value or ""
|
197 |
+
data_row.append(Paragraph(str(cell_value), cell_style))
|
198 |
+
chunk_data.append(data_row)
|
199 |
+
|
200 |
+
# Create table for this chunk
|
201 |
+
table = Table(chunk_data, colWidths=col_widths, repeatRows=1)
|
202 |
+
|
203 |
+
# Style the table
|
204 |
+
table_style = TableStyle([
|
205 |
+
# Header styling
|
206 |
+
('BACKGROUND', (0, 0), (-1, 0), colors.darkblue),
|
207 |
+
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
|
208 |
+
('ALIGN', (0, 0), (-1, 0), 'CENTER'),
|
209 |
+
|
210 |
+
# Grid
|
211 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
212 |
+
('VALIGN', (0, 0), (-1, -1), 'TOP'),
|
213 |
+
|
214 |
+
# Row background colors
|
215 |
+
('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.lightgrey]),
|
216 |
+
|
217 |
+
# Cell padding
|
218 |
+
('LEFTPADDING', (0, 0), (-1, -1), 3),
|
219 |
+
('RIGHTPADDING', (0, 0), (-1, -1), 3),
|
220 |
+
('TOPPADDING', (0, 0), (-1, -1), 3),
|
221 |
+
('BOTTOMPADDING', (0, 0), (-1, -1), 3)
|
222 |
+
])
|
223 |
+
|
224 |
+
table.setStyle(table_style)
|
225 |
+
table.hAlign = 'LEFT'
|
226 |
+
table.spaceBefore = 5
|
227 |
+
table.spaceAfter = 15
|
228 |
+
|
229 |
+
elements.append(table)
|
230 |
+
|
231 |
+
# Uncomment below if you wish to add a continuation note when splitting tables
|
232 |
+
# if end_row < row_count:
|
233 |
+
# continuation = Paragraph(f"Table continues... (Rows {start_row}-{end_row} of {row_count})", styles['Italic'])
|
234 |
+
# elements.append(continuation)
|
235 |
+
# elements.append(Spacer(1, 0.2 * inch))
|
236 |
+
|
237 |
+
start_row = end_row + 1
|
238 |
+
|
239 |
+
# Add page break between sheets (except for the last sheet)
|
240 |
+
if sheet_idx < len(sheets) - 1:
|
241 |
+
elements.append(PageBreak())
|
242 |
+
|
243 |
+
# Build PDF
|
244 |
+
doc.build(elements)
|
245 |
+
|
246 |
+
return pdf_path
|
src/txt_to_pdf.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from reportlab.lib.pagesizes import letter
|
3 |
+
from reportlab.pdfgen import canvas
|
4 |
+
|
5 |
+
|
6 |
+
def txt_to_pdf(input_txt, output_pdf=None):
|
7 |
+
if output_pdf is None:
|
8 |
+
output_pdf = os.path.splitext(input_txt)[0] + '.pdf'
|
9 |
+
|
10 |
+
# Read the text file without modifying spacing
|
11 |
+
with open(input_txt, "r", encoding="utf-8") as file:
|
12 |
+
lines = file.readlines()
|
13 |
+
|
14 |
+
c = canvas.Canvas(output_pdf, pagesize=letter)
|
15 |
+
width, height = letter
|
16 |
+
left_margin = 10
|
17 |
+
top_margin = 10
|
18 |
+
bottom_margin = 10
|
19 |
+
line_height = 10 # Adjust based on desired spacing
|
20 |
+
|
21 |
+
# Use a text object for more control
|
22 |
+
text_object = c.beginText(left_margin, height - top_margin)
|
23 |
+
text_object.setFont("Courier", 8) # Use a monospaced font to keep spacing intact
|
24 |
+
|
25 |
+
for line in lines:
|
26 |
+
# Remove the newline, preserving other whitespace
|
27 |
+
# And skip the line if it's empty (after stripping all whitespace)
|
28 |
+
if not line.strip():
|
29 |
+
continue
|
30 |
+
line = line.rstrip("\n")
|
31 |
+
text_object.textLine(line)
|
32 |
+
|
33 |
+
# Check if we have reached the bottom margin
|
34 |
+
if text_object.getY() < bottom_margin:
|
35 |
+
c.drawText(text_object)
|
36 |
+
c.showPage()
|
37 |
+
text_object = c.beginText(left_margin, height - top_margin)
|
38 |
+
text_object.setFont("Courier", 8)
|
39 |
+
|
40 |
+
# Draw any remaining text
|
41 |
+
c.drawText(text_object)
|
42 |
+
c.save()
|
43 |
+
return output_pdf
|