krishnavadithya commited on
Commit
aacdfd5
·
verified ·
1 Parent(s): a37298c

Upload folder using huggingface_hub

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.gitignore ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .env
2
+ result/
3
+ *.pdf
4
+ *.xlsx
5
+ *.xls
6
+ *.doc
7
+ *.docx
8
+ expiry_invoice/
9
+ ignore_code/
10
+ test.ipynb
11
+ __pycache__/
12
+ content/
13
+ .gradio/
14
+ *.json
15
+ invoiceprocessing/
16
+ invoiceprocessing/*
README.md CHANGED
@@ -1,12 +1,86 @@
1
  ---
2
- title: Expiryprocess
3
- emoji: 🦀
4
- colorFrom: red
5
- colorTo: red
6
  sdk: gradio
7
  sdk_version: 5.20.1
8
- app_file: app.py
9
- pinned: false
10
  ---
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: expiryprocess
3
+ app_file: gradio_app.py
 
 
4
  sdk: gradio
5
  sdk_version: 5.20.1
 
 
6
  ---
7
+ # Invoice Processing System with Gradio UI
8
 
9
+ This system processes invoice files (PDF, Excel, Word, Text) and extracts structured data using a combination of OCR, regex patterns, and LLM-based extraction. The extracted data can be downloaded as CSV.
10
+
11
+ ## Features
12
+
13
+ - **Multiple File Formats**: Supports PDF, Excel (.xlsx, .xls), Word (.doc, .docx), and Text (.txt) files
14
+ - **Document Conversion**: Automatically converts Word and Text files to PDF for processing
15
+ - **LLM-Enhanced Extraction**: Uses Google's Generative AI for improved extraction accuracy (optional)
16
+ - **Web Interface**: Easy-to-use Gradio UI for uploading files and downloading results
17
+ - **CSV Export**: Download extracted data as CSV for further analysis
18
+
19
+ ## Installation
20
+
21
+ 1. Clone this repository:
22
+ ```bash
23
+ git clone <repository-url>
24
+ cd invoice-processing-system
25
+ ```
26
+
27
+ 2. Install dependencies:
28
+ ```bash
29
+ pip install -r requirements.txt
30
+ ```
31
+
32
+ 3. Set up environment variables:
33
+ - Create a `.env` file in the project root
34
+ - Add your Google API key for LLM processing:
35
+ ```
36
+ GOOGLE_API_KEY=your_api_key_here
37
+ ```
38
+
39
+ ## Usage
40
+
41
+ ### Web Interface (Gradio UI)
42
+
43
+ 1. Start the Gradio web interface:
44
+ ```bash
45
+ python gradio_app.py
46
+ ```
47
+
48
+ 2. Open your browser and navigate to the URL shown in the terminal (typically http://127.0.0.1:7860)
49
+
50
+ 3. Upload an invoice file using the file upload button
51
+
52
+ 4. Click "Process Invoice" to extract data from the file
53
+
54
+ 5. View the extracted data in the table and download as CSV using the download button
55
+
56
+ ### Command Line Interface
57
+
58
+ You can also use the command line interface:
59
+
60
+ ```bash
61
+ # Process a file with default settings (using LLM if available)
62
+ python process_invoice.py path/to/invoice.pdf
63
+
64
+ # Process without using LLM
65
+ python process_invoice.py path/to/invoice.xlsx --no-llm
66
+
67
+ # Process without saving JSON output
68
+ python process_invoice.py path/to/invoice.docx --no-json
69
+ ```
70
+
71
+ ## Requirements
72
+
73
+ - Python 3.8+
74
+ - Google API key (for LLM-enhanced extraction)
75
+ - LibreOffice (for converting .doc/.docx files to PDF)
76
+ - Tesseract OCR (for PDF processing)
77
+
78
+ ## Troubleshooting
79
+
80
+ - **LLM Processing Not Available**: Ensure your Google API key is correctly set in the `.env` file
81
+ - **PDF Conversion Issues**: Make sure LibreOffice is installed and accessible in your PATH
82
+ - **OCR Quality Issues**: Ensure Tesseract OCR is properly installed and configured
83
+
84
+ ## License
85
+
86
+ [MIT License](LICENSE)
gradio_app.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Gradio web interface for invoice processing system.
4
+ This UI allows users to upload invoice files (PDF, DOCX, TXT, etc.) and download the results as CSV.
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import csv
10
+ import tempfile
11
+ import logging
12
+ import pandas as pd
13
+ from pathlib import Path
14
+ from typing import Dict, List, Optional, Tuple, Union
15
+
16
+ import gradio as gr
17
+ from gradio_pdf import PDF # Import the enhanced PDF component
18
+ from dotenv import load_dotenv
19
+
20
+ # Import the invoice processing functionality
21
+ from process_invoice import process_file, setup_google_client
22
+ # Load environment variables
23
+ load_dotenv()
24
+
25
+ # Configure logging
26
+ logging.basicConfig(
27
+ level=logging.INFO,
28
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
29
+ )
30
+ logger = logging.getLogger(__name__)
31
+
32
+ # Check if Google API is available
33
+ GOOGLE_API_AVAILABLE = setup_google_client() is not None
34
+
35
+ def convert_to_csv(invoice_data: Dict) -> str:
36
+ """
37
+ Convert invoice data to CSV format.
38
+
39
+ Args:
40
+ invoice_data: Dictionary containing invoice data
41
+
42
+ Returns:
43
+ Path to the generated CSV file
44
+ """
45
+ # Create a temporary file for the CSV
46
+ fd, temp_csv_path = tempfile.mkstemp(suffix='.csv')
47
+ os.close(fd)
48
+
49
+ # Extract items from invoice data
50
+ items = invoice_data.get('items', [])
51
+
52
+ if not items:
53
+ logger.warning("No items found in invoice data")
54
+ return temp_csv_path
55
+
56
+ # Get all unique keys from all items to use as headers
57
+ all_keys = set()
58
+ for item in items:
59
+ all_keys.update(item.keys())
60
+
61
+ # Write to CSV
62
+ with open(temp_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
63
+ writer = csv.DictWriter(csvfile, fieldnames=sorted(all_keys))
64
+ writer.writeheader()
65
+ writer.writerows(items)
66
+
67
+ logger.info(f"CSV file created at {temp_csv_path}")
68
+ return temp_csv_path
69
+
70
+ def process_invoice_file(
71
+ file_obj: tempfile._TemporaryFileWrapper,
72
+ use_llm: bool = True
73
+ ) -> Tuple[Dict, str, str, Optional[str], Optional[str]]:
74
+ """
75
+ Process an uploaded invoice file and return the results.
76
+
77
+ Args:
78
+ file_obj: The uploaded file object
79
+ use_llm: Whether to use LLM for processing
80
+
81
+ Returns:
82
+ Tuple containing:
83
+ - Dictionary of extracted data
84
+ - HTML table for display
85
+ - Status message
86
+ - Path to CSV file (or None if processing failed)
87
+ - Path to PDF file for display (or None if not a PDF)
88
+ """
89
+ if not file_obj:
90
+ return {}, "", "No file uploaded", None, None
91
+
92
+ # Get the file extension
93
+ file_path = file_obj.name
94
+ file_ext = os.path.splitext(file_path)[1].lower()
95
+
96
+ # Check if file format is supported
97
+ supported_formats = ['.pdf', '.xlsx', '.xls', '.doc', '.docx', '.txt']
98
+ if file_ext not in supported_formats:
99
+ return {}, "", f"Unsupported file format: {file_ext}. Supported formats: {', '.join(supported_formats)}", None, None
100
+
101
+ # Process the file
102
+ logger.info(f"Processing file: {file_path}")
103
+
104
+ # Create a temporary directory for JSON output
105
+ result_dir = Path("result")
106
+ result_dir.mkdir(exist_ok=True)
107
+
108
+ # For PDF display
109
+ pdf_path = file_path
110
+
111
+ # If the file is not a PDF, convert it to PDF for display
112
+ if file_ext != '.pdf':
113
+ temp_pdf = None
114
+ try:
115
+ if file_ext in ['.xlsx', '.xls']:
116
+ from src.excel_to_pdf import excel_to_pdf, convert_xls_to_xlsx
117
+ if file_ext == '.xls':
118
+ xlsx_path = convert_xls_to_xlsx(file_path, tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx').name)
119
+ temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf').name
120
+ pdf_path = excel_to_pdf(xlsx_path, pdf_path=temp_pdf)
121
+ else:
122
+ temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf').name
123
+ pdf_path = excel_to_pdf(file_path, pdf_path=temp_pdf)
124
+ elif file_ext in ['.doc', '.docx']:
125
+ from src.docx_to_pdf import docx_to_pdf
126
+ temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf').name
127
+ pdf_path = docx_to_pdf(file_path, temp_pdf)
128
+ elif file_ext == '.txt':
129
+ from src.txt_to_pdf import txt_to_pdf
130
+ temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf').name
131
+ pdf_path = txt_to_pdf(file_path, temp_pdf)
132
+
133
+ logger.info(f"Converted {file_ext} file to PDF: {pdf_path}")
134
+ except Exception as e:
135
+ logger.error(f"Error converting file to PDF: {str(e)}")
136
+ pdf_path = None
137
+
138
+ json_path = process_file(file_path)
139
+
140
+ # Try to read the JSON file that was created
141
+ if os.path.exists(json_path):
142
+ import json
143
+ with open(json_path, 'r', encoding='utf-8') as f:
144
+ invoice_data = json.load(f)
145
+ else:
146
+ return {}, "", "Failed to process file. No output data found.", None, pdf_path
147
+
148
+ # Create a DataFrame for display
149
+ items = invoice_data.get('items', [])
150
+ if 'error' in invoice_data and invoice_data['error']:
151
+ html_table = f"<p class='error' style='color: red; font-weight: bold;'>{invoice_data['error']}</p>"
152
+ status = f"Error: {invoice_data['error']}"
153
+ # Still create CSV with any available items
154
+ csv_path = convert_to_csv(invoice_data)
155
+ return invoice_data, html_table, status, csv_path, pdf_path
156
+ elif items:
157
+ df = pd.DataFrame(items)
158
+ html_table = df.to_html(classes='table table-striped')
159
+ status = f"Successfully processed {len(items)} items from {os.path.basename(file_path)}"
160
+ # Convert to CSV
161
+ csv_path = convert_to_csv(invoice_data)
162
+ else:
163
+ html_table = "<p>No items found in the invoice</p>"
164
+ status = "No items extracted from the file"
165
+ # Create empty CSV
166
+ csv_path = convert_to_csv({"items": []})
167
+
168
+ return invoice_data, html_table, status, csv_path, pdf_path
169
+
170
+
171
+ def create_ui() -> gr.Blocks:
172
+ """Create and return the Gradio UI."""
173
+ with gr.Blocks(title="Invoice Processing System") as app:
174
+ gr.Markdown("# Invoice Processing System")
175
+ gr.Markdown("Upload an invoice file (PDF, Excel, Word, or Text) to extract and download the data as CSV.")
176
+
177
+ with gr.Row():
178
+ with gr.Column(scale=1):
179
+ file_input = gr.File(label="Upload Invoice File")
180
+ process_button = gr.Button("Process Invoice", variant="primary")
181
+ status_output = gr.Textbox(label="Status", interactive=False)
182
+ csv_output = gr.File(label="Download CSV", interactive=False)
183
+
184
+ with gr.Column(scale=2):
185
+ with gr.Tabs():
186
+ with gr.TabItem("Extracted Data"):
187
+ results_html = gr.HTML(label="Extracted Data")
188
+ with gr.TabItem("PDF View"):
189
+ # Use the enhanced PDF component from gradio_pdf
190
+ pdf_viewer = PDF(label="Invoice PDF", interactive=False)
191
+
192
+ # Define the process flow
193
+ process_button.click(
194
+ fn=process_invoice_file,
195
+ inputs=[file_input],
196
+ outputs=[gr.State(), results_html, status_output, csv_output, pdf_viewer]
197
+ )
198
+
199
+ # Add examples if available
200
+ example_dir = Path("examples")
201
+ if example_dir.exists():
202
+ example_files = list(example_dir.glob("*.pdf")) + list(example_dir.glob("*.xlsx"))
203
+ if example_files:
204
+ gr.Examples(
205
+ examples=[[str(f)] for f in example_files],
206
+ inputs=[file_input]
207
+ )
208
+
209
+ return app
210
+
211
+ def main():
212
+ """Main function to launch the Gradio app."""
213
+ app = create_ui()
214
+ app.launch(
215
+ server_name="0.0.0.0", # Make accessible from other computers
216
+ share=True, # Create a public link
217
+ inbrowser=True # Open in browser
218
+ )
219
+
220
+ if __name__ == "__main__":
221
+ main()
install_requirement.sh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ sudo apt-get update
2
+ sudo apt-get install -y poppler-utils
3
+ sudo apt-get install -y libreoffice
4
+ sudo apt-get install -y python3-pip
5
+ pip install -r requirements.txt
process/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .process_pdf_with_headers import InvoiceItem, InvoiceData
2
+ from .process_excel import process_excel_file # Import the function for processing Excel files
process/process_excel.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ import json
4
+ import re
5
+ import concurrent.futures
6
+ from dotenv import load_dotenv
7
+ from google import genai
8
+ from typing import List, Dict, Any, Optional, Tuple
9
+ import logging
10
+ from pathlib import Path
11
+
12
+ # Configure logging
13
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def setup_environment() -> None:
18
+ """
19
+ Load environment variables and configure the Gemini API client.
20
+
21
+ Returns:
22
+ None
23
+ """
24
+ load_dotenv()
25
+
26
+
27
+ def get_gemini_client() -> genai.Client:
28
+ """
29
+ Initialize and return a Gemini API client.
30
+
31
+ Returns:
32
+ genai.Client: Configured Gemini client
33
+ """
34
+ api_key = os.getenv("GEMINI_API_KEY")
35
+ if not api_key:
36
+ raise ValueError("GEMINI_API_KEY environment variable not set")
37
+ return genai.Client(api_key=api_key)
38
+
39
+
40
+ def process_chunk(chunk_info: Tuple[int, pd.DataFrame, int, int], client: genai.Client) -> List[Dict[str, Any]]:
41
+ """
42
+ Process a single chunk of data using Gemini API.
43
+
44
+ Args:
45
+ chunk_info: Tuple containing (chunk_index, dataframe_chunk, start_index, end_index)
46
+ client: Gemini API client
47
+
48
+ Returns:
49
+ List of extracted items from the chunk
50
+ """
51
+ i, chunk_df, start_idx, end_idx = chunk_info
52
+
53
+ # Create a structured extraction prompt for the specific chunk
54
+ extraction_prompt = f"""
55
+ Extract product information from rows {start_idx} to {end_idx-1} in this Excel data.
56
+
57
+ For each product row, extract:
58
+ 1. Product name
59
+ 2. Batch number
60
+ 3. Expiry date (MM/YY format)
61
+ 4. MRP (Maximum Retail Price)
62
+ 5. Quantity (as integer)
63
+
64
+ Return ONLY a JSON array of objects, one for each product, with these properties:
65
+ [
66
+ {{
67
+ "product_name": "...",
68
+ "batch_number": "...",
69
+ "expiry_date": "...",
70
+ "mrp": "...",
71
+ "quantity": ...
72
+ }},
73
+ ...
74
+ ]
75
+
76
+ Use null for any value you cannot extract. Return ONLY the JSON array.
77
+ """
78
+
79
+ chunk_items = []
80
+
81
+ # Process chunk
82
+ try:
83
+ chunk_response = client.models.generate_content(
84
+ model="gemini-2.0-flash",
85
+ contents=[extraction_prompt, chunk_df.to_string()],
86
+ config={
87
+ 'response_mime_type': 'application/json',
88
+ 'temperature': 0.1,
89
+ 'max_output_tokens': 8192,
90
+ }
91
+ )
92
+
93
+ # Extract items
94
+ chunk_text = chunk_response.text
95
+ # Fix common JSON issues
96
+ chunk_text = re.sub(r'[\n\r\t]', '', chunk_text)
97
+ chunk_text = re.sub(r',\s*]', ']', chunk_text)
98
+
99
+ # Extract JSON array
100
+ match = re.search(r'\[(.*)\]', chunk_text, re.DOTALL)
101
+ if match:
102
+ try:
103
+ chunk_items = json.loads('[' + match.group(1) + ']')
104
+ logger.info(f"Successfully processed chunk {i+1} with {len(chunk_items)} items")
105
+ except json.JSONDecodeError:
106
+ logger.error(f"Error parsing JSON in chunk {i+1}")
107
+
108
+ except Exception as e:
109
+ logger.error(f"Error processing chunk {i+1}: {str(e)}")
110
+
111
+ return chunk_items
112
+
113
+
114
+ def prepare_chunks(df: pd.DataFrame, chunk_size: int) -> List[Tuple[int, pd.DataFrame, int, int]]:
115
+ """
116
+ Prepare dataframe chunks for processing.
117
+
118
+ Args:
119
+ df: Input dataframe
120
+ chunk_size: Size of each chunk
121
+
122
+ Returns:
123
+ List of chunk information tuples
124
+ """
125
+ num_chunks = (len(df) + chunk_size - 1) // chunk_size
126
+ chunks_to_process = []
127
+
128
+ for i in range(num_chunks):
129
+ start_idx = i * chunk_size
130
+ end_idx = min((i + 1) * chunk_size, len(df))
131
+ chunk_df = df.iloc[start_idx:end_idx]
132
+ chunks_to_process.append((i, chunk_df, start_idx, end_idx))
133
+
134
+ return chunks_to_process
135
+
136
+
137
+ def process_excel_file(file_path: str, output_path: str, chunk_size: int = 20, max_workers: int = 2) -> Dict[str, Any]:
138
+ """
139
+ Process an Excel file to extract product information using Gemini API.
140
+
141
+ Args:
142
+ file_path: Path to the Excel file
143
+ output_path: Path to save the extracted data
144
+ chunk_size: Size of each chunk for processing
145
+ max_workers: Maximum number of parallel workers
146
+
147
+ Returns:
148
+ Dict containing the extraction results
149
+ """
150
+ # Setup environment
151
+ setup_environment()
152
+ client = get_gemini_client()
153
+
154
+ # Read Excel file
155
+ logger.info(f"Reading Excel file: {file_path}")
156
+ df = pd.read_excel(file_path)
157
+
158
+ # Prepare chunks for processing
159
+ chunks_to_process = prepare_chunks(df, chunk_size)
160
+ num_chunks = len(chunks_to_process)
161
+
162
+ # Process chunks in parallel
163
+ logger.info(f"Processing {num_chunks} chunks with {max_workers} workers")
164
+ all_items = []
165
+
166
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
167
+ # Pass client to each process_chunk call
168
+ results = list(executor.map(
169
+ lambda chunk: process_chunk(chunk, client),
170
+ chunks_to_process
171
+ ))
172
+
173
+ # Combine results
174
+ for chunk_items in results:
175
+ all_items.extend(chunk_items)
176
+
177
+ # Create final result
178
+ final_result = {
179
+ "items": all_items,
180
+ "extraction_status": "COMPLETE" if all_items else "INCOMPLETE",
181
+ "total_items": len(all_items)
182
+ }
183
+
184
+ # Save the final result
185
+ with open(output_path, "w") as f:
186
+ json.dump(final_result, f, indent=2)
187
+
188
+ logger.info(f"Extraction complete. Total items extracted: {len(all_items)}")
189
+ return final_result
190
+
191
+
192
+ def main() -> None:
193
+ """
194
+ Main function to run the Excel processing script.
195
+ """
196
+ input_file = 'expiry_invoice/SAC01000975.xls'
197
+ output_file = "extracted_invoice_data.json"
198
+
199
+ # Ensure the output directory exists
200
+ output_path = Path(output_file)
201
+ output_path.parent.mkdir(parents=True, exist_ok=True)
202
+
203
+ # Process the Excel file
204
+ result = process_excel_file(
205
+ file_path=input_file,
206
+ output_path=output_file,
207
+ chunk_size=20,
208
+ max_workers=2
209
+ )
210
+
211
+ print(f"Extraction complete. Total items extracted: {result['total_items']}")
212
+
213
+
214
+ if __name__ == "__main__":
215
+ main()
process/process_pdf_with_headers.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from google import genai
2
+ from pydantic import BaseModel, Field
3
+ from typing import List, Optional, Dict, Tuple
4
+ import pdf2image
5
+ import os
6
+ from pathlib import Path
7
+ import concurrent.futures
8
+ from dataclasses import dataclass
9
+ from functools import partial
10
+ import logging
11
+ from PIL import Image
12
+ from dotenv import load_dotenv
13
+
14
+
15
+ load_dotenv()
16
+
17
+
18
+ class InvoiceItem(BaseModel):
19
+ """Represents a single item in an invoice."""
20
+ product_name: str = Field(description="The name of the product")
21
+ batch_number: str = Field(description="The batch number of the product")
22
+ expiry_date: str = Field(description="The expiry date (format: MM/YY)")
23
+ mrp: str = Field(description="Maximum Retail Price")
24
+ quantity: int = Field(description="Product quantity")
25
+
26
+ class InvoiceData(BaseModel):
27
+ """Represents the complete invoice data including headers."""
28
+ headers: List[str] = Field(
29
+ description="Column headers from the invoice table",
30
+ default_factory=list
31
+ )
32
+ items: List[InvoiceItem] = Field(
33
+ description="List of extracted invoice items",
34
+ default_factory=list
35
+ )
36
+
37
+ class HeaderExtraction(BaseModel):
38
+ """Model for extracting headers separately."""
39
+ headers: List[str] = Field(
40
+ description="The column headers found in the invoice table"
41
+ )
42
+
43
+ @dataclass
44
+ class PageData:
45
+ """Container for page processing data."""
46
+ idx: int
47
+ image_path: str
48
+ headers: List[str]
49
+ items: List[InvoiceItem]
50
+
51
+ def extract_headers(client: genai.Client, image_path: str, model_id: str) -> List[str]:
52
+ """
53
+ Extract column headers from the first page of the invoice.
54
+
55
+ Args:
56
+ client: The Gemini API client
57
+ image_path: Path to the image file
58
+ model_id: The model ID to use for extraction
59
+
60
+ Returns:
61
+ List of column headers
62
+ """
63
+ header_prompt = """
64
+ Extract only the column headers from this invoice table.
65
+ Return them exactly as they appear, maintaining their order from left to right.
66
+ Only extract the headers, not any data from the rows.
67
+ """
68
+
69
+ image_file = client.files.upload(
70
+ file=image_path,
71
+ config={'display_name': 'invoice_header_page'}
72
+ )
73
+
74
+ response = client.models.generate_content(
75
+ model=model_id,
76
+ contents=[header_prompt, image_file],
77
+ config={
78
+ 'response_mime_type': 'application/json',
79
+ 'response_schema': HeaderExtraction
80
+ }
81
+ )
82
+
83
+ return response.parsed.headers if response.parsed else []
84
+
85
+ def setup_client() -> genai.Client:
86
+ """Create and return a Gemini API client."""
87
+ return genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
88
+
89
+ def save_image(image: Image, temp_dir: Path, idx: int) -> str:
90
+ """
91
+ Save a single page image to disk.
92
+
93
+ Args:
94
+ image: The PDF page image (PIL Image)
95
+ temp_dir: Directory to save the image
96
+ idx: Page index
97
+
98
+ Returns:
99
+ Path to the saved image
100
+ """
101
+ image_path = str(temp_dir / f"page_{idx+1}.jpg")
102
+ image.save(image_path, "JPEG")
103
+ return image_path
104
+
105
+ def process_single_page(
106
+ page_data: Tuple[int, Image.Image, Path, List[str], genai.Client, str]
107
+ ) -> PageData:
108
+ """
109
+ Process a single page of the PDF.
110
+
111
+ Args:
112
+ page_data: Tuple containing (page_index, page_image, temp_dir, headers, client, model_id)
113
+
114
+ Returns:
115
+ PageData object containing extracted information
116
+ """
117
+ idx, image, temp_dir, headers, client, model_id = page_data
118
+
119
+ # Save image
120
+ image_path = save_image(image, temp_dir, idx)
121
+
122
+ # First page: extract headers
123
+ if idx == 0:
124
+ headers = extract_headers(client, image_path, model_id)
125
+ prompt = """
126
+ Extract product details from this invoice table.
127
+ Use the exact column headers you see in the table.
128
+ """
129
+ else:
130
+ headers_str = ", ".join(headers)
131
+ prompt = f"""
132
+ Extract product details from this invoice table.
133
+ This is page {idx + 1} of the same invoice.
134
+ Use these column headers: {headers_str}
135
+ Ensure the extracted data aligns with these columns in order.
136
+ """
137
+
138
+ # Process image
139
+ image_file = client.files.upload(
140
+ file=image_path,
141
+ config={'display_name': f'invoice_page_{idx+1}'}
142
+ )
143
+
144
+ response = client.models.generate_content(
145
+ model=model_id,
146
+ contents=[prompt, image_file],
147
+ config={
148
+ 'response_mime_type': 'application/json',
149
+ 'response_schema': InvoiceData
150
+ }
151
+ )
152
+
153
+ items = response.parsed.items if response.parsed and response.parsed.items else []
154
+ return PageData(idx=idx, image_path=image_path, headers=headers, items=items)
155
+
156
+ def process_pdf_with_headers(pdf_path: str, max_workers: int = 3) -> InvoiceData:
157
+ """
158
+ Process a PDF invoice while preserving column header context using parallel processing.
159
+
160
+ Args:
161
+ pdf_path: Path to the PDF file
162
+ max_workers: Maximum number of concurrent workers
163
+
164
+ Returns:
165
+ InvoiceData object containing headers and extracted items
166
+ """
167
+ # Convert PDF pages to images
168
+ images = pdf2image.convert_from_path(pdf_path)
169
+
170
+ # Create temp directory
171
+ temp_dir = Path("content/temp")
172
+ temp_dir.mkdir(parents=True, exist_ok=True)
173
+
174
+ # Initialize shared resources
175
+ client = setup_client()
176
+ model_id = "gemini-2.0-flash"
177
+ headers: List[str] = []
178
+
179
+ # Prepare data for parallel processing
180
+ page_data = []
181
+
182
+ try:
183
+ # Process first page separately to get headers
184
+ first_page = process_single_page((0, images[0], temp_dir, headers, client, model_id))
185
+ headers = first_page.headers
186
+ all_items = first_page.items
187
+
188
+ # Prepare remaining pages for parallel processing
189
+ remaining_pages = [
190
+ (i, img, temp_dir, headers, client, model_id)
191
+ for i, img in enumerate(images[1:], start=1)
192
+ ]
193
+
194
+ # Process remaining pages in parallel
195
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
196
+ future_to_page = {
197
+ executor.submit(process_single_page, page): page[0]
198
+ for page in remaining_pages
199
+ }
200
+
201
+ # Collect results as they complete
202
+ for future in concurrent.futures.as_completed(future_to_page):
203
+ page_idx = future_to_page[future]
204
+ try:
205
+ page_result = future.result()
206
+ all_items.extend(page_result.items)
207
+ except Exception as e:
208
+ logging.error(f"Error processing page {page_idx}: {str(e)}")
209
+
210
+ finally:
211
+ # Cleanup temporary files
212
+ for file in temp_dir.glob("*.jpg"):
213
+ try:
214
+ file.unlink()
215
+ except Exception as e:
216
+ logging.warning(f"Failed to delete temporary file {file}: {str(e)}")
217
+
218
+ return InvoiceData(headers=headers, items=all_items)
219
+
220
+ def main():
221
+ """Main function to demonstrate usage."""
222
+ # Configure logging
223
+ logging.basicConfig(
224
+ level=logging.INFO,
225
+ format='%(asctime)s - %(levelname)s - %(message)s'
226
+ )
227
+
228
+ try:
229
+ invoice_data = process_pdf_with_headers(
230
+ "/Users/krishnaadithya/Desktop/dev/invoice_processing_2.0/pdf_only/expiry_invoice/DR REDDYS PE 1194.pdf",
231
+ max_workers=3 # Adjust based on your system and API limits
232
+ )
233
+
234
+ # Print headers
235
+ print("Column Headers:", ", ".join(invoice_data.headers))
236
+ print("\nExtracted Items:")
237
+
238
+ # Print results
239
+ for item in invoice_data.items:
240
+ print(f"Product: {item.product_name}")
241
+ print(f"Batch: {item.batch_number}")
242
+ print(f"Expiry: {item.expiry_date}")
243
+ print(f"MRP: {item.mrp}")
244
+ print(f"Quantity: {item.quantity}")
245
+ print("-" * 50)
246
+
247
+ except Exception as e:
248
+ logging.error(f"Error processing invoice: {str(e)}")
249
+
250
+ if __name__ == "__main__":
251
+ main()
process_invoice.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Unified invoice processing script that handles both PDF and Excel files.
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ # Add the project root directory to the Python path
9
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
10
+ import json
11
+ import logging
12
+ from typing import Optional
13
+ from pathlib import Path
14
+ import argparse
15
+ import tempfile
16
+ from dotenv import load_dotenv
17
+
18
+ # Import document processing functions
19
+ from process.process_pdf_with_headers import process_pdf_with_headers
20
+ from process.process_excel import process_excel_file
21
+ from src.excel_to_pdf import excel_to_pdf, convert_xls_to_xlsx
22
+ from src.docx_to_pdf import docx_to_pdf
23
+ from src.txt_to_pdf import txt_to_pdf
24
+
25
+ # Load environment variables from .env file if it exists
26
+ load_dotenv()
27
+
28
+ # Configure logging
29
+ logging.basicConfig(
30
+ level=logging.INFO,
31
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
32
+ )
33
+ logger = logging.getLogger(__name__)
34
+
35
+ def setup_google_client():
36
+ """Set up and return the Google Generative AI client."""
37
+ try:
38
+ from google import genai
39
+ api_key = os.environ.get("GOOGLE_API_KEY")
40
+ if not api_key:
41
+ logger.warning("GOOGLE_API_KEY environment variable not set. PDF processing with LLM will not be available.")
42
+ return None
43
+
44
+ return genai.Client(api_key=api_key)
45
+ except ImportError:
46
+ logger.warning("google-generativeai package not installed. PDF processing with LLM will not be available.")
47
+ return None
48
+ except Exception as e:
49
+ logger.error(f"Error setting up Google client: {str(e)}")
50
+ return None
51
+
52
+ def save_to_json(invoice_data, input_file_path: str) -> str:
53
+ """
54
+ Save the invoice data to a JSON file in the 'result' directory.
55
+
56
+ Args:
57
+ invoice_data: The invoice data to save (can be a dictionary or an object)
58
+ input_file_path: The path to the input file
59
+
60
+ Returns:
61
+ The path to the saved JSON file
62
+ """
63
+ # Create result directory if it doesn't exist
64
+ result_dir = "result"
65
+ os.makedirs(result_dir, exist_ok=True)
66
+
67
+ # Get the base filename without extension
68
+ base_filename = os.path.splitext(os.path.basename(input_file_path))[0]
69
+
70
+ # Create the output JSON file path
71
+ output_file_path = os.path.join(result_dir, f"{base_filename}.json")
72
+
73
+ # Convert invoice data to JSON-serializable format
74
+ # Check if invoice_data is a dictionary or an object
75
+ if isinstance(invoice_data, dict):
76
+ # It's already a dictionary, just ensure items are serializable
77
+ json_data = invoice_data
78
+ else:
79
+ # It's an object, convert to dictionary
80
+ json_data = {
81
+ "headers": invoice_data.headers if hasattr(invoice_data, 'headers') else [],
82
+ "items": [item.model_dump() if hasattr(item, 'model_dump') else item.dict()
83
+ for item in invoice_data.items]
84
+ }
85
+
86
+ # Write to JSON file
87
+ with open(output_file_path, 'w', encoding='utf-8') as f:
88
+ json.dump(json_data, f, indent=2, ensure_ascii=False)
89
+
90
+ logger.info(f"Saved invoice data to {output_file_path}")
91
+ return output_file_path
92
+
93
+ def process_file(file_path: str) -> None:
94
+ """
95
+ Process an invoice file (PDF, Excel, or Document) and print the extracted data.
96
+
97
+ Args:
98
+ file_path: Path to the invoice file
99
+ """
100
+ file_path = os.path.abspath(file_path)
101
+ if not os.path.exists(file_path):
102
+ logger.error(f"File not found: {file_path}")
103
+ return
104
+
105
+ file_ext = os.path.splitext(file_path)[1].lower()
106
+
107
+ llm_client = setup_google_client()
108
+
109
+ temp_pdf_path = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf').name
110
+
111
+
112
+ if file_ext in ['.xlsx', '.xls']:
113
+ # Process Excel file
114
+ # For .xls files, convert to .xlsx format first
115
+ if file_ext == '.xls':
116
+ xlsx_path = convert_xls_to_xlsx(file_path)
117
+ file_path = xlsx_path
118
+
119
+ # Create output JSON path
120
+ output_json_path = os.path.join("result", f"{os.path.splitext(os.path.basename(file_path))[0]}.json")
121
+
122
+ result = process_excel_file(
123
+ file_path=file_path,
124
+ output_path=output_json_path,
125
+ chunk_size=20,
126
+ max_workers=2
127
+ )
128
+
129
+ # Create the expected invoice_data format
130
+ invoice_data = {
131
+ "headers": ["Product Name", "Batch Number", "Expiry Date", "MRP", "Quantity"],
132
+ "items": result["items"]
133
+ }
134
+
135
+
136
+ elif file_ext == '.pdf':
137
+
138
+ try:
139
+ logger.info(f"Processing PDF file with header context: {file_path}")
140
+
141
+ # Process the PDF using process_pdf_with_headers
142
+ invoice_data_obj = process_pdf_with_headers(file_path)
143
+
144
+ # Convert the InvoiceData object to the format expected by the rest of the code
145
+ invoice_data = {
146
+ "headers": invoice_data_obj.headers,
147
+ "items": [item.model_dump() if hasattr(item, 'model_dump') else item.dict() for item in invoice_data_obj.items]
148
+ }
149
+
150
+ except Exception as e:
151
+ logger.error(f"Error processing PDF with headers: {str(e)}")
152
+
153
+ elif file_ext in ['.doc', '.docx', '.txt']:
154
+ # Process Document file by first converting to PDF
155
+ # Ensure the required modules are imported
156
+ if file_ext == '.txt':
157
+ temp_pdf_path = txt_to_pdf(file_path, temp_pdf_path)
158
+ logger.info(f"Converted text file to PDF: {temp_pdf_path}")
159
+ elif file_ext in ['.doc', '.docx']:
160
+ temp_pdf_path = docx_to_pdf(file_path, temp_pdf_path)
161
+ logger.info(f"Converted document file to PDF: {temp_pdf_path}")
162
+
163
+ invoice_data_obj = process_pdf_with_headers(temp_pdf_path)
164
+
165
+ # Convert the InvoiceData object to the format expected by the rest of the code
166
+ invoice_data = {
167
+ "headers": invoice_data_obj.headers,
168
+ "items": [item.model_dump() if hasattr(item, 'model_dump') else item.dict() for item in invoice_data_obj.items]
169
+ }
170
+
171
+ else:
172
+ logger.error(f"Unsupported file format: {file_ext}")
173
+ logger.error("Supported formats: .pdf, .xlsx, .xls, .doc, .docx, .txt")
174
+ return
175
+
176
+ json_path = save_to_json(invoice_data, file_path)
177
+ print(f"Results saved to: {json_path}")
178
+
179
+ # Print results
180
+ if isinstance(invoice_data, dict):
181
+ # It's a dictionary
182
+ items_count = len(invoice_data.get('items', []))
183
+ items = invoice_data.get('items', [])
184
+ print(f"\nExtracted {items_count} items from {file_path}:")
185
+ for i, item in enumerate(items, 1):
186
+ print(f"\nItem {i}:")
187
+ print(f" Product: {item.get('product_name', 'N/A')}")
188
+ print(f" Batch Number: {item.get('batch_number', 'N/A')}")
189
+ print(f" Expiry: {item.get('expiry_date', 'N/A')}")
190
+ print(f" MRP: {item.get('mrp', 'N/A')}")
191
+ print(f" Quantity: {item.get('quantity', 'N/A')}")
192
+ else:
193
+ # It's an object (likely a Pydantic model)
194
+ items_count = len(invoice_data.items) if hasattr(invoice_data, 'items') else 0
195
+ print(f"\nExtracted {items_count} items from {file_path}:")
196
+ for i, item in enumerate(invoice_data.items if hasattr(invoice_data, 'items') else [], 1):
197
+ print(f"\nItem {i}:")
198
+ print(f" Product: {getattr(item, 'product_name', 'N/A')}")
199
+ print(f" Batch Number: {getattr(item, 'batch_number', 'N/A')}")
200
+ print(f" Expiry: {getattr(item, 'expiry_date', 'N/A')}")
201
+ print(f" MRP: {getattr(item, 'mrp', 'N/A')}")
202
+ print(f" Quantity: {getattr(item, 'quantity', 'N/A')}")
203
+ return json_path
204
+
205
+ def main():
206
+ """Main function to parse arguments and process files."""
207
+ parser = argparse.ArgumentParser(description="Process invoice files (PDF, Excel, XLS)")
208
+ parser.add_argument("--file_path", help="Path to the invoice file")
209
+
210
+ args = parser.parse_args()
211
+
212
+ try:
213
+ process_file(args.file_path)
214
+ except Exception as e:
215
+ logger.error(f"Error processing file: {str(e)}")
216
+ sys.exit(1)
217
+
218
+ if __name__ == "__main__":
219
+ main()
requirements.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ python-dotenv
3
+ pandas
4
+ numpy
5
+ pydantic
6
+
7
+ # PDF processing
8
+ pdf2image
9
+ PyMuPDF
10
+ pytesseract
11
+ Pillow
12
+
13
+ # Document processing
14
+ python-docx
15
+ reportlab
16
+ aspose-words
17
+
18
+ # Excel processing
19
+ openpyxl
20
+ xlrd
21
+ pyexcel
22
+ pyexcel-xls
23
+ pyexcel-xlsx
24
+
25
+ # LLM integration
26
+ google-genai
27
+
28
+ # Web UI
29
+ gradio
30
+ gradio_pdf
src/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Import functions to make them available at the package level
2
+ from .excel_to_pdf import excel_to_pdf, convert_xls_to_xlsx
3
+ from .docx_to_pdf import docx_to_pdf
4
+ from .txt_to_pdf import txt_to_pdf
src/docx_to_pdf.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import aspose.words as aw
4
+
5
+ def docx_to_pdf(input_file, output_file=None):
6
+ #convert doc,docx into pdf
7
+ # Ensure LibreOffice is installed and get absolute path
8
+ input_path = os.path.abspath(input_file)
9
+ output_dir = os.path.dirname(input_path) # Save in the same directory
10
+
11
+ # Run LibreOffice command
12
+ command = ["libreoffice", "--headless", "--convert-to", "pdf", input_path, "--outdir", output_dir]
13
+ subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
14
+
15
+ # Return output file path
16
+ if output_file is None:
17
+ output_file = os.path.join(output_dir, os.path.splitext(os.path.basename(input_file))[0] + ".pdf")
18
+ return output_file
19
+
20
+
21
+ def docx_to_pdf_(input_file, output_file=None):
22
+ input_path = os.path.abspath(input_file)
23
+ output_dir = os.path.dirname(input_path) # Save in the same directory
24
+
25
+
26
+ # Load .doc file
27
+ doc = aw.Document(input_path)
28
+
29
+ if output_file is None:
30
+ output_file = os.path.join(output_dir, os.path.splitext(os.path.basename(input_file))[0] + ".pdf")
31
+
32
+ doc.save(output_file)
33
+
34
+ return output_file
src/excel_to_pdf.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import os
4
+ import math
5
+ from openpyxl import load_workbook
6
+ from reportlab.lib import colors
7
+ from reportlab.lib.pagesizes import letter, A4, A3, landscape, portrait
8
+ from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, PageBreak
9
+ from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
10
+ from reportlab.lib.enums import TA_LEFT, TA_CENTER
11
+ from reportlab.lib.units import inch
12
+ import pyexcel as p
13
+
14
+
15
+ def convert_xls_to_xlsx(xls_path, xlsx_path=None):
16
+ """Convert the old .xls file to .xlsx format"""
17
+ if xlsx_path is None:
18
+ xlsx_path = os.path.splitext(xls_path)[0] + '.xlsx'
19
+ p.save_book_as(file_name=xls_path, dest_file_name=xlsx_path)
20
+ return xlsx_path
21
+
22
+
23
+ def determine_page_format(num_columns, max_column_width=None):
24
+ """
25
+ Determine the optimal page size and orientation based on table dimensions.
26
+
27
+ Args:
28
+ num_columns (int): Number of columns in the table.
29
+ max_column_width (float, optional): Maximum column width if available.
30
+
31
+ Returns:
32
+ tuple: (pagesize, orientation function)
33
+ """
34
+ # Define thresholds for decision making
35
+ if num_columns <= 5:
36
+ # Few columns, likely to fit on portrait A4
37
+ return A4, portrait
38
+ elif num_columns <= 8:
39
+ # Medium number of columns, use landscape A4
40
+ return A4, landscape
41
+ elif num_columns <= 12:
42
+ # Many columns, use portrait A3
43
+ return A3, portrait
44
+ else:
45
+ # Lots of columns, use landscape A3
46
+ return A3, landscape
47
+
48
+
49
+ def is_effectively_empty(value):
50
+ """
51
+ Return True if the cell value is considered empty.
52
+
53
+ Empty means:
54
+ - The value is None.
55
+ - The value is a float and math.isnan(value) is True.
56
+ - The value is a string that is empty (after stripping whitespace).
57
+ """
58
+ if value is None:
59
+ return True
60
+ if isinstance(value, float) and math.isnan(value):
61
+ return True
62
+ if isinstance(value, str) and not value.strip():
63
+ return True
64
+ return False
65
+
66
+
67
+ def excel_to_pdf(excel_path, pdf_path=None, sheet_name=None, max_rows_per_table=50):
68
+ """
69
+ Convert Excel file to PDF with adaptive page size based on content,
70
+ removing columns that contain only NaN (or empty) values.
71
+
72
+ Args:
73
+ excel_path (str): Path to the Excel file.
74
+ pdf_path (str, optional): Path for the output PDF file.
75
+ sheet_name (str, optional): Name of the sheet to convert.
76
+ max_rows_per_table (int): Maximum rows per table before splitting.
77
+
78
+ Returns:
79
+ str: Path to the created PDF file.
80
+ """
81
+ if excel_path.endswith('.xls'):
82
+ excel_path = convert_xls_to_xlsx(excel_path)
83
+
84
+ if pdf_path is None:
85
+ pdf_path = os.path.splitext(excel_path)[0] + '.pdf'
86
+
87
+ # Load Excel file
88
+ wb = load_workbook(excel_path)
89
+ sheets = [sheet_name] if sheet_name else wb.sheetnames
90
+
91
+ # Create paragraph styles for cell content
92
+ styles = getSampleStyleSheet()
93
+ header_style = ParagraphStyle(
94
+ name='HeaderStyle',
95
+ parent=styles['Normal'],
96
+ fontName='Helvetica-Bold',
97
+ fontSize=9,
98
+ alignment=TA_CENTER,
99
+ textColor=colors.white,
100
+ leading=12
101
+ )
102
+ cell_style = ParagraphStyle(
103
+ name='CellStyle',
104
+ parent=styles['Normal'],
105
+ fontName='Helvetica',
106
+ fontSize=8,
107
+ alignment=TA_LEFT,
108
+ leading=10 # Line spacing
109
+ )
110
+
111
+ elements = []
112
+
113
+ # Determine the effective maximum number of columns among all sheets (after filtering out empty ones)
114
+ global_effective_max_columns = 0
115
+ for sh in sheets:
116
+ sheet = wb[sh]
117
+ effective_cols = 0
118
+ for col in range(1, sheet.max_column + 1):
119
+ # Check if any cell in the column is non-empty
120
+ for row in range(1, sheet.max_row + 1):
121
+ if not is_effectively_empty(sheet.cell(row=row, column=col).value):
122
+ effective_cols += 1
123
+ break
124
+ global_effective_max_columns = max(global_effective_max_columns, effective_cols)
125
+
126
+ # Determine optimal page format based on effective column count
127
+ pagesize, orientation_func = determine_page_format(global_effective_max_columns)
128
+
129
+ # Create the document with determined format
130
+ doc = SimpleDocTemplate(
131
+ pdf_path,
132
+ pagesize=orientation_func(pagesize),
133
+ leftMargin=10,
134
+ rightMargin=10,
135
+ topMargin=15,
136
+ bottomMargin=15
137
+ )
138
+
139
+ # Process each sheet
140
+ for sheet_idx, current_sheet in enumerate(sheets):
141
+ sheet = wb[current_sheet]
142
+
143
+ # Determine which columns to keep (those with at least one non-empty cell)
144
+ columns_to_keep = []
145
+ for col in range(1, sheet.max_column + 1):
146
+ for row in range(1, sheet.max_row + 1):
147
+ if not is_effectively_empty(sheet.cell(row=row, column=col).value):
148
+ columns_to_keep.append(col)
149
+ break
150
+
151
+ # If no columns have valid data, skip this sheet.
152
+ if not columns_to_keep:
153
+ continue
154
+
155
+ # Calculate appropriate column widths (only for kept columns)
156
+ max_col_width = 130 # Maximum column width in points
157
+ min_col_width = 40 # Minimum column width in points
158
+ if pagesize == A3:
159
+ max_col_width = 150 # Allow wider columns on A3
160
+
161
+ col_widths = []
162
+ for col in columns_to_keep:
163
+ max_length = 0
164
+ # Sample first 100 rows for efficiency
165
+ for row in range(1, min(100, sheet.max_row) + 1):
166
+ cell = sheet.cell(row=row, column=col)
167
+ if cell.value:
168
+ content_length = len(str(cell.value))
169
+ # Cap the length for width calculation at 30 characters
170
+ max_length = max(max_length, min(content_length, 30))
171
+ # Adjust multiplier based on page format (narrower columns for A4, wider for A3)
172
+ multiplier = 5.5 if pagesize == A4 else 6.0
173
+ width = min(max(min_col_width, max_length * multiplier), max_col_width)
174
+ col_widths.append(width)
175
+
176
+ # Build the header row from the kept columns
177
+ header_row = []
178
+ # Using row 1 as header (or adjust if your header is in another row)
179
+ for col in columns_to_keep:
180
+ cell_value = sheet.cell(row=1, column=col).value
181
+ header_row.append(Paragraph(str(cell_value or ""), header_style))
182
+
183
+ # Process data rows in chunks to avoid huge tables that might get chopped
184
+ row_count = sheet.max_row
185
+ # Start after header row
186
+ start_row = 2
187
+ while start_row <= row_count:
188
+ end_row = min(start_row + max_rows_per_table - 1, row_count)
189
+
190
+ # Create data for this chunk, starting with the header row
191
+ chunk_data = [header_row]
192
+ for row_idx in range(start_row, end_row + 1):
193
+ data_row = []
194
+ for col in columns_to_keep:
195
+ cell = sheet.cell(row=row_idx, column=col)
196
+ cell_value = cell.value or ""
197
+ data_row.append(Paragraph(str(cell_value), cell_style))
198
+ chunk_data.append(data_row)
199
+
200
+ # Create table for this chunk
201
+ table = Table(chunk_data, colWidths=col_widths, repeatRows=1)
202
+
203
+ # Style the table
204
+ table_style = TableStyle([
205
+ # Header styling
206
+ ('BACKGROUND', (0, 0), (-1, 0), colors.darkblue),
207
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
208
+ ('ALIGN', (0, 0), (-1, 0), 'CENTER'),
209
+
210
+ # Grid
211
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
212
+ ('VALIGN', (0, 0), (-1, -1), 'TOP'),
213
+
214
+ # Row background colors
215
+ ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.lightgrey]),
216
+
217
+ # Cell padding
218
+ ('LEFTPADDING', (0, 0), (-1, -1), 3),
219
+ ('RIGHTPADDING', (0, 0), (-1, -1), 3),
220
+ ('TOPPADDING', (0, 0), (-1, -1), 3),
221
+ ('BOTTOMPADDING', (0, 0), (-1, -1), 3)
222
+ ])
223
+
224
+ table.setStyle(table_style)
225
+ table.hAlign = 'LEFT'
226
+ table.spaceBefore = 5
227
+ table.spaceAfter = 15
228
+
229
+ elements.append(table)
230
+
231
+ # Uncomment below if you wish to add a continuation note when splitting tables
232
+ # if end_row < row_count:
233
+ # continuation = Paragraph(f"Table continues... (Rows {start_row}-{end_row} of {row_count})", styles['Italic'])
234
+ # elements.append(continuation)
235
+ # elements.append(Spacer(1, 0.2 * inch))
236
+
237
+ start_row = end_row + 1
238
+
239
+ # Add page break between sheets (except for the last sheet)
240
+ if sheet_idx < len(sheets) - 1:
241
+ elements.append(PageBreak())
242
+
243
+ # Build PDF
244
+ doc.build(elements)
245
+
246
+ return pdf_path
src/txt_to_pdf.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from reportlab.lib.pagesizes import letter
3
+ from reportlab.pdfgen import canvas
4
+
5
+
6
+ def txt_to_pdf(input_txt, output_pdf=None):
7
+ if output_pdf is None:
8
+ output_pdf = os.path.splitext(input_txt)[0] + '.pdf'
9
+
10
+ # Read the text file without modifying spacing
11
+ with open(input_txt, "r", encoding="utf-8") as file:
12
+ lines = file.readlines()
13
+
14
+ c = canvas.Canvas(output_pdf, pagesize=letter)
15
+ width, height = letter
16
+ left_margin = 10
17
+ top_margin = 10
18
+ bottom_margin = 10
19
+ line_height = 10 # Adjust based on desired spacing
20
+
21
+ # Use a text object for more control
22
+ text_object = c.beginText(left_margin, height - top_margin)
23
+ text_object.setFont("Courier", 8) # Use a monospaced font to keep spacing intact
24
+
25
+ for line in lines:
26
+ # Remove the newline, preserving other whitespace
27
+ # And skip the line if it's empty (after stripping all whitespace)
28
+ if not line.strip():
29
+ continue
30
+ line = line.rstrip("\n")
31
+ text_object.textLine(line)
32
+
33
+ # Check if we have reached the bottom margin
34
+ if text_object.getY() < bottom_margin:
35
+ c.drawText(text_object)
36
+ c.showPage()
37
+ text_object = c.beginText(left_margin, height - top_margin)
38
+ text_object.setFont("Courier", 8)
39
+
40
+ # Draw any remaining text
41
+ c.drawText(text_object)
42
+ c.save()
43
+ return output_pdf