File size: 9,140 Bytes
aacdfd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246


import os
import math
from openpyxl import load_workbook
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter, A4, A3, landscape, portrait
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, PageBreak
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_LEFT, TA_CENTER
from reportlab.lib.units import inch
import pyexcel as p


def convert_xls_to_xlsx(xls_path, xlsx_path=None):
    """Convert the old .xls file to .xlsx format"""
    if xlsx_path is None:
        xlsx_path = os.path.splitext(xls_path)[0] + '.xlsx'
    p.save_book_as(file_name=xls_path, dest_file_name=xlsx_path)
    return xlsx_path


def determine_page_format(num_columns, max_column_width=None):
    """
    Determine the optimal page size and orientation based on table dimensions.
    
    Args:
        num_columns (int): Number of columns in the table.
        max_column_width (float, optional): Maximum column width if available.
        
    Returns:
        tuple: (pagesize, orientation function)
    """
    # Define thresholds for decision making
    if num_columns <= 5:
        # Few columns, likely to fit on portrait A4
        return A4, portrait
    elif num_columns <= 8:
        # Medium number of columns, use landscape A4
        return A4, landscape
    elif num_columns <= 12:
        # Many columns, use portrait A3
        return A3, portrait
    else:
        # Lots of columns, use landscape A3
        return A3, landscape


def is_effectively_empty(value):
    """
    Return True if the cell value is considered empty.
    
    Empty means:
      - The value is None.
      - The value is a float and math.isnan(value) is True.
      - The value is a string that is empty (after stripping whitespace).
    """
    if value is None:
        return True
    if isinstance(value, float) and math.isnan(value):
        return True
    if isinstance(value, str) and not value.strip():
        return True
    return False


def excel_to_pdf(excel_path, pdf_path=None, sheet_name=None, max_rows_per_table=50):
    """
    Convert Excel file to PDF with adaptive page size based on content,
    removing columns that contain only NaN (or empty) values.
    
    Args:
        excel_path (str): Path to the Excel file.
        pdf_path (str, optional): Path for the output PDF file.
        sheet_name (str, optional): Name of the sheet to convert.
        max_rows_per_table (int): Maximum rows per table before splitting.
        
    Returns:
        str: Path to the created PDF file.
    """
    if excel_path.endswith('.xls'):
        excel_path = convert_xls_to_xlsx(excel_path)

    if pdf_path is None:
        pdf_path = os.path.splitext(excel_path)[0] + '.pdf'
    
    # Load Excel file
    wb = load_workbook(excel_path)
    sheets = [sheet_name] if sheet_name else wb.sheetnames

    # Create paragraph styles for cell content
    styles = getSampleStyleSheet()
    header_style = ParagraphStyle(
        name='HeaderStyle',
        parent=styles['Normal'],
        fontName='Helvetica-Bold',
        fontSize=9,
        alignment=TA_CENTER,
        textColor=colors.white,
        leading=12
    )
    cell_style = ParagraphStyle(
        name='CellStyle',
        parent=styles['Normal'],
        fontName='Helvetica',
        fontSize=8,
        alignment=TA_LEFT,
        leading=10  # Line spacing
    )
    
    elements = []
    
    # Determine the effective maximum number of columns among all sheets (after filtering out empty ones)
    global_effective_max_columns = 0
    for sh in sheets:
        sheet = wb[sh]
        effective_cols = 0
        for col in range(1, sheet.max_column + 1):
            # Check if any cell in the column is non-empty
            for row in range(1, sheet.max_row + 1):
                if not is_effectively_empty(sheet.cell(row=row, column=col).value):
                    effective_cols += 1
                    break
        global_effective_max_columns = max(global_effective_max_columns, effective_cols)
    
    # Determine optimal page format based on effective column count
    pagesize, orientation_func = determine_page_format(global_effective_max_columns)
    
    # Create the document with determined format
    doc = SimpleDocTemplate(
        pdf_path,
        pagesize=orientation_func(pagesize),
        leftMargin=10,
        rightMargin=10,
        topMargin=15,
        bottomMargin=15
    )
    
    # Process each sheet
    for sheet_idx, current_sheet in enumerate(sheets):
        sheet = wb[current_sheet]
        
        # Determine which columns to keep (those with at least one non-empty cell)
        columns_to_keep = []
        for col in range(1, sheet.max_column + 1):
            for row in range(1, sheet.max_row + 1):
                if not is_effectively_empty(sheet.cell(row=row, column=col).value):
                    columns_to_keep.append(col)
                    break
        
        # If no columns have valid data, skip this sheet.
        if not columns_to_keep:
            continue
        
        # Calculate appropriate column widths (only for kept columns)
        max_col_width = 130  # Maximum column width in points
        min_col_width = 40   # Minimum column width in points
        if pagesize == A3:
            max_col_width = 150  # Allow wider columns on A3
        
        col_widths = []
        for col in columns_to_keep:
            max_length = 0
            # Sample first 100 rows for efficiency
            for row in range(1, min(100, sheet.max_row) + 1):
                cell = sheet.cell(row=row, column=col)
                if cell.value:
                    content_length = len(str(cell.value))
                    # Cap the length for width calculation at 30 characters
                    max_length = max(max_length, min(content_length, 30))
            # Adjust multiplier based on page format (narrower columns for A4, wider for A3)
            multiplier = 5.5 if pagesize == A4 else 6.0
            width = min(max(min_col_width, max_length * multiplier), max_col_width)
            col_widths.append(width)
        
        # Build the header row from the kept columns
        header_row = []
        # Using row 1 as header (or adjust if your header is in another row)
        for col in columns_to_keep:
            cell_value = sheet.cell(row=1, column=col).value
            header_row.append(Paragraph(str(cell_value or ""), header_style))
        
        # Process data rows in chunks to avoid huge tables that might get chopped
        row_count = sheet.max_row
        # Start after header row
        start_row = 2  
        while start_row <= row_count:
            end_row = min(start_row + max_rows_per_table - 1, row_count)
            
            # Create data for this chunk, starting with the header row
            chunk_data = [header_row]
            for row_idx in range(start_row, end_row + 1):
                data_row = []
                for col in columns_to_keep:
                    cell = sheet.cell(row=row_idx, column=col)
                    cell_value = cell.value or ""
                    data_row.append(Paragraph(str(cell_value), cell_style))
                chunk_data.append(data_row)
            
            # Create table for this chunk
            table = Table(chunk_data, colWidths=col_widths, repeatRows=1)
            
            # Style the table
            table_style = TableStyle([
                # Header styling
                ('BACKGROUND', (0, 0), (-1, 0), colors.darkblue),
                ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
                ('ALIGN', (0, 0), (-1, 0), 'CENTER'),
                
                # Grid
                ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
                ('VALIGN', (0, 0), (-1, -1), 'TOP'),
                
                # Row background colors
                ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.lightgrey]),
                
                # Cell padding
                ('LEFTPADDING', (0, 0), (-1, -1), 3),
                ('RIGHTPADDING', (0, 0), (-1, -1), 3),
                ('TOPPADDING', (0, 0), (-1, -1), 3),
                ('BOTTOMPADDING', (0, 0), (-1, -1), 3)
            ])
            
            table.setStyle(table_style)
            table.hAlign = 'LEFT'
            table.spaceBefore = 5
            table.spaceAfter = 15
            
            elements.append(table)
            
            # Uncomment below if you wish to add a continuation note when splitting tables
            # if end_row < row_count:
            #     continuation = Paragraph(f"Table continues... (Rows {start_row}-{end_row} of {row_count})", styles['Italic'])
            #     elements.append(continuation)
            #     elements.append(Spacer(1, 0.2 * inch))
            
            start_row = end_row + 1
        
        # Add page break between sheets (except for the last sheet)
        if sheet_idx < len(sheets) - 1:
            elements.append(PageBreak())
    
    # Build PDF
    doc.build(elements)
    
    return pdf_path