Spaces:
Sleeping
Sleeping
File size: 9,140 Bytes
aacdfd5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 |
import os
import math
from openpyxl import load_workbook
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter, A4, A3, landscape, portrait
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, PageBreak
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_LEFT, TA_CENTER
from reportlab.lib.units import inch
import pyexcel as p
def convert_xls_to_xlsx(xls_path, xlsx_path=None):
"""Convert the old .xls file to .xlsx format"""
if xlsx_path is None:
xlsx_path = os.path.splitext(xls_path)[0] + '.xlsx'
p.save_book_as(file_name=xls_path, dest_file_name=xlsx_path)
return xlsx_path
def determine_page_format(num_columns, max_column_width=None):
"""
Determine the optimal page size and orientation based on table dimensions.
Args:
num_columns (int): Number of columns in the table.
max_column_width (float, optional): Maximum column width if available.
Returns:
tuple: (pagesize, orientation function)
"""
# Define thresholds for decision making
if num_columns <= 5:
# Few columns, likely to fit on portrait A4
return A4, portrait
elif num_columns <= 8:
# Medium number of columns, use landscape A4
return A4, landscape
elif num_columns <= 12:
# Many columns, use portrait A3
return A3, portrait
else:
# Lots of columns, use landscape A3
return A3, landscape
def is_effectively_empty(value):
"""
Return True if the cell value is considered empty.
Empty means:
- The value is None.
- The value is a float and math.isnan(value) is True.
- The value is a string that is empty (after stripping whitespace).
"""
if value is None:
return True
if isinstance(value, float) and math.isnan(value):
return True
if isinstance(value, str) and not value.strip():
return True
return False
def excel_to_pdf(excel_path, pdf_path=None, sheet_name=None, max_rows_per_table=50):
"""
Convert Excel file to PDF with adaptive page size based on content,
removing columns that contain only NaN (or empty) values.
Args:
excel_path (str): Path to the Excel file.
pdf_path (str, optional): Path for the output PDF file.
sheet_name (str, optional): Name of the sheet to convert.
max_rows_per_table (int): Maximum rows per table before splitting.
Returns:
str: Path to the created PDF file.
"""
if excel_path.endswith('.xls'):
excel_path = convert_xls_to_xlsx(excel_path)
if pdf_path is None:
pdf_path = os.path.splitext(excel_path)[0] + '.pdf'
# Load Excel file
wb = load_workbook(excel_path)
sheets = [sheet_name] if sheet_name else wb.sheetnames
# Create paragraph styles for cell content
styles = getSampleStyleSheet()
header_style = ParagraphStyle(
name='HeaderStyle',
parent=styles['Normal'],
fontName='Helvetica-Bold',
fontSize=9,
alignment=TA_CENTER,
textColor=colors.white,
leading=12
)
cell_style = ParagraphStyle(
name='CellStyle',
parent=styles['Normal'],
fontName='Helvetica',
fontSize=8,
alignment=TA_LEFT,
leading=10 # Line spacing
)
elements = []
# Determine the effective maximum number of columns among all sheets (after filtering out empty ones)
global_effective_max_columns = 0
for sh in sheets:
sheet = wb[sh]
effective_cols = 0
for col in range(1, sheet.max_column + 1):
# Check if any cell in the column is non-empty
for row in range(1, sheet.max_row + 1):
if not is_effectively_empty(sheet.cell(row=row, column=col).value):
effective_cols += 1
break
global_effective_max_columns = max(global_effective_max_columns, effective_cols)
# Determine optimal page format based on effective column count
pagesize, orientation_func = determine_page_format(global_effective_max_columns)
# Create the document with determined format
doc = SimpleDocTemplate(
pdf_path,
pagesize=orientation_func(pagesize),
leftMargin=10,
rightMargin=10,
topMargin=15,
bottomMargin=15
)
# Process each sheet
for sheet_idx, current_sheet in enumerate(sheets):
sheet = wb[current_sheet]
# Determine which columns to keep (those with at least one non-empty cell)
columns_to_keep = []
for col in range(1, sheet.max_column + 1):
for row in range(1, sheet.max_row + 1):
if not is_effectively_empty(sheet.cell(row=row, column=col).value):
columns_to_keep.append(col)
break
# If no columns have valid data, skip this sheet.
if not columns_to_keep:
continue
# Calculate appropriate column widths (only for kept columns)
max_col_width = 130 # Maximum column width in points
min_col_width = 40 # Minimum column width in points
if pagesize == A3:
max_col_width = 150 # Allow wider columns on A3
col_widths = []
for col in columns_to_keep:
max_length = 0
# Sample first 100 rows for efficiency
for row in range(1, min(100, sheet.max_row) + 1):
cell = sheet.cell(row=row, column=col)
if cell.value:
content_length = len(str(cell.value))
# Cap the length for width calculation at 30 characters
max_length = max(max_length, min(content_length, 30))
# Adjust multiplier based on page format (narrower columns for A4, wider for A3)
multiplier = 5.5 if pagesize == A4 else 6.0
width = min(max(min_col_width, max_length * multiplier), max_col_width)
col_widths.append(width)
# Build the header row from the kept columns
header_row = []
# Using row 1 as header (or adjust if your header is in another row)
for col in columns_to_keep:
cell_value = sheet.cell(row=1, column=col).value
header_row.append(Paragraph(str(cell_value or ""), header_style))
# Process data rows in chunks to avoid huge tables that might get chopped
row_count = sheet.max_row
# Start after header row
start_row = 2
while start_row <= row_count:
end_row = min(start_row + max_rows_per_table - 1, row_count)
# Create data for this chunk, starting with the header row
chunk_data = [header_row]
for row_idx in range(start_row, end_row + 1):
data_row = []
for col in columns_to_keep:
cell = sheet.cell(row=row_idx, column=col)
cell_value = cell.value or ""
data_row.append(Paragraph(str(cell_value), cell_style))
chunk_data.append(data_row)
# Create table for this chunk
table = Table(chunk_data, colWidths=col_widths, repeatRows=1)
# Style the table
table_style = TableStyle([
# Header styling
('BACKGROUND', (0, 0), (-1, 0), colors.darkblue),
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
('ALIGN', (0, 0), (-1, 0), 'CENTER'),
# Grid
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
('VALIGN', (0, 0), (-1, -1), 'TOP'),
# Row background colors
('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.lightgrey]),
# Cell padding
('LEFTPADDING', (0, 0), (-1, -1), 3),
('RIGHTPADDING', (0, 0), (-1, -1), 3),
('TOPPADDING', (0, 0), (-1, -1), 3),
('BOTTOMPADDING', (0, 0), (-1, -1), 3)
])
table.setStyle(table_style)
table.hAlign = 'LEFT'
table.spaceBefore = 5
table.spaceAfter = 15
elements.append(table)
# Uncomment below if you wish to add a continuation note when splitting tables
# if end_row < row_count:
# continuation = Paragraph(f"Table continues... (Rows {start_row}-{end_row} of {row_count})", styles['Italic'])
# elements.append(continuation)
# elements.append(Spacer(1, 0.2 * inch))
start_row = end_row + 1
# Add page break between sheets (except for the last sheet)
if sheet_idx < len(sheets) - 1:
elements.append(PageBreak())
# Build PDF
doc.build(elements)
return pdf_path |