Spaces:
Sleeping
Sleeping
import os | |
import math | |
from openpyxl import load_workbook | |
from reportlab.lib import colors | |
from reportlab.lib.pagesizes import letter, A4, A3, landscape, portrait | |
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, PageBreak | |
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle | |
from reportlab.lib.enums import TA_LEFT, TA_CENTER | |
from reportlab.lib.units import inch | |
import pyexcel as p | |
def convert_xls_to_xlsx(xls_path, xlsx_path=None): | |
"""Convert the old .xls file to .xlsx format""" | |
if xlsx_path is None: | |
xlsx_path = os.path.splitext(xls_path)[0] + '.xlsx' | |
p.save_book_as(file_name=xls_path, dest_file_name=xlsx_path) | |
return xlsx_path | |
def determine_page_format(num_columns, max_column_width=None): | |
""" | |
Determine the optimal page size and orientation based on table dimensions. | |
Args: | |
num_columns (int): Number of columns in the table. | |
max_column_width (float, optional): Maximum column width if available. | |
Returns: | |
tuple: (pagesize, orientation function) | |
""" | |
# Define thresholds for decision making | |
if num_columns <= 5: | |
# Few columns, likely to fit on portrait A4 | |
return A4, portrait | |
elif num_columns <= 8: | |
# Medium number of columns, use landscape A4 | |
return A4, landscape | |
elif num_columns <= 12: | |
# Many columns, use portrait A3 | |
return A3, portrait | |
else: | |
# Lots of columns, use landscape A3 | |
return A3, landscape | |
def is_effectively_empty(value): | |
""" | |
Return True if the cell value is considered empty. | |
Empty means: | |
- The value is None. | |
- The value is a float and math.isnan(value) is True. | |
- The value is a string that is empty (after stripping whitespace). | |
""" | |
if value is None: | |
return True | |
if isinstance(value, float) and math.isnan(value): | |
return True | |
if isinstance(value, str) and not value.strip(): | |
return True | |
return False | |
def excel_to_pdf(excel_path, pdf_path=None, sheet_name=None, max_rows_per_table=50): | |
""" | |
Convert Excel file to PDF with adaptive page size based on content, | |
removing columns that contain only NaN (or empty) values. | |
Args: | |
excel_path (str): Path to the Excel file. | |
pdf_path (str, optional): Path for the output PDF file. | |
sheet_name (str, optional): Name of the sheet to convert. | |
max_rows_per_table (int): Maximum rows per table before splitting. | |
Returns: | |
str: Path to the created PDF file. | |
""" | |
if excel_path.endswith('.xls'): | |
excel_path = convert_xls_to_xlsx(excel_path) | |
if pdf_path is None: | |
pdf_path = os.path.splitext(excel_path)[0] + '.pdf' | |
# Load Excel file | |
wb = load_workbook(excel_path) | |
sheets = [sheet_name] if sheet_name else wb.sheetnames | |
# Create paragraph styles for cell content | |
styles = getSampleStyleSheet() | |
header_style = ParagraphStyle( | |
name='HeaderStyle', | |
parent=styles['Normal'], | |
fontName='Helvetica-Bold', | |
fontSize=9, | |
alignment=TA_CENTER, | |
textColor=colors.white, | |
leading=12 | |
) | |
cell_style = ParagraphStyle( | |
name='CellStyle', | |
parent=styles['Normal'], | |
fontName='Helvetica', | |
fontSize=8, | |
alignment=TA_LEFT, | |
leading=10 # Line spacing | |
) | |
elements = [] | |
# Determine the effective maximum number of columns among all sheets (after filtering out empty ones) | |
global_effective_max_columns = 0 | |
for sh in sheets: | |
sheet = wb[sh] | |
effective_cols = 0 | |
for col in range(1, sheet.max_column + 1): | |
# Check if any cell in the column is non-empty | |
for row in range(1, sheet.max_row + 1): | |
if not is_effectively_empty(sheet.cell(row=row, column=col).value): | |
effective_cols += 1 | |
break | |
global_effective_max_columns = max(global_effective_max_columns, effective_cols) | |
# Determine optimal page format based on effective column count | |
pagesize, orientation_func = determine_page_format(global_effective_max_columns) | |
# Create the document with determined format | |
doc = SimpleDocTemplate( | |
pdf_path, | |
pagesize=orientation_func(pagesize), | |
leftMargin=10, | |
rightMargin=10, | |
topMargin=15, | |
bottomMargin=15 | |
) | |
# Process each sheet | |
for sheet_idx, current_sheet in enumerate(sheets): | |
sheet = wb[current_sheet] | |
# Determine which columns to keep (those with at least one non-empty cell) | |
columns_to_keep = [] | |
for col in range(1, sheet.max_column + 1): | |
for row in range(1, sheet.max_row + 1): | |
if not is_effectively_empty(sheet.cell(row=row, column=col).value): | |
columns_to_keep.append(col) | |
break | |
# If no columns have valid data, skip this sheet. | |
if not columns_to_keep: | |
continue | |
# Calculate appropriate column widths (only for kept columns) | |
max_col_width = 130 # Maximum column width in points | |
min_col_width = 40 # Minimum column width in points | |
if pagesize == A3: | |
max_col_width = 150 # Allow wider columns on A3 | |
col_widths = [] | |
for col in columns_to_keep: | |
max_length = 0 | |
# Sample first 100 rows for efficiency | |
for row in range(1, min(100, sheet.max_row) + 1): | |
cell = sheet.cell(row=row, column=col) | |
if cell.value: | |
content_length = len(str(cell.value)) | |
# Cap the length for width calculation at 30 characters | |
max_length = max(max_length, min(content_length, 30)) | |
# Adjust multiplier based on page format (narrower columns for A4, wider for A3) | |
multiplier = 5.5 if pagesize == A4 else 6.0 | |
width = min(max(min_col_width, max_length * multiplier), max_col_width) | |
col_widths.append(width) | |
# Build the header row from the kept columns | |
header_row = [] | |
# Using row 1 as header (or adjust if your header is in another row) | |
for col in columns_to_keep: | |
cell_value = sheet.cell(row=1, column=col).value | |
header_row.append(Paragraph(str(cell_value or ""), header_style)) | |
# Process data rows in chunks to avoid huge tables that might get chopped | |
row_count = sheet.max_row | |
# Start after header row | |
start_row = 2 | |
while start_row <= row_count: | |
end_row = min(start_row + max_rows_per_table - 1, row_count) | |
# Create data for this chunk, starting with the header row | |
chunk_data = [header_row] | |
for row_idx in range(start_row, end_row + 1): | |
data_row = [] | |
for col in columns_to_keep: | |
cell = sheet.cell(row=row_idx, column=col) | |
cell_value = cell.value or "" | |
data_row.append(Paragraph(str(cell_value), cell_style)) | |
chunk_data.append(data_row) | |
# Create table for this chunk | |
table = Table(chunk_data, colWidths=col_widths, repeatRows=1) | |
# Style the table | |
table_style = TableStyle([ | |
# Header styling | |
('BACKGROUND', (0, 0), (-1, 0), colors.darkblue), | |
('TEXTCOLOR', (0, 0), (-1, 0), colors.white), | |
('ALIGN', (0, 0), (-1, 0), 'CENTER'), | |
# Grid | |
('GRID', (0, 0), (-1, -1), 0.5, colors.grey), | |
('VALIGN', (0, 0), (-1, -1), 'TOP'), | |
# Row background colors | |
('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.lightgrey]), | |
# Cell padding | |
('LEFTPADDING', (0, 0), (-1, -1), 3), | |
('RIGHTPADDING', (0, 0), (-1, -1), 3), | |
('TOPPADDING', (0, 0), (-1, -1), 3), | |
('BOTTOMPADDING', (0, 0), (-1, -1), 3) | |
]) | |
table.setStyle(table_style) | |
table.hAlign = 'LEFT' | |
table.spaceBefore = 5 | |
table.spaceAfter = 15 | |
elements.append(table) | |
# Uncomment below if you wish to add a continuation note when splitting tables | |
# if end_row < row_count: | |
# continuation = Paragraph(f"Table continues... (Rows {start_row}-{end_row} of {row_count})", styles['Italic']) | |
# elements.append(continuation) | |
# elements.append(Spacer(1, 0.2 * inch)) | |
start_row = end_row + 1 | |
# Add page break between sheets (except for the last sheet) | |
if sheet_idx < len(sheets) - 1: | |
elements.append(PageBreak()) | |
# Build PDF | |
doc.build(elements) | |
return pdf_path |