Spaces:

krishnavadithya
/

expiryprocess

Sleeping

File size: 9,140 Bytes

aacdfd5



import os
import math
from openpyxl import load_workbook
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter, A4, A3, landscape, portrait
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, PageBreak
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_LEFT, TA_CENTER
from reportlab.lib.units import inch
import pyexcel as p


def convert_xls_to_xlsx(xls_path, xlsx_path=None):
    """Convert the old .xls file to .xlsx format"""
    if xlsx_path is None:
        xlsx_path = os.path.splitext(xls_path)[0] + '.xlsx'
    p.save_book_as(file_name=xls_path, dest_file_name=xlsx_path)
    return xlsx_path


def determine_page_format(num_columns, max_column_width=None):
    """
    Determine the optimal page size and orientation based on table dimensions.
    
    Args:
        num_columns (int): Number of columns in the table.
        max_column_width (float, optional): Maximum column width if available.
        
    Returns:
        tuple: (pagesize, orientation function)
    """
    # Define thresholds for decision making
    if num_columns <= 5:
        # Few columns, likely to fit on portrait A4
        return A4, portrait
    elif num_columns <= 8:
        # Medium number of columns, use landscape A4
        return A4, landscape
    elif num_columns <= 12:
        # Many columns, use portrait A3
        return A3, portrait
    else:
        # Lots of columns, use landscape A3
        return A3, landscape


def is_effectively_empty(value):
    """
    Return True if the cell value is considered empty.
    
    Empty means:
      - The value is None.
      - The value is a float and math.isnan(value) is True.
      - The value is a string that is empty (after stripping whitespace).
    """
    if value is None:
        return True
    if isinstance(value, float) and math.isnan(value):
        return True
    if isinstance(value, str) and not value.strip():
        return True
    return False


def excel_to_pdf(excel_path, pdf_path=None, sheet_name=None, max_rows_per_table=50):
    """
    Convert Excel file to PDF with adaptive page size based on content,
    removing columns that contain only NaN (or empty) values.
    
    Args:
        excel_path (str): Path to the Excel file.
        pdf_path (str, optional): Path for the output PDF file.
        sheet_name (str, optional): Name of the sheet to convert.
        max_rows_per_table (int): Maximum rows per table before splitting.
        
    Returns:
        str: Path to the created PDF file.
    """
    if excel_path.endswith('.xls'):
        excel_path = convert_xls_to_xlsx(excel_path)

    if pdf_path is None:
        pdf_path = os.path.splitext(excel_path)[0] + '.pdf'
    
    # Load Excel file
    wb = load_workbook(excel_path)
    sheets = [sheet_name] if sheet_name else wb.sheetnames

    # Create paragraph styles for cell content
    styles = getSampleStyleSheet()
    header_style = ParagraphStyle(
        name='HeaderStyle',
        parent=styles['Normal'],
        fontName='Helvetica-Bold',
        fontSize=9,
        alignment=TA_CENTER,
        textColor=colors.white,
        leading=12
    )
    cell_style = ParagraphStyle(
        name='CellStyle',
        parent=styles['Normal'],
        fontName='Helvetica',
        fontSize=8,
        alignment=TA_LEFT,
        leading=10  # Line spacing
    )
    
    elements = []
    
    # Determine the effective maximum number of columns among all sheets (after filtering out empty ones)
    global_effective_max_columns = 0
    for sh in sheets:
        sheet = wb[sh]
        effective_cols = 0
        for col in range(1, sheet.max_column + 1):
            # Check if any cell in the column is non-empty
            for row in range(1, sheet.max_row + 1):
                if not is_effectively_empty(sheet.cell(row=row, column=col).value):
                    effective_cols += 1
                    break
        global_effective_max_columns = max(global_effective_max_columns, effective_cols)
    
    # Determine optimal page format based on effective column count
    pagesize, orientation_func = determine_page_format(global_effective_max_columns)
    
    # Create the document with determined format
    doc = SimpleDocTemplate(
        pdf_path,
        pagesize=orientation_func(pagesize),
        leftMargin=10,
        rightMargin=10,
        topMargin=15,
        bottomMargin=15
    )
    
    # Process each sheet
    for sheet_idx, current_sheet in enumerate(sheets):
        sheet = wb[current_sheet]
        
        # Determine which columns to keep (those with at least one non-empty cell)
        columns_to_keep = []
        for col in range(1, sheet.max_column + 1):
            for row in range(1, sheet.max_row + 1):
                if not is_effectively_empty(sheet.cell(row=row, column=col).value):
                    columns_to_keep.append(col)
                    break
        
        # If no columns have valid data, skip this sheet.
        if not columns_to_keep:
            continue
        
        # Calculate appropriate column widths (only for kept columns)
        max_col_width = 130  # Maximum column width in points
        min_col_width = 40   # Minimum column width in points
        if pagesize == A3:
            max_col_width = 150  # Allow wider columns on A3
        
        col_widths = []
        for col in columns_to_keep:
            max_length = 0
            # Sample first 100 rows for efficiency
            for row in range(1, min(100, sheet.max_row) + 1):
                cell = sheet.cell(row=row, column=col)
                if cell.value:
                    content_length = len(str(cell.value))
                    # Cap the length for width calculation at 30 characters
                    max_length = max(max_length, min(content_length, 30))
            # Adjust multiplier based on page format (narrower columns for A4, wider for A3)
            multiplier = 5.5 if pagesize == A4 else 6.0
            width = min(max(min_col_width, max_length * multiplier), max_col_width)
            col_widths.append(width)
        
        # Build the header row from the kept columns
        header_row = []
        # Using row 1 as header (or adjust if your header is in another row)
        for col in columns_to_keep:
            cell_value = sheet.cell(row=1, column=col).value
            header_row.append(Paragraph(str(cell_value or ""), header_style))
        
        # Process data rows in chunks to avoid huge tables that might get chopped
        row_count = sheet.max_row
        # Start after header row
        start_row = 2  
        while start_row <= row_count:
            end_row = min(start_row + max_rows_per_table - 1, row_count)
            
            # Create data for this chunk, starting with the header row
            chunk_data = [header_row]
            for row_idx in range(start_row, end_row + 1):
                data_row = []
                for col in columns_to_keep:
                    cell = sheet.cell(row=row_idx, column=col)
                    cell_value = cell.value or ""
                    data_row.append(Paragraph(str(cell_value), cell_style))
                chunk_data.append(data_row)
            
            # Create table for this chunk
            table = Table(chunk_data, colWidths=col_widths, repeatRows=1)
            
            # Style the table
            table_style = TableStyle([
                # Header styling
                ('BACKGROUND', (0, 0), (-1, 0), colors.darkblue),
                ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
                ('ALIGN', (0, 0), (-1, 0), 'CENTER'),
                
                # Grid
                ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
                ('VALIGN', (0, 0), (-1, -1), 'TOP'),
                
                # Row background colors
                ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.lightgrey]),
                
                # Cell padding
                ('LEFTPADDING', (0, 0), (-1, -1), 3),
                ('RIGHTPADDING', (0, 0), (-1, -1), 3),
                ('TOPPADDING', (0, 0), (-1, -1), 3),
                ('BOTTOMPADDING', (0, 0), (-1, -1), 3)
            ])
            
            table.setStyle(table_style)
            table.hAlign = 'LEFT'
            table.spaceBefore = 5
            table.spaceAfter = 15
            
            elements.append(table)
            
            # Uncomment below if you wish to add a continuation note when splitting tables
            # if end_row < row_count:
            #     continuation = Paragraph(f"Table continues... (Rows {start_row}-{end_row} of {row_count})", styles['Italic'])
            #     elements.append(continuation)
            #     elements.append(Spacer(1, 0.2 * inch))
            
            start_row = end_row + 1
        
        # Add page break between sheets (except for the last sheet)
        if sheet_idx < len(sheets) - 1:
            elements.append(PageBreak())
    
    # Build PDF
    doc.build(elements)
    
    return pdf_path