File size: 4,836 Bytes
62da328
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from camel.toolkits.base import BaseToolkit
from camel.toolkits.function_tool import FunctionTool
from retry import retry
from typing import List, Dict, Any, Optional, Tuple
from loguru import logger
from openpyxl import load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from tabulate import tabulate
from xls2xlsx import XLS2XLSX
import os
import pandas as pd


class ExcelToolkit(BaseToolkit):
    r"""A class representing a toolkit for extract detailed cell information from an Excel file.

    This class provides method for processing docx, pdf, pptx, etc. It cannot process excel files.
    """

    def _convert_to_markdown(self, df: pd.DataFrame) -> str:
        """
        Convert DataFrame to Markdown format table.
        
        Args:
            df (pd.DataFrame): DataFrame containing the Excel data.
        
        Returns:
            str: Markdown formatted table.
        """
        md_table = tabulate(df, headers='keys', tablefmt='pipe')
        return str(md_table)
    

    def extract_excel_content(self, document_path: str) -> str:
        r"""Extract detailed cell information from an Excel file, including multiple sheets.
        
        Args:
            document_path (str): The path of the Excel file.
        
        Returns:
            str: Extracted excel information, including details of each sheet.
        """
        logger.debug(f"Calling extract_excel_content with document_path: {document_path}")

        if not (document_path.endswith("xls") or document_path.endswith("xlsx") or document_path.endswith("csv")):
            logger.error("Only xls, xlsx, csv files are supported.")
            return f"Failed to process file {document_path}: It is not excel format. Please try other ways."

        if document_path.endswith("csv"):
            try:
                df = pd.read_csv(document_path)
                md_table = self._convert_to_markdown(df)
                return f"CSV File Processed:\n{md_table}"
            except Exception as e:
                logger.error(f"Failed to process file {document_path}: {e}")
                return f"Failed to process file {document_path}: {e}"


        if document_path.endswith("xls"):
            output_path = document_path.replace(".xls", ".xlsx")
            x2x = XLS2XLSX(document_path)
            x2x.to_xlsx(output_path)
            document_path = output_path

        # Load the Excel workbook
        wb = load_workbook(document_path, data_only=True)
        sheet_info_list = [] 

        # Iterate through all sheets
        for sheet in wb.sheetnames:
            ws = wb[sheet]
            cell_info_list = []

            for row in ws.iter_rows():
                for cell in row:
                    row_num = cell.row
                    col_letter = cell.column_letter

                    cell_value = cell.value

                    font_color = None
                    if cell.font and cell.font.color and "rgb=None" not in str(cell.font.color):  # Handle font color
                        font_color = cell.font.color.rgb

                    fill_color = None
                    if cell.fill and cell.fill.fgColor and "rgb=None" not in str(cell.fill.fgColor):  # Handle fill color
                        fill_color = cell.fill.fgColor.rgb

                    cell_info_list.append({
                        "index": f"{row_num}{col_letter}",
                        "value": cell_value,
                        "font_color": font_color,
                        "fill_color": fill_color,
                    })

            # Convert the sheet to a DataFrame and then to markdown
            sheet_df = pd.read_excel(document_path, sheet_name=sheet, engine='openpyxl')
            markdown_content = self._convert_to_markdown(sheet_df)

            # Collect all information for the sheet
            sheet_info = {
                "sheet_name": sheet,
                "cell_info_list": cell_info_list,
                "markdown_content": markdown_content,
            }
            sheet_info_list.append(sheet_info)

        result_str = ""
        for sheet_info in sheet_info_list:
            result_str += f"""
            Sheet Name: {sheet_info['sheet_name']}
            Cell information list:
            {sheet_info['cell_info_list']}
            
            Markdown View of the content:
            {sheet_info['markdown_content']}
            
            {'-'*40}
            """

        return result_str

    def get_tools(self) -> List[FunctionTool]:
        r"""Returns a list of FunctionTool objects representing the functions in the toolkit.

        Returns:
            List[FunctionTool]: A list of FunctionTool objects representing the functions in the toolkit.
        """
        return [
            FunctionTool(self.extract_excel_content),
        ]