|
from camel.toolkits.base import BaseToolkit |
|
from camel.toolkits.function_tool import FunctionTool |
|
from retry import retry |
|
from typing import List, Dict, Any, Optional, Tuple |
|
from loguru import logger |
|
from openpyxl import load_workbook |
|
from openpyxl.utils.dataframe import dataframe_to_rows |
|
from tabulate import tabulate |
|
from xls2xlsx import XLS2XLSX |
|
import os |
|
import pandas as pd |
|
|
|
|
|
class ExcelToolkit(BaseToolkit): |
|
r"""A class representing a toolkit for extract detailed cell information from an Excel file. |
|
|
|
This class provides method for processing docx, pdf, pptx, etc. It cannot process excel files. |
|
""" |
|
|
|
def _convert_to_markdown(self, df: pd.DataFrame) -> str: |
|
""" |
|
Convert DataFrame to Markdown format table. |
|
|
|
Args: |
|
df (pd.DataFrame): DataFrame containing the Excel data. |
|
|
|
Returns: |
|
str: Markdown formatted table. |
|
""" |
|
md_table = tabulate(df, headers='keys', tablefmt='pipe') |
|
return str(md_table) |
|
|
|
|
|
def extract_excel_content(self, document_path: str) -> str: |
|
r"""Extract detailed cell information from an Excel file, including multiple sheets. |
|
|
|
Args: |
|
document_path (str): The path of the Excel file. |
|
|
|
Returns: |
|
str: Extracted excel information, including details of each sheet. |
|
""" |
|
logger.debug(f"Calling extract_excel_content with document_path: {document_path}") |
|
|
|
if not (document_path.endswith("xls") or document_path.endswith("xlsx") or document_path.endswith("csv")): |
|
logger.error("Only xls, xlsx, csv files are supported.") |
|
return f"Failed to process file {document_path}: It is not excel format. Please try other ways." |
|
|
|
if document_path.endswith("csv"): |
|
try: |
|
df = pd.read_csv(document_path) |
|
md_table = self._convert_to_markdown(df) |
|
return f"CSV File Processed:\n{md_table}" |
|
except Exception as e: |
|
logger.error(f"Failed to process file {document_path}: {e}") |
|
return f"Failed to process file {document_path}: {e}" |
|
|
|
|
|
if document_path.endswith("xls"): |
|
output_path = document_path.replace(".xls", ".xlsx") |
|
x2x = XLS2XLSX(document_path) |
|
x2x.to_xlsx(output_path) |
|
document_path = output_path |
|
|
|
|
|
wb = load_workbook(document_path, data_only=True) |
|
sheet_info_list = [] |
|
|
|
|
|
for sheet in wb.sheetnames: |
|
ws = wb[sheet] |
|
cell_info_list = [] |
|
|
|
for row in ws.iter_rows(): |
|
for cell in row: |
|
row_num = cell.row |
|
col_letter = cell.column_letter |
|
|
|
cell_value = cell.value |
|
|
|
font_color = None |
|
if cell.font and cell.font.color and "rgb=None" not in str(cell.font.color): |
|
font_color = cell.font.color.rgb |
|
|
|
fill_color = None |
|
if cell.fill and cell.fill.fgColor and "rgb=None" not in str(cell.fill.fgColor): |
|
fill_color = cell.fill.fgColor.rgb |
|
|
|
cell_info_list.append({ |
|
"index": f"{row_num}{col_letter}", |
|
"value": cell_value, |
|
"font_color": font_color, |
|
"fill_color": fill_color, |
|
}) |
|
|
|
|
|
sheet_df = pd.read_excel(document_path, sheet_name=sheet, engine='openpyxl') |
|
markdown_content = self._convert_to_markdown(sheet_df) |
|
|
|
|
|
sheet_info = { |
|
"sheet_name": sheet, |
|
"cell_info_list": cell_info_list, |
|
"markdown_content": markdown_content, |
|
} |
|
sheet_info_list.append(sheet_info) |
|
|
|
result_str = "" |
|
for sheet_info in sheet_info_list: |
|
result_str += f""" |
|
Sheet Name: {sheet_info['sheet_name']} |
|
Cell information list: |
|
{sheet_info['cell_info_list']} |
|
|
|
Markdown View of the content: |
|
{sheet_info['markdown_content']} |
|
|
|
{'-'*40} |
|
""" |
|
|
|
return result_str |
|
|
|
def get_tools(self) -> List[FunctionTool]: |
|
r"""Returns a list of FunctionTool objects representing the functions in the toolkit. |
|
|
|
Returns: |
|
List[FunctionTool]: A list of FunctionTool objects representing the functions in the toolkit. |
|
""" |
|
return [ |
|
FunctionTool(self.extract_excel_content), |
|
] |
|
|