File size: 4,836 Bytes
62da328 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
from camel.toolkits.base import BaseToolkit
from camel.toolkits.function_tool import FunctionTool
from retry import retry
from typing import List, Dict, Any, Optional, Tuple
from loguru import logger
from openpyxl import load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from tabulate import tabulate
from xls2xlsx import XLS2XLSX
import os
import pandas as pd
class ExcelToolkit(BaseToolkit):
r"""A class representing a toolkit for extract detailed cell information from an Excel file.
This class provides method for processing docx, pdf, pptx, etc. It cannot process excel files.
"""
def _convert_to_markdown(self, df: pd.DataFrame) -> str:
"""
Convert DataFrame to Markdown format table.
Args:
df (pd.DataFrame): DataFrame containing the Excel data.
Returns:
str: Markdown formatted table.
"""
md_table = tabulate(df, headers='keys', tablefmt='pipe')
return str(md_table)
def extract_excel_content(self, document_path: str) -> str:
r"""Extract detailed cell information from an Excel file, including multiple sheets.
Args:
document_path (str): The path of the Excel file.
Returns:
str: Extracted excel information, including details of each sheet.
"""
logger.debug(f"Calling extract_excel_content with document_path: {document_path}")
if not (document_path.endswith("xls") or document_path.endswith("xlsx") or document_path.endswith("csv")):
logger.error("Only xls, xlsx, csv files are supported.")
return f"Failed to process file {document_path}: It is not excel format. Please try other ways."
if document_path.endswith("csv"):
try:
df = pd.read_csv(document_path)
md_table = self._convert_to_markdown(df)
return f"CSV File Processed:\n{md_table}"
except Exception as e:
logger.error(f"Failed to process file {document_path}: {e}")
return f"Failed to process file {document_path}: {e}"
if document_path.endswith("xls"):
output_path = document_path.replace(".xls", ".xlsx")
x2x = XLS2XLSX(document_path)
x2x.to_xlsx(output_path)
document_path = output_path
# Load the Excel workbook
wb = load_workbook(document_path, data_only=True)
sheet_info_list = []
# Iterate through all sheets
for sheet in wb.sheetnames:
ws = wb[sheet]
cell_info_list = []
for row in ws.iter_rows():
for cell in row:
row_num = cell.row
col_letter = cell.column_letter
cell_value = cell.value
font_color = None
if cell.font and cell.font.color and "rgb=None" not in str(cell.font.color): # Handle font color
font_color = cell.font.color.rgb
fill_color = None
if cell.fill and cell.fill.fgColor and "rgb=None" not in str(cell.fill.fgColor): # Handle fill color
fill_color = cell.fill.fgColor.rgb
cell_info_list.append({
"index": f"{row_num}{col_letter}",
"value": cell_value,
"font_color": font_color,
"fill_color": fill_color,
})
# Convert the sheet to a DataFrame and then to markdown
sheet_df = pd.read_excel(document_path, sheet_name=sheet, engine='openpyxl')
markdown_content = self._convert_to_markdown(sheet_df)
# Collect all information for the sheet
sheet_info = {
"sheet_name": sheet,
"cell_info_list": cell_info_list,
"markdown_content": markdown_content,
}
sheet_info_list.append(sheet_info)
result_str = ""
for sheet_info in sheet_info_list:
result_str += f"""
Sheet Name: {sheet_info['sheet_name']}
Cell information list:
{sheet_info['cell_info_list']}
Markdown View of the content:
{sheet_info['markdown_content']}
{'-'*40}
"""
return result_str
def get_tools(self) -> List[FunctionTool]:
r"""Returns a list of FunctionTool objects representing the functions in the toolkit.
Returns:
List[FunctionTool]: A list of FunctionTool objects representing the functions in the toolkit.
"""
return [
FunctionTool(self.extract_excel_content),
]
|