Spaces:

rote1
/

IAGO

Sleeping

App Files Files Community

IAGO / deep-swarm /camel /toolkits /excel_toolkit.py

zyh-ralph

initial update

62da328 3 months ago

raw

history blame

4.84 kB

	from camel.toolkits.base import BaseToolkit
	from camel.toolkits.function_tool import FunctionTool
	from retry import retry
	from typing import List, Dict, Any, Optional, Tuple
	from loguru import logger
	from openpyxl import load_workbook
	from openpyxl.utils.dataframe import dataframe_to_rows
	from tabulate import tabulate
	from xls2xlsx import XLS2XLSX
	import os
	import pandas as pd


	class ExcelToolkit(BaseToolkit):
	r"""A class representing a toolkit for extract detailed cell information from an Excel file.

	This class provides method for processing docx, pdf, pptx, etc. It cannot process excel files.
	"""

	def _convert_to_markdown(self, df: pd.DataFrame) -> str:
	"""
	Convert DataFrame to Markdown format table.

	Args:
	df (pd.DataFrame): DataFrame containing the Excel data.

	Returns:
	str: Markdown formatted table.
	"""
	md_table = tabulate(df, headers='keys', tablefmt='pipe')
	return str(md_table)


	def extract_excel_content(self, document_path: str) -> str:
	r"""Extract detailed cell information from an Excel file, including multiple sheets.

	Args:
	document_path (str): The path of the Excel file.

	Returns:
	str: Extracted excel information, including details of each sheet.
	"""
	logger.debug(f"Calling extract_excel_content with document_path: {document_path}")

	if not (document_path.endswith("xls") or document_path.endswith("xlsx") or document_path.endswith("csv")):
	logger.error("Only xls, xlsx, csv files are supported.")
	return f"Failed to process file {document_path}: It is not excel format. Please try other ways."

	if document_path.endswith("csv"):
	try:
	df = pd.read_csv(document_path)
	md_table = self._convert_to_markdown(df)
	return f"CSV File Processed:\n{md_table}"
	except Exception as e:
	logger.error(f"Failed to process file {document_path}: {e}")
	return f"Failed to process file {document_path}: {e}"


	if document_path.endswith("xls"):
	output_path = document_path.replace(".xls", ".xlsx")
	x2x = XLS2XLSX(document_path)
	x2x.to_xlsx(output_path)
	document_path = output_path

	# Load the Excel workbook
	wb = load_workbook(document_path, data_only=True)
	sheet_info_list = []

	# Iterate through all sheets
	for sheet in wb.sheetnames:
	ws = wb[sheet]
	cell_info_list = []

	for row in ws.iter_rows():
	for cell in row:
	row_num = cell.row
	col_letter = cell.column_letter

	cell_value = cell.value

	font_color = None
	if cell.font and cell.font.color and "rgb=None" not in str(cell.font.color): # Handle font color
	font_color = cell.font.color.rgb

	fill_color = None
	if cell.fill and cell.fill.fgColor and "rgb=None" not in str(cell.fill.fgColor): # Handle fill color
	fill_color = cell.fill.fgColor.rgb

	cell_info_list.append({
	"index": f"{row_num}{col_letter}",
	"value": cell_value,
	"font_color": font_color,
	"fill_color": fill_color,
	})

	# Convert the sheet to a DataFrame and then to markdown
	sheet_df = pd.read_excel(document_path, sheet_name=sheet, engine='openpyxl')
	markdown_content = self._convert_to_markdown(sheet_df)

	# Collect all information for the sheet
	sheet_info = {
	"sheet_name": sheet,
	"cell_info_list": cell_info_list,
	"markdown_content": markdown_content,
	}
	sheet_info_list.append(sheet_info)

	result_str = ""
	for sheet_info in sheet_info_list:
	result_str += f"""
	Sheet Name: {sheet_info['sheet_name']}
	Cell information list:
	{sheet_info['cell_info_list']}

	Markdown View of the content:
	{sheet_info['markdown_content']}

	{'-'*40}
	"""

	return result_str

	def get_tools(self) -> List[FunctionTool]:
	r"""Returns a list of FunctionTool objects representing the functions in the toolkit.

	Returns:
	List[FunctionTool]: A list of FunctionTool objects representing the functions in the toolkit.
	"""
	return [
	FunctionTool(self.extract_excel_content),
	]