|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from dotenv import load_dotenv |
|
|
|
|
|
import os |
|
|
|
from camel.models import ModelFactory |
|
from camel.logger import get_logger |
|
from camel.toolkits import ( |
|
AudioAnalysisToolkit, |
|
CodeExecutionToolkit, |
|
ExcelToolkit, |
|
ImageAnalysisToolkit, |
|
SearchToolkit, |
|
VideoAnalysisToolkit, |
|
BrowserToolkit, |
|
FileWriteToolkit, |
|
) |
|
from camel.types import ModelPlatformType, ModelType |
|
from camel.configs import ChatGPTConfig |
|
|
|
from owl.utils import GAIABenchmark |
|
from camel.logger import set_log_level |
|
|
|
import pathlib |
|
|
|
base_dir = pathlib.Path(__file__).parent.parent |
|
env_path = base_dir / "owl" / ".env" |
|
load_dotenv(dotenv_path=str(env_path)) |
|
|
|
set_log_level(level="DEBUG") |
|
|
|
logger = get_logger(__name__) |
|
|
|
|
|
LEVEL = 1 |
|
SAVE_RESULT = True |
|
test_idx = [0] |
|
|
|
|
|
def main(): |
|
"""Main function to run the GAIA benchmark.""" |
|
|
|
cache_dir = "tmp/" |
|
os.makedirs(cache_dir, exist_ok=True) |
|
result_dir = "results/" |
|
os.makedirs(result_dir, exist_ok=True) |
|
|
|
|
|
models = { |
|
"user": ModelFactory.create( |
|
model_platform=ModelPlatformType.OPENAI, |
|
model_type=ModelType.GPT_4O, |
|
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), |
|
), |
|
"assistant": ModelFactory.create( |
|
model_platform=ModelPlatformType.OPENAI, |
|
model_type=ModelType.GPT_4O, |
|
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), |
|
), |
|
"browsing": ModelFactory.create( |
|
model_platform=ModelPlatformType.OPENAI, |
|
model_type=ModelType.GPT_4O, |
|
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), |
|
), |
|
"planning": ModelFactory.create( |
|
model_platform=ModelPlatformType.OPENAI, |
|
model_type=ModelType.GPT_4O, |
|
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), |
|
), |
|
"video": ModelFactory.create( |
|
model_platform=ModelPlatformType.OPENAI, |
|
model_type=ModelType.GPT_4O, |
|
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), |
|
), |
|
"image": ModelFactory.create( |
|
model_platform=ModelPlatformType.OPENAI, |
|
model_type=ModelType.GPT_4O, |
|
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), |
|
), |
|
} |
|
|
|
|
|
tools = [ |
|
*BrowserToolkit( |
|
headless=False, |
|
web_agent_model=models["browsing"], |
|
planning_agent_model=models["planning"], |
|
).get_tools(), |
|
*VideoAnalysisToolkit( |
|
model=models["video"] |
|
).get_tools(), |
|
*AudioAnalysisToolkit().get_tools(), |
|
*CodeExecutionToolkit(sandbox="subprocess", verbose=True).get_tools(), |
|
*ImageAnalysisToolkit(model=models["image"]).get_tools(), |
|
*SearchToolkit().get_tools(), |
|
*ExcelToolkit().get_tools(), |
|
*FileWriteToolkit(output_dir="./").get_tools(), |
|
] |
|
|
|
|
|
user_agent_kwargs = {"model": models["user"]} |
|
assistant_agent_kwargs = {"model": models["assistant"], "tools": tools} |
|
|
|
|
|
benchmark = GAIABenchmark(data_dir="data/gaia", save_to="results/result.json") |
|
|
|
|
|
print(f"Number of validation examples: {len(benchmark.valid)}") |
|
print(f"Number of test examples: {len(benchmark.test)}") |
|
|
|
|
|
result = benchmark.run( |
|
on="valid", |
|
level=LEVEL, |
|
idx=test_idx, |
|
save_result=SAVE_RESULT, |
|
user_role_name="user", |
|
user_agent_kwargs=user_agent_kwargs, |
|
assistant_role_name="assistant", |
|
assistant_agent_kwargs=assistant_agent_kwargs, |
|
) |
|
|
|
|
|
logger.info(f"Correct: {result['correct']}, Total: {result['total']}") |
|
logger.info(f"Accuracy: {result['accuracy']}") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|