File size: 4,040 Bytes
62da328 38255bb 62da328 38255bb 62da328 38255bb 62da328 38255bb 62da328 38255bb 62da328 38255bb 35ee5f2 38255bb 35ee5f2 38255bb 35ee5f2 3863783 38255bb 35ee5f2 38255bb 62da328 38255bb 62da328 38255bb 62da328 38255bb 62da328 38255bb 62da328 38255bb 62da328 38255bb 62da328 38255bb 62da328 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
from dotenv import load_dotenv
load_dotenv()
import os
from loguru import logger
from camel.models import ModelFactory
from camel.toolkits import (
AudioAnalysisToolkit,
CodeExecutionToolkit,
DocumentProcessingToolkit,
ExcelToolkit,
ImageAnalysisToolkit,
SearchToolkit,
VideoAnalysisToolkit,
WebToolkit,
)
from camel.types import ModelPlatformType, ModelType
from camel.configs import ChatGPTConfig
from utils import GAIABenchmark
# Configuration
LEVEL = 1
SAVE_RESULT = True
test_idx = [0]
def main():
"""Main function to run the GAIA benchmark."""
# Create cache directory
cache_dir = "tmp/"
os.makedirs(cache_dir, exist_ok=True)
# Create models for different components
models = {
"user": ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=ModelType.GPT_4O,
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(),
),
"assistant": ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=ModelType.GPT_4O,
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(),
),
"web": ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=ModelType.GPT_4O,
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(),
),
"planning": ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=ModelType.GPT_4O,
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(),
),
"video": ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=ModelType.GPT_4O,
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(),
),
"image": ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=ModelType.GPT_4O,
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(),
),
"search": ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=ModelType.GPT_4O,
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(),
),
}
# Configure toolkits
tools = [
*WebToolkit(
headless=False, # Set to True for headless mode (e.g., on remote servers)
web_agent_model=models["web"],
planning_agent_model=models["planning"],
).get_tools(),
*DocumentProcessingToolkit().get_tools(),
*VideoAnalysisToolkit(model=models["video"]).get_tools(), # This requires OpenAI Key
*AudioAnalysisToolkit().get_tools(), # This requires OpenAI Key
*CodeExecutionToolkit(sandbox="subprocess", verbose=True).get_tools(),
*ImageAnalysisToolkit(model=models["image"]).get_tools(),
*SearchToolkit(model=models["search"]).get_tools(),
*ExcelToolkit().get_tools(),
]
# Configure agent roles and parameters
user_agent_kwargs = {"model": models["user"]}
assistant_agent_kwargs = {"model": models["assistant"], "tools": tools}
# Initialize benchmark
benchmark = GAIABenchmark(
data_dir="data/gaia",
save_to=f"results/result.json"
)
# Print benchmark information
print(f"Number of validation examples: {len(benchmark.valid)}")
print(f"Number of test examples: {len(benchmark.test)}")
# Run benchmark
result = benchmark.run(
on="valid",
level=LEVEL,
idx=test_idx,
save_result=SAVE_RESULT,
user_role_name="user",
user_agent_kwargs=user_agent_kwargs,
assistant_role_name="assistant",
assistant_agent_kwargs=assistant_agent_kwargs,
)
# Output results
logger.success(f"Correct: {result['correct']}, Total: {result['total']}")
logger.success(f"Accuracy: {result['accuracy']}")
if __name__ == "__main__":
main()
|