Review agent added
Browse files- agents/llama_index_agent.py +114 -42
- app.py +35 -12
- tools/multimedia_tools.py +1 -1
agents/llama_index_agent.py
CHANGED
@@ -197,7 +197,6 @@ class GaiaAgent(ReActAgent):
|
|
197 |
|
198 |
query: What is the first name of the scientist who discovered penicillin?
|
199 |
research_notes: After researching, I found that Sir Alexander Fleming discovered penicillin in 1928. The full answer is "Alexander Fleming" but the question only asks for the first name, which is "Alexander".
|
200 |
-
answer_format: Return ONLY the first name, with no additional text, punctuation, or explanation.
|
201 |
```
|
202 |
|
203 |
IMPORTANT: NEVER provide the final answer directly to the user. ALWAYS hand off to the writer_agent for proper formatting.
|
@@ -222,21 +221,20 @@ def create_writer_agent(model_config: Dict[str, Any]) -> ReActAgent:
|
|
222 |
llm = OpenAI(
|
223 |
model=model_name,
|
224 |
api_key=api_key or os.getenv("OPENAI_API_KEY"),
|
225 |
-
max_tokens=
|
226 |
temperature=0.1,
|
227 |
-
|
228 |
additional_kwargs={
|
229 |
-
"max_tokens":
|
230 |
-
"temperature": 0.
|
231 |
-
|
|
|
232 |
elif model_provider.lower() == "anthropic":
|
233 |
llm = Anthropic(
|
234 |
model=model_name,
|
235 |
api_key=api_key or os.getenv("ANTHROPIC_API_KEY"),
|
236 |
-
temperature=
|
237 |
-
thinking_dict={"type": "enabled", "budget_tokens":
|
238 |
-
max_tokens=
|
239 |
-
|
240 |
)
|
241 |
else:
|
242 |
raise ValueError(f"Unsupported model provider for writer agent: {model_provider}")
|
@@ -244,50 +242,124 @@ def create_writer_agent(model_config: Dict[str, Any]) -> ReActAgent:
|
|
244 |
# Create and return the writer agent
|
245 |
return ReActAgent(
|
246 |
name="writer_agent",
|
247 |
-
description="Formats the final answer
|
248 |
system_prompt="""
|
249 |
-
You are a specialized formatting agent for the GAIA benchmark. Your
|
250 |
|
251 |
## YOUR ROLE
|
252 |
You will receive:
|
253 |
- query: The original question
|
254 |
- research_notes: The main agent's complete analysis and reasoning
|
255 |
-
- answer_format: Specific formatting instructions for the final answer
|
256 |
|
257 |
-
##
|
258 |
-
1.
|
259 |
-
2.
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
7. NEVER include your own thoughts or analysis
|
265 |
-
8. NEVER add preamble or conclusion text
|
266 |
|
267 |
-
##
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
When asked for "opposite of word 'right'": left
|
272 |
-
When asked for "How many ...": eleven
|
273 |
-
When asked for "What says Yoda": "May the force be with you"
|
274 |
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
|
|
|
|
|
|
|
|
279 |
|
280 |
-
|
281 |
-
|
282 |
-
|
|
|
|
|
283 |
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
289 |
|
290 |
-
|
291 |
""",
|
292 |
llm=llm
|
293 |
)
|
|
|
197 |
|
198 |
query: What is the first name of the scientist who discovered penicillin?
|
199 |
research_notes: After researching, I found that Sir Alexander Fleming discovered penicillin in 1928. The full answer is "Alexander Fleming" but the question only asks for the first name, which is "Alexander".
|
|
|
200 |
```
|
201 |
|
202 |
IMPORTANT: NEVER provide the final answer directly to the user. ALWAYS hand off to the writer_agent for proper formatting.
|
|
|
221 |
llm = OpenAI(
|
222 |
model=model_name,
|
223 |
api_key=api_key or os.getenv("OPENAI_API_KEY"),
|
224 |
+
max_tokens=256,
|
225 |
temperature=0.1,
|
|
|
226 |
additional_kwargs={
|
227 |
+
"max_tokens": 256,
|
228 |
+
"temperature": 0.1
|
229 |
+
}
|
230 |
+
)
|
231 |
elif model_provider.lower() == "anthropic":
|
232 |
llm = Anthropic(
|
233 |
model=model_name,
|
234 |
api_key=api_key or os.getenv("ANTHROPIC_API_KEY"),
|
235 |
+
temperature=0.1,
|
236 |
+
thinking_dict={"type": "enabled", "budget_tokens": 1024} if "3-7" in model_name else None,
|
237 |
+
max_tokens=1024
|
|
|
238 |
)
|
239 |
else:
|
240 |
raise ValueError(f"Unsupported model provider for writer agent: {model_provider}")
|
|
|
242 |
# Create and return the writer agent
|
243 |
return ReActAgent(
|
244 |
name="writer_agent",
|
245 |
+
description="Formats the final answer based on research notes for GAIA benchmark questions",
|
246 |
system_prompt="""
|
247 |
+
You are a specialized formatting agent for the GAIA benchmark. Your job is to take the research from the main agent and format the answer according to the benchmark requirements.
|
248 |
|
249 |
## YOUR ROLE
|
250 |
You will receive:
|
251 |
- query: The original question
|
252 |
- research_notes: The main agent's complete analysis and reasoning
|
|
|
253 |
|
254 |
+
## FORMATTING RULES
|
255 |
+
1. Format the answer according to the instructions in the `query` received
|
256 |
+
2. Your answers will be always as minimal as necessary to answer the question
|
257 |
+
2. Try to remove unnecessary characters, spaces, or wording
|
258 |
+
3. If asked for a name, provide **ONLY** the name
|
259 |
+
4. If asked for a number, provide the **ONLY** number
|
260 |
+
5. If asked for a list, format it exactly as specified
|
|
|
|
|
261 |
|
262 |
+
## DELEGATION TO REVIEW AGENT
|
263 |
+
After formatting your answer, ALWAYS delegate to the review_agent with:
|
264 |
+
- query: The original question
|
265 |
+
- formatted_answer: Your formatted answer
|
|
|
|
|
|
|
266 |
|
267 |
+
Example handoff to review_agent:
|
268 |
+
```
|
269 |
+
I'll delegate to review_agent for final review.
|
270 |
+
|
271 |
+
query: What is the first name of the scientist who discovered penicillin?
|
272 |
+
formatted_answer: Alexander
|
273 |
+
format_requirements: Return ONLY the first name, with no additional text.
|
274 |
+
```
|
275 |
|
276 |
+
IMPORTANT: ALWAYS hand off to the review_agent for final verification and cleanup.
|
277 |
+
""",
|
278 |
+
llm=llm,
|
279 |
+
can_handoff_to=["review_agent"]
|
280 |
+
)
|
281 |
|
282 |
+
def create_review_agent(model_config: Dict[str, Any]) -> ReActAgent:
|
283 |
+
"""
|
284 |
+
Create a review agent that ensures the final answer follows exact formatting requirements.
|
285 |
+
|
286 |
+
Args:
|
287 |
+
model_config: Dictionary containing model_provider, model_name, and api_key
|
288 |
+
|
289 |
+
Returns:
|
290 |
+
A configured ReActAgent for final answer review and formatting
|
291 |
+
"""
|
292 |
+
# Initialize LLM based on the provided configuration
|
293 |
+
model_provider = model_config.get("model_provider", "openai")
|
294 |
+
model_name = model_config.get("model_name", "gpt-4o-mini")
|
295 |
+
api_key = model_config.get("api_key")
|
296 |
+
|
297 |
+
if model_provider.lower() == "openai":
|
298 |
+
llm = OpenAI(
|
299 |
+
model=model_name,
|
300 |
+
api_key=api_key or os.getenv("OPENAI_API_KEY"),
|
301 |
+
max_tokens=128,
|
302 |
+
temperature=0.0, # Use 0 temperature for deterministic output
|
303 |
+
additional_kwargs={
|
304 |
+
"max_tokens": 128,
|
305 |
+
"temperature": 0.0
|
306 |
+
}
|
307 |
+
)
|
308 |
+
elif model_provider.lower() == "anthropic":
|
309 |
+
llm = Anthropic(
|
310 |
+
model=model_name,
|
311 |
+
api_key=api_key or os.getenv("ANTHROPIC_API_KEY"),
|
312 |
+
temperature=0.0, # Use 0 temperature for deterministic output
|
313 |
+
thinking_dict={"type": "enabled", "budget_tokens": 1024} if "3-7" in model_name else None,
|
314 |
+
max_tokens=128 # Keep token limit low for final answers
|
315 |
+
)
|
316 |
+
else:
|
317 |
+
raise ValueError(f"Unsupported model provider for review agent: {model_provider}")
|
318 |
+
|
319 |
+
# Create and return the review agent
|
320 |
+
return ReActAgent(
|
321 |
+
name="review_agent",
|
322 |
+
description="Ensures the final answer is formatted exactly as required, removing any unnecessary information",
|
323 |
+
system_prompt="""
|
324 |
+
You are the final review agent for the GAIA benchmark. Your ONLY job is to ensure the answer is in the EXACT format required. This is EXTREMELY important for benchmark scoring.
|
325 |
+
|
326 |
+
## YOUR ROLE
|
327 |
+
You will receive:
|
328 |
+
- query: The original question
|
329 |
+
- formatted_answer: The answer formatted by the writer agent
|
330 |
+
|
331 |
+
## CRITICAL RULES
|
332 |
+
1. Your ENTIRE response must be ONLY the final answer - NOTHING ELSE
|
333 |
+
2. Remove ALL of the following:
|
334 |
+
- Explanations like "The answer is..." or "I found that..."
|
335 |
+
- Quotation marks (unless explicitly required)
|
336 |
+
- Punctuation at the end (unless explicitly required)
|
337 |
+
- Unnecessary whitespace
|
338 |
+
3. If no specific format is mentioned, make the answer as minimal as possible:
|
339 |
+
- For names/words: just the name/word (e.g., "Paris")
|
340 |
+
- For numbers: just the number (e.g., "42")
|
341 |
+
- For lists: comma-separated values (e.g., "apple, banana, cherry")
|
342 |
+
4. NEVER add ANY commentary, explanation, or additional information
|
343 |
+
5. Double-check for exact formatting requirements like:
|
344 |
+
- Numerical format (e.g., "42" vs "forty-two")
|
345 |
+
- Case sensitivity (e.g., "PARIS" vs "Paris")
|
346 |
+
- List formatting (e.g., comma-separated vs numbered)
|
347 |
+
|
348 |
+
## OUTPUT EXAMPLES
|
349 |
+
- Input: "The answer is Alexander."
|
350 |
+
Output: Alexander
|
351 |
+
- Input: "The result is 42 because..."
|
352 |
+
Output: 42
|
353 |
+
- Input: "The capital of France is Paris."
|
354 |
+
Output: Paris
|
355 |
+
- Input: "I found that it's eleven."
|
356 |
+
Output: eleven
|
357 |
+
- Input: "These actors starred in the film: Tom Hanks, Meg Ryan, and Bill Pullman."
|
358 |
+
Output: Tom Hanks, Meg Ryan, Bill Pullman
|
359 |
+
- Input: "She published studio albums "Album 1", "Album 2", "Album 3", so in total 3."
|
360 |
+
Output: 3
|
361 |
|
362 |
+
REMEMBER: Your ENTIRE response should be just the bare answer with NOTHING else.
|
363 |
""",
|
364 |
llm=llm
|
365 |
)
|
app.py
CHANGED
@@ -28,17 +28,18 @@ OPENAI = {
|
|
28 |
class BasicAgent:
|
29 |
def __init__(
|
30 |
self,
|
31 |
-
|
32 |
-
|
33 |
-
model_provider="openai",
|
34 |
-
model_name="o4-mini",
|
35 |
api_key=None,
|
36 |
use_separate_writer_model=True,
|
37 |
writer_model_provider="openai",
|
38 |
-
writer_model_name="gpt-4o-mini"
|
|
|
|
|
|
|
39 |
):
|
40 |
"""
|
41 |
-
Initialize the BasicAgent with a
|
42 |
|
43 |
Args:
|
44 |
model_provider: LLM provider for main agent
|
@@ -47,6 +48,9 @@ class BasicAgent:
|
|
47 |
use_separate_writer_model: Whether to use a different model for the writer agent
|
48 |
writer_model_provider: LLM provider for writer agent (if separate)
|
49 |
writer_model_name: Model name for writer agent (if separate)
|
|
|
|
|
|
|
50 |
"""
|
51 |
# Configure the main reasoning agent
|
52 |
main_model_config = {
|
@@ -64,22 +68,37 @@ class BasicAgent:
|
|
64 |
}
|
65 |
else:
|
66 |
writer_model_config = main_model_config
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
-
# Create the
|
69 |
self.main_agent = GaiaAgent(**main_model_config)
|
70 |
-
|
71 |
-
# Create the writer agent
|
72 |
self.writer_agent = create_writer_agent(writer_model_config)
|
|
|
|
|
|
|
|
|
73 |
|
74 |
# Set up the agent workflow with shared context
|
75 |
self.agent_workflow = AgentWorkflow(
|
76 |
-
agents=[self.main_agent, self.writer_agent],
|
77 |
root_agent=self.main_agent.name,
|
78 |
initial_state={
|
79 |
"original_question": "",
|
|
|
|
|
80 |
"analysis_notes": "",
|
81 |
"format_requirements": "",
|
82 |
"next_agent": "",
|
|
|
83 |
"final_answer": ""
|
84 |
}
|
85 |
)
|
@@ -89,7 +108,11 @@ class BasicAgent:
|
|
89 |
print(f"Writer agent using: {writer_model_provider} {writer_model_name}")
|
90 |
else:
|
91 |
print(f"Writer agent using same model as main agent")
|
92 |
-
|
|
|
|
|
|
|
|
|
93 |
def __call__(self, question_data: dict) -> str:
|
94 |
"""Process a GAIA benchmark question and return the formatted answer."""
|
95 |
# Extract question text and task_id
|
@@ -141,7 +164,7 @@ class BasicAgent:
|
|
141 |
|
142 |
# Extract the final answer from the writer agent's response
|
143 |
final_answer = response.response.blocks[-1].text
|
144 |
-
print(f"Agent returning answer: {final_answer}")
|
145 |
return final_answer
|
146 |
|
147 |
def download_task_file(self, question_data: dict) -> str:
|
|
|
28 |
class BasicAgent:
|
29 |
def __init__(
|
30 |
self,
|
31 |
+
model_provider="anthropic",
|
32 |
+
model_name="claude-3-7-sonnet-latest",
|
|
|
|
|
33 |
api_key=None,
|
34 |
use_separate_writer_model=True,
|
35 |
writer_model_provider="openai",
|
36 |
+
writer_model_name="gpt-4o-mini",
|
37 |
+
use_separate_review_model=True,
|
38 |
+
review_model_provider="openai",
|
39 |
+
review_model_name="gpt-4o-mini"
|
40 |
):
|
41 |
"""
|
42 |
+
Initialize the BasicAgent with a three-agent workflow.
|
43 |
|
44 |
Args:
|
45 |
model_provider: LLM provider for main agent
|
|
|
48 |
use_separate_writer_model: Whether to use a different model for the writer agent
|
49 |
writer_model_provider: LLM provider for writer agent (if separate)
|
50 |
writer_model_name: Model name for writer agent (if separate)
|
51 |
+
use_separate_review_model: Whether to use a different model for the review agent
|
52 |
+
review_model_provider: LLM provider for review agent (if separate)
|
53 |
+
review_model_name: Model name for review agent (if separate)
|
54 |
"""
|
55 |
# Configure the main reasoning agent
|
56 |
main_model_config = {
|
|
|
68 |
}
|
69 |
else:
|
70 |
writer_model_config = main_model_config
|
71 |
+
|
72 |
+
# Configure the review agent (either same as main or different)
|
73 |
+
if use_separate_review_model:
|
74 |
+
review_model_config = {
|
75 |
+
"model_provider": review_model_provider,
|
76 |
+
"model_name": review_model_name,
|
77 |
+
"api_key": api_key # Use same API key for simplicity
|
78 |
+
}
|
79 |
+
else:
|
80 |
+
review_model_config = main_model_config
|
81 |
|
82 |
+
# Create the agents
|
83 |
self.main_agent = GaiaAgent(**main_model_config)
|
|
|
|
|
84 |
self.writer_agent = create_writer_agent(writer_model_config)
|
85 |
+
self.review_agent = create_review_agent(review_model_config)
|
86 |
+
|
87 |
+
# Update the GaiaAgent's can_handoff_to to include review_agent
|
88 |
+
self.main_agent.can_handoff_to = ["writer_agent", "review_agent"]
|
89 |
|
90 |
# Set up the agent workflow with shared context
|
91 |
self.agent_workflow = AgentWorkflow(
|
92 |
+
agents=[self.main_agent, self.writer_agent, self.review_agent],
|
93 |
root_agent=self.main_agent.name,
|
94 |
initial_state={
|
95 |
"original_question": "",
|
96 |
+
"task_id": "",
|
97 |
+
"audio_file_path": "",
|
98 |
"analysis_notes": "",
|
99 |
"format_requirements": "",
|
100 |
"next_agent": "",
|
101 |
+
"formatted_answer": "",
|
102 |
"final_answer": ""
|
103 |
}
|
104 |
)
|
|
|
108 |
print(f"Writer agent using: {writer_model_provider} {writer_model_name}")
|
109 |
else:
|
110 |
print(f"Writer agent using same model as main agent")
|
111 |
+
if use_separate_review_model:
|
112 |
+
print(f"Review agent using: {review_model_provider} {review_model_name}")
|
113 |
+
else:
|
114 |
+
print(f"Review agent using same model as main agent")
|
115 |
+
|
116 |
def __call__(self, question_data: dict) -> str:
|
117 |
"""Process a GAIA benchmark question and return the formatted answer."""
|
118 |
# Extract question text and task_id
|
|
|
164 |
|
165 |
# Extract the final answer from the writer agent's response
|
166 |
final_answer = response.response.blocks[-1].text
|
167 |
+
print(f"Agent returning final answer: {final_answer}")
|
168 |
return final_answer
|
169 |
|
170 |
def download_task_file(self, question_data: dict) -> str:
|
tools/multimedia_tools.py
CHANGED
@@ -195,7 +195,7 @@ class VisionAnalyzerAgent:
|
|
195 |
try:
|
196 |
response = self.client.chat.completions.create(
|
197 |
model=self.model_name,
|
198 |
-
max_tokens=1024,
|
199 |
messages=[
|
200 |
{
|
201 |
"role": "user",
|
|
|
195 |
try:
|
196 |
response = self.client.chat.completions.create(
|
197 |
model=self.model_name,
|
198 |
+
max_tokens=1024*20,
|
199 |
messages=[
|
200 |
{
|
201 |
"role": "user",
|