hgmiya commited on
Commit
f6a50f3
·
1 Parent(s): b1628e5

Implement GAIA Solver: Add agent tools for code execution, YouTube analysis, image understanding, audio transcription, and Excel conversion. Initialize agents and set up asynchronous processing for question handling.

Browse files
.gitattributes CHANGED
@@ -33,5 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- GAIA_resource/1f975693-876d-457b-a649-393859e79bf3.mp3 filter=lfs diff=lfs merge=lfs -text
37
- GAIA_resource/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3 filter=lfs diff=lfs merge=lfs -text
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+
 
GAIA_resource/1f975693-876d-457b-a649-393859e79bf3.mp3 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:200f767e732b49efef5c05d128903ee4d2c34e66fdce7f5593ac123b2e637673
3
- size 280868
 
 
 
 
GAIA_resource/7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx DELETED
Binary file (5.29 kB)
 
GAIA_resource/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b218c951c1f888f0bbe6f46c080f57afc7c9348fffc7ba4da35749ff1e2ac40f
3
- size 179304
 
 
 
 
GAIA_resource/cca530fc-4052-43b2-b130-b30968d8aa44.png DELETED
Binary file (63.1 kB)
 
GAIA_resource/f918266a-b3e0-4914-865d-4faa564f1aef.py DELETED
@@ -1,35 +0,0 @@
1
- from random import randint
2
- import time
3
-
4
- class UhOh(Exception):
5
- pass
6
-
7
- class Hmm:
8
- def __init__(self):
9
- self.value = randint(-100, 100)
10
-
11
- def Yeah(self):
12
- if self.value == 0:
13
- return True
14
- else:
15
- raise UhOh()
16
-
17
- def Okay():
18
- while True:
19
- yield Hmm()
20
-
21
- def keep_trying(go, first_try=True):
22
- maybe = next(go)
23
- try:
24
- if maybe.Yeah():
25
- return maybe.value
26
- except UhOh:
27
- if first_try:
28
- print("Working...")
29
- print("Please wait patiently...")
30
- time.sleep(0.1)
31
- return keep_trying(go, first_try=False)
32
-
33
- if __name__ == "__main__":
34
- go = Okay()
35
- print(f"{keep_trying(go)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
__init__.py CHANGED
@@ -1 +1 @@
1
- from . import agent
 
1
+ from .adk_web import agent
agent.py → agent_dev.py RENAMED
@@ -16,6 +16,7 @@ from google.adk.agents import Agent
16
  from google.adk.tools import google_search, built_in_code_execution
17
  from google.adk.agents import LlmAgent
18
 
 
19
  from openpyxl import load_workbook
20
 
21
  import warnings
@@ -102,7 +103,9 @@ def understand_youtube_video(video_url: str, question: str) -> str:
102
  )
103
 
104
  print("--- Gemini Response Received ---")
 
105
  if hasattr(response, 'text'):
 
106
  return response.text
107
  elif response.parts:
108
  return "".join(part.text for part in response.parts if hasattr(part, 'text'))
@@ -122,41 +125,39 @@ def understand_image(image_file_name: str) -> str:
122
  Given an image file , this will analyze the image in detail and describe its contents in as much detail as possible.
123
 
124
  Args:
125
- image_file_name (str): The file name of the image to analyze. Which given as "file_name" parameter in the question.
126
 
127
  Returns:
128
  str: The response text generated by the Gemini model.
129
  """
130
- image_url = os.path.join("./GAIA_resource/" , image_file_name)
131
  print("--- Analyzing Image ---")
132
- print(f"Image URL/Path: {image_url}")
133
 
134
  prompt = """
135
  Analyze the image in detail and describe its contents in as much detail as possible.
136
  For example, give someone a chess board and describe where each piece is.
137
 
138
- The description should include the following information:
139
- - General overview of the image
140
- - Details of important elements and features (e.g., location relationships, attributes, etc.)
141
- - Identification of specific objects or characters (e.g., game piece names, positions, people, etc.)
142
 
143
- # Steps
144
- 1. Examine the image as a whole and identify the main elements.
145
- 2. Examine each element in detail and identify what it is.
146
- 3. Develop a description of each element based on its characteristic relationships and positions.
147
- 4. Finally, summarize the overall scene or situation.
148
-
149
- # Output Format
150
- Provide detailed descriptions in paragraphs of text, using bullet points where necessary.
151
 
 
 
152
  """
153
 
154
  try:
155
  # Fetch the image data
156
- if image_url.startswith("http"):
157
- image_bytes = requests.get(image_url).content
158
  else:
159
- with open(image_url, "rb") as f:
160
  image_bytes = f.read()
161
 
162
  # Create image part
@@ -177,8 +178,10 @@ Provide detailed descriptions in paragraphs of text, using bullet points where n
177
  )
178
 
179
  print("--- Gemini Response Received ---")
 
180
  # Extract text from the response
181
  if hasattr(response, 'text'):
 
182
  return response.text
183
  elif getattr(response, 'parts', None):
184
  return "".join(part.text for part in response.parts if hasattr(part, 'text'))
@@ -189,7 +192,7 @@ Provide detailed descriptions in paragraphs of text, using bullet points where n
189
  return f"Model did not return text content.{block_reason}"
190
 
191
  except Exception as e:
192
- print(f"Error processing image '{image_url}' with Gemini: {e}")
193
  return f"Sorry, an error occurred while analyzing the image. Please check the image URL or path. Error details: {str(e)}"
194
 
195
  # Audio Tool
@@ -205,7 +208,6 @@ def transcribe_audio(audio_path: str) -> str:
205
  """
206
  print("--- Transcribing Audio ---")
207
  print(f"Audio Path: {audio_path}")
208
- audio_path = os.path.join("./GAIA_resource/", audio_path)
209
 
210
  try:
211
  # Initialize Gemini client
@@ -229,6 +231,8 @@ def transcribe_audio(audio_path: str) -> str:
229
  else:
230
  transcript = "Model did not return text content."
231
 
 
 
232
  # Format as Markdown
233
  markdown_transcript = (
234
  "## Audio Transcription Result\n"
@@ -258,14 +262,7 @@ def excel_to_csv(excel_path: str) -> str:
258
  excel_path = os.path.join("./GAIA_resource/", excel_path)
259
 
260
  try:
261
- # Load workbook from URL or local file
262
- if excel_path.startswith("http"):
263
- response = requests.get(excel_path)
264
- response.raise_for_status()
265
- data_stream = BytesIO(response.content)
266
- wb = load_workbook(filename=data_stream, data_only=True)
267
- else:
268
- wb = load_workbook(filename=excel_path, data_only=True)
269
 
270
  # Select worksheet
271
  ws = wb.active
@@ -286,70 +283,56 @@ def excel_to_csv(excel_path: str) -> str:
286
  except Exception as e:
287
  return f"Error converting Excel to CSV: {e}"
288
 
289
- data_analyzer_agent = LlmAgent(
290
- model="gemini-2.5-flash-preview-04-17",
291
- name="data_analyzer_agent",
292
- description="When data is provided, analyze it and derive an appropriate answer.",
293
- instruction="""
294
- # Steps
295
- 1. **Data Review**: Understand the data provided and understand what it shows.
296
- 2. **Prepare for Analysis**: If necessary, clean the data and prepare it for analysis.
297
- 3. **Data Analysis**: Analyze the data using appropriate methods to find meaningful information and trends.
298
- 4. **Interpretation**: Interpret the analysis results to answer questions and doubts.
299
- 5. **Present Conclusions**: Present your conclusions and insights in a logical summary.
300
-
301
- # Output Format
302
- - State your conclusions in a short sentence, but make sure they are clear and specific.
303
- - If necessary, use tables and graphs to provide additional information.
304
-
305
- # Examples
306
- - **Input Data**:
307
- - Survey data on age, gender, occupation, and annual income
308
- - **Analysis Results**:
309
- - The older the person, the higher the annual income tends to be.
310
- - **Statement of conclusion**:
311
- - "The survey data shows that the older you are, the higher your average annual income is."
312
-
313
- # Notes
314
- - If your data set is very large, consider using sample data or segmenting your data for analysis.
315
- - Distinguish between qualitative and quantitative data and choose the appropriate analysis method for each.
316
- """,
317
- tools=[excel_to_csv] # Provide the function directly
318
- )
319
-
320
-
321
- # Read file ascii
322
- def read_file_ascii(file_path: str) -> str:
323
  """
324
- Given a file URL or local file path, reads the file content and returns it as an ASCII string.
325
 
326
  Args:
327
- file_path (str): The URL or local file path of the file to read.
328
 
329
  Returns:
330
- str: The ASCII-decoded content of the file, or an error message on failure.
331
  """
 
332
  print("File Path : ", file_path)
333
- file_path = os.path.join("./GAIA_resource/", file_path)
334
 
335
  try:
336
- # Load data from URL or local file
337
- if file_path.startswith("http"):
338
- response = requests.get(file_path)
339
- response.raise_for_status()
340
- data_bytes = response.content
341
- else:
342
- with open(file_path, "rb") as f:
343
- data_bytes = f.read()
344
-
345
  # Decode bytes to ASCII string, replacing errors
346
- ascii_str = data_bytes.decode("ascii", errors="replace")
347
- return ascii_str
348
 
349
  except Exception as e:
350
- return f"Error reading file as ASCII: {e}"
351
 
352
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
  # Call Agent Async
354
  async def call_agent_async(query: str, runner, user_id, session_id):
355
  """Sends a query to the agent and prints the final response."""
@@ -405,8 +388,8 @@ Thinking Process:
405
  1. **Analyze Question & Identify Files:** Carefully read the question. Determine the core task and the **exact final answer format**. Check if the question explicitly mentions an attached file (image, Excel, audio, code).
406
  2. **Identify Filename:** If a file is mentioned, identify its filename from the text (e.g., "Homework.mp3", "image.png"). If no specific filename is given for a required file type, state that you need the filename. **Do not guess filenames.**
407
  3. **Plan:** Create a step-by-step plan using tools. If a file is needed, include the correct tool call with the identified filename.
408
- 4. **Execute & Refine:** Execute the plan. Pass correct arguments (especially filenames). Evaluate tool outputs. If errors occur (e.g., file not found, API errors) or info is insufficient, revise the plan (e.g., use `web_search`, different tool prompts).
409
- 5. **Synthesize Answer:** Combine information. Use `execute_python_code` for final formatting/calculations.
410
  6. **Final Output:** Generate **only the final answer** in the requested format. No extra text. If the answer cannot be found or a required filename was missing/invalid, output: "I could not find the answer."
411
 
412
  Constraints:
@@ -435,8 +418,9 @@ async def main():
435
  understand_youtube_video,
436
  understand_image,
437
  transcribe_audio,
438
- agent_tool.AgentTool(agent=data_analyzer_agent),
439
- read_file_ascii,
 
440
  ]
441
  )
442
  except Exception as e:
@@ -469,17 +453,14 @@ async def main():
469
  results_log = []
470
  answers_payload = []
471
  print(f"Running agent on {len(questions_data)} questions...")
472
- i = 0
473
  for item in questions_data:
474
- i += 1
475
- if i < 12:
476
- continue
477
- elif i > 12:
478
- break
479
  task_id = item.get("task_id")
480
  question_text = item.get("question")
481
- question_file_name = item.get("file_name")
482
- question_all = question_text + " file_name = " + question_file_name
 
 
 
483
  if not task_id or question_text is None:
484
  print(f"Skipping item with missing task_id or question: {item}")
485
  continue
@@ -500,16 +481,19 @@ async def main():
500
  app_name=APP_NAME, # Associates runs with our app
501
  session_service=session_service # Uses our session manager
502
  )
503
- submitted_answer = await call_agent_async(question_all,
504
  runner=runner,
505
  user_id=USER_ID,
506
- session_id=SESSION_ID)
 
507
 
508
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
509
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
510
  except Exception as e:
511
  print(f"Error running agent on task {task_id}: {e}")
512
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
 
 
513
 
514
  if not answers_payload:
515
  print("Agent did not produce any answers to submit.")
 
16
  from google.adk.tools import google_search, built_in_code_execution
17
  from google.adk.agents import LlmAgent
18
 
19
+ from huggingface_hub import snapshot_download
20
  from openpyxl import load_workbook
21
 
22
  import warnings
 
103
  )
104
 
105
  print("--- Gemini Response Received ---")
106
+
107
  if hasattr(response, 'text'):
108
+ print("Video Description : ", response.text)
109
  return response.text
110
  elif response.parts:
111
  return "".join(part.text for part in response.parts if hasattr(part, 'text'))
 
125
  Given an image file , this will analyze the image in detail and describe its contents in as much detail as possible.
126
 
127
  Args:
128
+ image_file_name (str): The file name of the image to analyze.
129
 
130
  Returns:
131
  str: The response text generated by the Gemini model.
132
  """
 
133
  print("--- Analyzing Image ---")
134
+ print(f"Image URL/Path: {image_file_name}")
135
 
136
  prompt = """
137
  Analyze the image in detail and describe its contents in as much detail as possible.
138
  For example, give someone a chess board and describe where each piece is.
139
 
140
+ The description should include the following information:
141
+ - General overview of the image
142
+ - Details of important elements and features (e.g., location relationships, attributes, etc.)
143
+ - Identification of specific objects or characters (e.g., game piece names, positions, people, etc.)
144
 
145
+ # Steps
146
+ 1. Examine the image as a whole and identify the main elements.
147
+ 2. Examine each element in detail and identify what it is.
148
+ 3. Develop a description of each element based on its characteristic relationships and positions.
149
+ 4. Finally, summarize the overall scene or situation.
 
 
 
150
 
151
+ # Output Format
152
+ Provide detailed descriptions in paragraphs of text, using bullet points where necessary.
153
  """
154
 
155
  try:
156
  # Fetch the image data
157
+ if image_file_name.startswith("http"):
158
+ image_bytes = requests.get(image_file_name).content
159
  else:
160
+ with open(image_file_name, "rb") as f:
161
  image_bytes = f.read()
162
 
163
  # Create image part
 
178
  )
179
 
180
  print("--- Gemini Response Received ---")
181
+
182
  # Extract text from the response
183
  if hasattr(response, 'text'):
184
+ print("Image Description : ", response.text)
185
  return response.text
186
  elif getattr(response, 'parts', None):
187
  return "".join(part.text for part in response.parts if hasattr(part, 'text'))
 
192
  return f"Model did not return text content.{block_reason}"
193
 
194
  except Exception as e:
195
+ print(f"Error processing image '{image_file_name}' with Gemini: {e}")
196
  return f"Sorry, an error occurred while analyzing the image. Please check the image URL or path. Error details: {str(e)}"
197
 
198
  # Audio Tool
 
208
  """
209
  print("--- Transcribing Audio ---")
210
  print(f"Audio Path: {audio_path}")
 
211
 
212
  try:
213
  # Initialize Gemini client
 
231
  else:
232
  transcript = "Model did not return text content."
233
 
234
+ print("Transcript : ", transcript)
235
+
236
  # Format as Markdown
237
  markdown_transcript = (
238
  "## Audio Transcription Result\n"
 
262
  excel_path = os.path.join("./GAIA_resource/", excel_path)
263
 
264
  try:
265
+ wb = load_workbook(filename=excel_path, data_only=True)
 
 
 
 
 
 
 
266
 
267
  # Select worksheet
268
  ws = wb.active
 
283
  except Exception as e:
284
  return f"Error converting Excel to CSV: {e}"
285
 
286
+ # Read text file
287
+ def LoadTextFileTool(file_path: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  """
289
+ This tool loads any text file
290
 
291
  Args:
292
+ file_path (str): File Path
293
 
294
  Returns:
295
+ str: Text file contents.
296
  """
297
+ print("---Load Text File Tool---")
298
  print("File Path : ", file_path)
 
299
 
300
  try:
 
 
 
 
 
 
 
 
 
301
  # Decode bytes to ASCII string, replacing errors
302
+ with open(file_path, 'r', encoding='utf-8') as file:
303
+ return file.read()
304
 
305
  except Exception as e:
306
+ return f"Error reading text file: {e}"
307
 
308
 
309
+ # Get task file
310
+ def GetTaskFileTool(file_name: str, task_id: str) -> str:
311
+ """
312
+ This tool downloads the file content associated with the given task_id if exists. Returns absolute file path.
313
+
314
+ Args:
315
+ task_id (str): Task id
316
+ file_name (str) File name
317
+
318
+ Returns:
319
+ str: absolute file path
320
+ """
321
+ print("---Get Task File Tool---")
322
+ print("File Name : ", file_name)
323
+
324
+ try:
325
+ response = requests.get(f"{DEFAULT_API_URL}/files/{task_id}", timeout=15)
326
+ response.raise_for_status()
327
+ with open(file_name, 'wb') as file:
328
+ file.write(response.content)
329
+ return os.path.abspath(file_name)
330
+ except TypeError as e:
331
+ return f"Error GetTaskFileTool '{file_name}' : {str(e)}"
332
+ except Exception as e:
333
+ return f"Error reading file: {e}"
334
+
335
+
336
  # Call Agent Async
337
  async def call_agent_async(query: str, runner, user_id, session_id):
338
  """Sends a query to the agent and prints the final response."""
 
388
  1. **Analyze Question & Identify Files:** Carefully read the question. Determine the core task and the **exact final answer format**. Check if the question explicitly mentions an attached file (image, Excel, audio, code).
389
  2. **Identify Filename:** If a file is mentioned, identify its filename from the text (e.g., "Homework.mp3", "image.png"). If no specific filename is given for a required file type, state that you need the filename. **Do not guess filenames.**
390
  3. **Plan:** Create a step-by-step plan using tools. If a file is needed, include the correct tool call with the identified filename.
391
+ 4. **Execute & Refine:** Execute the plan. Pass correct arguments (especially filenames). Evaluate tool outputs. If errors occur (e.g., file not found, API errors) or info is insufficient, revise the plan (e.g., use different tool prompts).
392
+ 5. **Synthesize Answer:** Combine information. Use `coding_agent` for final formatting/calculations.
393
  6. **Final Output:** Generate **only the final answer** in the requested format. No extra text. If the answer cannot be found or a required filename was missing/invalid, output: "I could not find the answer."
394
 
395
  Constraints:
 
418
  understand_youtube_video,
419
  understand_image,
420
  transcribe_audio,
421
+ excel_to_csv,
422
+ GetTaskFileTool,
423
+ LoadTextFileTool,
424
  ]
425
  )
426
  except Exception as e:
 
453
  results_log = []
454
  answers_payload = []
455
  print(f"Running agent on {len(questions_data)} questions...")
 
456
  for item in questions_data:
 
 
 
 
 
457
  task_id = item.get("task_id")
458
  question_text = item.get("question")
459
+ file_name = item.get("file_name")
460
+ if task_id:
461
+ question_text += " task_id = " + task_id
462
+ if file_name:
463
+ question_text += " file_name = " + file_name
464
  if not task_id or question_text is None:
465
  print(f"Skipping item with missing task_id or question: {item}")
466
  continue
 
481
  app_name=APP_NAME, # Associates runs with our app
482
  session_service=session_service # Uses our session manager
483
  )
484
+ submitted_answer = await call_agent_async(question_text,
485
  runner=runner,
486
  user_id=USER_ID,
487
+ session_id=SESSION_ID
488
+ )
489
 
490
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
491
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
492
  except Exception as e:
493
  print(f"Error running agent on task {task_id}: {e}")
494
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
495
+ if os.path.exists(file_name):
496
+ os.remove(file_name)
497
 
498
  if not answers_payload:
499
  print("Agent did not produce any answers to submit.")
app.py CHANGED
@@ -104,7 +104,9 @@ def understand_youtube_video(video_url: str, question: str) -> str:
104
  )
105
 
106
  print("--- Gemini Response Received ---")
 
107
  if hasattr(response, 'text'):
 
108
  return response.text
109
  elif response.parts:
110
  return "".join(part.text for part in response.parts if hasattr(part, 'text'))
@@ -124,41 +126,39 @@ def understand_image(image_file_name: str) -> str:
124
  Given an image file , this will analyze the image in detail and describe its contents in as much detail as possible.
125
 
126
  Args:
127
- image_file_name (str): The file name of the image to analyze. Which given as "file_name" parameter in the question.
128
 
129
  Returns:
130
  str: The response text generated by the Gemini model.
131
  """
132
- image_url = os.path.join("./GAIA_resource/" , image_file_name)
133
  print("--- Analyzing Image ---")
134
- print(f"Image URL/Path: {image_url}")
135
 
136
  prompt = """
137
  Analyze the image in detail and describe its contents in as much detail as possible.
138
  For example, give someone a chess board and describe where each piece is.
139
 
140
- The description should include the following information:
141
- - General overview of the image
142
- - Details of important elements and features (e.g., location relationships, attributes, etc.)
143
- - Identification of specific objects or characters (e.g., game piece names, positions, people, etc.)
144
 
145
- # Steps
146
- 1. Examine the image as a whole and identify the main elements.
147
- 2. Examine each element in detail and identify what it is.
148
- 3. Develop a description of each element based on its characteristic relationships and positions.
149
- 4. Finally, summarize the overall scene or situation.
150
-
151
- # Output Format
152
- Provide detailed descriptions in paragraphs of text, using bullet points where necessary.
153
 
 
 
154
  """
155
 
156
  try:
157
  # Fetch the image data
158
- if image_url.startswith("http"):
159
- image_bytes = requests.get(image_url).content
160
  else:
161
- with open(image_url, "rb") as f:
162
  image_bytes = f.read()
163
 
164
  # Create image part
@@ -179,8 +179,10 @@ Provide detailed descriptions in paragraphs of text, using bullet points where n
179
  )
180
 
181
  print("--- Gemini Response Received ---")
 
182
  # Extract text from the response
183
  if hasattr(response, 'text'):
 
184
  return response.text
185
  elif getattr(response, 'parts', None):
186
  return "".join(part.text for part in response.parts if hasattr(part, 'text'))
@@ -191,7 +193,7 @@ Provide detailed descriptions in paragraphs of text, using bullet points where n
191
  return f"Model did not return text content.{block_reason}"
192
 
193
  except Exception as e:
194
- print(f"Error processing image '{image_url}' with Gemini: {e}")
195
  return f"Sorry, an error occurred while analyzing the image. Please check the image URL or path. Error details: {str(e)}"
196
 
197
  # Audio Tool
@@ -207,7 +209,6 @@ def transcribe_audio(audio_path: str) -> str:
207
  """
208
  print("--- Transcribing Audio ---")
209
  print(f"Audio Path: {audio_path}")
210
- audio_path = os.path.join("./GAIA_resource/", audio_path)
211
 
212
  try:
213
  # Initialize Gemini client
@@ -231,6 +232,8 @@ def transcribe_audio(audio_path: str) -> str:
231
  else:
232
  transcript = "Model did not return text content."
233
 
 
 
234
  # Format as Markdown
235
  markdown_transcript = (
236
  "## Audio Transcription Result\n"
@@ -260,14 +263,7 @@ def excel_to_csv(excel_path: str) -> str:
260
  excel_path = os.path.join("./GAIA_resource/", excel_path)
261
 
262
  try:
263
- # Load workbook from URL or local file
264
- if excel_path.startswith("http"):
265
- response = requests.get(excel_path)
266
- response.raise_for_status()
267
- data_stream = BytesIO(response.content)
268
- wb = load_workbook(filename=data_stream, data_only=True)
269
- else:
270
- wb = load_workbook(filename=excel_path, data_only=True)
271
 
272
  # Select worksheet
273
  ws = wb.active
@@ -320,37 +316,54 @@ data_analyzer_agent = LlmAgent(
320
  )
321
 
322
 
323
- # Read file ascii
324
- def read_file_ascii(file_path: str) -> str:
325
  """
326
- Given a file URL or local file path, reads the file content and returns it as an ASCII string.
327
 
328
  Args:
329
- file_path (str): The URL or local file path of the file to read.
330
 
331
  Returns:
332
- str: The ASCII-decoded content of the file, or an error message on failure.
333
  """
 
334
  print("File Path : ", file_path)
335
- file_path = os.path.join("./GAIA_resource/", file_path)
336
 
337
  try:
338
- # Load data from URL or local file
339
- if file_path.startswith("http"):
340
- response = requests.get(file_path)
341
- response.raise_for_status()
342
- data_bytes = response.content
343
- else:
344
- with open(file_path, "rb") as f:
345
- data_bytes = f.read()
346
-
347
  # Decode bytes to ASCII string, replacing errors
348
- ascii_str = data_bytes.decode("ascii", errors="replace")
349
- return ascii_str
350
 
351
  except Exception as e:
352
- return f"Error reading file as ASCII: {e}"
353
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
 
355
  # Call Agent Async
356
  async def call_agent_async(query: str, runner, user_id, session_id):
@@ -382,36 +395,8 @@ async def call_agent_async(query: str, runner, user_id, session_id):
382
  # (Keep Constants as is)
383
  # --- Constants ---
384
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
385
- # for GAIA Repo
386
- GAIA_REPO_ID = "gaia-benchmark/GAIA"
387
- GAIA_VALIDATION_DIR = "2023/validation"
388
- LOCAL_GAIA_DIR = "GAIA_resource"
389
 
390
 
391
- # --- GAIA Data Download Utility ---
392
- def download_gaia_validation(local_dir: str = LOCAL_GAIA_DIR):
393
- """
394
- Download only the validation part of the Hugging Face GAIA dataset to
395
- local_dir/2023/validation/.
396
- If it has already been downloaded, it will not be downloaded again.
397
- """
398
- target_path = os.path.join(local_dir, GAIA_VALIDATION_DIR)
399
- if os.path.isdir(target_path) and os.listdir(target_path):
400
- print(f"GAIA validation data already exists at {target_path}")
401
- return
402
-
403
- os.makedirs(local_dir, exist_ok=True)
404
- print(f"Downloading GAIA validation data into {local_dir} ...")
405
- snapshot_download(
406
- repo_id=GAIA_REPO_ID,
407
- repo_type="dataset",
408
- allow_patterns=[f"{GAIA_VALIDATION_DIR}/*"],
409
- local_dir=local_dir,
410
- local_dir_use_symlinks=False,
411
- use_auth_token=True
412
- )
413
- print(f"Downloaded GAIA validation data to {target_path}")
414
-
415
  # --- Basic Agent Definition ---
416
  # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
417
  #class BasicAgent:
@@ -435,8 +420,8 @@ Thinking Process:
435
  1. **Analyze Question & Identify Files:** Carefully read the question. Determine the core task and the **exact final answer format**. Check if the question explicitly mentions an attached file (image, Excel, audio, code).
436
  2. **Identify Filename:** If a file is mentioned, identify its filename from the text (e.g., "Homework.mp3", "image.png"). If no specific filename is given for a required file type, state that you need the filename. **Do not guess filenames.**
437
  3. **Plan:** Create a step-by-step plan using tools. If a file is needed, include the correct tool call with the identified filename.
438
- 4. **Execute & Refine:** Execute the plan. Pass correct arguments (especially filenames). Evaluate tool outputs. If errors occur (e.g., file not found, API errors) or info is insufficient, revise the plan (e.g., use `web_search`, different tool prompts).
439
- 5. **Synthesize Answer:** Combine information. Use `execute_python_code` for final formatting/calculations.
440
  6. **Final Output:** Generate **only the final answer** in the requested format. No extra text. If the answer cannot be found or a required filename was missing/invalid, output: "I could not find the answer."
441
 
442
  Constraints:
@@ -444,7 +429,6 @@ Constraints:
444
  - Adhere strictly to the requested output format.
445
  """
446
 
447
-
448
  async def run_and_submit_all( profile: gr.OAuthProfile | None):
449
  """
450
  Fetches all questions, runs the BasicAgent on them, submits all answers,
@@ -486,8 +470,9 @@ async def run_and_submit_all( profile: gr.OAuthProfile | None):
486
  understand_youtube_video,
487
  understand_image,
488
  transcribe_audio,
489
- agent_tool.AgentTool(agent=data_analyzer_agent),
490
- read_file_ascii,
 
491
  ]
492
  )
493
  except Exception as e:
@@ -525,8 +510,11 @@ async def run_and_submit_all( profile: gr.OAuthProfile | None):
525
  for item in questions_data:
526
  task_id = item.get("task_id")
527
  question_text = item.get("question")
528
- question_file_name = item.get("file_name")
529
- question_all = question_text + " file_name = " + question_file_name
 
 
 
530
  if not task_id or question_text is None:
531
  print(f"Skipping item with missing task_id or question: {item}")
532
  continue
@@ -547,7 +535,7 @@ async def run_and_submit_all( profile: gr.OAuthProfile | None):
547
  app_name=APP_NAME, # Associates runs with our app
548
  session_service=session_service # Uses our session manager
549
  )
550
- submitted_answer = await call_agent_async(question_all,
551
  runner=runner,
552
  user_id=USER_ID,
553
  session_id=SESSION_ID)
 
104
  )
105
 
106
  print("--- Gemini Response Received ---")
107
+
108
  if hasattr(response, 'text'):
109
+ print("Video Description : ", response.text)
110
  return response.text
111
  elif response.parts:
112
  return "".join(part.text for part in response.parts if hasattr(part, 'text'))
 
126
  Given an image file , this will analyze the image in detail and describe its contents in as much detail as possible.
127
 
128
  Args:
129
+ image_file_name (str): The file name of the image to analyze.
130
 
131
  Returns:
132
  str: The response text generated by the Gemini model.
133
  """
 
134
  print("--- Analyzing Image ---")
135
+ print(f"Image URL/Path: {image_file_name}")
136
 
137
  prompt = """
138
  Analyze the image in detail and describe its contents in as much detail as possible.
139
  For example, give someone a chess board and describe where each piece is.
140
 
141
+ The description should include the following information:
142
+ - General overview of the image
143
+ - Details of important elements and features (e.g., location relationships, attributes, etc.)
144
+ - Identification of specific objects or characters (e.g., game piece names, positions, people, etc.)
145
 
146
+ # Steps
147
+ 1. Examine the image as a whole and identify the main elements.
148
+ 2. Examine each element in detail and identify what it is.
149
+ 3. Develop a description of each element based on its characteristic relationships and positions.
150
+ 4. Finally, summarize the overall scene or situation.
 
 
 
151
 
152
+ # Output Format
153
+ Provide detailed descriptions in paragraphs of text, using bullet points where necessary.
154
  """
155
 
156
  try:
157
  # Fetch the image data
158
+ if image_file_name.startswith("http"):
159
+ image_bytes = requests.get(image_file_name).content
160
  else:
161
+ with open(image_file_name, "rb") as f:
162
  image_bytes = f.read()
163
 
164
  # Create image part
 
179
  )
180
 
181
  print("--- Gemini Response Received ---")
182
+
183
  # Extract text from the response
184
  if hasattr(response, 'text'):
185
+ print("Image Description : ", response.text)
186
  return response.text
187
  elif getattr(response, 'parts', None):
188
  return "".join(part.text for part in response.parts if hasattr(part, 'text'))
 
193
  return f"Model did not return text content.{block_reason}"
194
 
195
  except Exception as e:
196
+ print(f"Error processing image '{image_file_name}' with Gemini: {e}")
197
  return f"Sorry, an error occurred while analyzing the image. Please check the image URL or path. Error details: {str(e)}"
198
 
199
  # Audio Tool
 
209
  """
210
  print("--- Transcribing Audio ---")
211
  print(f"Audio Path: {audio_path}")
 
212
 
213
  try:
214
  # Initialize Gemini client
 
232
  else:
233
  transcript = "Model did not return text content."
234
 
235
+ print("Transcript : ", transcript)
236
+
237
  # Format as Markdown
238
  markdown_transcript = (
239
  "## Audio Transcription Result\n"
 
263
  excel_path = os.path.join("./GAIA_resource/", excel_path)
264
 
265
  try:
266
+ wb = load_workbook(filename=excel_path, data_only=True)
 
 
 
 
 
 
 
267
 
268
  # Select worksheet
269
  ws = wb.active
 
316
  )
317
 
318
 
319
+ # Read text file
320
+ def LoadTextFileTool(file_path: str) -> str:
321
  """
322
+ This tool loads any text file
323
 
324
  Args:
325
+ file_path (str): File Path
326
 
327
  Returns:
328
+ str: Text file contents.
329
  """
330
+ print("---Load Text File Tool---")
331
  print("File Path : ", file_path)
 
332
 
333
  try:
 
 
 
 
 
 
 
 
 
334
  # Decode bytes to ASCII string, replacing errors
335
+ with open(file_path, 'r', encoding='utf-8') as file:
336
+ return file.read()
337
 
338
  except Exception as e:
339
+ return f"Error reading text file: {e}"
340
 
341
+ # Get task file
342
+ def GetTaskFileTool(file_name: str, task_id: str) -> str:
343
+ """
344
+ This tool downloads the file content associated with the given task_id if exists. Returns absolute file path.
345
+
346
+ Args:
347
+ task_id (str): Task id
348
+ file_name (str) File name
349
+
350
+ Returns:
351
+ str: absolute file path
352
+ """
353
+ print("---Get Task File Tool---")
354
+ print("File Name : ", file_name)
355
+
356
+ try:
357
+ response = requests.get(f"{DEFAULT_API_URL}/files/{task_id}", timeout=15)
358
+ response.raise_for_status()
359
+ with open(file_name, 'wb') as file:
360
+ file.write(response.content)
361
+ return os.path.abspath(file_name)
362
+ except TypeError as e:
363
+ return f"Error GetTaskFileTool '{file_name}' : {str(e)}"
364
+ except Exception as e:
365
+ return f"Error reading file: {e}"
366
+
367
 
368
  # Call Agent Async
369
  async def call_agent_async(query: str, runner, user_id, session_id):
 
395
  # (Keep Constants as is)
396
  # --- Constants ---
397
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 
 
 
 
398
 
399
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
  # --- Basic Agent Definition ---
401
  # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
402
  #class BasicAgent:
 
420
  1. **Analyze Question & Identify Files:** Carefully read the question. Determine the core task and the **exact final answer format**. Check if the question explicitly mentions an attached file (image, Excel, audio, code).
421
  2. **Identify Filename:** If a file is mentioned, identify its filename from the text (e.g., "Homework.mp3", "image.png"). If no specific filename is given for a required file type, state that you need the filename. **Do not guess filenames.**
422
  3. **Plan:** Create a step-by-step plan using tools. If a file is needed, include the correct tool call with the identified filename.
423
+ 4. **Execute & Refine:** Execute the plan. Pass correct arguments (especially filenames). Evaluate tool outputs. If errors occur (e.g., file not found, API errors) or info is insufficient, revise the plan (e.g., use different tool prompts).
424
+ 5. **Synthesize Answer:** Combine information. Use `coding_agent` for final formatting/calculations.
425
  6. **Final Output:** Generate **only the final answer** in the requested format. No extra text. If the answer cannot be found or a required filename was missing/invalid, output: "I could not find the answer."
426
 
427
  Constraints:
 
429
  - Adhere strictly to the requested output format.
430
  """
431
 
 
432
  async def run_and_submit_all( profile: gr.OAuthProfile | None):
433
  """
434
  Fetches all questions, runs the BasicAgent on them, submits all answers,
 
470
  understand_youtube_video,
471
  understand_image,
472
  transcribe_audio,
473
+ excel_to_csv,
474
+ GetTaskFileTool,
475
+ LoadTextFileTool,
476
  ]
477
  )
478
  except Exception as e:
 
510
  for item in questions_data:
511
  task_id = item.get("task_id")
512
  question_text = item.get("question")
513
+ file_name = item.get("file_name")
514
+ if task_id:
515
+ question_text += " task_id = " + task_id
516
+ if file_name:
517
+ question_text += " file_name = " + file_name
518
  if not task_id or question_text is None:
519
  print(f"Skipping item with missing task_id or question: {item}")
520
  continue
 
535
  app_name=APP_NAME, # Associates runs with our app
536
  session_service=session_service # Uses our session manager
537
  )
538
+ submitted_answer = await call_agent_async(question_text,
539
  runner=runner,
540
  user_id=USER_ID,
541
  session_id=SESSION_ID)