Spaces:
Running
Running
Implement GAIA Solver: Add agent tools for code execution, YouTube analysis, image understanding, audio transcription, and Excel conversion. Initialize agents and set up asynchronous processing for question handling.
Browse files- .gitattributes +1 -2
- GAIA_resource/1f975693-876d-457b-a649-393859e79bf3.mp3 +0 -3
- GAIA_resource/7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx +0 -0
- GAIA_resource/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3 +0 -3
- GAIA_resource/cca530fc-4052-43b2-b130-b30968d8aa44.png +0 -0
- GAIA_resource/f918266a-b3e0-4914-865d-4faa564f1aef.py +0 -35
- __init__.py +1 -1
- agent.py → agent_dev.py +76 -92
- app.py +70 -82
.gitattributes
CHANGED
@@ -33,5 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
-
|
37 |
-
GAIA_resource/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3 filter=lfs diff=lfs merge=lfs -text
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
|
|
GAIA_resource/1f975693-876d-457b-a649-393859e79bf3.mp3
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:200f767e732b49efef5c05d128903ee4d2c34e66fdce7f5593ac123b2e637673
|
3 |
-
size 280868
|
|
|
|
|
|
|
|
GAIA_resource/7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx
DELETED
Binary file (5.29 kB)
|
|
GAIA_resource/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:b218c951c1f888f0bbe6f46c080f57afc7c9348fffc7ba4da35749ff1e2ac40f
|
3 |
-
size 179304
|
|
|
|
|
|
|
|
GAIA_resource/cca530fc-4052-43b2-b130-b30968d8aa44.png
DELETED
Binary file (63.1 kB)
|
|
GAIA_resource/f918266a-b3e0-4914-865d-4faa564f1aef.py
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
from random import randint
|
2 |
-
import time
|
3 |
-
|
4 |
-
class UhOh(Exception):
|
5 |
-
pass
|
6 |
-
|
7 |
-
class Hmm:
|
8 |
-
def __init__(self):
|
9 |
-
self.value = randint(-100, 100)
|
10 |
-
|
11 |
-
def Yeah(self):
|
12 |
-
if self.value == 0:
|
13 |
-
return True
|
14 |
-
else:
|
15 |
-
raise UhOh()
|
16 |
-
|
17 |
-
def Okay():
|
18 |
-
while True:
|
19 |
-
yield Hmm()
|
20 |
-
|
21 |
-
def keep_trying(go, first_try=True):
|
22 |
-
maybe = next(go)
|
23 |
-
try:
|
24 |
-
if maybe.Yeah():
|
25 |
-
return maybe.value
|
26 |
-
except UhOh:
|
27 |
-
if first_try:
|
28 |
-
print("Working...")
|
29 |
-
print("Please wait patiently...")
|
30 |
-
time.sleep(0.1)
|
31 |
-
return keep_trying(go, first_try=False)
|
32 |
-
|
33 |
-
if __name__ == "__main__":
|
34 |
-
go = Okay()
|
35 |
-
print(f"{keep_trying(go)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__init__.py
CHANGED
@@ -1 +1 @@
|
|
1 |
-
from . import agent
|
|
|
1 |
+
from .adk_web import agent
|
agent.py → agent_dev.py
RENAMED
@@ -16,6 +16,7 @@ from google.adk.agents import Agent
|
|
16 |
from google.adk.tools import google_search, built_in_code_execution
|
17 |
from google.adk.agents import LlmAgent
|
18 |
|
|
|
19 |
from openpyxl import load_workbook
|
20 |
|
21 |
import warnings
|
@@ -102,7 +103,9 @@ def understand_youtube_video(video_url: str, question: str) -> str:
|
|
102 |
)
|
103 |
|
104 |
print("--- Gemini Response Received ---")
|
|
|
105 |
if hasattr(response, 'text'):
|
|
|
106 |
return response.text
|
107 |
elif response.parts:
|
108 |
return "".join(part.text for part in response.parts if hasattr(part, 'text'))
|
@@ -122,41 +125,39 @@ def understand_image(image_file_name: str) -> str:
|
|
122 |
Given an image file , this will analyze the image in detail and describe its contents in as much detail as possible.
|
123 |
|
124 |
Args:
|
125 |
-
image_file_name (str): The file name of the image to analyze.
|
126 |
|
127 |
Returns:
|
128 |
str: The response text generated by the Gemini model.
|
129 |
"""
|
130 |
-
image_url = os.path.join("./GAIA_resource/" , image_file_name)
|
131 |
print("--- Analyzing Image ---")
|
132 |
-
print(f"Image URL/Path: {
|
133 |
|
134 |
prompt = """
|
135 |
Analyze the image in detail and describe its contents in as much detail as possible.
|
136 |
For example, give someone a chess board and describe where each piece is.
|
137 |
|
138 |
-
The description should include the following information:
|
139 |
-
- General overview of the image
|
140 |
-
- Details of important elements and features (e.g., location relationships, attributes, etc.)
|
141 |
-
- Identification of specific objects or characters (e.g., game piece names, positions, people, etc.)
|
142 |
|
143 |
-
# Steps
|
144 |
-
1. Examine the image as a whole and identify the main elements.
|
145 |
-
2. Examine each element in detail and identify what it is.
|
146 |
-
3. Develop a description of each element based on its characteristic relationships and positions.
|
147 |
-
4. Finally, summarize the overall scene or situation.
|
148 |
-
|
149 |
-
# Output Format
|
150 |
-
Provide detailed descriptions in paragraphs of text, using bullet points where necessary.
|
151 |
|
|
|
|
|
152 |
"""
|
153 |
|
154 |
try:
|
155 |
# Fetch the image data
|
156 |
-
if
|
157 |
-
image_bytes = requests.get(
|
158 |
else:
|
159 |
-
with open(
|
160 |
image_bytes = f.read()
|
161 |
|
162 |
# Create image part
|
@@ -177,8 +178,10 @@ Provide detailed descriptions in paragraphs of text, using bullet points where n
|
|
177 |
)
|
178 |
|
179 |
print("--- Gemini Response Received ---")
|
|
|
180 |
# Extract text from the response
|
181 |
if hasattr(response, 'text'):
|
|
|
182 |
return response.text
|
183 |
elif getattr(response, 'parts', None):
|
184 |
return "".join(part.text for part in response.parts if hasattr(part, 'text'))
|
@@ -189,7 +192,7 @@ Provide detailed descriptions in paragraphs of text, using bullet points where n
|
|
189 |
return f"Model did not return text content.{block_reason}"
|
190 |
|
191 |
except Exception as e:
|
192 |
-
print(f"Error processing image '{
|
193 |
return f"Sorry, an error occurred while analyzing the image. Please check the image URL or path. Error details: {str(e)}"
|
194 |
|
195 |
# Audio Tool
|
@@ -205,7 +208,6 @@ def transcribe_audio(audio_path: str) -> str:
|
|
205 |
"""
|
206 |
print("--- Transcribing Audio ---")
|
207 |
print(f"Audio Path: {audio_path}")
|
208 |
-
audio_path = os.path.join("./GAIA_resource/", audio_path)
|
209 |
|
210 |
try:
|
211 |
# Initialize Gemini client
|
@@ -229,6 +231,8 @@ def transcribe_audio(audio_path: str) -> str:
|
|
229 |
else:
|
230 |
transcript = "Model did not return text content."
|
231 |
|
|
|
|
|
232 |
# Format as Markdown
|
233 |
markdown_transcript = (
|
234 |
"## Audio Transcription Result\n"
|
@@ -258,14 +262,7 @@ def excel_to_csv(excel_path: str) -> str:
|
|
258 |
excel_path = os.path.join("./GAIA_resource/", excel_path)
|
259 |
|
260 |
try:
|
261 |
-
|
262 |
-
if excel_path.startswith("http"):
|
263 |
-
response = requests.get(excel_path)
|
264 |
-
response.raise_for_status()
|
265 |
-
data_stream = BytesIO(response.content)
|
266 |
-
wb = load_workbook(filename=data_stream, data_only=True)
|
267 |
-
else:
|
268 |
-
wb = load_workbook(filename=excel_path, data_only=True)
|
269 |
|
270 |
# Select worksheet
|
271 |
ws = wb.active
|
@@ -286,70 +283,56 @@ def excel_to_csv(excel_path: str) -> str:
|
|
286 |
except Exception as e:
|
287 |
return f"Error converting Excel to CSV: {e}"
|
288 |
|
289 |
-
|
290 |
-
|
291 |
-
name="data_analyzer_agent",
|
292 |
-
description="When data is provided, analyze it and derive an appropriate answer.",
|
293 |
-
instruction="""
|
294 |
-
# Steps
|
295 |
-
1. **Data Review**: Understand the data provided and understand what it shows.
|
296 |
-
2. **Prepare for Analysis**: If necessary, clean the data and prepare it for analysis.
|
297 |
-
3. **Data Analysis**: Analyze the data using appropriate methods to find meaningful information and trends.
|
298 |
-
4. **Interpretation**: Interpret the analysis results to answer questions and doubts.
|
299 |
-
5. **Present Conclusions**: Present your conclusions and insights in a logical summary.
|
300 |
-
|
301 |
-
# Output Format
|
302 |
-
- State your conclusions in a short sentence, but make sure they are clear and specific.
|
303 |
-
- If necessary, use tables and graphs to provide additional information.
|
304 |
-
|
305 |
-
# Examples
|
306 |
-
- **Input Data**:
|
307 |
-
- Survey data on age, gender, occupation, and annual income
|
308 |
-
- **Analysis Results**:
|
309 |
-
- The older the person, the higher the annual income tends to be.
|
310 |
-
- **Statement of conclusion**:
|
311 |
-
- "The survey data shows that the older you are, the higher your average annual income is."
|
312 |
-
|
313 |
-
# Notes
|
314 |
-
- If your data set is very large, consider using sample data or segmenting your data for analysis.
|
315 |
-
- Distinguish between qualitative and quantitative data and choose the appropriate analysis method for each.
|
316 |
-
""",
|
317 |
-
tools=[excel_to_csv] # Provide the function directly
|
318 |
-
)
|
319 |
-
|
320 |
-
|
321 |
-
# Read file ascii
|
322 |
-
def read_file_ascii(file_path: str) -> str:
|
323 |
"""
|
324 |
-
|
325 |
|
326 |
Args:
|
327 |
-
file_path (str):
|
328 |
|
329 |
Returns:
|
330 |
-
str:
|
331 |
"""
|
|
|
332 |
print("File Path : ", file_path)
|
333 |
-
file_path = os.path.join("./GAIA_resource/", file_path)
|
334 |
|
335 |
try:
|
336 |
-
# Load data from URL or local file
|
337 |
-
if file_path.startswith("http"):
|
338 |
-
response = requests.get(file_path)
|
339 |
-
response.raise_for_status()
|
340 |
-
data_bytes = response.content
|
341 |
-
else:
|
342 |
-
with open(file_path, "rb") as f:
|
343 |
-
data_bytes = f.read()
|
344 |
-
|
345 |
# Decode bytes to ASCII string, replacing errors
|
346 |
-
|
347 |
-
|
348 |
|
349 |
except Exception as e:
|
350 |
-
return f"Error reading file
|
351 |
|
352 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
353 |
# Call Agent Async
|
354 |
async def call_agent_async(query: str, runner, user_id, session_id):
|
355 |
"""Sends a query to the agent and prints the final response."""
|
@@ -405,8 +388,8 @@ Thinking Process:
|
|
405 |
1. **Analyze Question & Identify Files:** Carefully read the question. Determine the core task and the **exact final answer format**. Check if the question explicitly mentions an attached file (image, Excel, audio, code).
|
406 |
2. **Identify Filename:** If a file is mentioned, identify its filename from the text (e.g., "Homework.mp3", "image.png"). If no specific filename is given for a required file type, state that you need the filename. **Do not guess filenames.**
|
407 |
3. **Plan:** Create a step-by-step plan using tools. If a file is needed, include the correct tool call with the identified filename.
|
408 |
-
4. **Execute & Refine:** Execute the plan. Pass correct arguments (especially filenames). Evaluate tool outputs. If errors occur (e.g., file not found, API errors) or info is insufficient, revise the plan (e.g., use
|
409 |
-
5. **Synthesize Answer:** Combine information. Use `
|
410 |
6. **Final Output:** Generate **only the final answer** in the requested format. No extra text. If the answer cannot be found or a required filename was missing/invalid, output: "I could not find the answer."
|
411 |
|
412 |
Constraints:
|
@@ -435,8 +418,9 @@ async def main():
|
|
435 |
understand_youtube_video,
|
436 |
understand_image,
|
437 |
transcribe_audio,
|
438 |
-
|
439 |
-
|
|
|
440 |
]
|
441 |
)
|
442 |
except Exception as e:
|
@@ -469,17 +453,14 @@ async def main():
|
|
469 |
results_log = []
|
470 |
answers_payload = []
|
471 |
print(f"Running agent on {len(questions_data)} questions...")
|
472 |
-
i = 0
|
473 |
for item in questions_data:
|
474 |
-
i += 1
|
475 |
-
if i < 12:
|
476 |
-
continue
|
477 |
-
elif i > 12:
|
478 |
-
break
|
479 |
task_id = item.get("task_id")
|
480 |
question_text = item.get("question")
|
481 |
-
|
482 |
-
|
|
|
|
|
|
|
483 |
if not task_id or question_text is None:
|
484 |
print(f"Skipping item with missing task_id or question: {item}")
|
485 |
continue
|
@@ -500,16 +481,19 @@ async def main():
|
|
500 |
app_name=APP_NAME, # Associates runs with our app
|
501 |
session_service=session_service # Uses our session manager
|
502 |
)
|
503 |
-
submitted_answer = await call_agent_async(
|
504 |
runner=runner,
|
505 |
user_id=USER_ID,
|
506 |
-
session_id=SESSION_ID
|
|
|
507 |
|
508 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
509 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
510 |
except Exception as e:
|
511 |
print(f"Error running agent on task {task_id}: {e}")
|
512 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
|
|
|
|
|
513 |
|
514 |
if not answers_payload:
|
515 |
print("Agent did not produce any answers to submit.")
|
|
|
16 |
from google.adk.tools import google_search, built_in_code_execution
|
17 |
from google.adk.agents import LlmAgent
|
18 |
|
19 |
+
from huggingface_hub import snapshot_download
|
20 |
from openpyxl import load_workbook
|
21 |
|
22 |
import warnings
|
|
|
103 |
)
|
104 |
|
105 |
print("--- Gemini Response Received ---")
|
106 |
+
|
107 |
if hasattr(response, 'text'):
|
108 |
+
print("Video Description : ", response.text)
|
109 |
return response.text
|
110 |
elif response.parts:
|
111 |
return "".join(part.text for part in response.parts if hasattr(part, 'text'))
|
|
|
125 |
Given an image file , this will analyze the image in detail and describe its contents in as much detail as possible.
|
126 |
|
127 |
Args:
|
128 |
+
image_file_name (str): The file name of the image to analyze.
|
129 |
|
130 |
Returns:
|
131 |
str: The response text generated by the Gemini model.
|
132 |
"""
|
|
|
133 |
print("--- Analyzing Image ---")
|
134 |
+
print(f"Image URL/Path: {image_file_name}")
|
135 |
|
136 |
prompt = """
|
137 |
Analyze the image in detail and describe its contents in as much detail as possible.
|
138 |
For example, give someone a chess board and describe where each piece is.
|
139 |
|
140 |
+
The description should include the following information:
|
141 |
+
- General overview of the image
|
142 |
+
- Details of important elements and features (e.g., location relationships, attributes, etc.)
|
143 |
+
- Identification of specific objects or characters (e.g., game piece names, positions, people, etc.)
|
144 |
|
145 |
+
# Steps
|
146 |
+
1. Examine the image as a whole and identify the main elements.
|
147 |
+
2. Examine each element in detail and identify what it is.
|
148 |
+
3. Develop a description of each element based on its characteristic relationships and positions.
|
149 |
+
4. Finally, summarize the overall scene or situation.
|
|
|
|
|
|
|
150 |
|
151 |
+
# Output Format
|
152 |
+
Provide detailed descriptions in paragraphs of text, using bullet points where necessary.
|
153 |
"""
|
154 |
|
155 |
try:
|
156 |
# Fetch the image data
|
157 |
+
if image_file_name.startswith("http"):
|
158 |
+
image_bytes = requests.get(image_file_name).content
|
159 |
else:
|
160 |
+
with open(image_file_name, "rb") as f:
|
161 |
image_bytes = f.read()
|
162 |
|
163 |
# Create image part
|
|
|
178 |
)
|
179 |
|
180 |
print("--- Gemini Response Received ---")
|
181 |
+
|
182 |
# Extract text from the response
|
183 |
if hasattr(response, 'text'):
|
184 |
+
print("Image Description : ", response.text)
|
185 |
return response.text
|
186 |
elif getattr(response, 'parts', None):
|
187 |
return "".join(part.text for part in response.parts if hasattr(part, 'text'))
|
|
|
192 |
return f"Model did not return text content.{block_reason}"
|
193 |
|
194 |
except Exception as e:
|
195 |
+
print(f"Error processing image '{image_file_name}' with Gemini: {e}")
|
196 |
return f"Sorry, an error occurred while analyzing the image. Please check the image URL or path. Error details: {str(e)}"
|
197 |
|
198 |
# Audio Tool
|
|
|
208 |
"""
|
209 |
print("--- Transcribing Audio ---")
|
210 |
print(f"Audio Path: {audio_path}")
|
|
|
211 |
|
212 |
try:
|
213 |
# Initialize Gemini client
|
|
|
231 |
else:
|
232 |
transcript = "Model did not return text content."
|
233 |
|
234 |
+
print("Transcript : ", transcript)
|
235 |
+
|
236 |
# Format as Markdown
|
237 |
markdown_transcript = (
|
238 |
"## Audio Transcription Result\n"
|
|
|
262 |
excel_path = os.path.join("./GAIA_resource/", excel_path)
|
263 |
|
264 |
try:
|
265 |
+
wb = load_workbook(filename=excel_path, data_only=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
|
267 |
# Select worksheet
|
268 |
ws = wb.active
|
|
|
283 |
except Exception as e:
|
284 |
return f"Error converting Excel to CSV: {e}"
|
285 |
|
286 |
+
# Read text file
|
287 |
+
def LoadTextFileTool(file_path: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
288 |
"""
|
289 |
+
This tool loads any text file
|
290 |
|
291 |
Args:
|
292 |
+
file_path (str): File Path
|
293 |
|
294 |
Returns:
|
295 |
+
str: Text file contents.
|
296 |
"""
|
297 |
+
print("---Load Text File Tool---")
|
298 |
print("File Path : ", file_path)
|
|
|
299 |
|
300 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
301 |
# Decode bytes to ASCII string, replacing errors
|
302 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
303 |
+
return file.read()
|
304 |
|
305 |
except Exception as e:
|
306 |
+
return f"Error reading text file: {e}"
|
307 |
|
308 |
|
309 |
+
# Get task file
|
310 |
+
def GetTaskFileTool(file_name: str, task_id: str) -> str:
|
311 |
+
"""
|
312 |
+
This tool downloads the file content associated with the given task_id if exists. Returns absolute file path.
|
313 |
+
|
314 |
+
Args:
|
315 |
+
task_id (str): Task id
|
316 |
+
file_name (str) File name
|
317 |
+
|
318 |
+
Returns:
|
319 |
+
str: absolute file path
|
320 |
+
"""
|
321 |
+
print("---Get Task File Tool---")
|
322 |
+
print("File Name : ", file_name)
|
323 |
+
|
324 |
+
try:
|
325 |
+
response = requests.get(f"{DEFAULT_API_URL}/files/{task_id}", timeout=15)
|
326 |
+
response.raise_for_status()
|
327 |
+
with open(file_name, 'wb') as file:
|
328 |
+
file.write(response.content)
|
329 |
+
return os.path.abspath(file_name)
|
330 |
+
except TypeError as e:
|
331 |
+
return f"Error GetTaskFileTool '{file_name}' : {str(e)}"
|
332 |
+
except Exception as e:
|
333 |
+
return f"Error reading file: {e}"
|
334 |
+
|
335 |
+
|
336 |
# Call Agent Async
|
337 |
async def call_agent_async(query: str, runner, user_id, session_id):
|
338 |
"""Sends a query to the agent and prints the final response."""
|
|
|
388 |
1. **Analyze Question & Identify Files:** Carefully read the question. Determine the core task and the **exact final answer format**. Check if the question explicitly mentions an attached file (image, Excel, audio, code).
|
389 |
2. **Identify Filename:** If a file is mentioned, identify its filename from the text (e.g., "Homework.mp3", "image.png"). If no specific filename is given for a required file type, state that you need the filename. **Do not guess filenames.**
|
390 |
3. **Plan:** Create a step-by-step plan using tools. If a file is needed, include the correct tool call with the identified filename.
|
391 |
+
4. **Execute & Refine:** Execute the plan. Pass correct arguments (especially filenames). Evaluate tool outputs. If errors occur (e.g., file not found, API errors) or info is insufficient, revise the plan (e.g., use different tool prompts).
|
392 |
+
5. **Synthesize Answer:** Combine information. Use `coding_agent` for final formatting/calculations.
|
393 |
6. **Final Output:** Generate **only the final answer** in the requested format. No extra text. If the answer cannot be found or a required filename was missing/invalid, output: "I could not find the answer."
|
394 |
|
395 |
Constraints:
|
|
|
418 |
understand_youtube_video,
|
419 |
understand_image,
|
420 |
transcribe_audio,
|
421 |
+
excel_to_csv,
|
422 |
+
GetTaskFileTool,
|
423 |
+
LoadTextFileTool,
|
424 |
]
|
425 |
)
|
426 |
except Exception as e:
|
|
|
453 |
results_log = []
|
454 |
answers_payload = []
|
455 |
print(f"Running agent on {len(questions_data)} questions...")
|
|
|
456 |
for item in questions_data:
|
|
|
|
|
|
|
|
|
|
|
457 |
task_id = item.get("task_id")
|
458 |
question_text = item.get("question")
|
459 |
+
file_name = item.get("file_name")
|
460 |
+
if task_id:
|
461 |
+
question_text += " task_id = " + task_id
|
462 |
+
if file_name:
|
463 |
+
question_text += " file_name = " + file_name
|
464 |
if not task_id or question_text is None:
|
465 |
print(f"Skipping item with missing task_id or question: {item}")
|
466 |
continue
|
|
|
481 |
app_name=APP_NAME, # Associates runs with our app
|
482 |
session_service=session_service # Uses our session manager
|
483 |
)
|
484 |
+
submitted_answer = await call_agent_async(question_text,
|
485 |
runner=runner,
|
486 |
user_id=USER_ID,
|
487 |
+
session_id=SESSION_ID
|
488 |
+
)
|
489 |
|
490 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
491 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
492 |
except Exception as e:
|
493 |
print(f"Error running agent on task {task_id}: {e}")
|
494 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
|
495 |
+
if os.path.exists(file_name):
|
496 |
+
os.remove(file_name)
|
497 |
|
498 |
if not answers_payload:
|
499 |
print("Agent did not produce any answers to submit.")
|
app.py
CHANGED
@@ -104,7 +104,9 @@ def understand_youtube_video(video_url: str, question: str) -> str:
|
|
104 |
)
|
105 |
|
106 |
print("--- Gemini Response Received ---")
|
|
|
107 |
if hasattr(response, 'text'):
|
|
|
108 |
return response.text
|
109 |
elif response.parts:
|
110 |
return "".join(part.text for part in response.parts if hasattr(part, 'text'))
|
@@ -124,41 +126,39 @@ def understand_image(image_file_name: str) -> str:
|
|
124 |
Given an image file , this will analyze the image in detail and describe its contents in as much detail as possible.
|
125 |
|
126 |
Args:
|
127 |
-
image_file_name (str): The file name of the image to analyze.
|
128 |
|
129 |
Returns:
|
130 |
str: The response text generated by the Gemini model.
|
131 |
"""
|
132 |
-
image_url = os.path.join("./GAIA_resource/" , image_file_name)
|
133 |
print("--- Analyzing Image ---")
|
134 |
-
print(f"Image URL/Path: {
|
135 |
|
136 |
prompt = """
|
137 |
Analyze the image in detail and describe its contents in as much detail as possible.
|
138 |
For example, give someone a chess board and describe where each piece is.
|
139 |
|
140 |
-
The description should include the following information:
|
141 |
-
- General overview of the image
|
142 |
-
- Details of important elements and features (e.g., location relationships, attributes, etc.)
|
143 |
-
- Identification of specific objects or characters (e.g., game piece names, positions, people, etc.)
|
144 |
|
145 |
-
# Steps
|
146 |
-
1. Examine the image as a whole and identify the main elements.
|
147 |
-
2. Examine each element in detail and identify what it is.
|
148 |
-
3. Develop a description of each element based on its characteristic relationships and positions.
|
149 |
-
4. Finally, summarize the overall scene or situation.
|
150 |
-
|
151 |
-
# Output Format
|
152 |
-
Provide detailed descriptions in paragraphs of text, using bullet points where necessary.
|
153 |
|
|
|
|
|
154 |
"""
|
155 |
|
156 |
try:
|
157 |
# Fetch the image data
|
158 |
-
if
|
159 |
-
image_bytes = requests.get(
|
160 |
else:
|
161 |
-
with open(
|
162 |
image_bytes = f.read()
|
163 |
|
164 |
# Create image part
|
@@ -179,8 +179,10 @@ Provide detailed descriptions in paragraphs of text, using bullet points where n
|
|
179 |
)
|
180 |
|
181 |
print("--- Gemini Response Received ---")
|
|
|
182 |
# Extract text from the response
|
183 |
if hasattr(response, 'text'):
|
|
|
184 |
return response.text
|
185 |
elif getattr(response, 'parts', None):
|
186 |
return "".join(part.text for part in response.parts if hasattr(part, 'text'))
|
@@ -191,7 +193,7 @@ Provide detailed descriptions in paragraphs of text, using bullet points where n
|
|
191 |
return f"Model did not return text content.{block_reason}"
|
192 |
|
193 |
except Exception as e:
|
194 |
-
print(f"Error processing image '{
|
195 |
return f"Sorry, an error occurred while analyzing the image. Please check the image URL or path. Error details: {str(e)}"
|
196 |
|
197 |
# Audio Tool
|
@@ -207,7 +209,6 @@ def transcribe_audio(audio_path: str) -> str:
|
|
207 |
"""
|
208 |
print("--- Transcribing Audio ---")
|
209 |
print(f"Audio Path: {audio_path}")
|
210 |
-
audio_path = os.path.join("./GAIA_resource/", audio_path)
|
211 |
|
212 |
try:
|
213 |
# Initialize Gemini client
|
@@ -231,6 +232,8 @@ def transcribe_audio(audio_path: str) -> str:
|
|
231 |
else:
|
232 |
transcript = "Model did not return text content."
|
233 |
|
|
|
|
|
234 |
# Format as Markdown
|
235 |
markdown_transcript = (
|
236 |
"## Audio Transcription Result\n"
|
@@ -260,14 +263,7 @@ def excel_to_csv(excel_path: str) -> str:
|
|
260 |
excel_path = os.path.join("./GAIA_resource/", excel_path)
|
261 |
|
262 |
try:
|
263 |
-
|
264 |
-
if excel_path.startswith("http"):
|
265 |
-
response = requests.get(excel_path)
|
266 |
-
response.raise_for_status()
|
267 |
-
data_stream = BytesIO(response.content)
|
268 |
-
wb = load_workbook(filename=data_stream, data_only=True)
|
269 |
-
else:
|
270 |
-
wb = load_workbook(filename=excel_path, data_only=True)
|
271 |
|
272 |
# Select worksheet
|
273 |
ws = wb.active
|
@@ -320,37 +316,54 @@ data_analyzer_agent = LlmAgent(
|
|
320 |
)
|
321 |
|
322 |
|
323 |
-
# Read file
|
324 |
-
def
|
325 |
"""
|
326 |
-
|
327 |
|
328 |
Args:
|
329 |
-
file_path (str):
|
330 |
|
331 |
Returns:
|
332 |
-
str:
|
333 |
"""
|
|
|
334 |
print("File Path : ", file_path)
|
335 |
-
file_path = os.path.join("./GAIA_resource/", file_path)
|
336 |
|
337 |
try:
|
338 |
-
# Load data from URL or local file
|
339 |
-
if file_path.startswith("http"):
|
340 |
-
response = requests.get(file_path)
|
341 |
-
response.raise_for_status()
|
342 |
-
data_bytes = response.content
|
343 |
-
else:
|
344 |
-
with open(file_path, "rb") as f:
|
345 |
-
data_bytes = f.read()
|
346 |
-
|
347 |
# Decode bytes to ASCII string, replacing errors
|
348 |
-
|
349 |
-
|
350 |
|
351 |
except Exception as e:
|
352 |
-
return f"Error reading file
|
353 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
354 |
|
355 |
# Call Agent Async
|
356 |
async def call_agent_async(query: str, runner, user_id, session_id):
|
@@ -382,36 +395,8 @@ async def call_agent_async(query: str, runner, user_id, session_id):
|
|
382 |
# (Keep Constants as is)
|
383 |
# --- Constants ---
|
384 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
385 |
-
# for GAIA Repo
|
386 |
-
GAIA_REPO_ID = "gaia-benchmark/GAIA"
|
387 |
-
GAIA_VALIDATION_DIR = "2023/validation"
|
388 |
-
LOCAL_GAIA_DIR = "GAIA_resource"
|
389 |
|
390 |
|
391 |
-
# --- GAIA Data Download Utility ---
|
392 |
-
def download_gaia_validation(local_dir: str = LOCAL_GAIA_DIR):
|
393 |
-
"""
|
394 |
-
Download only the validation part of the Hugging Face GAIA dataset to
|
395 |
-
local_dir/2023/validation/.
|
396 |
-
If it has already been downloaded, it will not be downloaded again.
|
397 |
-
"""
|
398 |
-
target_path = os.path.join(local_dir, GAIA_VALIDATION_DIR)
|
399 |
-
if os.path.isdir(target_path) and os.listdir(target_path):
|
400 |
-
print(f"GAIA validation data already exists at {target_path}")
|
401 |
-
return
|
402 |
-
|
403 |
-
os.makedirs(local_dir, exist_ok=True)
|
404 |
-
print(f"Downloading GAIA validation data into {local_dir} ...")
|
405 |
-
snapshot_download(
|
406 |
-
repo_id=GAIA_REPO_ID,
|
407 |
-
repo_type="dataset",
|
408 |
-
allow_patterns=[f"{GAIA_VALIDATION_DIR}/*"],
|
409 |
-
local_dir=local_dir,
|
410 |
-
local_dir_use_symlinks=False,
|
411 |
-
use_auth_token=True
|
412 |
-
)
|
413 |
-
print(f"Downloaded GAIA validation data to {target_path}")
|
414 |
-
|
415 |
# --- Basic Agent Definition ---
|
416 |
# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
|
417 |
#class BasicAgent:
|
@@ -435,8 +420,8 @@ Thinking Process:
|
|
435 |
1. **Analyze Question & Identify Files:** Carefully read the question. Determine the core task and the **exact final answer format**. Check if the question explicitly mentions an attached file (image, Excel, audio, code).
|
436 |
2. **Identify Filename:** If a file is mentioned, identify its filename from the text (e.g., "Homework.mp3", "image.png"). If no specific filename is given for a required file type, state that you need the filename. **Do not guess filenames.**
|
437 |
3. **Plan:** Create a step-by-step plan using tools. If a file is needed, include the correct tool call with the identified filename.
|
438 |
-
4. **Execute & Refine:** Execute the plan. Pass correct arguments (especially filenames). Evaluate tool outputs. If errors occur (e.g., file not found, API errors) or info is insufficient, revise the plan (e.g., use
|
439 |
-
5. **Synthesize Answer:** Combine information. Use `
|
440 |
6. **Final Output:** Generate **only the final answer** in the requested format. No extra text. If the answer cannot be found or a required filename was missing/invalid, output: "I could not find the answer."
|
441 |
|
442 |
Constraints:
|
@@ -444,7 +429,6 @@ Constraints:
|
|
444 |
- Adhere strictly to the requested output format.
|
445 |
"""
|
446 |
|
447 |
-
|
448 |
async def run_and_submit_all( profile: gr.OAuthProfile | None):
|
449 |
"""
|
450 |
Fetches all questions, runs the BasicAgent on them, submits all answers,
|
@@ -486,8 +470,9 @@ async def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
486 |
understand_youtube_video,
|
487 |
understand_image,
|
488 |
transcribe_audio,
|
489 |
-
|
490 |
-
|
|
|
491 |
]
|
492 |
)
|
493 |
except Exception as e:
|
@@ -525,8 +510,11 @@ async def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
525 |
for item in questions_data:
|
526 |
task_id = item.get("task_id")
|
527 |
question_text = item.get("question")
|
528 |
-
|
529 |
-
|
|
|
|
|
|
|
530 |
if not task_id or question_text is None:
|
531 |
print(f"Skipping item with missing task_id or question: {item}")
|
532 |
continue
|
@@ -547,7 +535,7 @@ async def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
547 |
app_name=APP_NAME, # Associates runs with our app
|
548 |
session_service=session_service # Uses our session manager
|
549 |
)
|
550 |
-
submitted_answer = await call_agent_async(
|
551 |
runner=runner,
|
552 |
user_id=USER_ID,
|
553 |
session_id=SESSION_ID)
|
|
|
104 |
)
|
105 |
|
106 |
print("--- Gemini Response Received ---")
|
107 |
+
|
108 |
if hasattr(response, 'text'):
|
109 |
+
print("Video Description : ", response.text)
|
110 |
return response.text
|
111 |
elif response.parts:
|
112 |
return "".join(part.text for part in response.parts if hasattr(part, 'text'))
|
|
|
126 |
Given an image file , this will analyze the image in detail and describe its contents in as much detail as possible.
|
127 |
|
128 |
Args:
|
129 |
+
image_file_name (str): The file name of the image to analyze.
|
130 |
|
131 |
Returns:
|
132 |
str: The response text generated by the Gemini model.
|
133 |
"""
|
|
|
134 |
print("--- Analyzing Image ---")
|
135 |
+
print(f"Image URL/Path: {image_file_name}")
|
136 |
|
137 |
prompt = """
|
138 |
Analyze the image in detail and describe its contents in as much detail as possible.
|
139 |
For example, give someone a chess board and describe where each piece is.
|
140 |
|
141 |
+
The description should include the following information:
|
142 |
+
- General overview of the image
|
143 |
+
- Details of important elements and features (e.g., location relationships, attributes, etc.)
|
144 |
+
- Identification of specific objects or characters (e.g., game piece names, positions, people, etc.)
|
145 |
|
146 |
+
# Steps
|
147 |
+
1. Examine the image as a whole and identify the main elements.
|
148 |
+
2. Examine each element in detail and identify what it is.
|
149 |
+
3. Develop a description of each element based on its characteristic relationships and positions.
|
150 |
+
4. Finally, summarize the overall scene or situation.
|
|
|
|
|
|
|
151 |
|
152 |
+
# Output Format
|
153 |
+
Provide detailed descriptions in paragraphs of text, using bullet points where necessary.
|
154 |
"""
|
155 |
|
156 |
try:
|
157 |
# Fetch the image data
|
158 |
+
if image_file_name.startswith("http"):
|
159 |
+
image_bytes = requests.get(image_file_name).content
|
160 |
else:
|
161 |
+
with open(image_file_name, "rb") as f:
|
162 |
image_bytes = f.read()
|
163 |
|
164 |
# Create image part
|
|
|
179 |
)
|
180 |
|
181 |
print("--- Gemini Response Received ---")
|
182 |
+
|
183 |
# Extract text from the response
|
184 |
if hasattr(response, 'text'):
|
185 |
+
print("Image Description : ", response.text)
|
186 |
return response.text
|
187 |
elif getattr(response, 'parts', None):
|
188 |
return "".join(part.text for part in response.parts if hasattr(part, 'text'))
|
|
|
193 |
return f"Model did not return text content.{block_reason}"
|
194 |
|
195 |
except Exception as e:
|
196 |
+
print(f"Error processing image '{image_file_name}' with Gemini: {e}")
|
197 |
return f"Sorry, an error occurred while analyzing the image. Please check the image URL or path. Error details: {str(e)}"
|
198 |
|
199 |
# Audio Tool
|
|
|
209 |
"""
|
210 |
print("--- Transcribing Audio ---")
|
211 |
print(f"Audio Path: {audio_path}")
|
|
|
212 |
|
213 |
try:
|
214 |
# Initialize Gemini client
|
|
|
232 |
else:
|
233 |
transcript = "Model did not return text content."
|
234 |
|
235 |
+
print("Transcript : ", transcript)
|
236 |
+
|
237 |
# Format as Markdown
|
238 |
markdown_transcript = (
|
239 |
"## Audio Transcription Result\n"
|
|
|
263 |
excel_path = os.path.join("./GAIA_resource/", excel_path)
|
264 |
|
265 |
try:
|
266 |
+
wb = load_workbook(filename=excel_path, data_only=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
|
268 |
# Select worksheet
|
269 |
ws = wb.active
|
|
|
316 |
)
|
317 |
|
318 |
|
319 |
+
# Read text file
|
320 |
+
def LoadTextFileTool(file_path: str) -> str:
|
321 |
"""
|
322 |
+
This tool loads any text file
|
323 |
|
324 |
Args:
|
325 |
+
file_path (str): File Path
|
326 |
|
327 |
Returns:
|
328 |
+
str: Text file contents.
|
329 |
"""
|
330 |
+
print("---Load Text File Tool---")
|
331 |
print("File Path : ", file_path)
|
|
|
332 |
|
333 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
334 |
# Decode bytes to ASCII string, replacing errors
|
335 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
336 |
+
return file.read()
|
337 |
|
338 |
except Exception as e:
|
339 |
+
return f"Error reading text file: {e}"
|
340 |
|
341 |
+
# Get task file
|
342 |
+
def GetTaskFileTool(file_name: str, task_id: str) -> str:
|
343 |
+
"""
|
344 |
+
This tool downloads the file content associated with the given task_id if exists. Returns absolute file path.
|
345 |
+
|
346 |
+
Args:
|
347 |
+
task_id (str): Task id
|
348 |
+
file_name (str) File name
|
349 |
+
|
350 |
+
Returns:
|
351 |
+
str: absolute file path
|
352 |
+
"""
|
353 |
+
print("---Get Task File Tool---")
|
354 |
+
print("File Name : ", file_name)
|
355 |
+
|
356 |
+
try:
|
357 |
+
response = requests.get(f"{DEFAULT_API_URL}/files/{task_id}", timeout=15)
|
358 |
+
response.raise_for_status()
|
359 |
+
with open(file_name, 'wb') as file:
|
360 |
+
file.write(response.content)
|
361 |
+
return os.path.abspath(file_name)
|
362 |
+
except TypeError as e:
|
363 |
+
return f"Error GetTaskFileTool '{file_name}' : {str(e)}"
|
364 |
+
except Exception as e:
|
365 |
+
return f"Error reading file: {e}"
|
366 |
+
|
367 |
|
368 |
# Call Agent Async
|
369 |
async def call_agent_async(query: str, runner, user_id, session_id):
|
|
|
395 |
# (Keep Constants as is)
|
396 |
# --- Constants ---
|
397 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
|
|
|
|
|
|
|
|
398 |
|
399 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
400 |
# --- Basic Agent Definition ---
|
401 |
# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
|
402 |
#class BasicAgent:
|
|
|
420 |
1. **Analyze Question & Identify Files:** Carefully read the question. Determine the core task and the **exact final answer format**. Check if the question explicitly mentions an attached file (image, Excel, audio, code).
|
421 |
2. **Identify Filename:** If a file is mentioned, identify its filename from the text (e.g., "Homework.mp3", "image.png"). If no specific filename is given for a required file type, state that you need the filename. **Do not guess filenames.**
|
422 |
3. **Plan:** Create a step-by-step plan using tools. If a file is needed, include the correct tool call with the identified filename.
|
423 |
+
4. **Execute & Refine:** Execute the plan. Pass correct arguments (especially filenames). Evaluate tool outputs. If errors occur (e.g., file not found, API errors) or info is insufficient, revise the plan (e.g., use different tool prompts).
|
424 |
+
5. **Synthesize Answer:** Combine information. Use `coding_agent` for final formatting/calculations.
|
425 |
6. **Final Output:** Generate **only the final answer** in the requested format. No extra text. If the answer cannot be found or a required filename was missing/invalid, output: "I could not find the answer."
|
426 |
|
427 |
Constraints:
|
|
|
429 |
- Adhere strictly to the requested output format.
|
430 |
"""
|
431 |
|
|
|
432 |
async def run_and_submit_all( profile: gr.OAuthProfile | None):
|
433 |
"""
|
434 |
Fetches all questions, runs the BasicAgent on them, submits all answers,
|
|
|
470 |
understand_youtube_video,
|
471 |
understand_image,
|
472 |
transcribe_audio,
|
473 |
+
excel_to_csv,
|
474 |
+
GetTaskFileTool,
|
475 |
+
LoadTextFileTool,
|
476 |
]
|
477 |
)
|
478 |
except Exception as e:
|
|
|
510 |
for item in questions_data:
|
511 |
task_id = item.get("task_id")
|
512 |
question_text = item.get("question")
|
513 |
+
file_name = item.get("file_name")
|
514 |
+
if task_id:
|
515 |
+
question_text += " task_id = " + task_id
|
516 |
+
if file_name:
|
517 |
+
question_text += " file_name = " + file_name
|
518 |
if not task_id or question_text is None:
|
519 |
print(f"Skipping item with missing task_id or question: {item}")
|
520 |
continue
|
|
|
535 |
app_name=APP_NAME, # Associates runs with our app
|
536 |
session_service=session_service # Uses our session manager
|
537 |
)
|
538 |
+
submitted_answer = await call_agent_async(question_text,
|
539 |
runner=runner,
|
540 |
user_id=USER_ID,
|
541 |
session_id=SESSION_ID)
|