Adding audio tools with whisper
Browse files- agents/llama_index_agent.py +6 -4
- app.py +62 -10
- tools/text_tools.py +1 -0
agents/llama_index_agent.py
CHANGED
@@ -186,10 +186,12 @@ class GaiaAgent(ReActAgent):
|
|
186 |
|
187 |
## HANDLING AUDIO TASKS
|
188 |
When dealing with audio files:
|
189 |
-
1.
|
190 |
-
2.
|
191 |
-
3.
|
192 |
-
4.
|
|
|
|
|
193 |
|
194 |
## DELEGATION TO WRITER AGENT
|
195 |
After completing your analysis, ALWAYS delegate the final answer preparation to the writer_agent with:
|
|
|
186 |
|
187 |
## HANDLING AUDIO TASKS
|
188 |
When dealing with audio files:
|
189 |
+
1. Check if an audio file path is available in the context's "audio_file_path" field
|
190 |
+
2. Always use the transcribe_audio tool with the exact file path provided in the context
|
191 |
+
3. Extract the specific information requested from the transcript (e.g., ingredients, page numbers, names)
|
192 |
+
4. Follow any special formatting instructions (e.g., comma-separated list, alphabetical order)
|
193 |
+
5. Make sure to provide exactly what is asked for (e.g., "only list ingredients, not measurements")
|
194 |
+
6. For audio tasks, ensure you've captured all relevant spoken content, including names, facts, or quotes as needed
|
195 |
|
196 |
## DELEGATION TO WRITER AGENT
|
197 |
After completing your analysis, ALWAYS delegate the final answer preparation to the writer_agent with:
|
app.py
CHANGED
@@ -90,23 +90,49 @@ class BasicAgent:
|
|
90 |
else:
|
91 |
print(f"Writer agent using same model as main agent")
|
92 |
|
93 |
-
def __call__(self,
|
94 |
"""Process a GAIA benchmark question and return the formatted answer."""
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
async def agentic_main():
|
98 |
-
# Initialize context with the question
|
99 |
initial_state = {
|
100 |
-
"original_question":
|
|
|
|
|
101 |
"analysis_notes": "",
|
102 |
"format_requirements": "",
|
103 |
"next_agent": "",
|
104 |
"final_answer": ""
|
105 |
}
|
106 |
|
107 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
workflow_response = await self.agent_workflow.run(
|
109 |
-
|
110 |
initial_state=initial_state
|
111 |
)
|
112 |
return workflow_response
|
@@ -117,7 +143,32 @@ class BasicAgent:
|
|
117 |
final_answer = response.response.blocks[-1].text
|
118 |
print(f"Agent returning answer: {final_answer}")
|
119 |
return final_answer
|
120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
123 |
"""
|
@@ -180,12 +231,13 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
180 |
print(f"Skipping item with missing task_id or question: {item}")
|
181 |
continue
|
182 |
try:
|
183 |
-
|
|
|
184 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
185 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
186 |
except Exception as e:
|
187 |
-
|
188 |
-
|
189 |
|
190 |
if not answers_payload:
|
191 |
print("Agent did not produce any answers to submit.")
|
|
|
90 |
else:
|
91 |
print(f"Writer agent using same model as main agent")
|
92 |
|
93 |
+
def __call__(self, question_data: dict) -> str:
|
94 |
"""Process a GAIA benchmark question and return the formatted answer."""
|
95 |
+
# Extract question text and task_id
|
96 |
+
question_text = question_data.get("question", "")
|
97 |
+
task_id = question_data.get("task_id", "")
|
98 |
+
file_name = question_data.get("file_name", "")
|
99 |
+
|
100 |
+
print(f"Agent received question (first 50 chars): {question_text[:50]}...")
|
101 |
+
|
102 |
+
# Download audio file if present
|
103 |
+
local_file_path = None
|
104 |
+
if file_name and task_id:
|
105 |
+
try:
|
106 |
+
local_file_path = self.download_task_file(task_id)
|
107 |
+
print(f"Downloaded audio file to {local_file_path}")
|
108 |
+
except Exception as e:
|
109 |
+
print(f"Error downloading audio file: {e}")
|
110 |
|
111 |
async def agentic_main():
|
112 |
+
# Initialize context with the question and file path
|
113 |
initial_state = {
|
114 |
+
"original_question": question_text,
|
115 |
+
"task_id": task_id,
|
116 |
+
"audio_file_path": local_file_path,
|
117 |
"analysis_notes": "",
|
118 |
"format_requirements": "",
|
119 |
"next_agent": "",
|
120 |
"final_answer": ""
|
121 |
}
|
122 |
|
123 |
+
# MODIFY THIS PART - Instead of just passing the question text,
|
124 |
+
# create a more detailed input that includes the audio file path information
|
125 |
+
enhanced_input = f"""Task ID: {task_id}
|
126 |
+
Question: {question_text}
|
127 |
+
|
128 |
+
"""
|
129 |
+
# Add audio file information if available
|
130 |
+
if local_file_path:
|
131 |
+
enhanced_input += f"Audio File Path: {local_file_path}\n\nPlease analyze this question. If it involves an audio file, use the transcribe_audio tool with the provided path."
|
132 |
+
|
133 |
+
# Use the workflow to process the question with enhanced input
|
134 |
workflow_response = await self.agent_workflow.run(
|
135 |
+
enhanced_input,
|
136 |
initial_state=initial_state
|
137 |
)
|
138 |
return workflow_response
|
|
|
143 |
final_answer = response.response.blocks[-1].text
|
144 |
print(f"Agent returning answer: {final_answer}")
|
145 |
return final_answer
|
146 |
+
|
147 |
+
def download_task_file(self, task_id: str) -> str:
|
148 |
+
"""Download a task file from the API and return the local file path."""
|
149 |
+
api_url = DEFAULT_API_URL
|
150 |
+
file_url = f"{api_url}/files/{task_id}"
|
151 |
+
|
152 |
+
print(f"Downloading file from: {file_url}")
|
153 |
+
|
154 |
+
try:
|
155 |
+
response = requests.get(file_url, stream=True)
|
156 |
+
response.raise_for_status()
|
157 |
+
|
158 |
+
# Create a directory for downloaded files if it doesn't exist
|
159 |
+
downloads_dir = Path("downloads")
|
160 |
+
downloads_dir.mkdir(exist_ok=True)
|
161 |
+
|
162 |
+
# Save the file to the downloads directory
|
163 |
+
file_path = downloads_dir / f"{task_id}.mp3"
|
164 |
+
with open(file_path, "wb") as f:
|
165 |
+
for chunk in response.iter_content(chunk_size=8192):
|
166 |
+
f.write(chunk)
|
167 |
+
|
168 |
+
return str(file_path)
|
169 |
+
except Exception as e:
|
170 |
+
print(f"Error downloading file: {e}")
|
171 |
+
raise
|
172 |
|
173 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
174 |
"""
|
|
|
231 |
print(f"Skipping item with missing task_id or question: {item}")
|
232 |
continue
|
233 |
try:
|
234 |
+
# Pass the entire item instead of just the question text
|
235 |
+
submitted_answer = agent(item)
|
236 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
237 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
238 |
except Exception as e:
|
239 |
+
print(f"Error running agent on task {task_id}: {e}")
|
240 |
+
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
|
241 |
|
242 |
if not answers_payload:
|
243 |
print("Agent did not produce any answers to submit.")
|
tools/text_tools.py
CHANGED
@@ -11,3 +11,4 @@ reverse_text_tool = FunctionTool.from_defaults(
|
|
11 |
name="reverse_text_tool",
|
12 |
description="It returns the reversed string of text in the input.",
|
13 |
)
|
|
|
|
11 |
name="reverse_text_tool",
|
12 |
description="It returns the reversed string of text in the input.",
|
13 |
)
|
14 |
+
|