drAbreu commited on
Commit
bcb1c6b
·
1 Parent(s): 09a77ad

Adding audio tools with whisper

Browse files
Files changed (3) hide show
  1. agents/llama_index_agent.py +6 -4
  2. app.py +62 -10
  3. tools/text_tools.py +1 -0
agents/llama_index_agent.py CHANGED
@@ -186,10 +186,12 @@ class GaiaAgent(ReActAgent):
186
 
187
  ## HANDLING AUDIO TASKS
188
  When dealing with audio files:
189
- 1. Use the transcribe_audio tool to get a full transcript of the audio content
190
- 2. Extract the specific information requested from the transcript
191
- 3. Format your answer exactly as requested in the question
192
- 4. For audio tasks, ensure you've captured all relevant spoken content, including names, facts, or quotes as needed
 
 
193
 
194
  ## DELEGATION TO WRITER AGENT
195
  After completing your analysis, ALWAYS delegate the final answer preparation to the writer_agent with:
 
186
 
187
  ## HANDLING AUDIO TASKS
188
  When dealing with audio files:
189
+ 1. Check if an audio file path is available in the context's "audio_file_path" field
190
+ 2. Always use the transcribe_audio tool with the exact file path provided in the context
191
+ 3. Extract the specific information requested from the transcript (e.g., ingredients, page numbers, names)
192
+ 4. Follow any special formatting instructions (e.g., comma-separated list, alphabetical order)
193
+ 5. Make sure to provide exactly what is asked for (e.g., "only list ingredients, not measurements")
194
+ 6. For audio tasks, ensure you've captured all relevant spoken content, including names, facts, or quotes as needed
195
 
196
  ## DELEGATION TO WRITER AGENT
197
  After completing your analysis, ALWAYS delegate the final answer preparation to the writer_agent with:
app.py CHANGED
@@ -90,23 +90,49 @@ class BasicAgent:
90
  else:
91
  print(f"Writer agent using same model as main agent")
92
 
93
- def __call__(self, question: str) -> str:
94
  """Process a GAIA benchmark question and return the formatted answer."""
95
- print(f"Agent received question (first 50 chars): {question[:50]}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  async def agentic_main():
98
- # Initialize context with the question
99
  initial_state = {
100
- "original_question": question,
 
 
101
  "analysis_notes": "",
102
  "format_requirements": "",
103
  "next_agent": "",
104
  "final_answer": ""
105
  }
106
 
107
- # Use the workflow to process the question
 
 
 
 
 
 
 
 
 
 
108
  workflow_response = await self.agent_workflow.run(
109
- question,
110
  initial_state=initial_state
111
  )
112
  return workflow_response
@@ -117,7 +143,32 @@ class BasicAgent:
117
  final_answer = response.response.blocks[-1].text
118
  print(f"Agent returning answer: {final_answer}")
119
  return final_answer
120
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  def run_and_submit_all( profile: gr.OAuthProfile | None):
123
  """
@@ -180,12 +231,13 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
180
  print(f"Skipping item with missing task_id or question: {item}")
181
  continue
182
  try:
183
- submitted_answer = agent(question_text)
 
184
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
185
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
186
  except Exception as e:
187
- print(f"Error running agent on task {task_id}: {e}")
188
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
189
 
190
  if not answers_payload:
191
  print("Agent did not produce any answers to submit.")
 
90
  else:
91
  print(f"Writer agent using same model as main agent")
92
 
93
+ def __call__(self, question_data: dict) -> str:
94
  """Process a GAIA benchmark question and return the formatted answer."""
95
+ # Extract question text and task_id
96
+ question_text = question_data.get("question", "")
97
+ task_id = question_data.get("task_id", "")
98
+ file_name = question_data.get("file_name", "")
99
+
100
+ print(f"Agent received question (first 50 chars): {question_text[:50]}...")
101
+
102
+ # Download audio file if present
103
+ local_file_path = None
104
+ if file_name and task_id:
105
+ try:
106
+ local_file_path = self.download_task_file(task_id)
107
+ print(f"Downloaded audio file to {local_file_path}")
108
+ except Exception as e:
109
+ print(f"Error downloading audio file: {e}")
110
 
111
  async def agentic_main():
112
+ # Initialize context with the question and file path
113
  initial_state = {
114
+ "original_question": question_text,
115
+ "task_id": task_id,
116
+ "audio_file_path": local_file_path,
117
  "analysis_notes": "",
118
  "format_requirements": "",
119
  "next_agent": "",
120
  "final_answer": ""
121
  }
122
 
123
+ # MODIFY THIS PART - Instead of just passing the question text,
124
+ # create a more detailed input that includes the audio file path information
125
+ enhanced_input = f"""Task ID: {task_id}
126
+ Question: {question_text}
127
+
128
+ """
129
+ # Add audio file information if available
130
+ if local_file_path:
131
+ enhanced_input += f"Audio File Path: {local_file_path}\n\nPlease analyze this question. If it involves an audio file, use the transcribe_audio tool with the provided path."
132
+
133
+ # Use the workflow to process the question with enhanced input
134
  workflow_response = await self.agent_workflow.run(
135
+ enhanced_input,
136
  initial_state=initial_state
137
  )
138
  return workflow_response
 
143
  final_answer = response.response.blocks[-1].text
144
  print(f"Agent returning answer: {final_answer}")
145
  return final_answer
146
+
147
+ def download_task_file(self, task_id: str) -> str:
148
+ """Download a task file from the API and return the local file path."""
149
+ api_url = DEFAULT_API_URL
150
+ file_url = f"{api_url}/files/{task_id}"
151
+
152
+ print(f"Downloading file from: {file_url}")
153
+
154
+ try:
155
+ response = requests.get(file_url, stream=True)
156
+ response.raise_for_status()
157
+
158
+ # Create a directory for downloaded files if it doesn't exist
159
+ downloads_dir = Path("downloads")
160
+ downloads_dir.mkdir(exist_ok=True)
161
+
162
+ # Save the file to the downloads directory
163
+ file_path = downloads_dir / f"{task_id}.mp3"
164
+ with open(file_path, "wb") as f:
165
+ for chunk in response.iter_content(chunk_size=8192):
166
+ f.write(chunk)
167
+
168
+ return str(file_path)
169
+ except Exception as e:
170
+ print(f"Error downloading file: {e}")
171
+ raise
172
 
173
  def run_and_submit_all( profile: gr.OAuthProfile | None):
174
  """
 
231
  print(f"Skipping item with missing task_id or question: {item}")
232
  continue
233
  try:
234
+ # Pass the entire item instead of just the question text
235
+ submitted_answer = agent(item)
236
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
237
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
238
  except Exception as e:
239
+ print(f"Error running agent on task {task_id}: {e}")
240
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
241
 
242
  if not answers_payload:
243
  print("Agent did not produce any answers to submit.")
tools/text_tools.py CHANGED
@@ -11,3 +11,4 @@ reverse_text_tool = FunctionTool.from_defaults(
11
  name="reverse_text_tool",
12
  description="It returns the reversed string of text in the input.",
13
  )
 
 
11
  name="reverse_text_tool",
12
  description="It returns the reversed string of text in the input.",
13
  )
14
+