drAbreu commited on
Commit
f042db0
·
1 Parent(s): adf2ae1

Added cache to the answers

Browse files
Files changed (2) hide show
  1. README.md +4 -1
  2. app.py +233 -57
README.md CHANGED
@@ -20,5 +20,8 @@ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-
20
  -
21
  ## Tag 1.1.0
22
 
23
- - Adding web and wikipedia tools to single agent obtains 5 / 20 correct answers using claude 3.7
24
 
 
 
 
 
20
  -
21
  ## Tag 1.1.0
22
 
23
+ - Adding web and wikipedia tools to single agent obtains 5 / 20 correct answers using claude 3.7 and gpt-4o
24
 
25
+ ## Tag 1.2.0
26
+
27
+ - Adding a `writer_agent` obtains 7 / 20 correct answers using claude 3.7 for research and gpt-4o for write the answers
app.py CHANGED
@@ -6,6 +6,9 @@ import pandas as pd
6
  import asyncio
7
  from llama_index.core.agent.workflow import AgentWorkflow
8
  from agents.llama_index_agent import GaiaAgent, create_writer_agent
 
 
 
9
 
10
  # (Keep Constants as is)
11
  # --- Constants ---
@@ -114,16 +117,104 @@ class BasicAgent:
114
  return final_answer
115
 
116
 
117
- def run_and_submit_all( profile: gr.OAuthProfile | None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  """
119
  Fetches all questions, runs the BasicAgent on them, submits all answers,
120
- and displays the results.
 
121
  """
122
  # --- Determine HF Space Runtime URL and Repo URL ---
123
  space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
124
 
125
  if profile:
126
- username= f"{profile.username}"
127
  print(f"User logged in: {username}")
128
  else:
129
  print("User not logged in.")
@@ -132,17 +223,17 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
132
  api_url = DEFAULT_API_URL
133
  questions_url = f"{api_url}/questions"
134
  submit_url = f"{api_url}/submit"
 
 
 
 
135
 
136
- # 1. Instantiate Agent ( modify this part to create your agent)
137
- try:
138
- agent = BasicAgent()
139
- except Exception as e:
140
- print(f"Error instantiating agent: {e}")
141
- return f"Error initializing agent: {e}", None
142
- # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
143
- agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
144
- print(agent_code)
145
 
 
 
 
146
  # 2. Fetch Questions
147
  print(f"Fetching questions from: {questions_url}")
148
  try:
@@ -164,72 +255,144 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
164
  print(f"An unexpected error occurred fetching questions: {e}")
165
  return f"An unexpected error occurred fetching questions: {e}", None
166
 
167
- # 3. Run your Agent
168
  results_log = []
169
  answers_payload = []
170
- print(f"Running agent on {len(questions_data)} questions...")
 
 
 
171
  for item in questions_data:
172
  task_id = item.get("task_id")
173
  question_text = item.get("question")
 
174
  if not task_id or question_text is None:
175
  print(f"Skipping item with missing task_id or question: {item}")
176
  continue
177
- try:
178
- submitted_answer = agent(question_text)
179
- answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
180
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
181
- except Exception as e:
182
- print(f"Error running agent on task {task_id}: {e}")
183
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
  if not answers_payload:
186
- print("Agent did not produce any answers to submit.")
187
- return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
188
 
189
  # 4. Prepare Submission
190
- submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
191
- status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
 
 
 
 
 
 
 
 
 
192
  print(status_update)
193
 
194
- # 5. Submit
195
  print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
196
  try:
197
  response = requests.post(submit_url, json=submission_data, timeout=60)
198
  response.raise_for_status()
199
  result_data = response.json()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  final_status = (
201
  f"Submission Successful!\n"
202
  f"User: {result_data.get('username')}\n"
203
  f"Overall Score: {result_data.get('score', 'N/A')}% "
204
  f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
 
 
205
  f"Message: {result_data.get('message', 'No message received.')}"
206
  )
207
- print("Submission successful.")
 
208
  results_df = pd.DataFrame(results_log)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  return final_status, results_df
210
- except requests.exceptions.HTTPError as e:
211
- error_detail = f"Server responded with status {e.response.status_code}."
212
- try:
213
- error_json = e.response.json()
214
- error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
215
- except requests.exceptions.JSONDecodeError:
216
- error_detail += f" Response: {e.response.text[:500]}"
217
- status_message = f"Submission Failed: {error_detail}"
218
- print(status_message)
219
- results_df = pd.DataFrame(results_log)
220
- return status_message, results_df
221
- except requests.exceptions.Timeout:
222
- status_message = "Submission Failed: The request timed out."
223
- print(status_message)
224
- results_df = pd.DataFrame(results_log)
225
- return status_message, results_df
226
- except requests.exceptions.RequestException as e:
227
- status_message = f"Submission Failed: Network error - {e}"
228
- print(status_message)
229
- results_df = pd.DataFrame(results_log)
230
- return status_message, results_df
231
  except Exception as e:
232
- status_message = f"An unexpected error occurred during submission: {e}"
233
  print(status_message)
234
  results_df = pd.DataFrame(results_log)
235
  return status_message, results_df
@@ -237,19 +400,21 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
237
 
238
  # --- Build Gradio Interface using Blocks ---
239
  with gr.Blocks() as demo:
240
- gr.Markdown("# Basic Agent Evaluation Runner")
241
  gr.Markdown(
242
  """
243
  **Instructions:**
244
 
245
- 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
246
- 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
247
- 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
248
 
 
 
249
  ---
250
  **Disclaimers:**
251
- Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
252
- This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
253
  """
254
  )
255
 
@@ -258,14 +423,24 @@ with gr.Blocks() as demo:
258
  run_button = gr.Button("Run Evaluation & Submit All Answers")
259
 
260
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
261
- # Removed max_rows=10 from DataFrame constructor
262
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
263
 
 
 
 
 
 
 
 
 
264
  run_button.click(
265
  fn=run_and_submit_all,
266
  outputs=[status_output, results_table]
267
  )
268
 
 
 
 
269
  if __name__ == "__main__":
270
  print("\n" + "-"*30 + " App Starting " + "-"*30)
271
  # Check for SPACE_HOST and SPACE_ID at startup for information
@@ -288,4 +463,5 @@ if __name__ == "__main__":
288
  print("-"*(60 + len(" App Starting ")) + "\n")
289
 
290
  print("Launching Gradio Interface for Basic Agent Evaluation...")
291
- demo.launch(debug=True, share=False)
 
 
6
  import asyncio
7
  from llama_index.core.agent.workflow import AgentWorkflow
8
  from agents.llama_index_agent import GaiaAgent, create_writer_agent
9
+ import json
10
+ import hashlib
11
+ from pathlib import Path
12
 
13
  # (Keep Constants as is)
14
  # --- Constants ---
 
117
  return final_answer
118
 
119
 
120
+ class ResponseCache:
121
+ """Cache manager for storing and retrieving agent responses."""
122
+
123
+ def __init__(self, cache_file="agent_cache.json"):
124
+ """Initialize the cache manager.
125
+
126
+ Args:
127
+ cache_file: Path to the JSON file for storing the cache
128
+ """
129
+ self.cache_file = cache_file
130
+ self.cache = self._load_cache()
131
+
132
+ # Stats for the current session
133
+ self.cache_hits = 0
134
+ self.cache_misses = 0
135
+
136
+ def _load_cache(self):
137
+ """Load the cache from disk."""
138
+ try:
139
+ if os.path.exists(self.cache_file):
140
+ with open(self.cache_file, 'r') as f:
141
+ return json.load(f)
142
+ return {}
143
+ except Exception as e:
144
+ print(f"Error loading cache: {e}. Starting with empty cache.")
145
+ return {}
146
+
147
+ def _save_cache(self):
148
+ """Save the cache to disk."""
149
+ try:
150
+ with open(self.cache_file, 'w') as f:
151
+ json.dump(self.cache, f)
152
+ except Exception as e:
153
+ print(f"Error saving cache: {e}")
154
+
155
+ def get_hash(self, question):
156
+ """Create a consistent hash for a question."""
157
+ return hashlib.md5(question.encode('utf-8')).hexdigest()
158
+
159
+ def get(self, question):
160
+ """Get a cached response if available.
161
+
162
+ Returns:
163
+ tuple: (cached_answer, hit_status)
164
+ - cached_answer: The cached answer or None if not found
165
+ - hit_status: True if cache hit, False if miss
166
+ """
167
+ question_hash = self.get_hash(question)
168
+ if question_hash in self.cache:
169
+ # Only return answers marked as correct
170
+ entry = self.cache[question_hash]
171
+ if entry.get("is_correct", False):
172
+ self.cache_hits += 1
173
+ return entry["answer"], True
174
+
175
+ self.cache_misses += 1
176
+ return None, False
177
+
178
+ def update(self, question, answer, is_correct=False):
179
+ """Update the cache with a new response.
180
+
181
+ Args:
182
+ question: The question text
183
+ answer: The agent's answer
184
+ is_correct: Whether the answer was correct
185
+ """
186
+ question_hash = self.get_hash(question)
187
+ self.cache[question_hash] = {
188
+ "question": question,
189
+ "answer": answer,
190
+ "is_correct": is_correct
191
+ }
192
+ self._save_cache()
193
+
194
+ def get_stats(self):
195
+ """Get cache statistics."""
196
+ total_entries = len(self.cache)
197
+ correct_entries = sum(1 for entry in self.cache.values() if entry.get("is_correct", False))
198
+
199
+ return {
200
+ "total_cached": total_entries,
201
+ "correct_cached": correct_entries,
202
+ "session_hits": self.cache_hits,
203
+ "session_misses": self.cache_misses
204
+ }
205
+
206
+
207
+ def run_and_submit_all(profile: gr.OAuthProfile | None):
208
  """
209
  Fetches all questions, runs the BasicAgent on them, submits all answers,
210
+ and displays the results. Uses caching to avoid re-processing questions
211
+ with known correct answers.
212
  """
213
  # --- Determine HF Space Runtime URL and Repo URL ---
214
  space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
215
 
216
  if profile:
217
+ username = f"{profile.username}"
218
  print(f"User logged in: {username}")
219
  else:
220
  print("User not logged in.")
 
223
  api_url = DEFAULT_API_URL
224
  questions_url = f"{api_url}/questions"
225
  submit_url = f"{api_url}/submit"
226
+
227
+ # Initialize the cache
228
+ cache = ResponseCache()
229
+ print(f"Cache loaded. Stats: {cache.get_stats()}")
230
 
231
+ # 1. Instantiate Agent (only if needed)
232
+ agent = None # We'll lazily initialize the agent only if needed
 
 
 
 
 
 
 
233
 
234
+ # In the case of an app running as a hugging Face space, this link points toward your codebase
235
+ agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
236
+
237
  # 2. Fetch Questions
238
  print(f"Fetching questions from: {questions_url}")
239
  try:
 
255
  print(f"An unexpected error occurred fetching questions: {e}")
256
  return f"An unexpected error occurred fetching questions: {e}", None
257
 
258
+ # 3. Run your Agent (with cache)
259
  results_log = []
260
  answers_payload = []
261
+ cache_usage = {"hits": 0, "misses": 0}
262
+
263
+ print(f"Processing {len(questions_data)} questions...")
264
+
265
  for item in questions_data:
266
  task_id = item.get("task_id")
267
  question_text = item.get("question")
268
+
269
  if not task_id or question_text is None:
270
  print(f"Skipping item with missing task_id or question: {item}")
271
  continue
272
+
273
+ # Try to get the answer from cache
274
+ cached_answer, is_cache_hit = cache.get(question_text)
275
+
276
+ if is_cache_hit:
277
+ # Use cached answer
278
+ submitted_answer = cached_answer
279
+ cache_usage["hits"] += 1
280
+ print(f"✅ Cache hit for task {task_id}. Using cached answer.")
281
+ else:
282
+ # Cache miss - run the agent
283
+ cache_usage["misses"] += 1
284
+ print(f"🔄 Cache miss for task {task_id}. Running agent...")
285
+
286
+ # Lazy initialization of agent
287
+ if agent is None:
288
+ try:
289
+ print("Initializing agent...")
290
+ agent = BasicAgent()
291
+ except Exception as e:
292
+ print(f"Error instantiating agent: {e}")
293
+ return f"Error initializing agent: {e}", None
294
+
295
+ try:
296
+ submitted_answer = agent(question_text)
297
+ except Exception as e:
298
+ print(f"Error running agent on task {task_id}: {e}")
299
+ submitted_answer = f"AGENT ERROR: {e}"
300
+
301
+ # Add to results and submission payload
302
+ answers_payload.append({
303
+ "task_id": task_id,
304
+ "submitted_answer": submitted_answer
305
+ })
306
+
307
+ results_log.append({
308
+ "Task ID": task_id,
309
+ "Question": question_text,
310
+ "Submitted Answer": submitted_answer,
311
+ "From Cache": is_cache_hit
312
+ })
313
 
314
  if not answers_payload:
315
+ print("No answers to submit.")
316
+ return "No answers to submit.", pd.DataFrame(results_log)
317
 
318
  # 4. Prepare Submission
319
+ submission_data = {
320
+ "username": username.strip(),
321
+ "agent_code": agent_code,
322
+ "answers": answers_payload
323
+ }
324
+
325
+ status_update = (
326
+ f"Finished processing questions. "
327
+ f"Cache: {cache_usage['hits']} hits, {cache_usage['misses']} misses. "
328
+ f"Submitting {len(answers_payload)} answers for user '{username}'..."
329
+ )
330
  print(status_update)
331
 
332
+ # 5. Submit and update cache with results
333
  print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
334
  try:
335
  response = requests.post(submit_url, json=submission_data, timeout=60)
336
  response.raise_for_status()
337
  result_data = response.json()
338
+
339
+ # Update cache with correct answers
340
+ if "task_results" in result_data:
341
+ cache_updates = 0
342
+ for task_result in result_data["task_results"]:
343
+ task_id = task_result.get("task_id")
344
+ is_correct = task_result.get("is_correct", False)
345
+
346
+ # Find the matching question and answer
347
+ for item in questions_data:
348
+ if item.get("task_id") == task_id:
349
+ question = item.get("question")
350
+
351
+ # Find the matching submitted answer
352
+ for answer_item in answers_payload:
353
+ if answer_item.get("task_id") == task_id:
354
+ answer = answer_item.get("submitted_answer")
355
+
356
+ # Only cache correct answers
357
+ if is_correct:
358
+ cache.update(question, answer, is_correct=True)
359
+ cache_updates += 1
360
+ break
361
+
362
+ print(f"Updated cache with {cache_updates} correct answers.")
363
+
364
+ # Prepare final status message
365
+ cache_stats = cache.get_stats()
366
  final_status = (
367
  f"Submission Successful!\n"
368
  f"User: {result_data.get('username')}\n"
369
  f"Overall Score: {result_data.get('score', 'N/A')}% "
370
  f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
371
+ f"Cache Performance: {cache_usage['hits']} hits, {cache_usage['misses']} misses\n"
372
+ f"Total Cached Correct Answers: {cache_stats['correct_cached']}\n"
373
  f"Message: {result_data.get('message', 'No message received.')}"
374
  )
375
+
376
+ # Add cache information to results dataframe
377
  results_df = pd.DataFrame(results_log)
378
+
379
+ # If the response includes detailed results, add correctness to the DataFrame
380
+ if "task_results" in result_data:
381
+ # Create a mapping of task_id to correctness
382
+ correctness_map = {
383
+ result["task_id"]: result["is_correct"]
384
+ for result in result_data["task_results"]
385
+ }
386
+
387
+ # Add a column for correctness
388
+ results_df["Is Correct"] = results_df["Task ID"].map(
389
+ lambda x: correctness_map.get(x, "Unknown")
390
+ )
391
+
392
  return final_status, results_df
393
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
  except Exception as e:
395
+ status_message = f"Submission Failed: {str(e)}"
396
  print(status_message)
397
  results_df = pd.DataFrame(results_log)
398
  return status_message, results_df
 
400
 
401
  # --- Build Gradio Interface using Blocks ---
402
  with gr.Blocks() as demo:
403
+ gr.Markdown("# Basic Agent Evaluation Runner (with Caching)")
404
  gr.Markdown(
405
  """
406
  **Instructions:**
407
 
408
+ 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc...
409
+ 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
410
+ 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
411
 
412
+ **Caching Enabled**: Correct answers are cached between runs to speed up evaluation.
413
+
414
  ---
415
  **Disclaimers:**
416
+ Once clicking on the "submit button, it can take quite some time (this is the time for the agent to go through all the questions).
417
+ This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution.
418
  """
419
  )
420
 
 
423
  run_button = gr.Button("Run Evaluation & Submit All Answers")
424
 
425
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
 
426
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
427
 
428
+ # Display current cache status
429
+ cache = ResponseCache()
430
+ cache_stats = cache.get_stats()
431
+
432
+ gr.Markdown(
433
+ f"**Cache Status**: {cache_stats['correct_cached']} correct answers cached out of {cache_stats['total_cached']} total entries."
434
+ )
435
+
436
  run_button.click(
437
  fn=run_and_submit_all,
438
  outputs=[status_output, results_table]
439
  )
440
 
441
+
442
+
443
+ # Add these imports to your existing imports
444
  if __name__ == "__main__":
445
  print("\n" + "-"*30 + " App Starting " + "-"*30)
446
  # Check for SPACE_HOST and SPACE_ID at startup for information
 
463
  print("-"*(60 + len(" App Starting ")) + "\n")
464
 
465
  print("Launching Gradio Interface for Basic Agent Evaluation...")
466
+ demo.launch(debug=True, share=False)
467
+