drAbreu commited on
Commit
f289100
·
1 Parent(s): e67eadd

Changed writer to gpt-4o-mini. Back to non cached due to issues

Browse files
Files changed (2) hide show
  1. agents/llama_index_agent.py +9 -0
  2. app.py +56 -257
agents/llama_index_agent.py CHANGED
@@ -260,8 +260,17 @@ def create_writer_agent(model_config: Dict[str, Any]) -> ReActAgent:
260
  When asked for "comma-separated list in alphabetical order": apple, banana, cherry
261
  When asked for "single number": 42
262
  When asked for "opposite of word 'right'": left
 
 
 
 
 
 
 
263
 
264
  REMEMBER: Your ENTIRE response should be just the answer - nothing more, nothing less.
 
 
265
  """,
266
  llm=llm
267
  )
 
260
  When asked for "comma-separated list in alphabetical order": apple, banana, cherry
261
  When asked for "single number": 42
262
  When asked for "opposite of word 'right'": left
263
+ When asked for "How many ...": eleven
264
+ When asked for "What says Yoda": "May the force be with you"
265
+
266
+ ## CONCRETE EXAMPLE:
267
+ When asked "The answer to the question of Universe, life and everything"
268
+ - WRONG ANSWER: The answer to the question is 42.
269
+ - RIGHT ANSWER: 42
270
 
271
  REMEMBER: Your ENTIRE response should be just the answer - nothing more, nothing less.
272
+
273
+ DO NOT EXPLAIN THE ANSWER. SIMPLY WRITE BACK THE ANSWER.
274
  """,
275
  llm=llm
276
  )
app.py CHANGED
@@ -119,124 +119,16 @@ class BasicAgent:
119
  return final_answer
120
 
121
 
122
- class ResponseCache:
123
- """Cache manager for storing and retrieving agent responses with persistence across HF rebuilds."""
124
-
125
- def __init__(self, cache_name="agent_responses"):
126
- """Initialize the cache manager.
127
-
128
- Args:
129
- cache_name: Base name for the cache file
130
- """
131
- # Use /data directory for persistence in HF Spaces
132
- # Fall back to local directory if running locally
133
- if os.path.exists("/data") and os.access("/data", os.W_OK):
134
- self.cache_dir = Path("/data")
135
- print("Using HF Spaces persistent storage in /data directory")
136
- else:
137
- self.cache_dir = Path(".")
138
- print("Using local directory for cache (not persistent across HF rebuilds)")
139
-
140
- # Ensure directory exists
141
- os.makedirs(self.cache_dir, exist_ok=True)
142
-
143
- # Full path to cache file
144
- self.cache_file = self.cache_dir / f"{cache_name}.json"
145
- print(f"Cache file location: {self.cache_file}")
146
-
147
- # Load the cache
148
- self.cache = self._load_cache()
149
-
150
- # Stats for the current session
151
- self.cache_hits = 0
152
- self.cache_misses = 0
153
-
154
- def _load_cache(self):
155
- """Load the cache from disk."""
156
- try:
157
- if os.path.exists(self.cache_file):
158
- with open(self.cache_file, 'r') as f:
159
- cache_data = json.load(f)
160
- print(f"Cache loaded with {len(cache_data)} entries")
161
- return cache_data
162
- print("No existing cache found, starting with empty cache")
163
- return {}
164
- except Exception as e:
165
- print(f"Error loading cache: {e}. Starting with empty cache.")
166
- return {}
167
-
168
- def _save_cache(self):
169
- """Save the cache to disk."""
170
- try:
171
- with open(self.cache_file, 'w') as f:
172
- json.dump(self.cache, f)
173
- print(f"Cache saved with {len(self.cache)} entries")
174
- except Exception as e:
175
- print(f"Error saving cache: {e}")
176
-
177
- def get_hash(self, question):
178
- """Create a consistent hash for a question."""
179
- return hashlib.md5(question.encode('utf-8')).hexdigest()
180
-
181
- def get(self, question):
182
- """Get a cached response if available.
183
-
184
- Returns:
185
- tuple: (cached_answer, hit_status)
186
- - cached_answer: The cached answer or None if not found
187
- - hit_status: True if cache hit, False if miss
188
- """
189
- question_hash = self.get_hash(question)
190
- if question_hash in self.cache:
191
- # Only return answers marked as correct
192
- entry = self.cache[question_hash]
193
- if entry.get("is_correct", False):
194
- self.cache_hits += 1
195
- return entry["answer"], True
196
-
197
- self.cache_misses += 1
198
- return None, False
199
-
200
- def update(self, question, answer, is_correct=False):
201
- """Update the cache with a new response.
202
-
203
- Args:
204
- question: The question text
205
- answer: The agent's answer
206
- is_correct: Whether the answer was correct
207
- """
208
- question_hash = self.get_hash(question)
209
- self.cache[question_hash] = {
210
- "question": question,
211
- "answer": answer,
212
- "is_correct": is_correct
213
- }
214
- self._save_cache()
215
-
216
- def get_stats(self):
217
- """Get cache statistics."""
218
- total_entries = len(self.cache)
219
- correct_entries = sum(1 for entry in self.cache.values() if entry.get("is_correct", False))
220
-
221
- return {
222
- "total_cached": total_entries,
223
- "correct_cached": correct_entries,
224
- "session_hits": self.cache_hits,
225
- "session_misses": self.cache_misses
226
- }
227
-
228
-
229
- def run_and_submit_all(profile: gr.OAuthProfile | None):
230
  """
231
  Fetches all questions, runs the BasicAgent on them, submits all answers,
232
- and displays the results. Uses caching to avoid re-processing questions
233
- with known correct answers.
234
  """
235
  # --- Determine HF Space Runtime URL and Repo URL ---
236
  space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
237
 
238
  if profile:
239
- username = f"{profile.username}"
240
  print(f"User logged in: {username}")
241
  else:
242
  print("User not logged in.")
@@ -245,17 +137,17 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
245
  api_url = DEFAULT_API_URL
246
  questions_url = f"{api_url}/questions"
247
  submit_url = f"{api_url}/submit"
248
-
249
- # Initialize the cache
250
- cache = ResponseCache()
251
- print(f"Cache loaded. Stats: {cache.get_stats()}")
252
 
253
- # 1. Instantiate Agent (only if needed)
254
- agent = None # We'll lazily initialize the agent only if needed
255
-
256
- # In the case of an app running as a hugging Face space, this link points toward your codebase
 
 
 
257
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
258
-
 
259
  # 2. Fetch Questions
260
  print(f"Fetching questions from: {questions_url}")
261
  try:
@@ -277,144 +169,72 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
277
  print(f"An unexpected error occurred fetching questions: {e}")
278
  return f"An unexpected error occurred fetching questions: {e}", None
279
 
280
- # 3. Run your Agent (with cache)
281
  results_log = []
282
  answers_payload = []
283
- cache_usage = {"hits": 0, "misses": 0}
284
-
285
- print(f"Processing {len(questions_data)} questions...")
286
-
287
  for item in questions_data:
288
  task_id = item.get("task_id")
289
  question_text = item.get("question")
290
-
291
  if not task_id or question_text is None:
292
  print(f"Skipping item with missing task_id or question: {item}")
293
  continue
294
-
295
- # Try to get the answer from cache
296
- cached_answer, is_cache_hit = cache.get(question_text)
297
-
298
- if is_cache_hit:
299
- # Use cached answer
300
- submitted_answer = cached_answer
301
- cache_usage["hits"] += 1
302
- print(f"✅ Cache hit for task {task_id}. Using cached answer.")
303
- else:
304
- # Cache miss - run the agent
305
- cache_usage["misses"] += 1
306
- print(f"🔄 Cache miss for task {task_id}. Running agent...")
307
-
308
- # Lazy initialization of agent
309
- if agent is None:
310
- try:
311
- print("Initializing agent...")
312
- agent = BasicAgent()
313
- except Exception as e:
314
- print(f"Error instantiating agent: {e}")
315
- return f"Error initializing agent: {e}", None
316
-
317
- try:
318
- submitted_answer = agent(question_text)
319
- except Exception as e:
320
- print(f"Error running agent on task {task_id}: {e}")
321
- submitted_answer = f"AGENT ERROR: {e}"
322
-
323
- # Add to results and submission payload
324
- answers_payload.append({
325
- "task_id": task_id,
326
- "submitted_answer": submitted_answer
327
- })
328
-
329
- results_log.append({
330
- "Task ID": task_id,
331
- "Question": question_text,
332
- "Submitted Answer": submitted_answer,
333
- "From Cache": is_cache_hit
334
- })
335
 
336
  if not answers_payload:
337
- print("No answers to submit.")
338
- return "No answers to submit.", pd.DataFrame(results_log)
339
 
340
  # 4. Prepare Submission
341
- submission_data = {
342
- "username": username.strip(),
343
- "agent_code": agent_code,
344
- "answers": answers_payload
345
- }
346
-
347
- status_update = (
348
- f"Finished processing questions. "
349
- f"Cache: {cache_usage['hits']} hits, {cache_usage['misses']} misses. "
350
- f"Submitting {len(answers_payload)} answers for user '{username}'..."
351
- )
352
  print(status_update)
353
 
354
- # 5. Submit and update cache with results
355
  print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
356
  try:
357
  response = requests.post(submit_url, json=submission_data, timeout=60)
358
  response.raise_for_status()
359
  result_data = response.json()
360
-
361
- # Update cache with correct answers
362
- if "task_results" in result_data:
363
- cache_updates = 0
364
- for task_result in result_data["task_results"]:
365
- task_id = task_result.get("task_id")
366
- is_correct = task_result.get("is_correct", False)
367
-
368
- # Find the matching question and answer
369
- for item in questions_data:
370
- if item.get("task_id") == task_id:
371
- question = item.get("question")
372
-
373
- # Find the matching submitted answer
374
- for answer_item in answers_payload:
375
- if answer_item.get("task_id") == task_id:
376
- answer = answer_item.get("submitted_answer")
377
-
378
- # Only cache correct answers
379
- if is_correct:
380
- cache.update(question, answer, is_correct=True)
381
- cache_updates += 1
382
- break
383
-
384
- print(f"Updated cache with {cache_updates} correct answers.")
385
-
386
- # Prepare final status message
387
- cache_stats = cache.get_stats()
388
  final_status = (
389
  f"Submission Successful!\n"
390
  f"User: {result_data.get('username')}\n"
391
  f"Overall Score: {result_data.get('score', 'N/A')}% "
392
  f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
393
- f"Cache Performance: {cache_usage['hits']} hits, {cache_usage['misses']} misses\n"
394
- f"Total Cached Correct Answers: {cache_stats['correct_cached']}\n"
395
  f"Message: {result_data.get('message', 'No message received.')}"
396
  )
397
-
398
- # Add cache information to results dataframe
399
  results_df = pd.DataFrame(results_log)
400
-
401
- # If the response includes detailed results, add correctness to the DataFrame
402
- if "task_results" in result_data:
403
- # Create a mapping of task_id to correctness
404
- correctness_map = {
405
- result["task_id"]: result["is_correct"]
406
- for result in result_data["task_results"]
407
- }
408
-
409
- # Add a column for correctness
410
- results_df["Is Correct"] = results_df["Task ID"].map(
411
- lambda x: correctness_map.get(x, "Unknown")
412
- )
413
-
414
  return final_status, results_df
415
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
  except Exception as e:
417
- status_message = f"Submission Failed: {str(e)}"
418
  print(status_message)
419
  results_df = pd.DataFrame(results_log)
420
  return status_message, results_df
@@ -422,21 +242,17 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
422
 
423
  # --- Build Gradio Interface using Blocks ---
424
  with gr.Blocks() as demo:
425
- gr.Markdown("# Basic Agent Evaluation Runner (with Caching)")
426
  gr.Markdown(
427
  """
428
  **Instructions:**
429
-
430
- 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc...
431
- 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
432
- 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
433
-
434
- **Caching Enabled**: Correct answers are cached between runs to speed up evaluation.
435
-
436
  ---
437
  **Disclaimers:**
438
- Once clicking on the "submit button, it can take quite some time (this is the time for the agent to go through all the questions).
439
- This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution.
440
  """
441
  )
442
 
@@ -445,24 +261,14 @@ with gr.Blocks() as demo:
445
  run_button = gr.Button("Run Evaluation & Submit All Answers")
446
 
447
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
 
448
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
449
 
450
- # Display current cache status
451
- cache = ResponseCache()
452
- cache_stats = cache.get_stats()
453
-
454
- gr.Markdown(
455
- f"**Cache Status**: {cache_stats['correct_cached']} correct answers cached out of {cache_stats['total_cached']} total entries."
456
- )
457
-
458
  run_button.click(
459
  fn=run_and_submit_all,
460
  outputs=[status_output, results_table]
461
  )
462
 
463
-
464
-
465
- # Add these imports to your existing imports
466
  if __name__ == "__main__":
467
  print("\n" + "-"*30 + " App Starting " + "-"*30)
468
  # Check for SPACE_HOST and SPACE_ID at startup for information
@@ -484,12 +290,5 @@ if __name__ == "__main__":
484
 
485
  print("-"*(60 + len(" App Starting ")) + "\n")
486
 
487
- # Check cache persistence
488
- cache = ResponseCache()
489
- stats = cache.get_stats()
490
- print(f"Cache loaded with {stats['correct_cached']} correct answers out of {stats['total_cached']} total entries")
491
-
492
-
493
  print("Launching Gradio Interface for Basic Agent Evaluation...")
494
  demo.launch(debug=True, share=False)
495
-
 
119
  return final_answer
120
 
121
 
122
+ def run_and_submit_all( profile: gr.OAuthProfile | None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  """
124
  Fetches all questions, runs the BasicAgent on them, submits all answers,
125
+ and displays the results.
 
126
  """
127
  # --- Determine HF Space Runtime URL and Repo URL ---
128
  space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
129
 
130
  if profile:
131
+ username= f"{profile.username}"
132
  print(f"User logged in: {username}")
133
  else:
134
  print("User not logged in.")
 
137
  api_url = DEFAULT_API_URL
138
  questions_url = f"{api_url}/questions"
139
  submit_url = f"{api_url}/submit"
 
 
 
 
140
 
141
+ # 1. Instantiate Agent ( modify this part to create your agent)
142
+ try:
143
+ agent = BasicAgent()
144
+ except Exception as e:
145
+ print(f"Error instantiating agent: {e}")
146
+ return f"Error initializing agent: {e}", None
147
+ # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
148
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
149
+ print(agent_code)
150
+
151
  # 2. Fetch Questions
152
  print(f"Fetching questions from: {questions_url}")
153
  try:
 
169
  print(f"An unexpected error occurred fetching questions: {e}")
170
  return f"An unexpected error occurred fetching questions: {e}", None
171
 
172
+ # 3. Run your Agent
173
  results_log = []
174
  answers_payload = []
175
+ print(f"Running agent on {len(questions_data)} questions...")
 
 
 
176
  for item in questions_data:
177
  task_id = item.get("task_id")
178
  question_text = item.get("question")
 
179
  if not task_id or question_text is None:
180
  print(f"Skipping item with missing task_id or question: {item}")
181
  continue
182
+ try:
183
+ submitted_answer = agent(question_text)
184
+ answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
185
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
186
+ except Exception as e:
187
+ print(f"Error running agent on task {task_id}: {e}")
188
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
  if not answers_payload:
191
+ print("Agent did not produce any answers to submit.")
192
+ return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
193
 
194
  # 4. Prepare Submission
195
+ submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
196
+ status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
 
 
 
 
 
 
 
 
 
197
  print(status_update)
198
 
199
+ # 5. Submit
200
  print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
201
  try:
202
  response = requests.post(submit_url, json=submission_data, timeout=60)
203
  response.raise_for_status()
204
  result_data = response.json()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  final_status = (
206
  f"Submission Successful!\n"
207
  f"User: {result_data.get('username')}\n"
208
  f"Overall Score: {result_data.get('score', 'N/A')}% "
209
  f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
 
 
210
  f"Message: {result_data.get('message', 'No message received.')}"
211
  )
212
+ print("Submission successful.")
 
213
  results_df = pd.DataFrame(results_log)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  return final_status, results_df
215
+ except requests.exceptions.HTTPError as e:
216
+ error_detail = f"Server responded with status {e.response.status_code}."
217
+ try:
218
+ error_json = e.response.json()
219
+ error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
220
+ except requests.exceptions.JSONDecodeError:
221
+ error_detail += f" Response: {e.response.text[:500]}"
222
+ status_message = f"Submission Failed: {error_detail}"
223
+ print(status_message)
224
+ results_df = pd.DataFrame(results_log)
225
+ return status_message, results_df
226
+ except requests.exceptions.Timeout:
227
+ status_message = "Submission Failed: The request timed out."
228
+ print(status_message)
229
+ results_df = pd.DataFrame(results_log)
230
+ return status_message, results_df
231
+ except requests.exceptions.RequestException as e:
232
+ status_message = f"Submission Failed: Network error - {e}"
233
+ print(status_message)
234
+ results_df = pd.DataFrame(results_log)
235
+ return status_message, results_df
236
  except Exception as e:
237
+ status_message = f"An unexpected error occurred during submission: {e}"
238
  print(status_message)
239
  results_df = pd.DataFrame(results_log)
240
  return status_message, results_df
 
242
 
243
  # --- Build Gradio Interface using Blocks ---
244
  with gr.Blocks() as demo:
245
+ gr.Markdown("# Basic Agent Evaluation Runner")
246
  gr.Markdown(
247
  """
248
  **Instructions:**
249
+ 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
250
+ 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
251
+ 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
 
 
 
 
252
  ---
253
  **Disclaimers:**
254
+ Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
255
+ This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
256
  """
257
  )
258
 
 
261
  run_button = gr.Button("Run Evaluation & Submit All Answers")
262
 
263
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
264
+ # Removed max_rows=10 from DataFrame constructor
265
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
266
 
 
 
 
 
 
 
 
 
267
  run_button.click(
268
  fn=run_and_submit_all,
269
  outputs=[status_output, results_table]
270
  )
271
 
 
 
 
272
  if __name__ == "__main__":
273
  print("\n" + "-"*30 + " App Starting " + "-"*30)
274
  # Check for SPACE_HOST and SPACE_ID at startup for information
 
290
 
291
  print("-"*(60 + len(" App Starting ")) + "\n")
292
 
 
 
 
 
 
 
293
  print("Launching Gradio Interface for Basic Agent Evaluation...")
294
  demo.launch(debug=True, share=False)