AdnanElAssadi commited on
Commit
ac98842
·
verified ·
1 Parent(s): 2ce9eb5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +348 -299
app.py CHANGED
@@ -97,7 +97,7 @@ def create_reranking_interface(task_data):
97
  """.format(instructions=task_data.get("instructions", "Rank documents by their relevance to the query.")))
98
 
99
  current_sample_id = gr.State(value=samples[0]["id"])
100
- current_state = gr.State(value={"auto_save_enabled": True, "last_saved": time.time()})
101
 
102
  with gr.Row():
103
  progress_text = gr.Textbox(label="Progress", value=f"Progress: 0/{len(samples)}", interactive=False)
@@ -135,13 +135,11 @@ def create_reranking_interface(task_data):
135
  doc_containers.append(doc_box)
136
 
137
  with gr.Column(scale=1):
138
- # Use Radio buttons for ranking rather than dropdowns
139
- # This provides a more visual and error-resistant interface
140
- rank_input = gr.Radio(
141
  choices=[str(j) for j in range(1, len(samples[0]["candidates"])+1)],
142
  label=f"Rank",
143
- value="",
144
- interactive=True
145
  )
146
  ranking_inputs.append(rank_input)
147
 
@@ -170,13 +168,13 @@ def create_reranking_interface(task_data):
170
  else:
171
  results.append("✓")
172
 
173
- return results, all_valid
174
 
175
  def load_sample(sample_id):
176
  """Load a specific sample into the interface."""
177
  sample = next((s for s in samples if s["id"] == sample_id), None)
178
  if not sample:
179
- return [query_text.value] + [d.value for d in doc_containers] + [""] * len(ranking_inputs) + validation_indicators + [sample_id, progress_text.value, status_box.value]
180
 
181
  # Update query
182
  new_query = sample["query"]
@@ -207,9 +205,10 @@ def create_reranking_interface(task_data):
207
  new_status += " (already completed)"
208
 
209
  # Initialize validation indicators
210
- validation_results, _ = validate_rankings(*new_rankings)
 
211
 
212
- return [new_query] + new_docs + new_rankings + validation_results + [sample_id, new_progress, new_status]
213
 
214
  def auto_save_and_navigate(direction, current_id, auto_save, *rankings):
215
  """Save rankings if auto-save is enabled, then navigate."""
@@ -222,7 +221,8 @@ def create_reranking_interface(task_data):
222
 
223
  if auto_save:
224
  # Only save if all rankings are provided
225
- validation_results, all_valid = validate_rankings(*actual_rankings)
 
226
  if all_valid:
227
  status_msg, progress_msg = save_ranking(actual_rankings, current_id)
228
 
@@ -265,7 +265,6 @@ def create_reranking_interface(task_data):
265
  try:
266
  with open(output_path, "w") as f:
267
  json.dump(results, f, indent=2)
268
- current_state.value["last_saved"] = time.time()
269
  return f"✅ Results saved to {output_path} ({len(results['annotations'])} annotations)"
270
  except Exception as e:
271
  return f"Error saving results: {str(e)}"
@@ -281,7 +280,7 @@ def create_reranking_interface(task_data):
281
 
282
  # Function to clear all rankings
283
  def clear_rankings():
284
- return ["" for _ in range(len(samples[0]["candidates"]))]
285
 
286
  # Define a function that collects all ranking values and validates them
287
  def submit_rankings(*args):
@@ -297,11 +296,14 @@ def create_reranking_interface(task_data):
297
  rankings = args[:len(ranking_inputs)]
298
 
299
  # First validate the rankings
300
- validation_results, all_valid = validate_rankings(*rankings)
 
 
301
 
302
  # Update validation indicators
303
- for i, result in enumerate(validation_results):
304
- validation_indicators[i].update(value=result)
 
305
 
306
  # If not all valid, return error message
307
  if not all_valid:
@@ -311,347 +313,394 @@ def create_reranking_interface(task_data):
311
  status, progress = save_ranking(rankings, sample_id)
312
  return status, progress
313
 
314
- # Connect events - Direct input/output connections for reliability
315
  submit_btn.click(
316
- submit_rankings,
317
  inputs=ranking_inputs + [current_sample_id],
318
  outputs=[status_box, progress_text]
319
  )
320
 
321
- # Apply auto-save before navigation if enabled
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  next_btn.click(
323
- auto_save_and_navigate,
324
- inputs=["next", current_sample_id, auto_save_toggle] + ranking_inputs,
325
- outputs=[current_sample_id, status_box, progress_text]
326
- ).then(
327
- load_sample,
328
- inputs=[current_sample_id],
329
  outputs=[query_text] + doc_containers + ranking_inputs + validation_indicators + [current_sample_id, progress_text, status_box]
330
  )
331
 
332
  prev_btn.click(
333
- auto_save_and_navigate,
334
- inputs=["prev", current_sample_id, auto_save_toggle] + ranking_inputs,
335
- outputs=[current_sample_id, status_box, progress_text]
336
- ).then(
337
- load_sample,
338
- inputs=[current_sample_id],
339
  outputs=[query_text] + doc_containers + ranking_inputs + validation_indicators + [current_sample_id, progress_text, status_box]
340
  )
341
 
342
  # Connect quick ranking buttons
343
  sequential_btn.click(
344
- assign_sequential_ranks,
 
345
  outputs=ranking_inputs
346
  )
347
 
348
  reverse_btn.click(
349
- assign_reverse_ranks,
 
350
  outputs=ranking_inputs
351
  )
352
 
353
  clear_btn.click(
354
- clear_rankings,
 
355
  outputs=ranking_inputs
356
  )
357
 
358
  # Connect save button
359
- save_btn.click(save_results, outputs=[status_box])
360
-
361
- # Add validation on ranking changes
362
- for i, ranking in enumerate(ranking_inputs):
363
- ranking.change(
364
- validate_rankings,
365
- inputs=ranking_inputs,
366
- outputs=validation_indicators + [gr.State(value=None)] # Add dummy output to match function return
367
- )
368
 
369
- # Set up auto-save feature
 
 
 
370
  auto_save_toggle.change(
371
- lambda x: {"auto_save_enabled": x},
372
  inputs=[auto_save_toggle],
373
- outputs=[current_state]
374
  )
375
 
376
  return demo
377
 
378
  # Main app with file upload capability and improved task management
379
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
380
- gr.Markdown("# MTEB Human Evaluation Demo")
381
-
382
- with gr.Tabs():
383
- with gr.TabItem("Demo"):
384
- gr.Markdown("""
385
- ## MTEB Human Evaluation Interface
386
-
387
- This interface allows you to evaluate the relevance of documents for reranking tasks.
388
- """)
389
-
390
- # Function to get the most recent task file
391
- def get_latest_task_file():
392
- # Check first in uploaded_tasks directory
393
- os.makedirs("uploaded_tasks", exist_ok=True)
394
- uploaded_tasks = [f for f in os.listdir("uploaded_tasks") if f.endswith(".json")]
395
-
396
- if uploaded_tasks:
397
- # Sort by modification time, newest first
398
- uploaded_tasks.sort(key=lambda x: os.path.getmtime(os.path.join("uploaded_tasks", x)), reverse=True)
399
- task_path = os.path.join("uploaded_tasks", uploaded_tasks[0])
400
-
401
- # Verify this is a valid task file
402
- try:
403
- with open(task_path, "r") as f:
404
- task_data = json.load(f)
405
- if "task_name" in task_data and "samples" in task_data:
406
- return task_path
407
- except:
408
- pass
409
-
410
- # Look for task files in the current directory
411
- current_dir_tasks = [f for f in os.listdir(".") if f.endswith("_human_eval.json")]
412
- if current_dir_tasks:
413
- # Sort by modification time, newest first
414
- current_dir_tasks.sort(key=lambda x: os.path.getmtime(x), reverse=True)
415
- return current_dir_tasks[0]
416
-
417
- # Fall back to fixed example if available
418
- if os.path.exists("AskUbuntuDupQuestions_human_eval.json"):
419
- return "AskUbuntuDupQuestions_human_eval.json"
420
-
421
- # No valid task file found
422
- return None
423
-
424
- # Load the task file
425
- task_file = get_latest_task_file()
426
-
427
- if task_file:
428
- try:
429
- with open(task_file, "r") as f:
430
- task_data = json.load(f)
431
-
432
- # Show which task is currently loaded
433
- gr.Markdown(f"**Current Task: {task_data['task_name']}** ({len(task_data['samples'])} samples)")
434
-
435
- # Display the interface
436
- reranking_demo = create_reranking_interface(task_data)
437
- except Exception as e:
438
- gr.Markdown(f"**Error loading task: {str(e)}**")
439
- gr.Markdown("Please upload a valid task file in the 'Upload & Evaluate' tab.")
440
- else:
441
- gr.Markdown("**No task file found**")
442
- gr.Markdown("Please upload a valid task file in the 'Upload & Evaluate' tab.")
443
 
444
- with gr.TabItem("Upload & Evaluate"):
445
- gr.Markdown("""
446
- ## Upload Your Own Task File
447
-
448
- If you have a prepared task file, you can upload it here to create an evaluation interface.
449
- """)
450
-
451
- with gr.Row():
452
- with gr.Column(scale=1):
453
- file_input = gr.File(label="Upload a task file (JSON)")
454
- load_btn = gr.Button("Load Task")
455
- message = gr.Textbox(label="Status", interactive=False)
456
-
457
- # Add task list for previously uploaded tasks
458
- gr.Markdown("### Previous Uploads")
459
-
460
- # Function to list existing task files in the tasks directory
461
- def list_task_files():
462
- os.makedirs("uploaded_tasks", exist_ok=True)
463
- tasks = [f for f in os.listdir("uploaded_tasks") if f.endswith(".json")]
464
- if not tasks:
465
- return "No task files uploaded yet."
466
- return "\n".join([f"- {t}" for t in tasks])
467
-
468
- task_list = gr.Markdown(list_task_files())
469
- refresh_btn = gr.Button("Refresh List")
470
-
471
- # Add results management section
472
- gr.Markdown("### Results Management")
473
-
474
- # Function to list existing result files
475
- def list_result_files():
476
- results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
477
- if not results:
478
- return "No result files available yet."
479
-
480
- result_links = []
481
- for r in results:
482
- # Calculate completion stats
483
- try:
484
- with open(r, "r") as f:
485
- result_data = json.load(f)
486
- annotation_count = len(result_data.get("annotations", []))
487
- task_name = result_data.get("task_name", "Unknown")
488
- result_links.append(f"- {r} ({annotation_count} annotations for {task_name})")
489
- except:
490
- result_links.append(f"- {r}")
491
-
492
- return "\n".join(result_links)
493
-
494
- results_list = gr.Markdown(list_result_files())
495
- download_results_btn = gr.Button("Download Results")
496
-
497
- # Right side - will contain the actual interface
498
- with gr.Column(scale=2):
499
- task_container = gr.HTML()
500
- loaded_task_info = gr.JSON(label="Loaded Task Information", visible=False)
501
-
502
- # Handle file upload and storage
503
- def handle_upload(file):
504
- if not file:
505
- return "Please upload a task file", task_list.value, task_container.value, loaded_task_info.value
506
 
507
- try:
508
- # Create directory if it doesn't exist
 
509
  os.makedirs("uploaded_tasks", exist_ok=True)
 
510
 
511
- # Read the uploaded file
512
- with open(file.name, "r") as f:
513
- task_data = json.load(f)
514
-
515
- # Validate task format
516
- if "task_name" not in task_data or "samples" not in task_data:
517
- return "Invalid task file format. Must contain 'task_name' and 'samples' fields.", task_list.value, task_container.value, loaded_task_info.value
 
 
 
 
 
 
518
 
519
- # Save to a consistent location
520
- task_filename = f"uploaded_tasks/{task_data['task_name']}_task.json"
521
- with open(task_filename, "w") as f:
522
- json.dump(task_data, f, indent=2)
 
 
523
 
524
- # Show task info
525
- task_info = {
526
- "task_name": task_data["task_name"],
527
- "samples": len(task_data["samples"]),
528
- "file_path": task_filename
529
- }
530
 
531
- return f"Task '{task_data['task_name']}' uploaded successfully with {len(task_data['samples'])} samples. Please refresh the app and use the Demo tab to evaluate it.", list_task_files(), f"""
532
- <div style="padding: 20px; background-color: #f0f0f0; border-radius: 10px;">
533
- <h3>Task uploaded successfully!</h3>
534
- <p>Task Name: {task_data['task_name']}</p>
535
- <p>Samples: {len(task_data['samples'])}</p>
536
- <p>To evaluate this task:</p>
537
- <ol>
538
- <li>Refresh the app</li>
539
- <li>The Demo tab will now use your uploaded task</li>
540
- <li>Complete your evaluations</li>
541
- <li>Results will be saved as {task_data['task_name']}_human_results.json</li>
542
- </ol>
543
- </div>
544
- """, task_info
545
- except Exception as e:
546
- return f"Error processing task file: {str(e)}", task_list.value, task_container.value, loaded_task_info.value
547
-
548
- # Function to prepare results for download
549
- def prepare_results_for_download():
550
- results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
551
- if not results:
552
  return None
553
 
554
- # Create a zip file with all results
555
- import zipfile
556
- zip_path = "mteb_human_eval_results.zip"
557
- with zipfile.ZipFile(zip_path, 'w') as zipf:
558
- for r in results:
559
- zipf.write(r)
560
 
561
- return zip_path
562
-
563
- # Connect events
564
- load_btn.click(handle_upload, inputs=[file_input], outputs=[message, task_list, task_container, loaded_task_info])
565
- refresh_btn.click(list_task_files, outputs=[task_list])
566
- download_results_btn.click(prepare_results_for_download, outputs=[gr.File(label="Download Results")])
567
-
568
- with gr.TabItem("Results Management"):
569
- gr.Markdown("""
570
- ## Manage Evaluation Results
571
-
572
- View, download, and analyze your evaluation results.
573
- """)
 
 
 
 
574
 
575
- # Function to load and display result stats
576
- def get_result_stats():
577
- results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
578
- if not results:
579
- return "No result files available yet."
 
580
 
581
- stats = []
582
- for r in results:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
583
  try:
584
- with open(r, "r") as f:
585
- result_data = json.load(f)
586
 
587
- task_name = result_data.get("task_name", "Unknown")
588
- annotations = result_data.get("annotations", [])
589
- annotation_count = len(annotations)
590
 
591
- # Calculate completion percentage
592
- sample_ids = set(a.get("sample_id") for a in annotations)
 
593
 
594
- # Try to get the total sample count from the corresponding task file
595
- total_samples = 0
 
 
596
 
597
- # Try uploaded_tasks directory first
598
- task_file = f"uploaded_tasks/{task_name}_task.json"
599
- if os.path.exists(task_file):
600
- with open(task_file, "r") as f:
601
- task_data = json.load(f)
602
- total_samples = len(task_data.get("samples", []))
603
- else:
604
- # Try human_eval file in current directory
605
- task_file = f"{task_name}_human_eval.json"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
606
  if os.path.exists(task_file):
607
  with open(task_file, "r") as f:
608
  task_data = json.load(f)
609
  total_samples = len(task_data.get("samples", []))
610
-
611
- completion = f"{len(sample_ids)}/{total_samples}" if total_samples else f"{len(sample_ids)} samples"
612
-
613
- stats.append(f"### {task_name}\n- Annotations: {annotation_count}\n- Completion: {completion}\n- File: {r}")
614
- except Exception as e:
615
- stats.append(f"### {r}\n- Error loading results: {str(e)}")
 
 
 
 
 
 
 
 
 
616
 
617
- return "\n\n".join(stats)
618
-
619
- result_stats = gr.Markdown(get_result_stats())
620
- refresh_results_btn = gr.Button("Refresh Results")
621
-
622
- # Add download options
623
- with gr.Row():
624
- download_all_btn = gr.Button("Download All Results (ZIP)")
625
- result_select = gr.Dropdown(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")], label="Select Result to Download")
626
- download_selected_btn = gr.Button("Download Selected")
627
-
628
- # Connect events
629
- refresh_results_btn.click(get_result_stats, outputs=[result_stats])
630
-
631
- # Function to prepare all results for download as ZIP
632
- def prepare_all_results():
633
- import zipfile
634
- zip_path = "mteb_human_eval_results.zip"
635
- with zipfile.ZipFile(zip_path, 'w') as zipf:
636
- for r in [f for f in os.listdir(".") if f.endswith("_human_results.json")]:
637
- zipf.write(r)
638
- return zip_path
639
-
640
- # Function to return a single result file
641
- def get_selected_result(filename):
642
- if not filename:
643
  return None
644
- if os.path.exists(filename):
645
- return filename
646
- return None
647
-
648
- # Update dropdown when refreshing results
649
- def update_result_dropdown():
650
- return gr.Dropdown.update(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")])
651
-
652
- refresh_results_btn.click(update_result_dropdown, outputs=[result_select])
653
- download_all_btn.click(prepare_all_results, outputs=[gr.File(label="Download All Results")])
654
- download_selected_btn.click(get_selected_result, inputs=[result_select], outputs=[gr.File(label="Download Selected Result")])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
655
 
656
  if __name__ == "__main__":
657
  demo.launch()
 
97
  """.format(instructions=task_data.get("instructions", "Rank documents by their relevance to the query.")))
98
 
99
  current_sample_id = gr.State(value=samples[0]["id"])
100
+ auto_save_enabled = gr.State(value=True)
101
 
102
  with gr.Row():
103
  progress_text = gr.Textbox(label="Progress", value=f"Progress: 0/{len(samples)}", interactive=False)
 
135
  doc_containers.append(doc_box)
136
 
137
  with gr.Column(scale=1):
138
+ # Use Dropdown instead of Radio for compatibility with Gradio 3.x
139
+ rank_input = gr.Dropdown(
 
140
  choices=[str(j) for j in range(1, len(samples[0]["candidates"])+1)],
141
  label=f"Rank",
142
+ value=""
 
143
  )
144
  ranking_inputs.append(rank_input)
145
 
 
168
  else:
169
  results.append("✓")
170
 
171
+ return results + [all_valid] # Return validation indicators and validity flag
172
 
173
  def load_sample(sample_id):
174
  """Load a specific sample into the interface."""
175
  sample = next((s for s in samples if s["id"] == sample_id), None)
176
  if not sample:
177
+ return [query_text.value] + [d.value for d in doc_containers] + [""] * len(ranking_inputs) + [""] * len(validation_indicators) + [sample_id, progress_text.value, status_box.value]
178
 
179
  # Update query
180
  new_query = sample["query"]
 
205
  new_status += " (already completed)"
206
 
207
  # Initialize validation indicators
208
+ validation_results = validate_rankings(*new_rankings)
209
+ validation_indicators_values = validation_results[:-1] # Remove validity flag
210
 
211
+ return [new_query] + new_docs + new_rankings + validation_indicators_values + [sample_id, new_progress, new_status]
212
 
213
  def auto_save_and_navigate(direction, current_id, auto_save, *rankings):
214
  """Save rankings if auto-save is enabled, then navigate."""
 
221
 
222
  if auto_save:
223
  # Only save if all rankings are provided
224
+ validation_results = validate_rankings(*actual_rankings)
225
+ all_valid = validation_results[-1] # Last item is validity flag
226
  if all_valid:
227
  status_msg, progress_msg = save_ranking(actual_rankings, current_id)
228
 
 
265
  try:
266
  with open(output_path, "w") as f:
267
  json.dump(results, f, indent=2)
 
268
  return f"✅ Results saved to {output_path} ({len(results['annotations'])} annotations)"
269
  except Exception as e:
270
  return f"Error saving results: {str(e)}"
 
280
 
281
  # Function to clear all rankings
282
  def clear_rankings():
283
+ return [""] * len(samples[0]["candidates"])
284
 
285
  # Define a function that collects all ranking values and validates them
286
  def submit_rankings(*args):
 
296
  rankings = args[:len(ranking_inputs)]
297
 
298
  # First validate the rankings
299
+ validation_results = validate_rankings(*rankings)
300
+ all_valid = validation_results[-1] # Last item is validity flag
301
+ validation_indicators_values = validation_results[:-1] # Remove validity flag
302
 
303
  # Update validation indicators
304
+ for i, result in enumerate(validation_indicators_values):
305
+ if i < len(validation_indicators):
306
+ validation_indicators[i].update(value=result)
307
 
308
  # If not all valid, return error message
309
  if not all_valid:
 
313
  status, progress = save_ranking(rankings, sample_id)
314
  return status, progress
315
 
316
+ # Wire up events (Gradio 3.x syntax)
317
  submit_btn.click(
318
+ fn=submit_rankings,
319
  inputs=ranking_inputs + [current_sample_id],
320
  outputs=[status_box, progress_text]
321
  )
322
 
323
+ # Auto-save and navigate events
324
+ def handle_next(current_id, auto_save, *rankings):
325
+ # First, handle auto-save
326
+ new_id, status, progress = auto_save_and_navigate("next", current_id, auto_save, *rankings)
327
+ # Then, load the new sample
328
+ outputs = load_sample(new_id)
329
+ # Add the status and progress
330
+ outputs[-2] = progress if status else outputs[-2]
331
+ outputs[-1] = status if status else outputs[-1]
332
+ return outputs
333
+
334
+ def handle_prev(current_id, auto_save, *rankings):
335
+ # First, handle auto-save
336
+ new_id, status, progress = auto_save_and_navigate("prev", current_id, auto_save, *rankings)
337
+ # Then, load the new sample
338
+ outputs = load_sample(new_id)
339
+ # Add the status and progress
340
+ outputs[-2] = progress if status else outputs[-2]
341
+ outputs[-1] = status if status else outputs[-1]
342
+ return outputs
343
+
344
+ # Connect navigation with Gradio 3.x syntax
345
  next_btn.click(
346
+ fn=handle_next,
347
+ inputs=[current_sample_id, auto_save_toggle] + ranking_inputs,
 
 
 
 
348
  outputs=[query_text] + doc_containers + ranking_inputs + validation_indicators + [current_sample_id, progress_text, status_box]
349
  )
350
 
351
  prev_btn.click(
352
+ fn=handle_prev,
353
+ inputs=[current_sample_id, auto_save_toggle] + ranking_inputs,
 
 
 
 
354
  outputs=[query_text] + doc_containers + ranking_inputs + validation_indicators + [current_sample_id, progress_text, status_box]
355
  )
356
 
357
  # Connect quick ranking buttons
358
  sequential_btn.click(
359
+ fn=assign_sequential_ranks,
360
+ inputs=None,
361
  outputs=ranking_inputs
362
  )
363
 
364
  reverse_btn.click(
365
+ fn=assign_reverse_ranks,
366
+ inputs=None,
367
  outputs=ranking_inputs
368
  )
369
 
370
  clear_btn.click(
371
+ fn=clear_rankings,
372
+ inputs=None,
373
  outputs=ranking_inputs
374
  )
375
 
376
  # Connect save button
377
+ save_btn.click(
378
+ fn=save_results,
379
+ inputs=None,
380
+ outputs=[status_box]
381
+ )
 
 
 
 
382
 
383
+ # Connect auto-save toggle
384
+ def update_auto_save(enabled):
385
+ return enabled
386
+
387
  auto_save_toggle.change(
388
+ fn=update_auto_save,
389
  inputs=[auto_save_toggle],
390
+ outputs=[auto_save_enabled]
391
  )
392
 
393
  return demo
394
 
395
  # Main app with file upload capability and improved task management
396
+ def create_main_app():
397
+ with gr.Blocks(theme=gr.themes.Soft()) as app:
398
+ gr.Markdown("# MTEB Human Evaluation Demo")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
 
400
+ task_container = gr.HTML()
401
+ loaded_task_info = gr.JSON(label="Loaded Task Information", visible=False)
402
+
403
+ tabs = gr.Tabs()
404
+
405
+ with tabs:
406
+ with gr.TabItem("Demo"):
407
+ gr.Markdown("""
408
+ ## MTEB Human Evaluation Interface
409
+
410
+ This interface allows you to evaluate the relevance of documents for reranking tasks.
411
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
 
413
+ # Function to get the most recent task file
414
+ def get_latest_task_file():
415
+ # Check first in uploaded_tasks directory
416
  os.makedirs("uploaded_tasks", exist_ok=True)
417
+ uploaded_tasks = [f for f in os.listdir("uploaded_tasks") if f.endswith(".json")]
418
 
419
+ if uploaded_tasks:
420
+ # Sort by modification time, newest first
421
+ uploaded_tasks.sort(key=lambda x: os.path.getmtime(os.path.join("uploaded_tasks", x)), reverse=True)
422
+ task_path = os.path.join("uploaded_tasks", uploaded_tasks[0])
423
+
424
+ # Verify this is a valid task file
425
+ try:
426
+ with open(task_path, "r") as f:
427
+ task_data = json.load(f)
428
+ if "task_name" in task_data and "samples" in task_data:
429
+ return task_path
430
+ except:
431
+ pass
432
 
433
+ # Look for task files in the current directory
434
+ current_dir_tasks = [f for f in os.listdir(".") if f.endswith("_human_eval.json")]
435
+ if current_dir_tasks:
436
+ # Sort by modification time, newest first
437
+ current_dir_tasks.sort(key=lambda x: os.path.getmtime(x), reverse=True)
438
+ return current_dir_tasks[0]
439
 
440
+ # Fall back to fixed example if available
441
+ if os.path.exists("AskUbuntuDupQuestions_human_eval.json"):
442
+ return "AskUbuntuDupQuestions_human_eval.json"
 
 
 
443
 
444
+ # No valid task file found
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
  return None
446
 
447
+ # Load the task file
448
+ task_file = get_latest_task_file()
 
 
 
 
449
 
450
+ if task_file:
451
+ try:
452
+ with open(task_file, "r") as f:
453
+ task_data = json.load(f)
454
+
455
+ # Show which task is currently loaded
456
+ gr.Markdown(f"**Current Task: {task_data['task_name']}** ({len(task_data['samples'])} samples)")
457
+
458
+ # Display the interface
459
+ demo = create_reranking_interface(task_data)
460
+ task_container.update(value=f"<p>Task loaded: {task_file}</p>")
461
+ except Exception as e:
462
+ gr.Markdown(f"**Error loading task: {str(e)}**")
463
+ gr.Markdown("Please upload a valid task file in the 'Upload & Evaluate' tab.")
464
+ else:
465
+ gr.Markdown("**No task file found**")
466
+ gr.Markdown("Please upload a valid task file in the 'Upload & Evaluate' tab.")
467
 
468
+ with gr.TabItem("Upload & Evaluate"):
469
+ gr.Markdown("""
470
+ ## Upload Your Own Task File
471
+
472
+ If you have a prepared task file, you can upload it here to create an evaluation interface.
473
+ """)
474
 
475
+ with gr.Row():
476
+ with gr.Column(scale=1):
477
+ file_input = gr.File(label="Upload a task file (JSON)")
478
+ load_btn = gr.Button("Load Task")
479
+ message = gr.Textbox(label="Status", interactive=False)
480
+
481
+ # Add task list for previously uploaded tasks
482
+ gr.Markdown("### Previous Uploads")
483
+
484
+ # Function to list existing task files in the tasks directory
485
+ def list_task_files():
486
+ os.makedirs("uploaded_tasks", exist_ok=True)
487
+ tasks = [f for f in os.listdir("uploaded_tasks") if f.endswith(".json")]
488
+ if not tasks:
489
+ return "No task files uploaded yet."
490
+ return "\n".join([f"- {t}" for t in tasks])
491
+
492
+ task_list = gr.Markdown(list_task_files())
493
+ refresh_btn = gr.Button("Refresh List")
494
+
495
+ # Add results management section
496
+ gr.Markdown("### Results Management")
497
+
498
+ # Function to list existing result files
499
+ def list_result_files():
500
+ results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
501
+ if not results:
502
+ return "No result files available yet."
503
+
504
+ result_links = []
505
+ for r in results:
506
+ # Calculate completion stats
507
+ try:
508
+ with open(r, "r") as f:
509
+ result_data = json.load(f)
510
+ annotation_count = len(result_data.get("annotations", []))
511
+ task_name = result_data.get("task_name", "Unknown")
512
+ result_links.append(f"- {r} ({annotation_count} annotations for {task_name})")
513
+ except:
514
+ result_links.append(f"- {r}")
515
+
516
+ return "\n".join(result_links)
517
+
518
+ results_list = gr.Markdown(list_result_files())
519
+ download_results_btn = gr.Button("Download Results")
520
+
521
+ # Handle file upload and storage
522
+ def handle_upload(file):
523
+ if not file:
524
+ return "Please upload a task file", task_list.value, ""
525
+
526
  try:
527
+ # Create directory if it doesn't exist
528
+ os.makedirs("uploaded_tasks", exist_ok=True)
529
 
530
+ # Read the uploaded file
531
+ with open(file.name, "r") as f:
532
+ task_data = json.load(f)
533
 
534
+ # Validate task format
535
+ if "task_name" not in task_data or "samples" not in task_data:
536
+ return "Invalid task file format. Must contain 'task_name' and 'samples' fields.", task_list.value, ""
537
 
538
+ # Save to a consistent location
539
+ task_filename = f"uploaded_tasks/{task_data['task_name']}_task.json"
540
+ with open(task_filename, "w") as f:
541
+ json.dump(task_data, f, indent=2)
542
 
543
+ return f"Task '{task_data['task_name']}' uploaded successfully with {len(task_data['samples'])} samples. Please refresh the app and use the Demo tab to evaluate it.", list_task_files(), f"""
544
+ <div style="padding: 20px; background-color: #f0f0f0; border-radius: 10px;">
545
+ <h3>Task uploaded successfully!</h3>
546
+ <p>Task Name: {task_data['task_name']}</p>
547
+ <p>Samples: {len(task_data['samples'])}</p>
548
+ <p>To evaluate this task:</p>
549
+ <ol>
550
+ <li>Refresh the app</li>
551
+ <li>The Demo tab will now use your uploaded task</li>
552
+ <li>Complete your evaluations</li>
553
+ <li>Results will be saved as {task_data['task_name']}_human_results.json</li>
554
+ </ol>
555
+ </div>
556
+ """
557
+ except Exception as e:
558
+ return f"Error processing task file: {str(e)}", task_list.value, ""
559
+
560
+ # Function to prepare results for download
561
+ def prepare_results_for_download():
562
+ results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
563
+ if not results:
564
+ return None
565
+
566
+ # Create a zip file with all results
567
+ import zipfile
568
+ zip_path = "mteb_human_eval_results.zip"
569
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
570
+ for r in results:
571
+ zipf.write(r)
572
+
573
+ return zip_path
574
+
575
+ # Connect events
576
+ load_btn.click(
577
+ fn=handle_upload,
578
+ inputs=[file_input],
579
+ outputs=[message, task_list, task_container]
580
+ )
581
+
582
+ refresh_btn.click(
583
+ fn=list_task_files,
584
+ inputs=None,
585
+ outputs=[task_list]
586
+ )
587
+
588
+ download_results_btn.click(
589
+ fn=prepare_results_for_download,
590
+ inputs=None,
591
+ outputs=[gr.File(label="Download Results")]
592
+ )
593
+
594
+ with gr.TabItem("Results Management"):
595
+ gr.Markdown("""
596
+ ## Manage Evaluation Results
597
+
598
+ View, download, and analyze your evaluation results.
599
+ """)
600
+
601
+ # Function to load and display result stats
602
+ def get_result_stats():
603
+ results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
604
+ if not results:
605
+ return "No result files available yet."
606
+
607
+ stats = []
608
+ for r in results:
609
+ try:
610
+ with open(r, "r") as f:
611
+ result_data = json.load(f)
612
+
613
+ task_name = result_data.get("task_name", "Unknown")
614
+ annotations = result_data.get("annotations", [])
615
+ annotation_count = len(annotations)
616
+
617
+ # Calculate completion percentage
618
+ sample_ids = set(a.get("sample_id") for a in annotations)
619
+
620
+ # Try to get the total sample count from the corresponding task file
621
+ total_samples = 0
622
+
623
+ # Try uploaded_tasks directory first
624
+ task_file = f"uploaded_tasks/{task_name}_task.json"
625
  if os.path.exists(task_file):
626
  with open(task_file, "r") as f:
627
  task_data = json.load(f)
628
  total_samples = len(task_data.get("samples", []))
629
+ else:
630
+ # Try human_eval file in current directory
631
+ task_file = f"{task_name}_human_eval.json"
632
+ if os.path.exists(task_file):
633
+ with open(task_file, "r") as f:
634
+ task_data = json.load(f)
635
+ total_samples = len(task_data.get("samples", []))
636
+
637
+ completion = f"{len(sample_ids)}/{total_samples}" if total_samples else f"{len(sample_ids)} samples"
638
+
639
+ stats.append(f"### {task_name}\n- Annotations: {annotation_count}\n- Completion: {completion}\n- File: {r}")
640
+ except Exception as e:
641
+ stats.append(f"### {r}\n- Error loading results: {str(e)}")
642
+
643
+ return "\n\n".join(stats)
644
 
645
+ result_stats = gr.Markdown(get_result_stats())
646
+ refresh_results_btn = gr.Button("Refresh Results")
647
+
648
+ # Add download options
649
+ with gr.Row():
650
+ download_all_btn = gr.Button("Download All Results (ZIP)")
651
+ result_select = gr.Dropdown(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")], label="Select Result to Download")
652
+ download_selected_btn = gr.Button("Download Selected")
653
+
654
+ # Function to prepare all results for download as ZIP
655
+ def prepare_all_results():
656
+ import zipfile
657
+ zip_path = "mteb_human_eval_results.zip"
658
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
659
+ for r in [f for f in os.listdir(".") if f.endswith("_human_results.json")]:
660
+ zipf.write(r)
661
+ return zip_path
662
+
663
+ # Function to return a single result file
664
+ def get_selected_result(filename):
665
+ if not filename:
666
+ return None
667
+ if os.path.exists(filename):
668
+ return filename
 
 
669
  return None
670
+
671
+ # Update dropdown when refreshing results
672
+ def update_result_dropdown():
673
+ return gr.Dropdown.update(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")])
674
+
675
+ # Connect events
676
+ refresh_results_btn.click(
677
+ fn=get_result_stats,
678
+ inputs=None,
679
+ outputs=[result_stats]
680
+ )
681
+
682
+ refresh_results_btn.click(
683
+ fn=update_result_dropdown,
684
+ inputs=None,
685
+ outputs=[result_select]
686
+ )
687
+
688
+ download_all_btn.click(
689
+ fn=prepare_all_results,
690
+ inputs=None,
691
+ outputs=[gr.File(label="Download All Results")]
692
+ )
693
+
694
+ download_selected_btn.click(
695
+ fn=get_selected_result,
696
+ inputs=[result_select],
697
+ outputs=[gr.File(label="Download Selected Result")]
698
+ )
699
+
700
+ return app
701
+
702
+ # Create the app
703
+ demo = create_main_app()
704
 
705
  if __name__ == "__main__":
706
  demo.launch()