Files changed (8) hide show
  1. app.py +458 -664
  2. common.py +39 -14
  3. data/models.jsonl +16 -28
  4. gen_api_answer.py +51 -414
  5. leaderboard.py +0 -116
  6. prompts.py +0 -210
  7. random_sample_generation.py +0 -183
  8. requirements.txt +0 -3
app.py CHANGED
@@ -2,57 +2,36 @@ import json
2
  import re
3
  import random
4
  from collections import defaultdict
5
- from datetime import datetime
6
  import hashlib
7
- import gradio as gr
8
 
9
  from dotenv import load_dotenv
10
- load_dotenv()
11
 
12
- from gen_api_answer import (
13
- get_model_response,
14
- parse_model_response,
15
- prometheus_parse_model_response,
16
- salesforce_parse_model_response,
17
- flow_judge_parse_model_response
18
- )
19
 
20
- from random_sample_generation import (
21
- get_random_human_ai_pair,
22
- get_random_human_ai_ground_truth_pair,
23
- generate_ai_response
24
- )
25
  from db import add_vote, create_db_connection, get_votes
26
-
27
  from utils import Vote
28
-
29
  from common import (
30
  POLICY_CONTENT,
31
  ACKNOWLEDGEMENTS,
 
 
 
32
  CSS_STYLES,
33
  MAIN_TITLE,
34
  HOW_IT_WORKS,
 
 
 
35
  )
36
- from prompts import (
37
- DEFAULT_EVAL_PROMPT,
38
- DEFAULT_EVAL_PROMPT_EDITABLE,
39
- FIXED_EVAL_SUFFIX,
40
- DEFAULT_EVAL_CRITERIA,
41
- DEFAULT_SCORE_1,
42
- DEFAULT_SCORE_2,
43
- DEFAULT_SCORE_3,
44
- DEFAULT_SCORE_4,
45
- DEFAULT_SCORE_5,
46
- )
47
- from leaderboard import (
48
- get_leaderboard,
49
- get_leaderboard_stats,
50
- get_model_rankings,
51
- DEFAULT_ELO,
52
- K_FACTOR
53
- )
54
 
55
 
 
 
 
56
  elo_scores = defaultdict(lambda: DEFAULT_ELO)
57
  vote_counts = defaultdict(int)
58
 
@@ -73,7 +52,6 @@ def load_model_data():
73
  "organization": model["organization"],
74
  "license": model["license"],
75
  "api_model": model["api_model"],
76
- "active": model["active"]
77
  }
78
  except FileNotFoundError:
79
  print("Warning: models.jsonl not found")
@@ -84,11 +62,9 @@ def load_model_data():
84
  model_data = load_model_data()
85
 
86
  def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id):
87
- prompt_value = prompt.value if hasattr(prompt, 'value') else prompt
88
-
89
  vote = Vote(
90
  timestamp=datetime.now().isoformat(),
91
- prompt=prompt_value,
92
  response_a=response_a,
93
  response_b=response_b,
94
  model_a=model_a,
@@ -117,6 +93,40 @@ def get_final_prompt(eval_prompt, variable_values):
117
  return eval_prompt
118
 
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
  def get_ip(request: gr.Request) -> str:
122
  """Get and hash the IP address from the request."""
@@ -133,26 +143,6 @@ def get_ip(request: gr.Request) -> str:
133
  return hashlib.sha256(ip.encode()).hexdigest()[:16]
134
 
135
 
136
- def get_vote_message(choice: str, model_a: str, model_b: str) -> tuple[str, str]:
137
- """Generate appropriate message based on vote and model rankings.
138
- Returns (title, message) tuple."""
139
- # Get current rankings
140
- voting_data = get_current_votes()
141
- leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
142
- rankings = get_model_rankings(leaderboard)
143
- pos_a = rankings.get(model_a, 0)
144
- pos_b = rankings.get(model_b, 0)
145
-
146
- if choice == "Tie":
147
- return "It's a tie!", "Keep voting responsibly 🤗"
148
-
149
- # Check if vote aligns with leaderboard
150
- if (choice == "A" and pos_a < pos_b) or (choice == "B" and pos_b < pos_a):
151
- return "The favourite wins!", "Keep voting responsibly 🤗"
152
- else:
153
- return "The underdog wins!", "Keep voting responsibly 🤗"
154
-
155
-
156
  def vote(
157
  choice,
158
  model_a,
@@ -202,39 +192,16 @@ def vote(
202
  store_vote_data(
203
  final_prompt, response_a, response_b, model_a, model_b, choice, judge_id
204
  )
205
-
206
- # Get model positions for display
207
- voting_data = get_current_votes()
208
- leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
209
- rankings = get_model_rankings(leaderboard)
210
- pos_a = rankings.get(model_a, 0)
211
- pos_b = rankings.get(model_b, 0)
212
-
213
- # Format model names with positions and win/loss indicators
214
- if choice == "Tie":
215
- model_a_display = f"*Model: {model_a} (Position #{pos_a})*"
216
- model_b_display = f"*Model: {model_b} (Position #{pos_b})*"
217
- else:
218
- winner = model_a if choice == "A" else model_b
219
- loser = model_b if choice == "A" else model_a
220
- winner_pos = pos_a if choice == "A" else pos_b
221
- loser_pos = pos_b if choice == "A" else pos_a
222
-
223
- model_a_display = f"*Model: {model_a} {'✅' if choice == 'A' else '❌'} (Position #{pos_a})*"
224
- model_b_display = f"*Model: {model_b} {'✅' if choice == 'B' else '❌'} (Position #{pos_b})*"
225
-
226
- # Generate vote message
227
- title, message = get_vote_message(choice, model_a, model_b)
228
-
229
  return [
230
- gr.update(interactive=False, variant="primary" if choice == "A" else "secondary"), # vote_a
231
- gr.update(interactive=False, variant="primary" if choice == "B" else "secondary"), # vote_b
232
- gr.update(interactive=False, variant="primary" if choice == "Tie" else "secondary"), # vote_tie
233
- gr.update(value=model_a_display), # model_name_a
234
- gr.update(value=model_b_display), # model_name_b
235
- gr.update(interactive=True, value="Regenerate judges", variant="secondary"), # send_btn
236
- gr.update(value="🎲 New round", variant="primary"), # random_btn
237
- gr.Info(message, title=title), # success message
238
  ]
239
 
240
 
@@ -243,24 +210,150 @@ def get_current_votes():
243
  return get_votes(db)
244
 
245
 
246
- # Update the refresh_leaderboard function
247
- def refresh_leaderboard(show_preliminary):
248
- """Refresh the leaderboard data and stats."""
249
  voting_data = get_current_votes()
250
- leaderboard = get_leaderboard(model_data, voting_data, show_preliminary)
251
- data = [
252
- [
253
- entry["Model"],
254
- float(entry["ELO Score"]),
255
- entry["95% CI"],
256
- entry["# Votes"],
257
- entry["Organization"],
258
- entry["License"],
259
- ]
260
- for entry in leaderboard
261
- ]
262
- stats = get_leaderboard_stats(model_data, voting_data)
263
- return [gr.update(value=data), gr.update(value=stats)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
 
266
  # Update the leaderboard table definition in the UI
@@ -270,30 +363,63 @@ leaderboard_table = gr.Dataframe(
270
  )
271
 
272
 
273
- def populate_random_example(request: gr.Request, compatible_mode: bool):
274
- """Generate a random human-AI conversation example and reset judge outputs."""
275
- if compatible_mode:
276
- # Generate all three components when compatible mode is enabled
277
- human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
278
- else:
279
- # Generate only human and AI messages when compatible mode is disabled
280
- human_msg, ai_msg = get_random_human_ai_pair()
281
- ground_truth_msg = ""
282
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  return [
284
  gr.update(value=human_msg),
285
- gr.update(value=ai_msg),
286
- gr.update(value="🎲", variant="secondary"), # Reset random button appearance
287
- gr.update(value=""), # Clear score A
288
- gr.update(value=""), # Clear critique A
289
- gr.update(value=""), # Clear score B
290
- gr.update(value=""), # Clear critique B
291
- gr.update(interactive=False, variant="primary"), # Reset vote A
292
- gr.update(interactive=False, variant="primary"), # Reset vote B
293
- gr.update(interactive=False, variant="primary"), # Reset vote tie
294
- gr.update(value="*Model: Hidden*"), # Reset model name A
295
- gr.update(value="*Model: Hidden*"), # Reset model name B
296
- gr.update(value=ground_truth_msg, visible=compatible_mode), # Set ground truth and visibility
297
  ]
298
 
299
 
@@ -309,43 +435,27 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
309
 
310
  with gr.Tabs():
311
  with gr.TabItem("Judge Arena"):
 
312
  with gr.Row():
313
  # Left side - Input section
314
  with gr.Column(scale=1):
315
  with gr.Group():
316
  human_input = gr.TextArea(
317
- label="👩 User Input",
318
- lines=10,
319
  placeholder="Enter the human message here..."
320
  )
321
- with gr.Row():
322
- generate_btn = gr.Button(
323
- "Generate AI Response",
324
- size="sm",
325
- interactive=False
326
- )
327
 
328
  ai_response = gr.TextArea(
329
  label="🤖 AI Response",
330
- lines=15,
331
- placeholder="Enter the AI response here..."
332
- )
333
-
334
- # Ground truth response (initially hidden)
335
- ground_truth = gr.TextArea(
336
- label="🎯 Ground truth response",
337
  lines=12,
338
- placeholder="Enter the ground truth response here...",
339
- visible=False
340
  )
341
 
342
- with gr.Row():
343
- random_btn = gr.Button("🎲", scale=2)
344
  send_btn = gr.Button(
345
- value="Run judges",
346
  variant="primary",
347
- size="lg",
348
- scale=8
349
  )
350
 
351
  # Right side - Model outputs
@@ -355,15 +465,18 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
355
  model_name_a = gr.Markdown("*Model: Hidden*")
356
  with gr.Row():
357
  with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
358
- score_a = gr.Textbox(label="Score", lines=6, interactive=False)
359
- vote_a = gr.Button("Vote A", variant="primary", interactive=False)
360
  with gr.Column(scale=9, min_width=400): # Wider width for critique
361
- critique_a = gr.TextArea(label="Critique", lines=8, interactive=False)
 
 
 
362
 
363
  # Tie button row
364
- with gr.Row() as tie_button_row:
365
  with gr.Column():
366
- vote_tie = gr.Button("Tie", variant="primary", interactive=False)
367
 
368
 
369
  gr.Markdown("### 🧑‍⚖️ Judge B")
@@ -371,90 +484,16 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
371
  model_name_b = gr.Markdown("*Model: Hidden*")
372
  with gr.Row():
373
  with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
374
- score_b = gr.Textbox(label="Score", lines=6, interactive=False)
375
- vote_b = gr.Button("Vote B", variant="primary", interactive=False)
376
  with gr.Column(scale=9, min_width=400): # Wider width for critique
377
- critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
378
- # Place Vote B button directly under Judge B
379
 
380
  gr.Markdown("<br>")
381
-
382
-
383
- # Replace the "Edit Judge Prompt" Accordion section with:
384
- with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion:
385
- gr.Markdown("<br>")
386
- use_reference_toggle = gr.Checkbox(
387
- label="Use a reference response",
388
- value=False
389
- )
390
-
391
- # Hide the default prompt editor
392
- with gr.Column(visible=False) as default_prompt_editor:
393
- eval_prompt_editable = gr.TextArea(
394
- value=DEFAULT_EVAL_PROMPT_EDITABLE,
395
- label="Evaluation Criteria",
396
- lines=12
397
- )
398
 
399
- with gr.Row(visible=False) as edit_buttons_row:
400
- cancel_prompt_btn = gr.Button("Cancel")
401
- save_prompt_btn = gr.Button("Save", variant="primary")
402
- gr.Markdown("*The sample being evaluated is always appended as:*")
403
- gr.Markdown(f"```{FIXED_EVAL_SUFFIX}")
404
-
405
- # Show the compatible mode editor
406
- with gr.Column(visible=True) as compatible_prompt_editor:
407
- with gr.Row():
408
- # Left column - Evaluation Criteria
409
- with gr.Column(scale=1):
410
- eval_criteria_text = gr.TextArea(
411
- label="Evaluation Criteria",
412
- lines=12,
413
- value=DEFAULT_EVAL_CRITERIA,
414
- placeholder="Enter the evaluation criteria..."
415
- )
416
- prometheus_reference = gr.Markdown(
417
- "<br> *By default, we use the Prometheus absolute grading prompt template - see [here](https://huggingface.co/prometheus-eval/prometheus-7b-v2.0).*",
418
- visible=True
419
- )
420
-
421
- # Right column - Score Descriptions
422
- with gr.Column(scale=1):
423
- score1_description = gr.TextArea(
424
- label="Score 1",
425
- value=DEFAULT_SCORE_1,
426
- placeholder="Description for score 1",
427
- lines=2
428
- )
429
- score2_description = gr.TextArea(
430
- label="Score 2",
431
- value=DEFAULT_SCORE_2,
432
- placeholder="Description for score 2",
433
- lines=2
434
- )
435
- score3_description = gr.TextArea(
436
- label="Score 3",
437
- value=DEFAULT_SCORE_3,
438
- placeholder="Description for score 3",
439
- lines=2
440
- )
441
- score4_description = gr.TextArea(
442
- label="Score 4",
443
- value=DEFAULT_SCORE_4,
444
- placeholder="Description for score 4",
445
- lines=2
446
- )
447
- score5_description = gr.TextArea(
448
- label="Score 5",
449
- value=DEFAULT_SCORE_5,
450
- placeholder="Description for score 5",
451
- lines=2
452
- )
453
-
454
- # Add save/cancel buttons for compatible mode
455
- with gr.Row(visible=False) as compatible_edit_buttons_row:
456
- compatible_cancel_btn = gr.Button("Cancel")
457
- compatible_save_btn = gr.Button("Save", variant="primary")
458
 
459
  with gr.TabItem("Leaderboard"):
460
  with gr.Row():
@@ -462,7 +501,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
462
  show_preliminary = gr.Checkbox(
463
  label="Reveal preliminary results",
464
  value=True, # Checked by default
465
- info="Show all models, including models with less human ratings (< 300 votes)",
466
  interactive=True
467
  )
468
  stats_display = gr.Markdown()
@@ -470,13 +509,24 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
470
  headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
471
  datatype=["str", "number", "str", "number", "str", "str", "str"],
472
  )
473
-
474
- gr.Markdown("""<br>
475
- <br>
476
- Judge Arena uses Together AI for inference of open-source models. FP8 models are named as -- "Turbo" where the performance of the FP16 reference models is closely matched:
477
 
478
- [*"Together Turbo achieves this performance while maintaining full accuracy compared to Meta's reference implementation across all models. Llama-3.1-405B-Instruct-Turbo matches the accuracy of Meta reference models."*](https://www.together.ai/blog/together-inference-engine-2)
479
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
 
481
  # Add change handler for checkbox
482
  show_preliminary.change(
@@ -494,15 +544,67 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
494
 
495
  with gr.TabItem("Policy"):
496
  gr.Markdown(POLICY_CONTENT)
497
- gr.Markdown(ACKNOWLEDGEMENTS)
498
 
499
  # Define state variables for model tracking
500
  model_a_state = gr.State()
501
  model_b_state = gr.State()
502
  final_prompt_state = gr.State()
503
- eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) # Initialize with default value
504
- is_editing = gr.State(False) # Track editing state
505
- compatible_mode_state = gr.State(False) # Track compatible mode state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
506
 
507
  # Update model names after responses are generated
508
  def update_model_names(model_a, model_b):
@@ -517,7 +619,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
517
  vote_a.click(
518
  fn=vote,
519
  inputs=[
520
- gr.State("A"),
521
  model_a_state,
522
  model_b_state,
523
  final_prompt_state,
@@ -529,19 +631,18 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
529
  outputs=[
530
  vote_a,
531
  vote_b,
532
- vote_tie,
533
  model_name_a,
534
  model_name_b,
535
  send_btn,
536
- random_btn,
537
- gr.State(), # placeholder for success message
538
  ],
539
  )
540
 
541
  vote_b.click(
542
  fn=vote,
543
  inputs=[
544
- gr.State("B"),
545
  model_a_state,
546
  model_b_state,
547
  final_prompt_state,
@@ -553,19 +654,18 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
553
  outputs=[
554
  vote_a,
555
  vote_b,
556
- vote_tie,
557
  model_name_a,
558
  model_name_b,
559
  send_btn,
560
- random_btn,
561
- gr.State(), # placeholder for success message
562
  ],
563
  )
564
 
565
  vote_tie.click(
566
  fn=vote,
567
  inputs=[
568
- gr.State("Tie"),
569
  model_a_state,
570
  model_b_state,
571
  final_prompt_state,
@@ -577,248 +677,66 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
577
  outputs=[
578
  vote_a,
579
  vote_b,
580
- vote_tie,
581
  model_name_a,
582
  model_name_b,
583
  send_btn,
584
- random_btn,
585
- gr.State(), # placeholder for success message
586
  ],
587
  )
588
 
589
- # Add handlers for save/cancel buttons
590
- def save_prompt(new_prompt, previous_prompt):
591
- return [
592
- gr.update(value=new_prompt), # Update the prompt
593
- new_prompt, # Update the previous prompt state
594
- gr.update(visible=False) # Hide the buttons
595
- ]
596
-
597
- def cancel_prompt(previous_prompt):
598
- return [
599
- gr.update(value=previous_prompt), # Revert to previous prompt
600
- previous_prompt, # Keep the previous prompt state
601
- gr.update(visible=False) # Hide the buttons
602
- ]
 
603
 
604
- def show_edit_buttons(current_value, previous_value):
605
- # Show buttons only if the current value differs from the previous value
606
- return gr.update(visible=current_value != previous_value)
607
 
608
- # Add handlers for save/cancel buttons and prompt changes
609
- save_prompt_btn.click(
610
- fn=save_prompt,
611
- inputs=[eval_prompt_editable, eval_prompt_previous],
612
- outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
613
- )
614
 
615
- cancel_prompt_btn.click(
616
- fn=cancel_prompt,
617
- inputs=[eval_prompt_previous],
618
- outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
619
- )
620
-
621
- eval_prompt_editable.change(
622
- fn=show_edit_buttons,
623
- inputs=[eval_prompt_editable, eval_prompt_previous],
624
- outputs=edit_buttons_row
625
- )
626
 
627
- # Function to toggle visibility based on compatible mode
628
- def toggle_use_reference(checked):
629
- if checked:
630
- # Get new random samples with ground truth when enabling reference mode
631
- human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
632
- return {
633
- ground_truth: gr.update(visible=True, value=ground_truth_msg),
634
- human_input: gr.update(value=human_msg),
635
- ai_response: gr.update(value=ai_msg),
636
- # Reset other UI elements
637
- score_a: gr.update(value=""),
638
- critique_a: gr.update(value=""),
639
- score_b: gr.update(value=""),
640
- critique_b: gr.update(value=""),
641
- vote_a: gr.update(interactive=False, variant="primary"),
642
- vote_b: gr.update(interactive=False, variant="primary"),
643
- vote_tie: gr.update(interactive=False, variant="primary"),
644
- model_name_a: gr.update(value="*Model: Hidden*"),
645
- model_name_b: gr.update(value="*Model: Hidden*"),
646
- random_btn: gr.update(value="🎲", variant="secondary"),
647
- }
648
- else:
649
- # Just hide ground truth when disabling reference mode
650
- return {
651
- ground_truth: gr.update(visible=False)
652
- }
653
-
654
- # Update the change handler to include all necessary outputs
655
- use_reference_toggle.change(
656
- fn=toggle_use_reference,
657
- inputs=[use_reference_toggle],
658
- outputs=[
659
- ground_truth,
660
- human_input,
661
- ai_response,
662
  score_a,
663
  critique_a,
664
  score_b,
665
  critique_b,
666
- vote_a,
667
- vote_b,
668
- vote_tie,
669
- model_name_a,
670
- model_name_b,
671
- random_btn,
672
- ]
673
- )
674
-
675
- # Add a new state variable to track first game
676
- first_game_state = gr.State(True) # Initialize as True
677
-
678
- # Update the submit function to use the state variable
679
- def submit_and_store(
680
- use_reference,
681
- eval_criteria_text_input,
682
- human_input,
683
- ai_response,
684
- ground_truth_input,
685
- score1_description,
686
- score2_description,
687
- score3_description,
688
- score4_description,
689
- score5_description,
690
- ):
691
- # Build prompt data dictionary
692
- prompt_data = {
693
- 'human_input': human_input,
694
- 'ai_response': ai_response,
695
- 'ground_truth_input': ground_truth_input,
696
- 'eval_criteria': eval_criteria_text_input,
697
- 'score1_desc': score1_description,
698
- 'score2_desc': score2_description,
699
- 'score3_desc': score3_description,
700
- 'score4_desc': score4_description,
701
- 'score5_desc': score5_description,
702
- }
703
-
704
- # Get list of active models only for matches
705
- active_models = [name for name, info in model_data.items()
706
- if info.get("active", True) is True] # Explicitly check for True
707
-
708
- # Define new models list
709
- new_models = ["Atla Selene 1", "SFR-LLaMA-3.1-70B-Judge"]
710
-
711
- # New models appear 40% of the time
712
- if random.random() < 0.4:
713
- # Randomly choose between new models
714
- new_model = random.choice(new_models)
715
- other_models = [m for m in active_models if m not in new_models]
716
- other_model = random.choice(other_models)
717
-
718
- if random.random() < 0.5:
719
- model_a, model_b = new_model, other_model
720
- else:
721
- model_a, model_b = other_model, new_model
722
- else:
723
- # For other cases, exclude new models
724
- non_special_models = [m for m in active_models if m not in new_models]
725
- model1, model2 = random.sample(non_special_models, 2)
726
- model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
727
-
728
- # Get responses from models
729
- response_a = get_model_response(
730
  model_a,
731
- model_data.get(model_a),
732
- prompt_data,
733
- use_reference=use_reference
734
- )
735
- response_b = get_model_response(
736
  model_b,
737
- model_data.get(model_b),
738
- prompt_data,
739
- use_reference=use_reference
740
- )
741
-
742
-
743
- is_prometheus_a = model_data.get(model_a, {}).get('organization') == 'Prometheus'
744
- is_prometheus_b = model_data.get(model_b, {}).get('organization') == 'Prometheus'
745
- is_flow_judge_a = model_data.get(model_a, {}).get('organization') == 'Flow AI'
746
- is_flow_judge_b = model_data.get(model_b, {}).get('organization') == 'Flow AI'
747
- is_salesforce_a = model_data.get(model_a, {}).get('organization') == 'Salesforce'
748
- is_salesforce_b = model_data.get(model_b, {}).get('organization') == 'Salesforce'
749
-
750
- # Parse the responses based on model, using appropriate parsing for different models
751
- if is_prometheus_a:
752
- score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
753
- score_a_val = f"{score_a_val} / 5"
754
- elif is_salesforce_a: # Same parser for Atla and Salesforce
755
- score_a_val, critique_a_val = salesforce_parse_model_response(response_a)
756
- score_a_val = f"{score_a_val} / 5"
757
- elif is_flow_judge_a:
758
- score_a_val, critique_a_val = flow_judge_parse_model_response(response_a)
759
- score_a_val = f"{score_a_val} / 5"
760
- else:
761
- score_a_val, critique_a_val = parse_model_response(response_a)
762
- score_a_val = f"{score_a_val} / 5"
763
-
764
- if is_prometheus_b:
765
- score_b_val, critique_b_val = prometheus_parse_model_response(response_b)
766
- score_b_val = f"{score_b_val} / 5"
767
- elif is_salesforce_b: # Same parser for Atla and Salesforce
768
- score_b_val, critique_b_val = salesforce_parse_model_response(response_b)
769
- score_b_val = f"{score_b_val} / 5"
770
- elif is_flow_judge_b:
771
- score_b_val, critique_b_val = flow_judge_parse_model_response(response_b)
772
- score_b_val = f"{score_b_val} / 5"
773
- else:
774
- score_b_val, critique_b_val = parse_model_response(response_b)
775
- score_b_val = f"{score_b_val} / 5"
776
-
777
- return (
778
- score_a_val,
779
- critique_a_val,
780
- score_b_val,
781
- critique_b_val,
782
- gr.update(interactive=True, variant="primary"), # vote_a
783
- gr.update(interactive=True, variant="primary"), # vote_b
784
- gr.update(interactive=True, variant="primary"), # vote_tie
785
- model_a,
786
- model_b,
787
- eval_prompt,
788
  gr.update(value="*Model: Hidden*"),
789
  gr.update(value="*Model: Hidden*"),
790
- gr.update(value="Regenerate judges", variant="secondary", interactive=True),
791
- gr.update(value="🎲"), # random_btn
792
- False, # Set first_game_state to False after first submission
 
 
 
 
793
  )
794
 
795
- # Update the click handler to use False for is_first_game after first submission
796
- def create_submit_handler():
797
- first_game = True
798
-
799
- def handler(*args):
800
- nonlocal first_game
801
- result = submit_and_store(*args)
802
- first_game = False # Set to False after first submission
803
- return result
804
-
805
- return handler
806
-
807
- # Update the send_btn click handler
808
  send_btn.click(
809
  fn=submit_and_store,
810
- inputs=[
811
- use_reference_toggle,
812
- eval_criteria_text,
813
- human_input,
814
- ai_response,
815
- ground_truth,
816
- score1_description,
817
- score2_description,
818
- score3_description,
819
- score4_description,
820
- score5_description,
821
- ],
822
  outputs=[
823
  score_a,
824
  critique_a,
@@ -826,225 +744,101 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
826
  critique_b,
827
  vote_a,
828
  vote_b,
829
- vote_tie,
830
  model_a_state,
831
  model_b_state,
832
  final_prompt_state,
833
  model_name_a,
834
  model_name_b,
835
  send_btn,
836
- random_btn,
837
  ],
838
  )
839
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
840
  # Add random button handler
841
  random_btn.click(
842
  fn=populate_random_example,
843
- inputs=[use_reference_toggle], # Use compatible mode toggle to decide behavior
844
- outputs=[
845
- human_input,
846
- ai_response,
847
- random_btn,
848
- score_a,
849
- critique_a,
850
- score_b,
851
- critique_b,
852
- vote_a,
853
- vote_b,
854
- vote_tie,
855
- model_name_a,
856
- model_name_b,
857
- ground_truth, # Set ground truth
858
- ]
859
  )
860
 
861
  # Add new input change handlers
862
  def handle_input_change():
863
- """Reset UI state when inputs are changed"""
864
- return [
865
- gr.update(interactive=False), # vote_a
866
- gr.update(interactive=False), # vote_b
867
- gr.update(interactive=False), # vote_tie
868
- gr.update(value="Run judges", variant="primary"), # send_btn
869
- gr.update(value="🎲", variant="secondary"), # random_btn
870
- ]
871
 
872
  # Update the change handlers for inputs
873
  human_input.change(
874
  fn=handle_input_change,
875
  inputs=[],
876
- outputs=[vote_a, vote_b, vote_tie, send_btn, random_btn]
877
  )
878
 
879
  ai_response.change(
880
  fn=handle_input_change,
881
  inputs=[],
882
- outputs=[vote_a, vote_b, vote_tie, send_btn, random_btn]
883
- )
884
-
885
- generate_btn.click(
886
- fn=lambda msg: (
887
- generate_ai_response(msg)[0], # Only take the response text
888
- gr.update(
889
- value="Generate AI Response", # Keep the label
890
- interactive=False # Disable the button
891
- )
892
- ),
893
- inputs=[human_input],
894
- outputs=[ai_response, generate_btn]
895
- )
896
-
897
- human_input.change(
898
- fn=lambda x: gr.update(interactive=bool(x.strip())),
899
- inputs=[human_input],
900
- outputs=[generate_btn]
901
  )
902
 
903
  # Update the demo.load to include the random example population
904
  demo.load(
905
- fn=lambda: populate_random_example(None, False), # Pass False for initial compatible_mode
906
  inputs=[],
907
- outputs=[
908
- human_input,
909
- ai_response,
910
- random_btn,
911
- score_a,
912
- critique_a,
913
- score_b,
914
- critique_b,
915
- vote_a,
916
- vote_b,
917
- vote_tie,
918
- model_name_a,
919
- model_name_b,
920
- ground_truth,
921
- ]
922
  )
923
 
924
- # Add new state variables for compatible mode
925
- eval_criteria_previous = gr.State(value=DEFAULT_EVAL_CRITERIA)
926
- score1_previous = gr.State(value=DEFAULT_SCORE_1)
927
- score2_previous = gr.State(value=DEFAULT_SCORE_2)
928
- score3_previous = gr.State(value=DEFAULT_SCORE_3)
929
- score4_previous = gr.State(value=DEFAULT_SCORE_4)
930
- score5_previous = gr.State(value=DEFAULT_SCORE_5)
931
-
932
- # Add new functions to handle compatible mode saves/cancels
933
- def save_compatible_prompt(criteria, score1, score2, score3, score4, score5):
934
- return [
935
- gr.update(value=criteria), # Update criteria
936
- criteria, # Update previous criteria state
937
- gr.update(value=score1),
938
- score1,
939
- gr.update(value=score2),
940
- score2,
941
- gr.update(value=score3),
942
- score3,
943
- gr.update(value=score4),
944
- score4,
945
- gr.update(value=score5),
946
- score5,
947
- gr.update(visible=False) # Hide buttons
948
- ]
949
-
950
- def cancel_compatible_prompt(prev_criteria, prev_score1, prev_score2, prev_score3, prev_score4, prev_score5):
951
- return [
952
- gr.update(value=prev_criteria),
953
- prev_criteria,
954
- gr.update(value=prev_score1),
955
- prev_score1,
956
- gr.update(value=prev_score2),
957
- prev_score2,
958
- gr.update(value=prev_score3),
959
- prev_score3,
960
- gr.update(value=prev_score4),
961
- prev_score4,
962
- gr.update(value=prev_score5),
963
- prev_score5,
964
- gr.update(visible=False)
965
- ]
966
-
967
- def show_compatible_edit_buttons(*current_values):
968
- previous_values = current_values[1::2] # Get previous values
969
- current_values = current_values[::2] # Get current values
970
- return gr.update(visible=any(curr != prev for curr, prev in zip(current_values, previous_values)))
971
-
972
- # Add click handlers for compatible mode buttons
973
- compatible_save_btn.click(
974
- fn=save_compatible_prompt,
975
- inputs=[
976
- eval_criteria_text,
977
- score1_description,
978
- score2_description,
979
- score3_description,
980
- score4_description,
981
- score5_description
982
- ],
983
- outputs=[
984
- eval_criteria_text,
985
- eval_criteria_previous,
986
- score1_description,
987
- score1_previous,
988
- score2_description,
989
- score2_previous,
990
- score3_description,
991
- score3_previous,
992
- score4_description,
993
- score4_previous,
994
- score5_description,
995
- score5_previous,
996
- compatible_edit_buttons_row
997
- ]
998
- )
999
-
1000
- compatible_cancel_btn.click(
1001
- fn=cancel_compatible_prompt,
1002
- inputs=[
1003
- eval_criteria_previous,
1004
- score1_previous,
1005
- score2_previous,
1006
- score3_previous,
1007
- score4_previous,
1008
- score5_previous
1009
- ],
1010
- outputs=[
1011
- eval_criteria_text,
1012
- eval_criteria_previous,
1013
- score1_description,
1014
- score1_previous,
1015
- score2_description,
1016
- score2_previous,
1017
- score3_description,
1018
- score3_previous,
1019
- score4_description,
1020
- score4_previous,
1021
- score5_description,
1022
- score5_previous,
1023
- compatible_edit_buttons_row
1024
- ]
1025
- )
1026
-
1027
- # Add change handlers for all compatible mode inputs
1028
- for component in [eval_criteria_text, score1_description, score2_description,
1029
- score3_description, score4_description, score5_description]:
1030
- component.change(
1031
- fn=show_compatible_edit_buttons,
1032
- inputs=[
1033
- eval_criteria_text,
1034
- eval_criteria_previous,
1035
- score1_description,
1036
- score1_previous,
1037
- score2_description,
1038
- score2_previous,
1039
- score3_description,
1040
- score3_previous,
1041
- score4_description,
1042
- score4_previous,
1043
- score5_description,
1044
- score5_previous
1045
- ],
1046
- outputs=compatible_edit_buttons_row
1047
- )
1048
-
1049
  if __name__ == "__main__":
1050
  demo.launch()
 
2
  import re
3
  import random
4
  from collections import defaultdict
5
+ from datetime import datetime, timezone
6
  import hashlib
 
7
 
8
  from dotenv import load_dotenv
 
9
 
10
+ load_dotenv()
 
 
 
 
 
 
11
 
12
+ import gradio as gr
13
+ from gen_api_answer import get_model_response, parse_model_response, get_random_human_ai_pair
 
 
 
14
  from db import add_vote, create_db_connection, get_votes
 
15
  from utils import Vote
 
16
  from common import (
17
  POLICY_CONTENT,
18
  ACKNOWLEDGEMENTS,
19
+ DEFAULT_EVAL_PROMPT,
20
+ DEFAULT_INPUT,
21
+ DEFAULT_RESPONSE,
22
  CSS_STYLES,
23
  MAIN_TITLE,
24
  HOW_IT_WORKS,
25
+ BATTLE_RULES,
26
+ EVAL_DESCRIPTION,
27
+ VOTING_HEADER,
28
  )
29
+ from example_metrics import EXAMPLE_METRICS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
 
32
+ # Model and ELO score data
33
+ DEFAULT_ELO = 1200 # Starting ELO for new models
34
+ K_FACTOR = 32 # Standard chess K-factor, adjust as needed
35
  elo_scores = defaultdict(lambda: DEFAULT_ELO)
36
  vote_counts = defaultdict(int)
37
 
 
52
  "organization": model["organization"],
53
  "license": model["license"],
54
  "api_model": model["api_model"],
 
55
  }
56
  except FileNotFoundError:
57
  print("Warning: models.jsonl not found")
 
62
  model_data = load_model_data()
63
 
64
  def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id):
 
 
65
  vote = Vote(
66
  timestamp=datetime.now().isoformat(),
67
+ prompt=prompt,
68
  response_a=response_a,
69
  response_b=response_b,
70
  model_a=model_a,
 
93
  return eval_prompt
94
 
95
 
96
+ def submit_prompt(eval_prompt, *variable_values):
97
+ try:
98
+ variables = parse_variables(eval_prompt)
99
+ variable_values_dict = {var: val for var, val in zip(variables, variable_values)}
100
+ final_prompt = get_final_prompt(eval_prompt, variable_values_dict)
101
+
102
+ models = list(model_data.keys())
103
+ model1, model2 = random.sample(models, 2)
104
+ model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
105
+
106
+ response_a = get_model_response(model_a, model_data.get(model_a), final_prompt)
107
+ response_b = get_model_response(model_b, model_data.get(model_b), final_prompt)
108
+
109
+ return (
110
+ response_a,
111
+ response_b,
112
+ gr.update(visible=True),
113
+ gr.update(visible=True),
114
+ model_a,
115
+ model_b,
116
+ final_prompt,
117
+ )
118
+ except Exception as e:
119
+ print(f"Error in submit_prompt: {str(e)}")
120
+ return (
121
+ "Error generating response",
122
+ "Error generating response",
123
+ gr.update(visible=False),
124
+ gr.update(visible=False),
125
+ None,
126
+ None,
127
+ None,
128
+ )
129
+
130
 
131
  def get_ip(request: gr.Request) -> str:
132
  """Get and hash the IP address from the request."""
 
143
  return hashlib.sha256(ip.encode()).hexdigest()[:16]
144
 
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  def vote(
147
  choice,
148
  model_a,
 
192
  store_vote_data(
193
  final_prompt, response_a, response_b, model_a, model_b, choice, judge_id
194
  )
195
+
196
+ # Return updates for UI components
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  return [
198
+ gr.update(visible=False), # vote_a
199
+ gr.update(visible=False), # vote_b
200
+ gr.update(visible=False), # tie_button_row
201
+ gr.update(value=f"*Model: {model_a}*"), # model_name_a
202
+ gr.update(value=f"*Model: {model_b}*"), # model_name_b
203
+ gr.update(interactive=True, value="Run the evaluators", variant="primary"), # send_btn
204
+ gr.update(visible=True), # spacing_div
 
205
  ]
206
 
207
 
 
210
  return get_votes(db)
211
 
212
 
213
+ def get_leaderboard(show_preliminary=True):
214
+ """Generate leaderboard data using fresh votes from MongoDB."""
215
+ # Get fresh voting data
216
  voting_data = get_current_votes()
217
+ print(f"Fetched {len(voting_data)} votes from database") # Debug log
218
+
219
+ # Initialize dictionaries for tracking
220
+ ratings = defaultdict(lambda: DEFAULT_ELO)
221
+ matches = defaultdict(int)
222
+
223
+ # Process each vote
224
+ for vote in voting_data:
225
+ try:
226
+ model_a = vote.get("model_a")
227
+ model_b = vote.get("model_b")
228
+ winner = vote.get("winner")
229
+
230
+ # Skip if models aren't in current model_data
231
+ if (
232
+ not all([model_a, model_b, winner])
233
+ or model_a not in model_data
234
+ or model_b not in model_data
235
+ ):
236
+ continue
237
+
238
+ # Update match counts
239
+ matches[model_a] += 1
240
+ matches[model_b] += 1
241
+
242
+ # Calculate ELO changes
243
+ elo_a = ratings[model_a]
244
+ elo_b = ratings[model_b]
245
+
246
+ # Expected scores
247
+ expected_a = 1 / (1 + 10 ** ((elo_b - elo_a) / 400))
248
+ expected_b = 1 - expected_a
249
+
250
+ # Actual scores
251
+ score_a = 1 if winner == "A" else 0 if winner == "B" else 0.5
252
+ score_b = 1 - score_a
253
+
254
+ # Update ratings
255
+ ratings[model_a] += K_FACTOR * (score_a - expected_a)
256
+ ratings[model_b] += K_FACTOR * (score_b - expected_b)
257
+
258
+ except Exception as e:
259
+ print(f"Error processing vote: {e}")
260
+ continue
261
+
262
+ # Generate leaderboard data
263
+ leaderboard = []
264
+ for model in model_data.keys():
265
+ votes = matches[model]
266
+ # Skip models with < 500 votes if show_preliminary is False
267
+ if not show_preliminary and votes < 500:
268
+ continue
269
+
270
+ elo = ratings[model]
271
+ ci = 1.96 * (400 / (votes + 1) ** 0.5) if votes > 0 else 0
272
+ data = {
273
+ "Model": model,
274
+ "ELO Score": f"{int(elo)}",
275
+ "95% CI": f"±{int(ci)}",
276
+ "# Votes": votes,
277
+ "Organization": model_data[model]["organization"],
278
+ "License": model_data[model]["license"],
279
+ }
280
+ leaderboard.append(data)
281
+
282
+ # Sort leaderboard by ELO score in descending order
283
+ leaderboard.sort(key=lambda x: float(x["ELO Score"]), reverse=True)
284
+
285
+ return leaderboard
286
+
287
+
288
+ def calculate_elo_change(rating_a, rating_b, winner):
289
+ """Calculate ELO rating changes for both players."""
290
+ expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
291
+ expected_b = 1 - expected_a
292
+
293
+ if winner == "A":
294
+ score_a, score_b = 1, 0
295
+ elif winner == "B":
296
+ score_a, score_b = 0, 1
297
+ else: # Handle ties
298
+ score_a, score_b = 0.5, 0.5
299
+
300
+ change_a = K_FACTOR * (score_a - expected_a)
301
+ change_b = K_FACTOR * (score_b - expected_b)
302
+
303
+ return change_a, change_b
304
+
305
+
306
+ def update_leaderboard():
307
+ """Generate leaderboard DataFrame using fresh votes from MongoDB."""
308
+ # Get fresh voting data
309
+ voting_data = get_current_votes()
310
+ print(f"Found {len(voting_data)} votes in database")
311
+ matches = defaultdict(int)
312
+
313
+ # Process each vote chronologically
314
+ for vote in voting_data:
315
+ # Extract model names from the vote document
316
+ try:
317
+ model_a = vote.get("model_a")
318
+ model_b = vote.get("model_b")
319
+ winner = vote.get("winner")
320
+
321
+ print(f"Processing vote: {model_a} vs {model_b}, winner: {winner}")
322
+
323
+ # Skip if any required field is missing or models aren't in current model_data
324
+ if not all([model_a, model_b, winner]):
325
+ print(f"Missing required fields in vote: {vote}")
326
+ continue
327
+
328
+ if model_a not in model_data:
329
+ print(f"Model A '{model_a}' not found in model_data")
330
+ continue
331
+
332
+ if model_b not in model_data:
333
+ print(f"Model B '{model_b}' not found in model_data")
334
+ continue
335
+
336
+ # Update match counts
337
+ matches[model_a] += 1
338
+ matches[model_b] += 1
339
+ print(
340
+ f"Updated matches - {model_a}: {matches[model_a]}, {model_b}: {matches[model_b]}"
341
+ )
342
+ except Exception as e:
343
+ print(f"Error processing vote: {e}")
344
+ print(f"Problematic vote data: {vote}")
345
+ continue
346
+
347
+
348
+ # Update the display_leaderboard function
349
+ def display_leaderboard():
350
+ df = update_leaderboard()
351
+ return gr.DataFrame(
352
+ value=df,
353
+ headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
354
+ datatype=["str", "number", "str", "number", "str", "str", "str"],
355
+ row_count=(len(df) + 1, "dynamic"),
356
+ )
357
 
358
 
359
  # Update the leaderboard table definition in the UI
 
363
  )
364
 
365
 
366
+ def get_leaderboard_stats():
367
+ """Get summary statistics for the leaderboard."""
368
+ now = datetime.now(timezone.utc)
369
+ total_votes = len(get_current_votes())
370
+ total_models = len(model_data)
371
+ last_updated = now.replace(minute=0, second=0, microsecond=0).strftime(
372
+ "%B %d, %Y at %H:00 UTC"
373
+ )
374
+
375
+ return f"""
376
+ ### Leaderboard Stats
377
+ - **Total Models**: {total_models}
378
+ - **Total Votes**: {total_votes}
379
+ - **Last Updated**: {last_updated}
380
+ """
381
+
382
+
383
+ #def set_example_metric(metric_name):
384
+ # if metric_name == "Custom":
385
+ # variables = parse_variables(DEFAULT_EVAL_PROMPT)
386
+ # variable_values = []
387
+ # for var in variables:
388
+ # if var == "input":
389
+ # variable_values.append(DEFAULT_INPUT)
390
+ # elif var == "response":
391
+ # variable_values.append(DEFAULT_RESPONSE)
392
+ # else:
393
+ # variable_values.append("") # Default empty value
394
+ # Pad variable_values to match the length of variable_rows
395
+ # while len(variable_values) < len(variable_rows):
396
+ # variable_values.append("")
397
+ # return [DEFAULT_EVAL_PROMPT] + variable_values
398
+
399
+ # metric_data = EXAMPLE_METRICS[metric_name]
400
+ # variables = parse_variables(metric_data["prompt"])
401
+ # variable_values = []
402
+ # for var in variables:
403
+ # value = metric_data.get(var, "") # Default to empty string if not found
404
+ # variable_values.append(value)
405
+ # Pad variable_values to match the length of variable_rows
406
+ # while len(variable_values) < len(variable_rows):
407
+ # variable_values.append("")
408
+ # return [metric_data["prompt"]] + variable_values
409
+
410
+
411
+ # Select random metric at startup
412
+ # def get_random_metric():
413
+ # metrics = list(EXAMPLE_METRICS.keys())
414
+ # return set_example_metric(random.choice(metrics))
415
+
416
+
417
+ def populate_random_example(request: gr.Request):
418
+ """Generate a random human-AI conversation example."""
419
+ human_msg, ai_msg = get_random_human_ai_pair()
420
  return [
421
  gr.update(value=human_msg),
422
+ gr.update(value=ai_msg)
 
 
 
 
 
 
 
 
 
 
 
423
  ]
424
 
425
 
 
435
 
436
  with gr.Tabs():
437
  with gr.TabItem("Judge Arena"):
438
+ random_btn = gr.Button("🎲", scale=0)
439
  with gr.Row():
440
  # Left side - Input section
441
  with gr.Column(scale=1):
442
  with gr.Group():
443
  human_input = gr.TextArea(
444
+ label="👩 Human Input",
445
+ lines=12,
446
  placeholder="Enter the human message here..."
447
  )
 
 
 
 
 
 
448
 
449
  ai_response = gr.TextArea(
450
  label="🤖 AI Response",
 
 
 
 
 
 
 
451
  lines=12,
452
+ placeholder="Enter the AI response here..."
 
453
  )
454
 
 
 
455
  send_btn = gr.Button(
456
+ value="Run the evaluators",
457
  variant="primary",
458
+ size="lg"
 
459
  )
460
 
461
  # Right side - Model outputs
 
465
  model_name_a = gr.Markdown("*Model: Hidden*")
466
  with gr.Row():
467
  with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
468
+ score_a = gr.Textbox(label="Score", lines=5, interactive=False)
469
+ vote_a = gr.Button("Vote A", variant="primary", visible=False)
470
  with gr.Column(scale=9, min_width=400): # Wider width for critique
471
+ critique_a = gr.TextArea(label="Critique", lines=7, interactive=False)
472
+
473
+ # Spacing div that's visible only when tie button is hidden
474
+ spacing_div = gr.HTML('<div style="height: 42px;"></div>', visible=True, elem_id="spacing-div")
475
 
476
  # Tie button row
477
+ with gr.Row(visible=False) as tie_button_row:
478
  with gr.Column():
479
+ vote_tie = gr.Button("Tie", variant="secondary")
480
 
481
 
482
  gr.Markdown("### 🧑‍⚖️ Judge B")
 
484
  model_name_b = gr.Markdown("*Model: Hidden*")
485
  with gr.Row():
486
  with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
487
+ score_b = gr.Textbox(label="Score", lines=5, interactive=False)
488
+ vote_b = gr.Button("Vote B", variant="primary", visible=False)
489
  with gr.Column(scale=9, min_width=400): # Wider width for critique
490
+ critique_b = gr.TextArea(label="Critique", lines=7, interactive=False)
491
+ # Place Vote B button directly under Judge B
492
 
493
  gr.Markdown("<br>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
494
 
495
+ # Add spacing and acknowledgements at the bottom
496
+ gr.Markdown(ACKNOWLEDGEMENTS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
497
 
498
  with gr.TabItem("Leaderboard"):
499
  with gr.Row():
 
501
  show_preliminary = gr.Checkbox(
502
  label="Reveal preliminary results",
503
  value=True, # Checked by default
504
+ info="Show all models, including models with less few human ratings (< 500 votes)",
505
  interactive=True
506
  )
507
  stats_display = gr.Markdown()
 
509
  headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
510
  datatype=["str", "number", "str", "number", "str", "str", "str"],
511
  )
 
 
 
 
512
 
513
+ # Update refresh_leaderboard to use the checkbox value
514
+ def refresh_leaderboard(show_preliminary):
515
+ """Refresh the leaderboard data and stats."""
516
+ leaderboard = get_leaderboard(show_preliminary)
517
+ data = [
518
+ [
519
+ entry["Model"],
520
+ float(entry["ELO Score"]),
521
+ entry["95% CI"],
522
+ entry["# Votes"],
523
+ entry["Organization"],
524
+ entry["License"],
525
+ ]
526
+ for entry in leaderboard
527
+ ]
528
+ stats = get_leaderboard_stats()
529
+ return [gr.update(value=data), gr.update(value=stats)]
530
 
531
  # Add change handler for checkbox
532
  show_preliminary.change(
 
544
 
545
  with gr.TabItem("Policy"):
546
  gr.Markdown(POLICY_CONTENT)
 
547
 
548
  # Define state variables for model tracking
549
  model_a_state = gr.State()
550
  model_b_state = gr.State()
551
  final_prompt_state = gr.State()
552
+
553
+ # Update variable inputs based on the eval prompt
554
+ def update_variables(eval_prompt):
555
+ variables = parse_variables(eval_prompt)
556
+ updates = []
557
+
558
+ for i in range(len(variable_rows)):
559
+ var_row, var_input = variable_rows[i]
560
+ if i < len(variables):
561
+ var_name = variables[i]
562
+ # Set the number of lines based on the variable name
563
+ if var_name == "response":
564
+ lines = 4 # Adjust this number as needed
565
+ else:
566
+ lines = 1 # Default to single line for other variables
567
+ updates.extend(
568
+ [
569
+ gr.update(visible=True), # Show the variable row
570
+ gr.update(
571
+ label=var_name, visible=True, lines=lines
572
+ ), # Update label and lines
573
+ ]
574
+ )
575
+ else:
576
+ updates.extend(
577
+ [
578
+ gr.update(visible=False), # Hide the variable row
579
+ gr.update(value="", visible=False), # Clear value when hidden
580
+ ]
581
+ )
582
+ return updates
583
+
584
+ #eval_prompt.change(
585
+ # fn=update_variables,
586
+ # inputs=eval_prompt,
587
+ # outputs=[item for sublist in variable_rows for item in sublist],
588
+ #)
589
+
590
+ # Regenerate button functionality
591
+ #regenerate_button.click(
592
+ # fn=regenerate_prompt,
593
+ # inputs=[model_a_state, model_b_state, eval_prompt, human_input, ai_response],
594
+ # outputs=[
595
+ # score_a,
596
+ # critique_a,
597
+ # score_b,
598
+ # critique_b,
599
+ # vote_a,
600
+ # vote_b,
601
+ # tie_button_row,
602
+ # model_name_a,
603
+ # model_name_b,
604
+ # model_a_state,
605
+ # model_b_state,
606
+ # ],
607
+ #)
608
 
609
  # Update model names after responses are generated
610
  def update_model_names(model_a, model_b):
 
619
  vote_a.click(
620
  fn=vote,
621
  inputs=[
622
+ gr.State("A"), # Choice
623
  model_a_state,
624
  model_b_state,
625
  final_prompt_state,
 
631
  outputs=[
632
  vote_a,
633
  vote_b,
634
+ tie_button_row,
635
  model_name_a,
636
  model_name_b,
637
  send_btn,
638
+ spacing_div,
 
639
  ],
640
  )
641
 
642
  vote_b.click(
643
  fn=vote,
644
  inputs=[
645
+ gr.State("B"), # Choice
646
  model_a_state,
647
  model_b_state,
648
  final_prompt_state,
 
654
  outputs=[
655
  vote_a,
656
  vote_b,
657
+ tie_button_row,
658
  model_name_a,
659
  model_name_b,
660
  send_btn,
661
+ spacing_div,
 
662
  ],
663
  )
664
 
665
  vote_tie.click(
666
  fn=vote,
667
  inputs=[
668
+ gr.State("Tie"), # Choice
669
  model_a_state,
670
  model_b_state,
671
  final_prompt_state,
 
677
  outputs=[
678
  vote_a,
679
  vote_b,
680
+ tie_button_row,
681
  model_name_a,
682
  model_name_b,
683
  send_btn,
684
+ spacing_div,
 
685
  ],
686
  )
687
 
688
+ # Update the send button handler to store the submitted inputs
689
+ def submit_and_store(prompt, *variables):
690
+ # Create a copy of the current submission
691
+ current_submission = {"prompt": prompt, "variables": variables}
692
+
693
+ # Get the responses
694
+ (
695
+ response_a,
696
+ response_b,
697
+ buttons_visible,
698
+ regen_visible,
699
+ model_a,
700
+ model_b,
701
+ final_prompt,
702
+ ) = submit_prompt(prompt, *variables)
703
 
704
+ # Parse the responses
705
+ score_a, critique_a = parse_model_response(response_a)
706
+ score_b, critique_b = parse_model_response(response_b)
707
 
708
+ # Format scores with "/ 5"
709
+ score_a = f"{score_a} / 5"
710
+ score_b = f"{score_b} / 5"
 
 
 
711
 
712
+ # Update the last_submission state with the current values
713
+ last_submission.value = current_submission
 
 
 
 
 
 
 
 
 
714
 
715
+ return (
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
716
  score_a,
717
  critique_a,
718
  score_b,
719
  critique_b,
720
+ gr.update(visible=True), # vote_a
721
+ gr.update(visible=True), # vote_b
722
+ gr.update(visible=True), # tie_button_row
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
723
  model_a,
 
 
 
 
 
724
  model_b,
725
+ final_prompt, # Add final_prompt to state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
726
  gr.update(value="*Model: Hidden*"),
727
  gr.update(value="*Model: Hidden*"),
728
+ # Change the button to "Regenerate" mode after evaluation
729
+ gr.update(
730
+ value="Regenerate with different models",
731
+ variant="secondary",
732
+ interactive=True
733
+ ),
734
+ gr.update(visible=False), # spacing_div
735
  )
736
 
 
 
 
 
 
 
 
 
 
 
 
 
 
737
  send_btn.click(
738
  fn=submit_and_store,
739
+ inputs=[eval_prompt, human_input, ai_response],
 
 
 
 
 
 
 
 
 
 
 
740
  outputs=[
741
  score_a,
742
  critique_a,
 
744
  critique_b,
745
  vote_a,
746
  vote_b,
747
+ tie_button_row,
748
  model_a_state,
749
  model_b_state,
750
  final_prompt_state,
751
  model_name_a,
752
  model_name_b,
753
  send_btn,
754
+ spacing_div,
755
  ],
756
  )
757
 
758
+ # Update the input change handlers to also disable regenerate button
759
+ def handle_input_changes(prompt, *variables):
760
+ """Enable send button and manage regenerate button based on input changes"""
761
+ last_inputs = last_submission.value
762
+ current_inputs = {"prompt": prompt, "variables": variables}
763
+ inputs_changed = last_inputs != current_inputs
764
+ return [
765
+ gr.update(interactive=True), # send button always enabled
766
+ gr.update(
767
+ interactive=not inputs_changed
768
+ ), # regenerate button disabled if inputs changed
769
+ ]
770
+
771
+ # Update the change handlers for prompt and variables
772
+ #eval_prompt.change(
773
+ # fn=handle_input_changes,
774
+ # inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
775
+ # outputs=[send_btn, regenerate_button],
776
+ #)
777
+
778
+ # for _, var_input in variable_rows:
779
+ # var_input.change(
780
+ # fn=handle_input_changes,
781
+ # inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
782
+ # outputs=[send_btn, regenerate_button],
783
+ # )
784
+
785
+ # Add click handlers for metric buttons
786
+ #outputs_list = [eval_prompt] + [var_input for _, var_input in variable_rows]
787
+
788
+ #custom_btn.click(fn=lambda: set_example_metric("Custom"), outputs=outputs_list)
789
+
790
+ #hallucination_btn.click(
791
+ # fn=lambda: set_example_metric("Hallucination"), outputs=outputs_list
792
+ #)
793
+
794
+ #precision_btn.click(fn=lambda: set_example_metric("Precision"), outputs=outputs_list)
795
+
796
+ #recall_btn.click(fn=lambda: set_example_metric("Recall"), outputs=outputs_list)
797
+
798
+ #coherence_btn.click(
799
+ # fn=lambda: set_example_metric("Logical_Coherence"), outputs=outputs_list
800
+ #)
801
+
802
+ #faithfulness_btn.click(
803
+ # fn=lambda: set_example_metric("Faithfulness"), outputs=outputs_list
804
+ #)
805
+
806
+ # Set default metric at startup
807
+ demo.load(
808
+ #fn=lambda: set_example_metric("Hallucination"),
809
+ #outputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
810
+ )
811
+
812
  # Add random button handler
813
  random_btn.click(
814
  fn=populate_random_example,
815
+ inputs=[],
816
+ outputs=[human_input, ai_response]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
817
  )
818
 
819
  # Add new input change handlers
820
  def handle_input_change():
821
+ return gr.update(value="Run the evaluators", variant="primary")
 
 
 
 
 
 
 
822
 
823
  # Update the change handlers for inputs
824
  human_input.change(
825
  fn=handle_input_change,
826
  inputs=[],
827
+ outputs=[send_btn]
828
  )
829
 
830
  ai_response.change(
831
  fn=handle_input_change,
832
  inputs=[],
833
+ outputs=[send_btn]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
834
  )
835
 
836
  # Update the demo.load to include the random example population
837
  demo.load(
838
+ fn=populate_random_example,
839
  inputs=[],
840
+ outputs=[human_input, ai_response]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
841
  )
842
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
843
  if __name__ == "__main__":
844
  demo.launch()
common.py CHANGED
@@ -37,7 +37,7 @@ CSS_STYLES = """
37
  gap: 8px;
38
  }
39
  """
40
-
41
  # Default Eval Prompt
42
  EVAL_DESCRIPTION = """
43
  ## 📝 Tips
@@ -47,6 +47,27 @@ EVAL_DESCRIPTION = """
47
  - Examples (Optional)
48
  """
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  # Voting Section Header
51
  VOTING_HEADER = """
52
  # Start Voting Now
@@ -68,50 +89,55 @@ POLICY_CONTENT = """
68
 
69
  Atla is an applied research organization that trains models as evaluators to capture human preferences. We're a team of researchers, engineers, and operational leaders, with experience spanning a variety of disciplines, all working together to build reliable and understandable AI systems. Our research is informed by our experiences conducting AI safety research at the UK AI Task Force, OpenAI and the Stanford Existential Risks Initiative.
70
  <br><br>
71
- # [Our Mission](https://www.atla-ai.com/company)
72
 
73
- By creating advanced evaluation models, we enable AI developers to identify and fix risks, leading to safer, more reliable AI that can be trusted and widely used. Our aim is to surpass the current state-of-the-art evaluation methods by training models specifically for evaluation. AIs will probably become very powerful, and perform tasks that are difficult for us to verify. We want to enable humans to oversee AI systems that are solving tasks too difficult for humans to evaluate.
74
- Read more about [our approach to scalable oversight](https://www.atla-ai.com/post/scaling-alignment) on our blog.
75
  <br><br>
76
  # Judge Arena Policy
77
 
78
  ## Overview
79
 
80
- Judge Arena is an open-source platform dedicated to determining which models make the best judges. Users can run evals and assess anonymized responses from two competing model judges, choosing the better judgement or declaring a tie. This policy outlines our commitments to maintain a fair and open environment :)
81
 
82
  ## Transparency
83
 
84
  - **Open-Source**: Judge Arena's code is open-source and available on GitHub. We encourage contributions from the community and anyone can replicate or modify the platform to suit their needs. We use proprietary model provider APIs where provided and Together AI's API to serve leading open-source models.
85
- - **Methodology**: All processes related to model evaluation, rating calculations, and model selection are openly documented.
86
  - **Data Sharing**: Periodically, we'll share 20% of the collected evaluation data with the community. The data collected from Judge Arena is restricted to an anonymized user ID, the final prompt sent, the model responses, the user vote, and the timestamp.
87
 
88
  ## Model Inclusion Criteria
89
 
90
  Judge Arena is specifically designed to assess AI models that function as evaluators (a.k.a judges). This includes but is not limited to powerful general-purpose models and the latest language models designed for evaluation tasks. Models are eligible for inclusion if they meet the following criteria:
91
 
92
- - **Judge Capability**: The model should possess the ability to score AND critique other models' outputs effectively.
93
- - **Promptable:** The model must be promptable to be evaluate in different scoring formats, for different criteria.
94
  - **Accessibility**:
95
  - **Public API Access**: Models accessible through public APIs without restrictive barriers.
96
  - **Open-Source Models**: Models with publicly available weights that can be downloaded and run by the community.
97
 
98
  ## Leaderboard Management
99
 
100
- - **ELO Ranking System**: Models are ranked on a public leaderboard based on aggregated user evaluations. We use an ELO rating system to rank AI judges on the public leaderboard. Each model begins with an initial rating of 1200, and we use a K-factor of 32 to determine the maximum rating adjustment after each evaluation.
101
  - **Minimum Period**: Listed models remain accessible on Judge Arena for a minimum period of two weeks so they can be comprehensively evaluated.
102
  - **Deprecation Policy**: Models may be removed from the leaderboard if they become inaccessible or are no longer publicly available.
103
 
104
- *This policy might be updated to reflect changes in our practices or in response to community feedback.*
105
- <br><br>
106
  # FAQ
107
 
108
  **Isn't this the same as Chatbot Arena?**
109
 
110
  We are big fans of what the LMSYS team have done with Chatbot Arena and fully credit them for the inspiration to develop this. We were looking for a dynamic leaderboard that graded on AI judge capabilities and didn't manage to find one, so we created Judge Arena. This UI is designed especially for evals; to match the format of the model-based eval prompts that you would use in your LLM evaluation / monitoring tool.
111
 
 
 
 
 
 
 
112
  **Why should I trust this leaderboard?**
113
 
114
- We have listed out our efforts to be fully transparent in the policies above. All of the code for this leaderboard is open-source and can be found on our [Github](https://github.com/atla-ai/judge-arena). Check out our [blog](https://www.atla-ai.com/blog) to stay up to date as we analyse the results from the leaderboard.
115
 
116
  **Who funds this effort?**
117
 
@@ -122,5 +148,4 @@ Atla currently funds this out of our own pocket. We are looking for API credits
122
  We are training a general-purpose evaluator that you will soon be able to run in this Judge Arena. Our next step will be to open-source a powerful model that the community can use to run fast and accurate evaluations.
123
  <br><br>
124
  # Get in touch
125
- We’d love to hear your feedback! For general feature requests or to submit / suggest new models to add to the arena, please open up a discussion in the [community](https://huggingface.co/spaces/AtlaAI/judge-arena/discussions) tab. You can also contact us directly on [X](https://x.com/Atla_AI) or [Discord](https://discord.com/invite/qFCMgkGwUK).
126
- \nPlease file any issues on our [Github](https://github.com/atla-ai/judge-arena)."""
 
37
  gap: 8px;
38
  }
39
  """
40
+
41
  # Default Eval Prompt
42
  EVAL_DESCRIPTION = """
43
  ## 📝 Tips
 
47
  - Examples (Optional)
48
  """
49
 
50
+ DEFAULT_EVAL_PROMPT = """You are assessing a chat bot response to a user's input based on how well it follows the user's instructions. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Do not allow the length of the response to influence your evaluation. Be objective as possible and give a brief explanation for your score.
51
+
52
+ Score:
53
+ Score 1: The response ignores or misinterprets instructions, providing irrelevant or inaccurate content that fails to address the request.
54
+ Score 2: The response follows instructions partially but misses key elements, lacking depth or precision while containing minor inaccuracies.
55
+ Score 3: The response follows main instructions adequately, providing correct and relevant information with reasonable depth.
56
+ Score 4: The response follows instructions thoroughly with strong attention to detail, offering accurate, well-developed content that thoughtfully addresses needs.
57
+ Score 5: The response demonstrates exceptional instruction following with precise, comprehensive content that shows both insight and perfect alignment with the request.
58
+
59
+ [User Query]: {{input}}
60
+
61
+ [Response]: {{response}}"""
62
+
63
+ # Default Variable Values
64
+ DEFAULT_INPUT = """Which of these animals is least likely to be found in a rainforest?"
65
+ A) Jaguar
66
+ B) Toucan
67
+ C) Polar Bear
68
+ D) Sloth"""
69
+ DEFAULT_RESPONSE = "C) Polar Bear"
70
+
71
  # Voting Section Header
72
  VOTING_HEADER = """
73
  # Start Voting Now
 
89
 
90
  Atla is an applied research organization that trains models as evaluators to capture human preferences. We're a team of researchers, engineers, and operational leaders, with experience spanning a variety of disciplines, all working together to build reliable and understandable AI systems. Our research is informed by our experiences conducting AI safety research at the UK AI Task Force, OpenAI and the Stanford Existential Risks Initiative.
91
  <br><br>
92
+ # Our Mission
93
 
94
+ By creating advanced evaluation models, we enable AI developers to identify and fix risks, leading to safer, more reliable AI that can be trusted and widely used. Our aim is to surpass the current state-of-the-art evaluation methods by training models specifically for evaluation. AIs will probably become very powerful, and perform tasks that are difficult for us to verify. We want to enable humans to oversee AI systems that are solving tasks too difficult for humans to evaluate. We have written more about [our approach to scalable oversight](https://www.atla-ai.com/post/scaling-alignment) on our blog.
 
95
  <br><br>
96
  # Judge Arena Policy
97
 
98
  ## Overview
99
 
100
+ Judge Arena is an open-source platform dedicated to improving the standard of evaluation of generative AI models in their role as judges. Users can run evals and assess anonymized responses from two competing model judges, choosing the better judgement or declaring a tie. This policy outlines our commitments to maintain a fair, open, and collaborative environment :)
101
 
102
  ## Transparency
103
 
104
  - **Open-Source**: Judge Arena's code is open-source and available on GitHub. We encourage contributions from the community and anyone can replicate or modify the platform to suit their needs. We use proprietary model provider APIs where provided and Together AI's API to serve leading open-source models.
105
+ - **Methodology**: All processes related to model evaluation, rating calculations, and model selection are openly documented. We'd like to ensure that our ranking system is understandable and reproducible by others!
106
  - **Data Sharing**: Periodically, we'll share 20% of the collected evaluation data with the community. The data collected from Judge Arena is restricted to an anonymized user ID, the final prompt sent, the model responses, the user vote, and the timestamp.
107
 
108
  ## Model Inclusion Criteria
109
 
110
  Judge Arena is specifically designed to assess AI models that function as evaluators (a.k.a judges). This includes but is not limited to powerful general-purpose models and the latest language models designed for evaluation tasks. Models are eligible for inclusion if they meet the following criteria:
111
 
112
+ - **Judge Capability**: The model should possess the ability to score AND critique responses, content, or other models' outputs effectively.
113
+ - **Adaptable:** The model must be prompt-able to be evaluate in different scoring formats, for different criteria.
114
  - **Accessibility**:
115
  - **Public API Access**: Models accessible through public APIs without restrictive barriers.
116
  - **Open-Source Models**: Models with publicly available weights that can be downloaded and run by the community.
117
 
118
  ## Leaderboard Management
119
 
120
+ - **ELO Ranking System**: Models are ranked on a public leaderboard based on aggregated user evaluations. We use an ELO rating system to rank AI judges on the public leaderboard. Each model begins with an initial rating of 1500 (as is used by the International Chess Federation), and we use a K-factor of 32 to determine the maximum rating adjustment after each evaluation.
121
  - **Minimum Period**: Listed models remain accessible on Judge Arena for a minimum period of two weeks so they can be comprehensively evaluated.
122
  - **Deprecation Policy**: Models may be removed from the leaderboard if they become inaccessible or are no longer publicly available.
123
 
124
+ This policy might be updated to reflect changes in our practices or in response to community feedback.
125
+
126
  # FAQ
127
 
128
  **Isn't this the same as Chatbot Arena?**
129
 
130
  We are big fans of what the LMSYS team have done with Chatbot Arena and fully credit them for the inspiration to develop this. We were looking for a dynamic leaderboard that graded on AI judge capabilities and didn't manage to find one, so we created Judge Arena. This UI is designed especially for evals; to match the format of the model-based eval prompts that you would use in your LLM evaluation / monitoring tool.
131
 
132
+ **What are the Evaluator Prompt Templates based on?**
133
+
134
+ As a quick start, we've set up templates that cover the most popular evaluation metrics out there on LLM evaluation / monitoring tools, often known as 'base metrics'. The data samples used in these were randomly picked from popular datasets from academia - [ARC](https://huggingface.co/datasets/allenai/ai2_arc), [Preference Collection](https://huggingface.co/datasets/prometheus-eval/Preference-Collection), [RewardBench](https://huggingface.co/datasets/allenai/reward-bench), [RAGTruth](https://arxiv.org/abs/2401.00396).
135
+
136
+ These templates are designed as a starting point to showcase how to interact with the Judge Arena, especially for those less familiar with using LLM judges.
137
+
138
  **Why should I trust this leaderboard?**
139
 
140
+ We have listed out our efforts to be fully transparent in the policies above. All of the code for this leaderboard is open-source and can be found on our [Github](https://github.com/atla-ai/judge-arena).
141
 
142
  **Who funds this effort?**
143
 
 
148
  We are training a general-purpose evaluator that you will soon be able to run in this Judge Arena. Our next step will be to open-source a powerful model that the community can use to run fast and accurate evaluations.
149
  <br><br>
150
  # Get in touch
151
+ Feel free to email us at [[email protected]](mailto:support@atla-ai.com) or leave feedback on our [Github](https://github.com/atla-ai/judge-arena)!"""
 
data/models.jsonl CHANGED
@@ -1,28 +1,16 @@
1
- {"name": "Meta Llama 3.1 70B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", "active": false}
2
- {"name": "Meta Llama 3.1 405B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", "active": true}
3
- {"name": "Gemma 2 27B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-27b-it", "active": true}
4
- {"name": "Gemma 2 9B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-9b-it", "active": true}
5
- {"name": "Qwen 2 Instruct (72B)", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2-72B-Instruct", "active": true}
6
- {"name": "Mistral (7B) Instruct v0.3", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.3", "active": true}
7
- {"name": "GPT-4o", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4o", "active": true}
8
- {"name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo", "active": true}
9
- {"name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo", "active": true}
10
- {"name": "Claude 3 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-haiku-20240307", "active": true}
11
- {"name": "Claude 3 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-sonnet-20240229", "active": true}
12
- {"name": "Claude 3 Opus", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-opus-latest", "active": true}
13
- {"name": "Meta Llama 3.1 8B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", "active": true}
14
- {"name": "Qwen 2.5 72B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo", "active": true}
15
- {"name": "Qwen 2.5 7B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-7B-Instruct-Turbo", "active": true}
16
- {"name": "Mistral (7B) Instruct v0.1", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.1", "active": true}
17
- {"name": "Claude 3.5 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-sonnet-latest", "active": true}
18
- {"name": "Claude 3.5 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-haiku-latest", "active": true}
19
- {"name": "Prometheus-7b v2", "organization": "Prometheus", "license": "Open Source", "api_model": "prometheus/prometheus-7b-v2", "active": false}
20
- {"name": "Command-R", "organization": "Cohere", "license": "Proprietary", "api_model": "command-r", "active": true}
21
- {"name": "Command-R Plus", "organization": "Cohere", "license": "Proprietary", "api_model": "command-r-plus", "active": true}
22
- {"name": "Atla-8B-preview", "organization": "Atla", "license": "Open Source", "api_model": "Atla-8B-preview", "active": false}
23
- {"name": "Meta Llama 3.3 70B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", "active": true}
24
- {"name": "QwQ 32B Preview", "organization": "Qwen", "license": "Open Source", "api_model": "Qwen/QwQ-32B-Preview", "active": true}
25
- {"name": "Flow-Judge-v0.1", "organization": "Flow AI", "license": "Open Source", "api_model": "Flow-Judge-v0.1-4.65bpw-exl2", "active": false}
26
- {"name": "SFR-LLaMA-3.1-70B-Judge", "organization": "Salesforce", "license": "Proprietary", "api_model": "sfr-llama-3.1-70b-judge", "active": true}
27
- {"name": "Atla Selene 1 Mini", "organization": "Atla", "license": "Open Source", "api_model": "Atla-Selene-Mini", "active": true}
28
- {"name": "Atla Selene 1", "organization": "Atla", "license": "Proprietary", "api_model": "Atla-Selene", "active": true}
 
1
+ {"name": "Meta Llama 3.1 70B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"}
2
+ {"name": "Meta Llama 3.1 405B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo"}
3
+ {"name": "Gemma 2 27B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-27b-it"}
4
+ {"name": "Gemma 2 9B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-9b-it"}
5
+ {"name": "Qwen 2 Instruct (72B)", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2-72B-Instruct"}
6
+ {"name": "Mistral (7B) Instruct v0.3", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.3"}
7
+ {"name": "GPT-4o", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4o"}
8
+ {"name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo"}
9
+ {"name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo"}
10
+ {"name": "Claude 3 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-haiku-20240307"}
11
+ {"name": "Claude 3 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-sonnet-20240229"}
12
+ {"name": "Claude 3 Opus", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-opus-20240229"}
13
+ {"name": "Meta Llama 3.1 8B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"}
14
+ {"name": "Qwen 2.5 72B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo"}
15
+ {"name": "Qwen 2.5 7B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-7B-Instruct-Turbo"}
16
+ {"name": "Mistral (7B) Instruct v0.1", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.1"}
 
 
 
 
 
 
 
 
 
 
 
 
gen_api_answer.py CHANGED
@@ -1,234 +1,95 @@
1
  from openai import OpenAI
2
  import anthropic
3
  from together import Together
4
- import cohere
5
  import json
6
  import re
7
- import os
8
- import requests
9
- from prompts import (
10
- JUDGE_SYSTEM_PROMPT,
11
- PROMETHEUS_PROMPT,
12
- PROMETHEUS_PROMPT_WITH_REFERENCE,
13
- ATLA_PROMPT,
14
- ATLA_PROMPT_WITH_REFERENCE,
15
- FLOW_JUDGE_PROMPT
16
- )
17
- from transformers import AutoTokenizer
18
- from atla import Atla
19
 
20
  # Initialize clients
21
  anthropic_client = anthropic.Anthropic()
22
  openai_client = OpenAI()
23
  together_client = Together()
24
- hf_api_key = os.getenv("HF_API_KEY")
25
- flow_judge_api_key = os.getenv("FLOW_JUDGE_API_KEY")
26
- cohere_client = cohere.ClientV2(os.getenv("CO_API_KEY"))
27
- salesforce_api_key = os.getenv("SALESFORCE_API_KEY")
28
 
29
- # Initialize Atla client
30
- atla_client = Atla()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
 
33
  """Get response from OpenAI API"""
34
  try:
35
  response = openai_client.chat.completions.create(
36
  model=model_name,
37
  messages=[
38
- {"role": "system", "content": system_prompt},
39
  {"role": "user", "content": prompt},
40
  ],
41
- max_completion_tokens=max_tokens,
42
- temperature=temperature,
43
  )
44
  return response.choices[0].message.content
45
  except Exception as e:
46
  return f"Error with OpenAI model {model_name}: {str(e)}"
47
 
48
- def get_anthropic_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
 
49
  """Get response from Anthropic API"""
50
  try:
51
  response = anthropic_client.messages.create(
52
  model=model_name,
53
- max_tokens=max_tokens,
54
- temperature=temperature,
55
- system=system_prompt,
56
  messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}],
57
  )
58
  return response.content[0].text
59
  except Exception as e:
60
  return f"Error with Anthropic model {model_name}: {str(e)}"
61
 
62
- def get_together_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
 
63
  """Get response from Together API"""
64
  try:
65
  response = together_client.chat.completions.create(
66
  model=model_name,
67
  messages=[
68
- {"role": "system", "content": system_prompt},
69
  {"role": "user", "content": prompt},
70
  ],
71
- max_tokens=max_tokens,
72
- temperature=temperature,
73
  stream=False,
74
  )
75
  return response.choices[0].message.content
76
  except Exception as e:
77
  return f"Error with Together model {model_name}: {str(e)}"
78
 
79
- def get_prometheus_response(model_name, prompt, system_prompt=None, max_tokens=500, temperature=0.01):
80
- """Get response from Hugging Face model"""
81
- try:
82
- headers = {
83
- "Accept": "application/json",
84
- "Authorization": f"Bearer {hf_api_key}",
85
- "Content-Type": "application/json"
86
- }
87
-
88
- # Create messages list for chat template
89
- messages = []
90
- if system_prompt:
91
- messages.append({"role": "system", "content": system_prompt})
92
- messages.append({"role": "user", "content": prompt})
93
-
94
- # Apply chat template
95
- model_id = "prometheus-eval/prometheus-7b-v2.0"
96
- tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_api_key)
97
- formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
98
-
99
- payload = {
100
- "inputs": formatted_prompt,
101
- "parameters": {
102
- "max_new_tokens": max_tokens,
103
- "return_full_text": False,
104
- "temperature": temperature
105
- }
106
- }
107
-
108
- response = requests.post(
109
- "https://otb7jglxy6r37af6.us-east-1.aws.endpoints.huggingface.cloud",
110
- headers=headers,
111
- json=payload
112
- )
113
- return response.json()[0]["generated_text"]
114
- except Exception as e:
115
- return f"Error with Hugging Face model {model_name}: {str(e)}"
116
-
117
- def get_atla_response(model_name, prompt, system_prompt=None, max_tokens=500, temperature=0.01):
118
- """Get response from Atla API"""
119
- try:
120
- # Extract components from the prompt data
121
- model_input = prompt.get('human_input', '')
122
- model_output = prompt.get('ai_response', '')
123
- expected_output = prompt.get('ground_truth_input', '')
124
- evaluation_criteria = prompt.get('eval_criteria', '')
125
-
126
- # Set model_id based on the model name
127
- if "Mini" in model_name:
128
- model_id = "atla-selene-mini"
129
- else:
130
- model_id = "atla-selene"
131
 
132
- response = atla_client.evaluation.create(
133
- model_id=model_id,
134
- model_input=model_input,
135
- model_output=model_output,
136
- expected_model_output=expected_output if expected_output else None,
137
- evaluation_criteria=evaluation_criteria,
138
- )
139
-
140
- # Return the score and critique directly
141
- return {
142
- "score": response.result.evaluation.score,
143
- "critique": response.result.evaluation.critique
144
- }
145
- except Exception as e:
146
- return f"Error with Atla model {model_name}: {str(e)}"
147
-
148
- def get_flow_judge_response(model_name, prompt, max_tokens=2048, temperature=0.1, top_p=0.95) -> str:
149
- """Get response from Flow Judge"""
150
- try:
151
- response = requests.post(
152
- "https://arena.flow-ai.io/v1/chat/completions",
153
- headers={
154
- "Content-Type": "application/json",
155
- "Authorization": f"Bearer {flow_judge_api_key}"
156
- },
157
- json={
158
- "model": model_name,
159
- "messages": [
160
- {"role": "user", "content": prompt}
161
- ],
162
- "max_tokens": max_tokens,
163
- "temperature": temperature,
164
- "top_p": top_p,
165
- "stop": None
166
- }
167
- )
168
- response.raise_for_status()
169
- return response.json()["choices"][0]['message']['content']
170
- except Exception as e:
171
- return f"Error with Flow Judge completions model {model_name}: {str(e)}"
172
-
173
- def get_cohere_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
174
- """Get response from Cohere API"""
175
- try:
176
- response = cohere_client.chat(
177
- model=model_name,
178
- messages=[
179
- {"role": "system", "content": system_prompt},
180
- {"role": "user", "content": prompt}
181
- ],
182
- max_tokens=max_tokens,
183
- temperature=temperature
184
- )
185
- # Extract the text from the content items
186
- content_items = response.message.content
187
- if isinstance(content_items, list):
188
- # Get the text from the first content item
189
- return content_items[0].text
190
- return str(content_items) # Fallback if it's not a list
191
- except Exception as e:
192
- return f"Error with Cohere model {model_name}: {str(e)}"
193
-
194
- def get_salesforce_response(model_name, prompt, system_prompt=None, max_tokens=2048, temperature=0):
195
- """Get response from Salesforce Research API"""
196
- try:
197
- headers = {
198
- 'accept': 'application/json',
199
- "content-type": "application/json",
200
- "X-Api-Key": salesforce_api_key,
201
- }
202
-
203
- # Create messages list
204
- messages = []
205
- messages.append({"role": "user", "content": prompt})
206
-
207
- json_data = {
208
- "prompts": messages,
209
- "temperature": temperature,
210
- "top_p": 1,
211
- "max_tokens": max_tokens,
212
- }
213
-
214
- response = requests.post(
215
- 'https://gateway.salesforceresearch.ai/sfr-judge/process',
216
- headers=headers,
217
- json=json_data
218
- )
219
- response.raise_for_status()
220
- return response.json()['result'][0]
221
- except Exception as e:
222
- return f"Error with Salesforce model {model_name}: {str(e)}"
223
-
224
- def get_model_response(
225
- model_name,
226
- model_info,
227
- prompt_data,
228
- use_reference=False,
229
- max_tokens=500,
230
- temperature=0
231
- ):
232
  """Get response from appropriate API based on model organization"""
233
  if not model_info:
234
  return "Model not found or unsupported."
@@ -236,261 +97,37 @@ def get_model_response(
236
  api_model = model_info["api_model"]
237
  organization = model_info["organization"]
238
 
239
- # Determine if model is Prometheus, Atla, Flow Judge, or Salesforce
240
- is_prometheus = (organization == "Prometheus")
241
- is_atla = (organization == "Atla")
242
- is_flow_judge = (organization == "Flow AI")
243
- is_salesforce = (organization == "Salesforce")
244
-
245
- # For non-Prometheus/Atla/Flow Judge/Salesforce models, use the Judge system prompt
246
- system_prompt = None if (is_prometheus or is_atla or is_flow_judge or is_salesforce) else JUDGE_SYSTEM_PROMPT
247
-
248
- # Select the appropriate base prompt
249
- if is_atla or is_salesforce: # Use same prompt for Atla and Salesforce
250
- base_prompt = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT
251
- elif is_flow_judge:
252
- base_prompt = FLOW_JUDGE_PROMPT
253
- else:
254
- base_prompt = PROMETHEUS_PROMPT_WITH_REFERENCE if use_reference else PROMETHEUS_PROMPT
255
-
256
- # For non-Prometheus/non-Atla/non-Salesforce models, use Prometheus but replace the output format with JSON
257
- if not (is_prometheus or is_atla or is_flow_judge or is_salesforce):
258
- base_prompt = base_prompt.replace(
259
- '3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"',
260
- '3. Your output format should strictly adhere to JSON as follows: {{"feedback": "<write feedback>", "result": <numerical score>}}. Ensure the output is valid JSON, without additional formatting or explanations.'
261
- )
262
-
263
- try:
264
- if not is_flow_judge:
265
- # Format the prompt with the provided data, only using available keys
266
- final_prompt = base_prompt.format(
267
- human_input=prompt_data['human_input'],
268
- ai_response=prompt_data['ai_response'],
269
- ground_truth_input=prompt_data.get('ground_truth_input', ''),
270
- eval_criteria=prompt_data['eval_criteria'],
271
- score1_desc=prompt_data['score1_desc'],
272
- score2_desc=prompt_data['score2_desc'],
273
- score3_desc=prompt_data['score3_desc'],
274
- score4_desc=prompt_data['score4_desc'],
275
- score5_desc=prompt_data['score5_desc']
276
- )
277
- else:
278
- human_input = f"<user_input>\n{prompt_data['human_input']}\n</user_input>"
279
- ai_response = f"<response>\n{prompt_data['ai_response']}\n</response>"
280
- ground_truth=prompt_data.get('ground_truth_input', '')
281
- if ground_truth:
282
- response_reference = f"<response_reference>\n{ground_truth}\n</response_reference>"
283
- else:
284
- response_reference = ""
285
- eval_criteria = prompt_data['eval_criteria']
286
- score1_desc = f"- Score 1: {prompt_data['score1_desc']}\n"
287
- score2_desc = f"- Score 2: {prompt_data['score2_desc']}\n"
288
- score3_desc = f"- Score 3: {prompt_data['score3_desc']}\n"
289
- score4_desc = f"- Score 4: {prompt_data['score4_desc']}\n"
290
- score5_desc = f"- Score 5: {prompt_data['score5_desc']}"
291
- rubric = score1_desc + score2_desc + score3_desc + score4_desc + score5_desc
292
- if response_reference:
293
- inputs = human_input + "\n"+ response_reference
294
- else:
295
- inputs = human_input
296
- final_prompt = base_prompt.format(
297
- INPUTS=inputs,
298
- OUTPUT=ai_response,
299
- EVALUATION_CRITERIA=eval_criteria,
300
- RUBRIC=rubric
301
- )
302
-
303
- except KeyError as e:
304
- return f"Error formatting prompt: Missing required field {str(e)}"
305
-
306
  try:
307
  if organization == "OpenAI":
308
- return get_openai_response(
309
- api_model, final_prompt, system_prompt, max_tokens, temperature
310
- )
311
  elif organization == "Anthropic":
312
- return get_anthropic_response(
313
- api_model, final_prompt, system_prompt, max_tokens, temperature
314
- )
315
- elif organization == "Prometheus":
316
- return get_prometheus_response(
317
- api_model, final_prompt, system_prompt, max_tokens, temperature = 0.01
318
- )
319
- elif organization == "Atla":
320
- response = get_atla_response(
321
- api_model, prompt_data, system_prompt, max_tokens, temperature
322
- )
323
- # Response now contains score and critique directly
324
- if isinstance(response, dict) and 'score' in response and 'critique' in response:
325
- score = str(response['score'])
326
- critique = response['critique']
327
- return score, critique
328
- else:
329
- return "Error", str(response)
330
- elif organization == "Cohere":
331
- return get_cohere_response(
332
- api_model, final_prompt, system_prompt, max_tokens, temperature
333
- )
334
- elif organization == "Flow AI":
335
- return get_flow_judge_response(
336
- api_model, final_prompt
337
- )
338
- elif organization == "Salesforce":
339
- response = get_salesforce_response(
340
- api_model, final_prompt, system_prompt, max_tokens, temperature
341
- )
342
- return response
343
  else:
344
  # All other organizations use Together API
345
- return get_together_response(
346
- api_model, final_prompt, system_prompt, max_tokens, temperature
347
- )
348
  except Exception as e:
349
  return f"Error with {organization} model {model_name}: {str(e)}"
350
 
 
351
  def parse_model_response(response):
352
  try:
353
  # Debug print
354
  print(f"Raw model response: {response}")
355
 
356
- # If response is already a tuple (from Atla/Salesforce), use it directly
357
- if isinstance(response, tuple):
358
- return response
359
-
360
- # If response is already a dictionary, use it directly
361
- if isinstance(response, dict):
362
- return str(response.get("result", "N/A")), response.get("feedback", "N/A")
363
-
364
  # First try to parse the entire response as JSON
365
  try:
366
  data = json.loads(response)
367
  return str(data.get("result", "N/A")), data.get("feedback", "N/A")
368
  except json.JSONDecodeError:
369
- # If that fails, check if this is a Salesforce response
370
- if "**Reasoning:**" in response or "**Result:**" in response:
371
- # Use ATLA parser for Salesforce responses only
372
- return salesforce_parse_model_response(response)
373
-
374
- # Otherwise try to find JSON within the response
375
- json_match = re.search(r"{.*}", response, re.DOTALL)
376
  if json_match:
377
  data = json.loads(json_match.group(0))
378
  return str(data.get("result", "N/A")), data.get("feedback", "N/A")
379
  else:
380
- return "Error", f"Invalid response format returned - here is the raw model response: {response}"
381
 
382
  except Exception as e:
383
  # Debug print for error case
384
  print(f"Failed to parse response: {str(e)}")
385
-
386
- # If the error message itself contains valid JSON, try to parse that
387
- try:
388
- error_json_match = re.search(r"{.*}", str(e), re.DOTALL)
389
- if error_json_match:
390
- data = json.loads(error_json_match.group(0))
391
- return str(data.get("result", "N/A")), data.get("feedback", "N/A")
392
- except:
393
- pass
394
-
395
  return "Error", f"Failed to parse response: {response}"
396
-
397
- def prometheus_parse_model_response(output):
398
- try:
399
- print(f"Raw model response: {output}")
400
- output = output.strip()
401
-
402
- # Remove "Feedback:" prefix if present (case insensitive)
403
- output = re.sub(r'^feedback:\s*', '', output, flags=re.IGNORECASE)
404
-
405
- # New pattern to match [RESULT] X at the beginning
406
- begin_result_pattern = r'^\[RESULT\]\s*(\d+)\s*\n*(.*?)$'
407
- begin_match = re.search(begin_result_pattern, output, re.DOTALL | re.IGNORECASE)
408
- if begin_match:
409
- score = int(begin_match.group(1))
410
- feedback = begin_match.group(2).strip()
411
- return str(score), feedback
412
-
413
- # Existing patterns for end-of-string results...
414
- pattern = r"(.*?)\s*\[RESULT\]\s*[\(\[]?(\d+)[\)\]]?"
415
- match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
416
- if match:
417
- feedback = match.group(1).strip()
418
- score = int(match.group(2))
419
- return str(score), feedback
420
-
421
- # If no match, try to match "... Score: X"
422
- pattern = r"(.*?)\s*(?:Score|Result)\s*:\s*[\(\[]?(\d+)[\)\]]?"
423
- match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
424
- if match:
425
- feedback = match.group(1).strip()
426
- score = int(match.group(2))
427
- return str(score), feedback
428
-
429
- # Pattern to handle [Score X] at the end
430
- pattern = r"(.*?)\s*\[(?:Score|Result)\s*[\(\[]?(\d+)[\)\]]?\]$"
431
- match = re.search(pattern, output, re.DOTALL)
432
- if match:
433
- feedback = match.group(1).strip()
434
- score = int(match.group(2))
435
- return str(score), feedback
436
-
437
- # Final fallback attempt
438
- pattern = r"[\(\[]?(\d+)[\)\]]?\s*\]?$"
439
- match = re.search(pattern, output)
440
- if match:
441
- score = int(match.group(1))
442
- feedback = output[:match.start()].rstrip()
443
- # Remove any trailing brackets from feedback
444
- feedback = re.sub(r'\s*\[[^\]]*$', '', feedback).strip()
445
- return str(score), feedback
446
-
447
- return "Error", f"Failed to parse response: {output}"
448
-
449
- except Exception as e:
450
- print(f"Failed to parse response: {str(e)}")
451
- return "Error", f"Exception during parsing: {str(e)}"
452
-
453
- def salesforce_parse_model_response(output):
454
- """Parse response from Salesforce model"""
455
- try:
456
- print(f"Raw Salesforce model response: {output}")
457
- output = output.strip()
458
-
459
- # Look for the Reasoning and Result sections
460
- reasoning_match = re.search(r'\*\*Reasoning:\*\*(.*?)(?=\*\*Result:|$)', output, re.DOTALL)
461
- result_match = re.search(r'\*\*Result:\*\*\s*(\d+)', output)
462
-
463
- if reasoning_match and result_match:
464
- feedback = reasoning_match.group(1).strip()
465
- score = result_match.group(1)
466
- return str(score), feedback
467
-
468
- return "Error", f"Failed to parse Salesforce response format: {output}"
469
-
470
- except Exception as e:
471
- print(f"Failed to parse Salesforce response: {str(e)}")
472
- return "Error", f"Exception during parsing: {str(e)}"
473
-
474
- def flow_judge_parse_model_response(output):
475
- try:
476
- print(f"Raw model response: {output}")
477
- # Convert multiple line breaks to single ones and strip whitespace
478
- output = re.sub(r'\n{2,}', '\n', output.strip())
479
-
480
- # Compile regex patterns
481
- feedback_pattern = re.compile(r"<feedback>\s*(.*?)\s*</feedback>", re.DOTALL)
482
- score_pattern = re.compile(r"<score>\s*(\d+)\s*</score>", re.DOTALL)
483
-
484
- feedback_match = feedback_pattern.search(output)
485
- score_match = score_pattern.search(output)
486
-
487
- if feedback_match or not score_match:
488
- feedback = feedback_match.group(1).strip()
489
- score = int(score_match.group(1).strip())
490
- return str(score), feedback
491
-
492
- return "Error", f"Failed to parse response: {output}"
493
-
494
- except Exception as e:
495
- print(f"Failed to parse response: {str(e)}")
496
- return "Error", f"Exception during parsing: {str(e)}"
 
1
  from openai import OpenAI
2
  import anthropic
3
  from together import Together
 
4
  import json
5
  import re
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  # Initialize clients
8
  anthropic_client = anthropic.Anthropic()
9
  openai_client = OpenAI()
10
  together_client = Together()
 
 
 
 
11
 
12
+ # Initialize OpenAI client
13
+
14
+ EXAMPLE_GENERATION_PROMPT_SYSTEM = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes."""
15
+ EXAMPLE_GENERATION_PROMPT_USER = """Please provide a random human message and an appropriate AI response in the format of an academic benchmark dataset e.g.,. User: "Hi, I'm trying to solve a crossword puzzle, but I've never done one of these before. Can you help me out?" / AI Response: "Absolutely! I'd be delighted to help you with your crossword puzzle. Just tell me the clues and the number of letters needed for each answer (and any letters you may have already filled in), and I'll do my best to help you find the solutions. If you have any specific questions about how to approach solving crossword puzzles in general, feel free to ask those as well!". Format the output as JSON:\n\n{\"human\": \"<human message>\", \"ai\": \"<AI assistant response>\"}"""
16
+
17
+ def get_random_human_ai_pair():
18
+ # Use GPT-3.5 to generate a random conversation
19
+ completion = openai_client.chat.completions.create(
20
+ model="gpt-3.5-turbo",
21
+ messages=[
22
+ {"role": "system", "content": EXAMPLE_GENERATION_PROMPT_SYSTEM},
23
+ {"role": "user", "content": EXAMPLE_GENERATION_PROMPT_USER},
24
+ ],
25
+ max_completion_tokens=300,
26
+ temperature=1,
27
+ )
28
+
29
+ # Parse the response to get the human input and AI response
30
+ raw_response = completion.choices[0].message.content.strip()
31
+
32
+ try:
33
+ data = json.loads(raw_response)
34
+ human_message = data.get("human", "Hello, how are you?")
35
+ ai_message = data.get("ai", "I'm doing well, thank you!")
36
+ except json.JSONDecodeError:
37
+ # If parsing fails, set default messages
38
+ human_message = "Hello, how are you?"
39
+ ai_message = "I'm doing well, thank you!"
40
+
41
+ return human_message, ai_message
42
+
43
+ SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
44
 
45
+
46
+ def get_openai_response(model_name, prompt):
47
  """Get response from OpenAI API"""
48
  try:
49
  response = openai_client.chat.completions.create(
50
  model=model_name,
51
  messages=[
52
+ {"role": "system", "content": SYSTEM_PROMPT},
53
  {"role": "user", "content": prompt},
54
  ],
 
 
55
  )
56
  return response.choices[0].message.content
57
  except Exception as e:
58
  return f"Error with OpenAI model {model_name}: {str(e)}"
59
 
60
+
61
+ def get_anthropic_response(model_name, prompt):
62
  """Get response from Anthropic API"""
63
  try:
64
  response = anthropic_client.messages.create(
65
  model=model_name,
66
+ max_tokens=1000,
67
+ temperature=0,
68
+ system=SYSTEM_PROMPT,
69
  messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}],
70
  )
71
  return response.content[0].text
72
  except Exception as e:
73
  return f"Error with Anthropic model {model_name}: {str(e)}"
74
 
75
+
76
+ def get_together_response(model_name, prompt):
77
  """Get response from Together API"""
78
  try:
79
  response = together_client.chat.completions.create(
80
  model=model_name,
81
  messages=[
82
+ {"role": "system", "content": SYSTEM_PROMPT},
83
  {"role": "user", "content": prompt},
84
  ],
 
 
85
  stream=False,
86
  )
87
  return response.choices[0].message.content
88
  except Exception as e:
89
  return f"Error with Together model {model_name}: {str(e)}"
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ def get_model_response(model_name, model_info, prompt):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  """Get response from appropriate API based on model organization"""
94
  if not model_info:
95
  return "Model not found or unsupported."
 
97
  api_model = model_info["api_model"]
98
  organization = model_info["organization"]
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  try:
101
  if organization == "OpenAI":
102
+ return get_openai_response(api_model, prompt)
 
 
103
  elif organization == "Anthropic":
104
+ return get_anthropic_response(api_model, prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  else:
106
  # All other organizations use Together API
107
+ return get_together_response(api_model, prompt)
 
 
108
  except Exception as e:
109
  return f"Error with {organization} model {model_name}: {str(e)}"
110
 
111
+
112
  def parse_model_response(response):
113
  try:
114
  # Debug print
115
  print(f"Raw model response: {response}")
116
 
 
 
 
 
 
 
 
 
117
  # First try to parse the entire response as JSON
118
  try:
119
  data = json.loads(response)
120
  return str(data.get("result", "N/A")), data.get("feedback", "N/A")
121
  except json.JSONDecodeError:
122
+ # If that fails (typically for smaller models), try to find JSON within the response
123
+ json_match = re.search(r"{.*}", response)
 
 
 
 
 
124
  if json_match:
125
  data = json.loads(json_match.group(0))
126
  return str(data.get("result", "N/A")), data.get("feedback", "N/A")
127
  else:
128
+ return "Error", f"Failed to parse response: {response}"
129
 
130
  except Exception as e:
131
  # Debug print for error case
132
  print(f"Failed to parse response: {str(e)}")
 
 
 
 
 
 
 
 
 
 
133
  return "Error", f"Failed to parse response: {response}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard.py DELETED
@@ -1,116 +0,0 @@
1
- from collections import defaultdict
2
- from datetime import datetime, timezone
3
- from typing import Dict, List
4
-
5
- # Constants
6
- DEFAULT_ELO = 1200 # Starting ELO for new models
7
- K_FACTOR = 32 # Standard chess K-factor
8
-
9
- def get_leaderboard(model_data: Dict, voting_data: List, show_preliminary=True):
10
- """Generate leaderboard data using votes from MongoDB."""
11
- # Initialize dictionaries for tracking
12
- ratings = defaultdict(lambda: DEFAULT_ELO)
13
- matches = defaultdict(int)
14
-
15
- # Process each vote
16
- for vote in voting_data:
17
- try:
18
- model_a = vote.get("model_a")
19
- model_b = vote.get("model_b")
20
- winner = vote.get("winner")
21
-
22
- # Skip if models aren't in current model_data
23
- if (
24
- not all([model_a, model_b, winner])
25
- or model_a not in model_data
26
- or model_b not in model_data
27
- ):
28
- continue
29
-
30
- # Update match counts
31
- matches[model_a] += 1
32
- matches[model_b] += 1
33
-
34
- # Calculate ELO changes
35
- elo_a = ratings[model_a]
36
- elo_b = ratings[model_b]
37
-
38
- # Expected scores
39
- expected_a = 1 / (1 + 10 ** ((elo_b - elo_a) / 400))
40
- expected_b = 1 - expected_a
41
-
42
- # Actual scores
43
- score_a = 1 if winner == "A" else 0 if winner == "B" else 0.5
44
- score_b = 1 - score_a
45
-
46
- # Update ratings
47
- ratings[model_a] += K_FACTOR * (score_a - expected_a)
48
- ratings[model_b] += K_FACTOR * (score_b - expected_b)
49
-
50
- except Exception as e:
51
- print(f"Error processing vote: {e}")
52
- continue
53
-
54
- # Generate leaderboard data
55
- leaderboard = []
56
- for model in model_data.keys():
57
- votes = matches[model]
58
- # Skip models with < 300 votes if show_preliminary is False
59
- if not show_preliminary and votes < 300:
60
- continue
61
-
62
- elo = ratings[model]
63
- ci = 1.96 * (400 / (votes + 1) ** 0.5) if votes > 0 else 0
64
- data = {
65
- "Model": model,
66
- "ELO Score": f"{int(elo)}",
67
- "95% CI": f"±{int(ci)}",
68
- "# Votes": votes,
69
- "Organization": model_data[model]["organization"],
70
- "License": model_data[model]["license"],
71
- }
72
- leaderboard.append(data)
73
-
74
- # Sort leaderboard by ELO score in descending order
75
- leaderboard.sort(key=lambda x: float(x["ELO Score"]), reverse=True)
76
-
77
- return leaderboard
78
-
79
- def get_leaderboard_stats(model_data: Dict, voting_data: List) -> str:
80
- """Get summary statistics for the leaderboard."""
81
- now = datetime.now(timezone.utc)
82
- total_votes = len(voting_data)
83
- total_models = len(model_data)
84
- # last_updated = now.strftime("%B %d, %Y at %H:%M:%S UTC")
85
-
86
- last_updated = now.replace(minute=0, second=0, microsecond=0).strftime(
87
- "%B %d, %Y at %H:00 UTC"
88
- )
89
-
90
- return f"""
91
- ### Leaderboard Stats
92
- - **Total Models**: {total_models}
93
- - **Total Votes**: {total_votes}
94
- - **Last Updated**: {last_updated}
95
- """
96
-
97
- def calculate_elo_change(rating_a: float, rating_b: float, winner: str) -> tuple[float, float]:
98
- """Calculate ELO rating changes for both players."""
99
- expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
100
- expected_b = 1 - expected_a
101
-
102
- if winner == "A":
103
- score_a, score_b = 1, 0
104
- elif winner == "B":
105
- score_a, score_b = 0, 1
106
- else: # Handle ties
107
- score_a, score_b = 0.5, 0.5
108
-
109
- change_a = K_FACTOR * (score_a - expected_a)
110
- change_b = K_FACTOR * (score_b - expected_b)
111
-
112
- return change_a, change_b
113
-
114
- def get_model_rankings(leaderboard: List[Dict]) -> Dict[str, int]:
115
- """Get current rankings of all models from leaderboard data."""
116
- return {entry["Model"]: idx + 1 for idx, entry in enumerate(leaderboard)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
prompts.py DELETED
@@ -1,210 +0,0 @@
1
- # Default values for compatible mode
2
- DEFAULT_EVAL_CRITERIA = """Does the model provide relevant and useful responses to the user's needs or questions?"""
3
-
4
- DEFAULT_SCORE_1 = "The model's responses are irrelevant or unhelpful to the user's needs or queries."
5
- DEFAULT_SCORE_2 = "The model sometimes provides helpful information, but often fails to address the user's actual needs or questions."
6
- DEFAULT_SCORE_3 = "The model generally provides helpful responses that address the user's needs, though it may occasionally miss the mark."
7
- DEFAULT_SCORE_4 = "The model regularly provides helpful responses that are well-aligned with the user's inquiries, with only rare inaccuracies."
8
- DEFAULT_SCORE_5 = "The model consistently offers highly relevant and useful responses that perfectly cater to the user's needs and inquiries."
9
-
10
- # Default Eval Prompt
11
- DEFAULT_EVAL_PROMPT = """Does the model provide relevant and useful responses to the user's needs or questions?
12
-
13
- Scoring Rubric:
14
- Score 1: The model's responses are irrelevant or unhelpful to the user's needs or queries.
15
- Score 2: The model sometimes provides helpful information, but often fails to address the user's actual needs or questions.
16
- Score 3: The model generally provides helpful responses that address the user's needs, though it may occasionally miss the mark.
17
- Score 4: The model regularly provides helpful responses that are well-aligned with the user's inquiries, with only rare inaccuracies.
18
- Score 5: The model consistently offers highly relevant and useful responses that perfectly cater to the user's needs and inquiries.
19
-
20
- [User Query]: {{input}}
21
-
22
- [AI Response]: {{response}}"""
23
-
24
- # Split the eval prompt into editable and fixed parts
25
- DEFAULT_EVAL_PROMPT_EDITABLE = """Does the model provide relevant and useful responses to the user's needs or questions?
26
-
27
- Scoring Rubric:
28
- Score 1: The model's responses are irrelevant or unhelpful to the user's needs or queries.
29
- Score 2: The model sometimes provides helpful information, but often fails to address the user's actual needs or questions.
30
- Score 3: The model generally provides helpful responses that address the user's needs, though it may occasionally miss the mark.
31
- Score 4: The model regularly provides helpful responses that are well-aligned with the user's inquiries, with only rare inaccuracies.
32
- Score 5: The model consistently offers highly relevant and useful responses that perfectly cater to the user's needs and inquiries."""
33
-
34
- # Fixed suffix that will always be appended
35
- FIXED_EVAL_SUFFIX = """
36
- [User Query]: {{human_input}}
37
-
38
- [AI Response]: {{ai_response}}"""
39
-
40
- # Define the Prometheus prompt used by default (without reference)
41
- PROMETHEUS_PROMPT = """###Task Description:
42
- An instruction (might include an Input inside it) and a response to evaluate are given.
43
- 1. Write a detailed feedback that assesses the quality of the response strictly based on the given score rubric, not evaluating in general.
44
- 2. After writing the feedback, write a score that is an integer between 1 and 5.
45
- 3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"
46
- 4. Please do not generate any other openings, closings, or explanations.
47
-
48
- ###The instruction to evaluate:
49
- {human_input}
50
-
51
- ###Response to evaluate:
52
- {ai_response}
53
-
54
- ###Score Rubrics:
55
- [{eval_criteria}]
56
- Score 1: {score1_desc}
57
- Score 2: {score2_desc}
58
- Score 3: {score3_desc}
59
- Score 4: {score4_desc}
60
- Score 5: {score5_desc}
61
-
62
- ###Feedback:
63
- """
64
-
65
- # Define the Prometheus prompt with reference response
66
- PROMETHEUS_PROMPT_WITH_REFERENCE = """###Task Description:
67
- An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing an evaluation criteria are given.
68
- 1. Write a detailed feedback that assesses the quality of the response strictly based on the given score rubric, not evaluating in general.
69
- 2. After writing the feedback, write a score that is an integer between 1 and 5.
70
- 3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"
71
- 4. Please do not generate any other openings, closings, or explanations.
72
-
73
- ###The instruction to evaluate:
74
- {human_input}
75
-
76
- ###Response to evaluate:
77
- {ai_response}
78
-
79
- ###Reference Answer (Score 5):
80
- {ground_truth_input}
81
-
82
- ###Score Rubrics:
83
- [{eval_criteria}]
84
- Score 1: {score1_desc}
85
- Score 2: {score2_desc}
86
- Score 3: {score3_desc}
87
- Score 4: {score4_desc}
88
- Score 5: {score5_desc}
89
-
90
- ###Feedback:
91
- """
92
-
93
- # Judge system prompt for non-Prometheus models
94
- JUDGE_SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
95
-
96
- ATLA_PROMPT = """You are tasked with evaluating a response based on a given instruction (which may contain an Input) and a scoring rubric that serve as the evaluation standard. Provide a comprehensive feedback on the response quality strictly adhering to the scoring rubric, without any general evaluation. Follow this with a score between 1 and 5, referring to the scoring rubric. Avoid generating any additional opening, closing, or explanations.
97
- Here are some rules of the evaluation:
98
- (1) You should prioritize evaluating whether the response satisfies the provided rubric. The basis of your score should depend exactly on the rubric. However, the response does not need to explicitly address points raised in the rubric. Rather, evaluate the response based on the criteria outlined in the rubric.
99
-
100
- Your reply should strictly follow this format:
101
- **Reasoning:** <Your feedback>
102
-
103
- **Result:** <an integer between 1 and 5>
104
-
105
- Here is the data:
106
-
107
- Instruction:
108
- ```
109
- {human_input}
110
- ```
111
-
112
- Response:
113
- ```
114
- {ai_response}
115
- ```
116
-
117
- Score Rubrics:
118
- [{eval_criteria}]
119
- Score 1: {score1_desc}
120
- Score 2: {score2_desc}
121
- Score 3: {score3_desc}
122
- Score 4: {score4_desc}
123
- Score 5: {score5_desc}"""
124
-
125
- ATLA_PROMPT_WITH_REFERENCE = """You are tasked with evaluating a response based on a given instruction (which may contain an Input) and a scoring rubric and reference answer that serve as the evaluation standard. Provide a comprehensive feedback on the response quality strictly adhering to the scoring rubric, without any general evaluation. Follow this with a score between 1 and 5, referring to the scoring rubric. Avoid generating any additional opening, closing, or explanations.
126
-
127
- Here are some rules of the evaluation:
128
- (1) You should prioritize evaluating whether the response satisfies the provided rubric. The basis of your score should depend exactly on the rubric. However, the response does not need to explicitly address points raised in the rubric. Rather, evaluate the response based on the criteria outlined in the rubric.
129
-
130
- Your reply should strictly follow this format:
131
- **Reasoning:** <Your feedback>
132
-
133
- **Result:** <an integer between 1 and 5>
134
-
135
- Here is the data:
136
-
137
- Instruction:
138
- ```
139
- {human_input}
140
- ```
141
-
142
- Response:
143
- ```
144
- {ai_response}
145
- ```
146
-
147
- Score Rubrics:
148
- [{eval_criteria}]
149
- Score 1: {score1_desc}
150
- Score 2: {score2_desc}
151
- Score 3: {score3_desc}
152
- Score 4: {score4_desc}
153
- Score 5: {score5_desc}
154
-
155
- Reference answer:
156
- {ground_truth_input}"""
157
-
158
- # Define the Flow Judge prompt
159
- FLOW_JUDGE_PROMPT = """# GOAL
160
- Your job is to evaluate a task carried out by an AI system powered by a large \
161
- language model.
162
-
163
- You will be provided with the inputs and output of the task, as well as the evaluation criteria \
164
- and scoring rubric. Your task is to evaluate the output of the AI system based on the evaluation \
165
- criteria and scoring rubric provided.
166
-
167
- # INPUT
168
- Below are the inputs required for performing the task:
169
- <inputs>
170
- {INPUTS}
171
- </inputs>
172
-
173
- # OUTPUT
174
- Below is the output of the task:
175
- <output>
176
- {OUTPUT}
177
- </output>
178
-
179
- # EVALUATION CRITERIA AND SCORING RUBRIC
180
- Here are the evaluation criteria and the rubric that you need to use for evaluating the task:
181
- <evaluation_criteria>
182
- {EVALUATION_CRITERIA}
183
- </evaluation_criteria>
184
-
185
- <scoring_rubric>
186
- {RUBRIC}
187
- </scoring_rubric>
188
-
189
- # INSTRUCTIONS FOR THE EVALUATION
190
- 1. Understand the task and criteria: Familiarize yourself with the task to be evaluated. \
191
- Review the evaluation criteria and scoring rubric to understand the different levels of \
192
- performance and the descriptions for each score.
193
- 2. Review the inputs and output: Look at the inputs provided for the task. Examine the output \
194
- generated from completing the task.
195
- 3. Compare output to score descriptions: Compare the output against the criteria and score \
196
- descriptions in the scoring rubric. For each criterion,decide which description best matches the \
197
- output.
198
- 4. After comparing the output to the score descriptions, pay attention to the small details that \
199
- might impact the final score that you assign. Sometimes a small difference can dictate the final \
200
- score.
201
- 5. Write verbal feedback justifying your evaluation that includes a detailed rationale, referring \
202
- to specific aspects of the output and comparing them to the rubric.
203
- 6. Assign a final score based on the scoring rubric.
204
-
205
- ## FORMAT FOR THE EVALUATION
206
- - Write the verbal feedback inside <feedback> tags without any additional surrounding text.
207
- - Write the numeric score inside <score> tags, without any additional surrounding text and always \
208
- after the feedback.
209
-
210
- Please accurately evaluate the task. Strictly adhere to the evaluation criteria and rubric."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
random_sample_generation.py DELETED
@@ -1,183 +0,0 @@
1
- from openai import OpenAI
2
- import anthropic
3
- import json
4
- import re
5
- import random
6
- import os
7
- from gen_api_answer import get_openai_response, get_anthropic_response
8
-
9
- # Initialize clients
10
- anthropic_client = anthropic.Anthropic()
11
- openai_client = OpenAI()
12
-
13
- GOOD_SYSTEM_PROMPT = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes. The AI response generated should be a few sentences long. Format your output as JSON: {"human": "<human message>", "ai": <AI assistant response>}. Ensure the output is valid JSON, without additional formatting or explanations."""
14
- BAD_SYSTEM_PROMPT = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes. The response should contain incorrect information, logical fallacies, or misleading explanations. It should sound plausible but be fundamentally wrong. The AI response generated should be a few sentences long. Format your output as JSON: {"human": "<human message>", "ai": <AI assistant response>}. Ensure the output is valid JSON, without additional formatting or explanations."""
15
- AMBIGUOUS_SYSTEM_PROMPT = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes. The response should mix correct and incorrect information - it should contain some accurate points but also include nuanced, questionable claims or exaggerations. The AI response generated should be a few sentences long. Format your output as JSON: {"human": "<human message>", "ai": <AI assistant response>}. Ensure the output is valid JSON, without additional formatting or explanations."""
16
-
17
- GOOD_SYSTEM_PROMPT_WITH_GROUND_TRUTH = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes, along with an ideal reference answer. The AI response generated should be a few sentences long and contain accurate information. The ground truth response should be a perfect, comprehensive answer that would score 5/5. Format your output as JSON: {"human": "<human message>", "ai": "<AI assistant response>", "ground_truth": "<perfect reference answer>"}. Ensure the output is valid JSON, without additional formatting or explanations."""
18
- BAD_SYSTEM_PROMPT_WITH_GROUND_TRUTH = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes, along with an ideal reference answer. The AI response should be a few sentences long and contain incorrect information, logical fallacies, or misleading explanations. It should sound plausible but be fundamentally wrong. The ground truth response should be a perfect, comprehensive answer that would score 5/5. Format your output as JSON: {"human": "<human message>", "ai": "<AI assistant response>", "ground_truth": "<perfect reference answer>"}. Ensure the output is valid JSON, without additional formatting or explanations."""
19
- AMBIGUOUS_SYSTEM_PROMPT_WITH_GROUND_TRUTH = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes, along with an ideal reference answer. The AI response should be a few sentences long and mix correct and incorrect information - it should contain some accurate points but also include nuanced, questionable claims or exaggerations. The ground truth response should be a perfect, comprehensive answer that would score 5/5. Format your output as JSON: {"human": "<human message>", "ai": "<AI assistant response>", "ground_truth": "<perfect reference answer>"}. Ensure the output is valid JSON, without additional formatting or explanations."""
20
-
21
- GENERATION_PROMPT = """Please generate a random human message and an AI response in the format of a QA dataset. The human input should not be a one-word answer question like "What is the capital of France?". The AI response generated should be a few sentences long."""
22
- GENERATION_PROMPT_WITH_GROUND_TRUTH = """Please generate:
23
- 1. A random human message (not a simple one-word answer question)
24
- 2. An AI response (a few sentences long)
25
- 3. A perfect reference answer that would score 5/5 on all criteria (e.g., concise, helpful, and accurate)
26
-
27
- Format as JSON with "human", "ai", and "ground_truth" fields."""
28
-
29
- RESPONSE_GENERATION_SYSTEM_PROMPT = "You are an assistant that generates random responses to human messages for testing purposes. Generate bad responses (with a mix of correct and incorrect information) 60% of the time and good responses 40% of the time. Do not say which type of response you are generating, just generate the response."
30
-
31
- def get_random_human_ai_pair():
32
- # Select system prompt with specified probabilities
33
- system_prompt = random.choices(
34
- [GOOD_SYSTEM_PROMPT, BAD_SYSTEM_PROMPT, AMBIGUOUS_SYSTEM_PROMPT],
35
- weights=[0.2, 0.2, 0.6] # 20% good, 20% bad, 60% ambiguous
36
- )[0]
37
-
38
- # Log which type of response is being generated
39
- prompt_type = {
40
- GOOD_SYSTEM_PROMPT: "good",
41
- BAD_SYSTEM_PROMPT: "bad",
42
- AMBIGUOUS_SYSTEM_PROMPT: "ambiguous"
43
- }[system_prompt]
44
- print(f"Generating {prompt_type} response")
45
-
46
- # Randomly choose between GPT-3.5 and Claude with 65%/35% weights
47
- model_choice = random.choices([
48
- ("gpt-3.5-turbo", get_openai_response),
49
- ("claude-3-5-haiku-latest", get_anthropic_response)
50
- ], weights=[0.5, 0.5])[0]
51
- model_name, api_func = model_choice
52
-
53
- # Generate response using selected model
54
- response = api_func(
55
- model_name=model_name,
56
- prompt=GENERATION_PROMPT,
57
- system_prompt=system_prompt,
58
- max_tokens=500,
59
- temperature=1
60
- )
61
-
62
- # Define default messages
63
- default_human = "How do muscles grow?"
64
- default_ai = """Muscles grow through a process called skeletal muscle hypertrophy, which adds more myosin filaments to each muscle fiber, making the engine of the cell bigger and stronger over time. This is achieved through increased muscle tension and physical stress, breaking down muscle fiber. Muscle growth is also a direct consequence of resistance training and nutrition. People build muscle at different rates depending on their age, sex, and genetics, but muscle development significantly increases if exercise is done correctly and the body stores more protein through a process called protein synthesis."""
65
-
66
- try:
67
- # Clean the response by replacing newlines with spaces
68
- cleaned_response = response.replace('\n', ' ').replace('\r', '')
69
- data = json.loads(cleaned_response)
70
-
71
- # Extract messages with fallbacks
72
- human_message = data.get("human", default_human)
73
- ai_message = data.get("ai", default_ai)
74
-
75
- # Debug logging
76
- print(f"Parsed response: human='{human_message}', ai='{ai_message[:50]}...'")
77
-
78
- except Exception as e:
79
- print(f"Failed to parse response: {str(e)}\n {response}")
80
- human_message = default_human
81
- ai_message = default_ai
82
-
83
- return human_message, ai_message
84
-
85
- def get_random_human_ai_ground_truth_pair():
86
- # Select system prompt with specified probabilities
87
- system_prompts = {
88
- "good": GOOD_SYSTEM_PROMPT_WITH_GROUND_TRUTH,
89
- "bad": BAD_SYSTEM_PROMPT_WITH_GROUND_TRUTH,
90
- "ambiguous": AMBIGUOUS_SYSTEM_PROMPT_WITH_GROUND_TRUTH
91
- }
92
-
93
- prompt_type = random.choices(
94
- ["good", "bad", "ambiguous"],
95
- weights=[0.2, 0.2, 0.6] # 20% good, 20% bad, 60% ambiguous
96
- )[0]
97
-
98
- system_prompt = system_prompts[prompt_type]
99
- print(f"Generating {prompt_type} response with ground truth")
100
-
101
- # Randomly choose between GPT-3.5 and Claude with 50/50 weights
102
- model_choice = random.choices([
103
- ("gpt-3.5-turbo", get_openai_response),
104
- ("claude-3-5-haiku-latest", get_anthropic_response)
105
- ], weights=[0.5, 0.5])[0]
106
- model_name, api_func = model_choice
107
-
108
- # Define default messages
109
- defaults = {
110
- "human": "How do muscles grow?",
111
- "ai": """Muscles grow through a process called skeletal muscle hypertrophy, which adds more myosin filaments to each muscle fiber, making the engine of the cell bigger and stronger over time. This is achieved through increased muscle tension and physical stress, breaking down muscle fiber. Muscle growth is also a direct consequence of resistance training and nutrition. People build muscle at different rates depending on their age, sex, and genetics, but muscle development significantly increases if exercise is done correctly and the body stores more protein through a process called protein synthesis.""",
112
- "ground_truth": """Muscle growth (hypertrophy) occurs through a complex biological process involving several key mechanisms:
113
-
114
- 1. Mechanical Tension: Resistance training creates mechanical tension in muscle fibers, triggering molecular and cellular responses that promote growth.
115
-
116
- 2. Metabolic Stress: The depletion of energy resources and accumulation of metabolic byproducts during exercise contributes to muscle growth signaling.
117
-
118
- 3. Muscle Damage: Exercise-induced micro-damage to muscle fibers activates satellite cells, which help repair and build new muscle tissue.
119
-
120
- 4. Protein Synthesis: After exercise, increased protein synthesis rates exceed protein breakdown, leading to net muscle protein accretion.
121
-
122
- 5. Hormonal Response: Exercise triggers the release of growth-promoting hormones like testosterone, growth hormone, and IGF-1.
123
-
124
- 6. Recovery: Adequate rest between training sessions allows for repair and growth, supported by proper nutrition, particularly protein intake (1.6-2.2g/kg/day).
125
-
126
- This process is influenced by factors including genetics, age, sex, nutrition, sleep quality, and training variables. Optimal muscle growth requires a structured resistance training program, adequate protein intake, sufficient calories, and proper recovery."""
127
- }
128
-
129
- # Generate response using selected model
130
- response = api_func(
131
- model_name=model_name,
132
- prompt=GENERATION_PROMPT_WITH_GROUND_TRUTH,
133
- system_prompt=system_prompt,
134
- max_tokens=1000, # Increased token limit to accommodate ground truth
135
- temperature=1
136
- )
137
-
138
- # Parse the response to get all three components
139
- try:
140
- # Clean the response by replacing newlines with spaces
141
- cleaned_response = response.replace('\n', ' ').replace('\r', '')
142
- data = json.loads(cleaned_response)
143
-
144
- # Extract messages with fallbacks
145
- human_message = data.get("human", defaults["human"])
146
- ai_message = data.get("ai", defaults["ai"])
147
- ground_truth = data.get("ground_truth", defaults["ground_truth"])
148
-
149
- # Debug logging
150
- print(f"Parsed response: human='{human_message}', ai='{ai_message[:50]}...', ground_truth='{ground_truth[:50]}...'")
151
-
152
- except Exception as e:
153
- print(f"Failed to parse response: {str(e)}\n {response}")
154
- human_message = defaults["human"]
155
- ai_message = defaults["ai"]
156
- ground_truth = defaults["ground_truth"]
157
-
158
- return human_message, ai_message, ground_truth
159
-
160
- def generate_ai_response(human_msg):
161
- """Generate AI response using GPT-3.5-turbo"""
162
- if not human_msg.strip():
163
- return "", False
164
-
165
- try:
166
- response = get_openai_response(
167
- "gpt-3.5-turbo",
168
- human_msg,
169
- system_prompt=RESPONSE_GENERATION_SYSTEM_PROMPT,
170
- max_tokens=1000,
171
- temperature=1
172
- )
173
- # Extract just the response content since we don't need JSON format here
174
- if isinstance(response, str):
175
- # Clean up any JSON formatting if present
176
- try:
177
- data = json.loads(response)
178
- response = data.get("content", response)
179
- except json.JSONDecodeError:
180
- pass
181
- return response, False # Return response and button interactive state
182
- except Exception as e:
183
- return f"Error generating response: {str(e)}", False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,9 +1,6 @@
1
- atla
2
  pymongo
3
  gradio
4
  python-dotenv
5
  openai
6
  anthropic
7
  together
8
- cohere
9
- transformers
 
 
1
  pymongo
2
  gradio
3
  python-dotenv
4
  openai
5
  anthropic
6
  together