from datasets import load_dataset, Dataset from functools import lru_cache from typing import Tuple import gradio as gr import json from env import MODELS, TASK, ORG_NAME def aggregate_results() -> list: """From the path of outputs and model list, extracts the current scores and stores them in a list of dicts with model, score, time as keys """ all_results = [] for org_model in MODELS: try: path = f"{ORG_NAME}/details_{org_model.replace('/', '__')}_private" ds = load_dataset(path, "results", split="latest") config = json.loads(ds["config_general"][0]) results = json.loads(ds["results"][0]) # Model data org, model = org_model.split("/") cur_result = { "Org": org, "Model": model, "Duration (s)": config["end_time"] - config["start_time"] } # Extract the task from the JSON data for k_metric, v_dict in results.items(): if k_metric != "all": for k, v in v_dict.items(): cur_result[f"{k}({k_metric})"] = v all_results.append(cur_result) except Exception as e: print(f"Error processing {model} {ORG_NAME}: {e}") return all_results def extract_dataviz() -> Tuple[list, list]: """From the path of outputs and model list, extracts from the details the worst samples, best samples """ all_samples = {} for org_model in MODELS: try: path = f"{ORG_NAME}/details_{org_model.replace('/', '__')}_private" ds = load_dataset(path, f"custom_{TASK.replace('/', '_')}_0", split="latest") for ix, row in enumerate(ds): prompt = row["full_prompt"] gold = row["gold"] score = list(row["metrics"].values())[0] prediction = row["predictions"][0] # We store flattened samples in a dict # ix -> ix, prompt, gold, model_score for each model, model_prediction for each model # then 2 lists: model_scores and models, to aggreg more easily if ix not in all_samples: all_samples[ix] = { "ix": ix, "prompt": prompt, "gold": gold[0] if isinstance(gold, list) else gold, # A bit redundant, but put in their own boxes for simplicity of access later "model_scores": [], "models": [] } if org_model not in all_samples[ix]["models"]: all_samples[ix][f"{org_model}_score"] = row["metrics"] all_samples[ix][f"{org_model}_prediction"] = prediction all_samples[ix]["model_scores"].append(score) all_samples[ix]["models"].append(org_model) except Exception as e: print(f"Error processing {org_model}: {e}") full_samples = sorted(list(all_samples.values()), key= lambda r: r['ix']) hard_samples = sorted([sample for sample in all_samples.values() if sum(sample["model_scores"]) == 0], key= lambda r: r['ix']) easy_samples = sorted([sample for sample in all_samples.values() if sum(sample["model_scores"]) == len(sample["model_scores"])], key= lambda r: r['ix']) return easy_samples, hard_samples, full_samples def samples_to_box_display(samples: list, example_index: int = 0): """Adapted from Nathan's code in https://huggingface.co/spaces/SaylorTwift/OpenEvalsModelDetails/ """ if len(samples) == 0: return "No samples in this category!" outputs = [] sample = samples[example_index] for model in sample["models"]: try: outputs.append({ 'Model': model, 'Prediction': sample[f'{model}_prediction'], 'Prompt': sample['prompt'], 'Metrics': sample[f'{model}_score'], 'Gold': sample['gold'] }) except (KeyError, IndexError): continue if not outputs: return "No results found for the selected combination." # Create HTML output with all models html_output = "
\n\n" # Show gold answer at the top with distinct styling if outputs: html_output += "
\n" html_output += "

Ground Truth

\n" html_output += "
\n" html_output += f"
{outputs[0]['Gold']}
\n" html_output += "
\n" html_output += "
\n" for output in outputs: html_output += "
\n" html_output += f"

{output['Model']}

\n" # Format metrics as a clean table html_output += "
\n" html_output += "

Metrics

\n" metrics = output['Metrics'] if isinstance(metrics, str): metrics = eval(metrics) html_output += "
\n" html_output += "\n" for key, value in metrics.items(): if isinstance(value, float): value = f"{value:.3f}" html_output += f"\n" html_output += "
{key}{value}
\n" html_output += "
\n" html_output += "
\n\n" # Handle prompt formatting with better styling html_output += "
\n" html_output += "

Prompt

\n" html_output += "
\n" prompt_text = output['Prompt'] if isinstance(prompt_text, list): for i, msg in enumerate(prompt_text): if isinstance(msg, dict) and 'content' in msg: role = msg.get('role', 'message').title() html_output += "
\n" html_output += f"{role}:\n" html_output += "
\n" html_output += f"
{msg['content']}
\n" html_output += "
\n" html_output += "
\n" else: html_output += "
\n" html_output += "
\n" html_output += f"
{json.dumps(msg, indent=2)}
\n" html_output += "
\n" html_output += "
\n" else: html_output += "
\n" if isinstance(prompt_text, dict) and 'content' in prompt_text: html_output += f"
{prompt_text['content']}
\n" else: html_output += f"
{prompt_text}
\n" html_output += "
\n" html_output += "
\n" html_output += "
\n\n" # Style prediction output - now in a collapsible section html_output += "
\n" html_output += "

Prediction

" # Add word count in a muted style word_count = len(output['Prediction'].split()) html_output += f"({word_count} words)" html_output += "
\n" html_output += "
\n" html_output += "
\n" html_output += f"
{output['Prediction']}
\n" html_output += "
\n" html_output += "
\n" html_output += "
\n" html_output += "
\n\n" html_output += "
" return html_output def run_pipeline(samples_ix: int = 0): results = aggregate_results() best_samples, worst_samples, all_samples = extract_dataviz() return gr.Dataframe(Dataset.from_list(results).to_pandas(), visible=True), \ gr.HTML(samples_to_box_display(best_samples, samples_ix), label="Easiest samples (always found)", visible=True), \ gr.HTML(samples_to_box_display(worst_samples, samples_ix), label="Hardest samples (always failed)", visible=True), \ gr.HTML(samples_to_box_display(all_samples, samples_ix), label="All samples", visible=True) def update_examples(samples_ix: int = 0): best_samples, worst_samples, all_samples = extract_dataviz() return samples_to_box_display(best_samples, samples_ix), \ samples_to_box_display(worst_samples, samples_ix), \ samples_to_box_display(all_samples, samples_ix)