from datasets import load_dataset, Dataset from functools import lru_cache from typing import Tuple import gradio as gr import json from env import MODELS, TASK, ORG_NAME def aggregate_results() -> list: """From the path of outputs and model list, extracts the current scores and stores them in a list of dicts with model, score, time as keys """ all_results = [] for org_model in MODELS: try: path = f"{ORG_NAME}/details_{org_model.replace('/', '__')}_private" ds = load_dataset(path, "results", split="latest") config = json.loads(ds["config_general"][0]) results = json.loads(ds["results"][0]) # Model data org, model = org_model.split("/") cur_result = { "Org": org, "Model": model, "Duration (s)": config["end_time"] - config["start_time"] } # Extract the task from the JSON data for k_metric, v_dict in results.items(): if k_metric != "all": for k, v in v_dict.items(): cur_result[f"{k}({k_metric})"] = v all_results.append(cur_result) except Exception as e: print(f"Error processing {model} {ORG_NAME}: {e}") return all_results def extract_dataviz() -> Tuple[list, list]: """From the path of outputs and model list, extracts from the details the worst samples, best samples """ all_samples = {} for org_model in MODELS: try: path = f"{ORG_NAME}/details_{org_model.replace('/', '__')}_private" ds = load_dataset(path, f"custom_{TASK.replace('/', '_')}_0", split="latest") for ix, row in enumerate(ds): prompt = row["full_prompt"] gold = row["gold"] score = list(row["metrics"].values())[0] prediction = row["predictions"][0] # We store flattened samples in a dict # ix -> ix, prompt, gold, model_score for each model, model_prediction for each model # then 2 lists: model_scores and models, to aggreg more easily if ix not in all_samples: all_samples[ix] = { "ix": ix, "prompt": prompt, "gold": gold[0] if isinstance(gold, list) else gold, # A bit redundant, but put in their own boxes for simplicity of access later "model_scores": [], "models": [] } if org_model not in all_samples[ix]["models"]: all_samples[ix][f"{org_model}_score"] = row["metrics"] all_samples[ix][f"{org_model}_prediction"] = prediction all_samples[ix]["model_scores"].append(score) all_samples[ix]["models"].append(org_model) except Exception as e: print(f"Error processing {org_model}: {e}") full_samples = sorted(list(all_samples.values()), key= lambda r: r['ix']) hard_samples = sorted([sample for sample in all_samples.values() if sum(sample["model_scores"]) == 0], key= lambda r: r['ix']) easy_samples = sorted([sample for sample in all_samples.values() if sum(sample["model_scores"]) == len(sample["model_scores"])], key= lambda r: r['ix']) return easy_samples, hard_samples, full_samples def samples_to_box_display(samples: list, example_index: int = 0): """Adapted from Nathan's code in https://huggingface.co/spaces/SaylorTwift/OpenEvalsModelDetails/ """ if len(samples) == 0: return "No samples in this category!" outputs = [] sample = samples[example_index] for model in sample["models"]: try: outputs.append({ 'Model': model, 'Prediction': sample[f'{model}_prediction'], 'Prompt': sample['prompt'], 'Metrics': sample[f'{model}_score'], 'Gold': sample['gold'] }) except (KeyError, IndexError): continue if not outputs: return "No results found for the selected combination." # Create HTML output with all models html_output = "
{outputs[0]['Gold']}
\n"
html_output += "{key} | {value} |
{msg['content']}
\n"
html_output += "{json.dumps(msg, indent=2)}
\n"
html_output += "{prompt_text['content']}
\n"
else:
html_output += f"{prompt_text}
\n"
html_output += "{output['Prediction']}
\n"
html_output += "