computer-agent / templates /viewer.html
lvwerra's picture
lvwerra HF Staff
add vis
e781603
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Computer Agent Evaluation Viewer</title>
<style>
/* CSS styles here */
body {
font-family: Arial, sans-serif;
margin: 0;
padding: 20px;
background-color: #f5f5f5;
}
.container {
max-width: 1200px;
margin: 0 auto;
background-color: #fff;
padding: 20px;
border-radius: 8px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
}
h1, h2, h3 {
color: #333;
}
select, input, button {
padding: 8px 12px;
margin: 5px 0;
border: 1px solid #ddd;
border-radius: 4px;
}
button {
background-color: #4a6cf7;
color: white;
cursor: pointer;
border: none;
}
button:hover {
background-color: #3a5ce5;
}
button:disabled {
background-color: #cccccc;
cursor: not-allowed;
}
.row {
display: flex;
margin-bottom: 20px;
}
.col {
flex: 1;
padding: 0 10px;
}
.image-viewer {
width: 100%;
max-height: 500px;
border: 1px solid #ddd;
border-radius: 4px;
overflow: hidden;
margin-bottom: 10px;
position: relative;
}
.image-viewer img {
max-width: 100%;
max-height: 450px;
display: block;
margin: 0 auto;
}
.image-controls {
display: flex;
justify-content: space-between;
align-items: center;
margin-top: 10px;
}
.nav-buttons {
display: flex;
gap: 10px;
}
.step {
border: 1px solid #ddd;
border-radius: 4px;
margin-bottom: 10px;
overflow: hidden;
}
.step-header {
background-color: #f0f0f0;
padding: 10px;
font-weight: bold;
cursor: pointer;
display: flex;
justify-content: space-between;
}
.step-content {
padding: 15px;
white-space: pre-wrap;
font-family: monospace;
background-color: #f9f9f9;
max-height: 300px;
overflow-y: auto;
}
.hidden {
display: none;
}
.status-success {
color: #22c55e;
font-weight: bold;
}
.status-failure {
color: #ef4444;
font-weight: bold;
}
.tabs {
display: flex;
border-bottom: 1px solid #ddd;
margin-bottom: 20px;
}
.tab {
padding: 10px 20px;
cursor: pointer;
border-bottom: 2px solid transparent;
}
.tab.active {
border-bottom-color: #4a6cf7;
font-weight: bold;
}
.tab-content {
display: none;
}
.tab-content.active {
display: block;
}
pre {
background-color: #f0f0f0;
padding: 10px;
border-radius: 4px;
overflow-x: auto;
white-space: pre-wrap;
}
.error-message {
background-color: #fee2e2;
color: #b91c1c;
padding: 10px;
border-radius: 4px;
margin: 10px 0;
}
.loading {
display: inline-block;
width: 20px;
height: 20px;
border: 2px solid #f3f3f3;
border-top: 2px solid #3498db;
border-radius: 50%;
animation: spin 1s linear infinite;
margin-left: 10px;
}
@keyframes spin {
0% { transform: rotate(0deg); }
100% { transform: rotate(360deg); }
}
</style>
</head>
<body>
<div class="container">
<h1>Computer Agent Evaluation Viewer</h1>
<!-- Path and Eval Selection -->
<div style="margin-bottom: 20px; padding: 15px; background-color: #f0f0f0; border-radius: 8px;">
<h2>Load Evaluation Data</h2>
<div style="display: flex; gap: 10px; margin-top: 10px;">
<input type="text" id="base-path" placeholder="Base directory path (leave empty for default)"
style="flex-grow: 1; padding: 8px; border: 1px solid #ddd; border-radius: 4px;">
<button id="refresh-evals-btn">Refresh</button>
</div>
<div style="margin-top: 10px;">
<label for="eval-select">Select Evaluation:</label>
<select id="eval-select" style="min-width: 300px;"></select>
</div>
<div id="load-status" style="margin-top: 10px; font-style: italic;"></div>
</div>
<!-- Example and Run Selectors -->
<div class="row">
<div class="col">
<label for="example-select">Select Example:</label>
<select id="example-select">
<option value="">-- Select Example --</option>
</select>
</div>
<div class="col">
<label for="run-select">Select Run:</label>
<select id="run-select" disabled>
<option value="">-- Select Run --</option>
</select>
</div>
</div>
<!-- Task & Status Display -->
<div id="run-details" class="hidden">
<div>
<h2>Task</h2>
<pre id="task-text"></pre>
</div>
<div>
<h2>Run Status</h2>
<div id="status-display"></div>
</div>
<!-- Tabs -->
<div class="tabs">
<div class="tab active" data-tab="screenshots">Screenshots</div>
<div class="tab" data-tab="agent-trace">Agent Trace</div>
<div class="tab" data-tab="raw-json">Raw JSON</div>
</div>
<!-- Screenshots Tab -->
<div id="screenshots-tab" class="tab-content active">
<div id="no-images" class="hidden">
<p>No screenshots available for this run.</p>
</div>
<div id="image-container" class="image-viewer hidden">
<img id="current-image" src="" alt="Screenshot">
<p id="image-caption" class="text-center"></p>
</div>
<div class="image-controls hidden" id="image-controls">
<div class="nav-buttons">
<button id="prev-image">Previous</button>
<span id="image-counter">0 / 0</span>
<button id="next-image">Next</button>
</div>
<input type="range" id="image-slider" min="0" max="0" value="0" style="width: 100%">
</div>
</div>
<!-- Agent Trace Tab -->
<div id="agent-trace-tab" class="tab-content">
<div id="agent-steps"></div>
</div>
<!-- Raw JSON Tab -->
<div id="raw-json-tab" class="tab-content">
<div id="json-loading-indicator" class="hidden">
<p>Loading metadata... <span class="loading"></span></p>
</div>
<div id="json-error" class="error-message hidden"></div>
<pre id="raw-json"></pre>
</div>
</div>
</div>
<script>
// Application state
const appState = {
basePath: '',
evalId: null,
currentExampleId: null,
currentRunId: null,
currentImages: [],
currentImageIndex: 0,
loadedData: {
examples: {},
runs: {},
metadata: {},
screenshots: {}
}
};
// DOM elements
const basePathInput = document.getElementById('base-path');
const refreshEvalsBtn = document.getElementById('refresh-evals-btn');
const evalSelect = document.getElementById('eval-select');
const loadStatusDisplay = document.getElementById('load-status');
const exampleSelect = document.getElementById('example-select');
const runSelect = document.getElementById('run-select');
const runDetails = document.getElementById('run-details');
const taskText = document.getElementById('task-text');
const statusDisplay = document.getElementById('status-display');
const imageContainer = document.getElementById('image-container');
const noImages = document.getElementById('no-images');
const imageControls = document.getElementById('image-controls');
const currentImage = document.getElementById('current-image');
const imageCaption = document.getElementById('image-caption');
const imageCounter = document.getElementById('image-counter');
const imageSlider = document.getElementById('image-slider');
const prevImage = document.getElementById('prev-image');
const nextImage = document.getElementById('next-image');
const agentSteps = document.getElementById('agent-steps');
const rawJson = document.getElementById('raw-json');
const jsonLoadingIndicator = document.getElementById('json-loading-indicator');
const jsonError = document.getElementById('json-error');
// Initialize by loading available evaluations
refreshEvalsBtn.addEventListener('click', loadEvaluations);
// Load evaluations from server
async function loadEvaluations() {
appState.basePath = basePathInput.value.trim();
loadStatusDisplay.textContent = 'Loading evaluations...';
refreshEvalsBtn.disabled = true;
try {
const response = await fetch(`/api/evals?path=${encodeURIComponent(appState.basePath)}`);
if (!response.ok) {
const errorData = await response.json();
throw new Error(errorData.error || 'Failed to load evaluations');
}
const evals = await response.json();
// Clear existing options
evalSelect.innerHTML = '<option value="">-- Select Evaluation --</option>';
// Add new options
evals.forEach(evalId => {
const option = document.createElement('option');
option.value = evalId;
option.textContent = evalId;
evalSelect.appendChild(option);
});
loadStatusDisplay.textContent = `Loaded ${evals.length} evaluations`;
// AUTO-SELECT LATEST EVALUATION
if (evals.length > 0) {
// Sort evaluations to get the latest one
evals.sort().reverse();
evalSelect.value = evals[0];
// Trigger change event to load examples
evalSelect.dispatchEvent(new Event('change'));
}
} catch (err) {
console.error('Error loading evaluations:', err);
loadStatusDisplay.textContent = `Error: ${err.message}`;
} finally {
refreshEvalsBtn.disabled = false;
}
}
// Handle evaluation selection
evalSelect.addEventListener('change', async () => {
appState.evalId = evalSelect.value;
if (!appState.evalId) {
exampleSelect.innerHTML = '<option value="">-- Select Example --</option>';
exampleSelect.disabled = true;
runSelect.innerHTML = '<option value="">-- Select Run --</option>';
runSelect.disabled = true;
runDetails.classList.add('hidden');
return;
}
try {
loadStatusDisplay.textContent = 'Loading examples...';
evalSelect.disabled = true;
const response = await fetch(`/api/eval/${appState.evalId}/examples?path=${encodeURIComponent(appState.basePath)}`);
if (!response.ok) {
const errorData = await response.json();
throw new Error(errorData.error || 'Failed to load examples');
}
const examples = await response.json();
appState.loadedData.examples = examples;
// Update example dropdown
exampleSelect.innerHTML = '<option value="">-- Select Example --</option>';
for (const [exampleId, task] of Object.entries(examples)) {
const option = document.createElement('option');
option.value = exampleId;
option.textContent = exampleId;
option.title = task; // Show task as tooltip
exampleSelect.appendChild(option);
}
exampleSelect.disabled = false;
runSelect.innerHTML = '<option value="">-- Select Run --</option>';
runSelect.disabled = true;
runDetails.classList.add('hidden');
loadStatusDisplay.textContent = `Loaded ${Object.keys(examples).length} examples`;
// AUTO-SELECT FIRST EXAMPLE
if (Object.keys(examples).length > 0) {
const firstExampleId = Object.keys(examples)[0];
exampleSelect.value = firstExampleId;
// Trigger change event to load runs
exampleSelect.dispatchEvent(new Event('change'));
}
} catch (err) {
console.error('Error loading examples:', err);
loadStatusDisplay.textContent = `Error: ${err.message}`;
} finally {
evalSelect.disabled = false;
}
});
// Example selection
exampleSelect.addEventListener('change', async () => {
appState.currentExampleId = exampleSelect.value;
// Reset run selection
runSelect.innerHTML = '<option value="">-- Select Run --</option>';
if (!appState.currentExampleId) {
runSelect.disabled = true;
runDetails.classList.add('hidden');
return;
}
try {
loadStatusDisplay.textContent = 'Loading runs...';
exampleSelect.disabled = true;
const response = await fetch(`/api/eval/${appState.evalId}/example/${appState.currentExampleId}/runs?path=${encodeURIComponent(appState.basePath)}`);
if (!response.ok) {
const errorData = await response.json();
throw new Error(errorData.error || 'Failed to load runs');
}
const runs = await response.json();
appState.loadedData.runs[appState.currentExampleId] = runs;
// SORT RUNS by ID (assuming run IDs have timestamps or sequence numbers)
runs.sort((a, b) => a.id.localeCompare(b.id, undefined, {numeric: true}));
// Update run dropdown with sorted runs
runSelect.innerHTML = '<option value="">-- Select Run --</option>';
runs.forEach(run => {
const option = document.createElement('option');
option.value = run.id;
option.textContent = `${run.id} (${run.status})`;
option.dataset.status = run.status;
runSelect.appendChild(option);
});
runSelect.disabled = false;
runDetails.classList.add('hidden');
loadStatusDisplay.textContent = `Loaded ${runs.length} runs`;
// AUTO-SELECT FIRST RUN
if (runs.length > 0) {
runSelect.value = runs[0].id;
// Trigger change event to load run data
runSelect.dispatchEvent(new Event('change'));
}
} catch (err) {
console.error('Error loading runs:', err);
loadStatusDisplay.textContent = `Error: ${err.message}`;
} finally {
exampleSelect.disabled = false;
}
});
// Run selection
runSelect.addEventListener('change', () => {
appState.currentRunId = runSelect.value;
if (appState.currentRunId && appState.currentExampleId) {
loadRunData(appState.currentExampleId, appState.currentRunId);
runDetails.classList.remove('hidden');
} else {
runDetails.classList.add('hidden');
}
});
// Load run data
async function loadRunData(exampleId, runId) {
loadStatusDisplay.textContent = 'Loading run data...';
runSelect.disabled = true;
jsonLoadingIndicator.classList.remove('hidden');
jsonError.classList.add('hidden');
try {
// Get metadata
const metadataResponse = await fetch(`/api/eval/${appState.evalId}/example/${exampleId}/run/${runId}/metadata?path=${encodeURIComponent(appState.basePath)}`);
let metadata;
if (metadataResponse.ok) {
metadata = await metadataResponse.json();
} else {
const errorData = await metadataResponse.json();
console.error('Error loading metadata:', errorData);
jsonError.textContent = `Error loading metadata: ${errorData.error || 'Unknown error'}`;
jsonError.classList.remove('hidden');
metadata = null;
}
appState.loadedData.metadata[exampleId] = appState.loadedData.metadata[exampleId] || {};
appState.loadedData.metadata[exampleId][runId] = metadata;
// Display task
const task = appState.loadedData.examples[exampleId];
taskText.textContent = task || "No task available";
// Display status
let statusHtml = "";
if (metadata) {
if (metadata.status === 'completed') {
statusHtml = `<p><span class="status-success">✓ Completed successfully</span></p>`;
} else {
statusHtml = `<p><span class="status-failure">✗ Failed</span></p>`;
if (metadata.error_message) {
statusHtml += `<p>Error: ${metadata.error_message}</p>`;
}
}
} else {
statusHtml = "<p>Status information not available</p>";
}
statusDisplay.innerHTML = statusHtml;
// Get screenshots
const screenshotsResponse = await fetch(`/api/eval/${appState.evalId}/example/${exampleId}/run/${runId}/screenshots?path=${encodeURIComponent(appState.basePath)}`);
const screenshots = await screenshotsResponse.json();
appState.loadedData.screenshots[exampleId] = appState.loadedData.screenshots[exampleId] || {};
appState.loadedData.screenshots[exampleId][runId] = screenshots;
// Load screenshots
loadScreenshots(exampleId, runId);
// Load agent trace
renderAgentTrace(metadata);
// Display raw JSON
if (metadata) {
rawJson.textContent = JSON.stringify(metadata, null, 2);
} else {
rawJson.textContent = "No metadata available";
}
// Show screenshots tab by default
document.querySelector('.tab[data-tab="screenshots"]').click();
loadStatusDisplay.textContent = 'Run data loaded successfully';
} catch (err) {
console.error('Error loading run data:', err);
loadStatusDisplay.textContent = `Error: ${err.message}`;
jsonError.textContent = `Error loading data: ${err.message}`;
jsonError.classList.remove('hidden');
} finally {
jsonLoadingIndicator.classList.add('hidden');
runSelect.disabled = false;
}
}
// Load screenshots
function loadScreenshots(exampleId, runId) {
appState.currentImages = appState.loadedData.screenshots[exampleId]?.[runId] || [];
if (appState.currentImages.length === 0) {
imageContainer.classList.add('hidden');
imageControls.classList.add('hidden');
noImages.classList.remove('hidden');
return;
}
// Setup image viewer
noImages.classList.add('hidden');
imageContainer.classList.remove('hidden');
imageControls.classList.remove('hidden');
// Configure slider
imageSlider.min = 0;
imageSlider.max = appState.currentImages.length - 1;
imageSlider.value = 0;
// Reset to first image
appState.currentImageIndex = 0;
updateImageDisplay();
}
// Update image display
function updateImageDisplay() {
if (appState.currentImages.length === 0) return;
const image = appState.currentImages[appState.currentImageIndex];
currentImage.src = image.path;
imageCaption.textContent = image.name;
imageCounter.textContent = `${appState.currentImageIndex + 1} / ${appState.currentImages.length}`;
imageSlider.value = appState.currentImageIndex;
// Update button states
prevImage.disabled = appState.currentImageIndex === 0;
nextImage.disabled = appState.currentImageIndex === appState.currentImages.length - 1;
}
// Image navigation
prevImage.addEventListener('click', () => {
if (appState.currentImageIndex > 0) {
appState.currentImageIndex--;
updateImageDisplay();
}
});
nextImage.addEventListener('click', () => {
if (appState.currentImageIndex < appState.currentImages.length - 1) {
appState.currentImageIndex++;
updateImageDisplay();
}
});
imageSlider.addEventListener('input', () => {
appState.currentImageIndex = parseInt(imageSlider.value);
updateImageDisplay();
});
// Tab handling
document.querySelectorAll('.tab').forEach(tab => {
tab.addEventListener('click', () => {
// Set active tab
document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
tab.classList.add('active');
// Show active content
const tabId = tab.getAttribute('data-tab');
document.querySelectorAll('.tab-content').forEach(content => {
content.classList.remove('active');
});
document.getElementById(`${tabId}-tab`).classList.add('active');
});
});
// Render agent trace - UPDATED to show all sections expanded and remove duplicated task title
function renderAgentTrace(metadata) {
agentSteps.innerHTML = '';
if (!metadata || !metadata.summary || metadata.summary.length === 0) {
agentSteps.innerHTML = '<p>No agent trace data available</p>';
return;
}
// Process each step
metadata.summary.forEach((step, index) => {
const stepDiv = document.createElement('div');
stepDiv.className = 'step';
// Create step header
const headerDiv = document.createElement('div');
headerDiv.className = 'step-header';
let headerText = `Step ${index}`;
if (index === 0 && step.task) {
headerText = 'Task';
} else if (step.model_output_message) {
headerText = 'Planning';
} else if (step.tool_calls) {
headerText = `Action ${index}`;
} else if (step.error) {
headerText = 'Error';
}
headerDiv.innerHTML = `<span>${headerText}</span><span>▲</span>`;
stepDiv.appendChild(headerDiv);
// Create step content
const contentDiv = document.createElement('div');
contentDiv.className = 'step-content';
// Make all sections visible by default
contentDiv.style.display = 'block';
let contentHtml = '';
// Task information - don't duplicate the title
if (index === 0 && step.task) {
// Just show the task content without the "Task:" title
contentHtml += `${step.task}\n\n`;
}
// Model output and planning
if (step.model_output_message && step.model_output_message.content) {
contentHtml += `<strong>Model Output:</strong>\n${step.model_output_message.content}\n\n`;
if (step.plan) {
contentHtml += `<strong>Plan:</strong>\n${step.plan}\n\n`;
}
}
// Tool calls
if (step.tool_calls && step.tool_calls.length > 0) {
step.tool_calls.forEach(toolCall => {
if (toolCall.function) {
contentHtml += `<strong>Tool Call:</strong> ${toolCall.function.name}\n`;
if (toolCall.function.arguments) {
contentHtml += `<strong>Arguments:</strong>\n${toolCall.function.arguments}\n\n`;
}
}
});
}
// Model reasoning
if (step.model_output) {
contentHtml += `<strong>Model Reasoning:</strong>\n${step.model_output}\n\n`;
}
// Observations
if (step.observations) {
contentHtml += `<strong>Observations:</strong>\n${step.observations}\n\n`;
}
// Action output
if (step.action_output) {
contentHtml += `<strong>Action Output:</strong>\n${step.action_output}\n\n`;
}
// Errors
if (step.error) {
contentHtml += `<strong>Error Type:</strong> ${step.error.type || 'Unknown'}\n`;
if (step.error.message) {
contentHtml += `<strong>Error Message:</strong> ${step.error.message}\n`;
}
}
contentDiv.textContent = contentHtml || "No content available for this step";
stepDiv.appendChild(contentDiv);
// Add click handler to toggle content
headerDiv.addEventListener('click', () => {
const isHidden = contentDiv.style.display === 'none';
contentDiv.style.display = isHidden ? 'block' : 'none';
headerDiv.querySelector('span:last-child').textContent = isHidden ? '▲' : '▼';
});
agentSteps.appendChild(stepDiv);
});
// No need to expand the first step by default since all are now expanded
}
// Handle keyboard navigation for images
document.addEventListener('keydown', (e) => {
if (!appState.currentImages || appState.currentImages.length === 0) return;
// Check if the screenshots tab is active
const screenshotsTab = document.getElementById('screenshots-tab');
if (!screenshotsTab.classList.contains('active')) return;
if (e.key === 'ArrowLeft' && appState.currentImageIndex > 0) {
appState.currentImageIndex--;
updateImageDisplay();
} else if (e.key === 'ArrowRight' && appState.currentImageIndex < appState.currentImages.length - 1) {
appState.currentImageIndex++;
updateImageDisplay();
}
});
// Load evaluations on page load
document.addEventListener('DOMContentLoaded', loadEvaluations);
</script>
</body>
</html>