Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<title>Computer Agent Evaluation Viewer</title> | |
<style> | |
/* CSS styles here */ | |
body { | |
font-family: Arial, sans-serif; | |
margin: 0; | |
padding: 20px; | |
background-color: #f5f5f5; | |
} | |
.container { | |
max-width: 1200px; | |
margin: 0 auto; | |
background-color: #fff; | |
padding: 20px; | |
border-radius: 8px; | |
box-shadow: 0 2px 10px rgba(0,0,0,0.1); | |
} | |
h1, h2, h3 { | |
color: #333; | |
} | |
select, input, button { | |
padding: 8px 12px; | |
margin: 5px 0; | |
border: 1px solid #ddd; | |
border-radius: 4px; | |
} | |
button { | |
background-color: #4a6cf7; | |
color: white; | |
cursor: pointer; | |
border: none; | |
} | |
button:hover { | |
background-color: #3a5ce5; | |
} | |
button:disabled { | |
background-color: #cccccc; | |
cursor: not-allowed; | |
} | |
.row { | |
display: flex; | |
margin-bottom: 20px; | |
} | |
.col { | |
flex: 1; | |
padding: 0 10px; | |
} | |
.image-viewer { | |
width: 100%; | |
max-height: 500px; | |
border: 1px solid #ddd; | |
border-radius: 4px; | |
overflow: hidden; | |
margin-bottom: 10px; | |
position: relative; | |
} | |
.image-viewer img { | |
max-width: 100%; | |
max-height: 450px; | |
display: block; | |
margin: 0 auto; | |
} | |
.image-controls { | |
display: flex; | |
justify-content: space-between; | |
align-items: center; | |
margin-top: 10px; | |
} | |
.nav-buttons { | |
display: flex; | |
gap: 10px; | |
} | |
.step { | |
border: 1px solid #ddd; | |
border-radius: 4px; | |
margin-bottom: 10px; | |
overflow: hidden; | |
} | |
.step-header { | |
background-color: #f0f0f0; | |
padding: 10px; | |
font-weight: bold; | |
cursor: pointer; | |
display: flex; | |
justify-content: space-between; | |
} | |
.step-content { | |
padding: 15px; | |
white-space: pre-wrap; | |
font-family: monospace; | |
background-color: #f9f9f9; | |
max-height: 300px; | |
overflow-y: auto; | |
} | |
.hidden { | |
display: none; | |
} | |
.status-success { | |
color: #22c55e; | |
font-weight: bold; | |
} | |
.status-failure { | |
color: #ef4444; | |
font-weight: bold; | |
} | |
.tabs { | |
display: flex; | |
border-bottom: 1px solid #ddd; | |
margin-bottom: 20px; | |
} | |
.tab { | |
padding: 10px 20px; | |
cursor: pointer; | |
border-bottom: 2px solid transparent; | |
} | |
.tab.active { | |
border-bottom-color: #4a6cf7; | |
font-weight: bold; | |
} | |
.tab-content { | |
display: none; | |
} | |
.tab-content.active { | |
display: block; | |
} | |
pre { | |
background-color: #f0f0f0; | |
padding: 10px; | |
border-radius: 4px; | |
overflow-x: auto; | |
white-space: pre-wrap; | |
} | |
.error-message { | |
background-color: #fee2e2; | |
color: #b91c1c; | |
padding: 10px; | |
border-radius: 4px; | |
margin: 10px 0; | |
} | |
.loading { | |
display: inline-block; | |
width: 20px; | |
height: 20px; | |
border: 2px solid #f3f3f3; | |
border-top: 2px solid #3498db; | |
border-radius: 50%; | |
animation: spin 1s linear infinite; | |
margin-left: 10px; | |
} | |
@keyframes spin { | |
0% { transform: rotate(0deg); } | |
100% { transform: rotate(360deg); } | |
} | |
</style> | |
</head> | |
<body> | |
<div class="container"> | |
<h1>Computer Agent Evaluation Viewer</h1> | |
<!-- Path and Eval Selection --> | |
<div style="margin-bottom: 20px; padding: 15px; background-color: #f0f0f0; border-radius: 8px;"> | |
<h2>Load Evaluation Data</h2> | |
<div style="display: flex; gap: 10px; margin-top: 10px;"> | |
<input type="text" id="base-path" placeholder="Base directory path (leave empty for default)" | |
style="flex-grow: 1; padding: 8px; border: 1px solid #ddd; border-radius: 4px;"> | |
<button id="refresh-evals-btn">Refresh</button> | |
</div> | |
<div style="margin-top: 10px;"> | |
<label for="eval-select">Select Evaluation:</label> | |
<select id="eval-select" style="min-width: 300px;"></select> | |
</div> | |
<div id="load-status" style="margin-top: 10px; font-style: italic;"></div> | |
</div> | |
<!-- Example and Run Selectors --> | |
<div class="row"> | |
<div class="col"> | |
<label for="example-select">Select Example:</label> | |
<select id="example-select"> | |
<option value="">-- Select Example --</option> | |
</select> | |
</div> | |
<div class="col"> | |
<label for="run-select">Select Run:</label> | |
<select id="run-select" disabled> | |
<option value="">-- Select Run --</option> | |
</select> | |
</div> | |
</div> | |
<!-- Task & Status Display --> | |
<div id="run-details" class="hidden"> | |
<div> | |
<h2>Task</h2> | |
<pre id="task-text"></pre> | |
</div> | |
<div> | |
<h2>Run Status</h2> | |
<div id="status-display"></div> | |
</div> | |
<!-- Tabs --> | |
<div class="tabs"> | |
<div class="tab active" data-tab="screenshots">Screenshots</div> | |
<div class="tab" data-tab="agent-trace">Agent Trace</div> | |
<div class="tab" data-tab="raw-json">Raw JSON</div> | |
</div> | |
<!-- Screenshots Tab --> | |
<div id="screenshots-tab" class="tab-content active"> | |
<div id="no-images" class="hidden"> | |
<p>No screenshots available for this run.</p> | |
</div> | |
<div id="image-container" class="image-viewer hidden"> | |
<img id="current-image" src="" alt="Screenshot"> | |
<p id="image-caption" class="text-center"></p> | |
</div> | |
<div class="image-controls hidden" id="image-controls"> | |
<div class="nav-buttons"> | |
<button id="prev-image">Previous</button> | |
<span id="image-counter">0 / 0</span> | |
<button id="next-image">Next</button> | |
</div> | |
<input type="range" id="image-slider" min="0" max="0" value="0" style="width: 100%"> | |
</div> | |
</div> | |
<!-- Agent Trace Tab --> | |
<div id="agent-trace-tab" class="tab-content"> | |
<div id="agent-steps"></div> | |
</div> | |
<!-- Raw JSON Tab --> | |
<div id="raw-json-tab" class="tab-content"> | |
<div id="json-loading-indicator" class="hidden"> | |
<p>Loading metadata... <span class="loading"></span></p> | |
</div> | |
<div id="json-error" class="error-message hidden"></div> | |
<pre id="raw-json"></pre> | |
</div> | |
</div> | |
</div> | |
<script> | |
// Application state | |
const appState = { | |
basePath: '', | |
evalId: null, | |
currentExampleId: null, | |
currentRunId: null, | |
currentImages: [], | |
currentImageIndex: 0, | |
loadedData: { | |
examples: {}, | |
runs: {}, | |
metadata: {}, | |
screenshots: {} | |
} | |
}; | |
// DOM elements | |
const basePathInput = document.getElementById('base-path'); | |
const refreshEvalsBtn = document.getElementById('refresh-evals-btn'); | |
const evalSelect = document.getElementById('eval-select'); | |
const loadStatusDisplay = document.getElementById('load-status'); | |
const exampleSelect = document.getElementById('example-select'); | |
const runSelect = document.getElementById('run-select'); | |
const runDetails = document.getElementById('run-details'); | |
const taskText = document.getElementById('task-text'); | |
const statusDisplay = document.getElementById('status-display'); | |
const imageContainer = document.getElementById('image-container'); | |
const noImages = document.getElementById('no-images'); | |
const imageControls = document.getElementById('image-controls'); | |
const currentImage = document.getElementById('current-image'); | |
const imageCaption = document.getElementById('image-caption'); | |
const imageCounter = document.getElementById('image-counter'); | |
const imageSlider = document.getElementById('image-slider'); | |
const prevImage = document.getElementById('prev-image'); | |
const nextImage = document.getElementById('next-image'); | |
const agentSteps = document.getElementById('agent-steps'); | |
const rawJson = document.getElementById('raw-json'); | |
const jsonLoadingIndicator = document.getElementById('json-loading-indicator'); | |
const jsonError = document.getElementById('json-error'); | |
// Initialize by loading available evaluations | |
refreshEvalsBtn.addEventListener('click', loadEvaluations); | |
// Load evaluations from server | |
async function loadEvaluations() { | |
appState.basePath = basePathInput.value.trim(); | |
loadStatusDisplay.textContent = 'Loading evaluations...'; | |
refreshEvalsBtn.disabled = true; | |
try { | |
const response = await fetch(`/api/evals?path=${encodeURIComponent(appState.basePath)}`); | |
if (!response.ok) { | |
const errorData = await response.json(); | |
throw new Error(errorData.error || 'Failed to load evaluations'); | |
} | |
const evals = await response.json(); | |
// Clear existing options | |
evalSelect.innerHTML = '<option value="">-- Select Evaluation --</option>'; | |
// Add new options | |
evals.forEach(evalId => { | |
const option = document.createElement('option'); | |
option.value = evalId; | |
option.textContent = evalId; | |
evalSelect.appendChild(option); | |
}); | |
loadStatusDisplay.textContent = `Loaded ${evals.length} evaluations`; | |
// AUTO-SELECT LATEST EVALUATION | |
if (evals.length > 0) { | |
// Sort evaluations to get the latest one | |
evals.sort().reverse(); | |
evalSelect.value = evals[0]; | |
// Trigger change event to load examples | |
evalSelect.dispatchEvent(new Event('change')); | |
} | |
} catch (err) { | |
console.error('Error loading evaluations:', err); | |
loadStatusDisplay.textContent = `Error: ${err.message}`; | |
} finally { | |
refreshEvalsBtn.disabled = false; | |
} | |
} | |
// Handle evaluation selection | |
evalSelect.addEventListener('change', async () => { | |
appState.evalId = evalSelect.value; | |
if (!appState.evalId) { | |
exampleSelect.innerHTML = '<option value="">-- Select Example --</option>'; | |
exampleSelect.disabled = true; | |
runSelect.innerHTML = '<option value="">-- Select Run --</option>'; | |
runSelect.disabled = true; | |
runDetails.classList.add('hidden'); | |
return; | |
} | |
try { | |
loadStatusDisplay.textContent = 'Loading examples...'; | |
evalSelect.disabled = true; | |
const response = await fetch(`/api/eval/${appState.evalId}/examples?path=${encodeURIComponent(appState.basePath)}`); | |
if (!response.ok) { | |
const errorData = await response.json(); | |
throw new Error(errorData.error || 'Failed to load examples'); | |
} | |
const examples = await response.json(); | |
appState.loadedData.examples = examples; | |
// Update example dropdown | |
exampleSelect.innerHTML = '<option value="">-- Select Example --</option>'; | |
for (const [exampleId, task] of Object.entries(examples)) { | |
const option = document.createElement('option'); | |
option.value = exampleId; | |
option.textContent = exampleId; | |
option.title = task; // Show task as tooltip | |
exampleSelect.appendChild(option); | |
} | |
exampleSelect.disabled = false; | |
runSelect.innerHTML = '<option value="">-- Select Run --</option>'; | |
runSelect.disabled = true; | |
runDetails.classList.add('hidden'); | |
loadStatusDisplay.textContent = `Loaded ${Object.keys(examples).length} examples`; | |
// AUTO-SELECT FIRST EXAMPLE | |
if (Object.keys(examples).length > 0) { | |
const firstExampleId = Object.keys(examples)[0]; | |
exampleSelect.value = firstExampleId; | |
// Trigger change event to load runs | |
exampleSelect.dispatchEvent(new Event('change')); | |
} | |
} catch (err) { | |
console.error('Error loading examples:', err); | |
loadStatusDisplay.textContent = `Error: ${err.message}`; | |
} finally { | |
evalSelect.disabled = false; | |
} | |
}); | |
// Example selection | |
exampleSelect.addEventListener('change', async () => { | |
appState.currentExampleId = exampleSelect.value; | |
// Reset run selection | |
runSelect.innerHTML = '<option value="">-- Select Run --</option>'; | |
if (!appState.currentExampleId) { | |
runSelect.disabled = true; | |
runDetails.classList.add('hidden'); | |
return; | |
} | |
try { | |
loadStatusDisplay.textContent = 'Loading runs...'; | |
exampleSelect.disabled = true; | |
const response = await fetch(`/api/eval/${appState.evalId}/example/${appState.currentExampleId}/runs?path=${encodeURIComponent(appState.basePath)}`); | |
if (!response.ok) { | |
const errorData = await response.json(); | |
throw new Error(errorData.error || 'Failed to load runs'); | |
} | |
const runs = await response.json(); | |
appState.loadedData.runs[appState.currentExampleId] = runs; | |
// SORT RUNS by ID (assuming run IDs have timestamps or sequence numbers) | |
runs.sort((a, b) => a.id.localeCompare(b.id, undefined, {numeric: true})); | |
// Update run dropdown with sorted runs | |
runSelect.innerHTML = '<option value="">-- Select Run --</option>'; | |
runs.forEach(run => { | |
const option = document.createElement('option'); | |
option.value = run.id; | |
option.textContent = `${run.id} (${run.status})`; | |
option.dataset.status = run.status; | |
runSelect.appendChild(option); | |
}); | |
runSelect.disabled = false; | |
runDetails.classList.add('hidden'); | |
loadStatusDisplay.textContent = `Loaded ${runs.length} runs`; | |
// AUTO-SELECT FIRST RUN | |
if (runs.length > 0) { | |
runSelect.value = runs[0].id; | |
// Trigger change event to load run data | |
runSelect.dispatchEvent(new Event('change')); | |
} | |
} catch (err) { | |
console.error('Error loading runs:', err); | |
loadStatusDisplay.textContent = `Error: ${err.message}`; | |
} finally { | |
exampleSelect.disabled = false; | |
} | |
}); | |
// Run selection | |
runSelect.addEventListener('change', () => { | |
appState.currentRunId = runSelect.value; | |
if (appState.currentRunId && appState.currentExampleId) { | |
loadRunData(appState.currentExampleId, appState.currentRunId); | |
runDetails.classList.remove('hidden'); | |
} else { | |
runDetails.classList.add('hidden'); | |
} | |
}); | |
// Load run data | |
async function loadRunData(exampleId, runId) { | |
loadStatusDisplay.textContent = 'Loading run data...'; | |
runSelect.disabled = true; | |
jsonLoadingIndicator.classList.remove('hidden'); | |
jsonError.classList.add('hidden'); | |
try { | |
// Get metadata | |
const metadataResponse = await fetch(`/api/eval/${appState.evalId}/example/${exampleId}/run/${runId}/metadata?path=${encodeURIComponent(appState.basePath)}`); | |
let metadata; | |
if (metadataResponse.ok) { | |
metadata = await metadataResponse.json(); | |
} else { | |
const errorData = await metadataResponse.json(); | |
console.error('Error loading metadata:', errorData); | |
jsonError.textContent = `Error loading metadata: ${errorData.error || 'Unknown error'}`; | |
jsonError.classList.remove('hidden'); | |
metadata = null; | |
} | |
appState.loadedData.metadata[exampleId] = appState.loadedData.metadata[exampleId] || {}; | |
appState.loadedData.metadata[exampleId][runId] = metadata; | |
// Display task | |
const task = appState.loadedData.examples[exampleId]; | |
taskText.textContent = task || "No task available"; | |
// Display status | |
let statusHtml = ""; | |
if (metadata) { | |
if (metadata.status === 'completed') { | |
statusHtml = `<p><span class="status-success">✓ Completed successfully</span></p>`; | |
} else { | |
statusHtml = `<p><span class="status-failure">✗ Failed</span></p>`; | |
if (metadata.error_message) { | |
statusHtml += `<p>Error: ${metadata.error_message}</p>`; | |
} | |
} | |
} else { | |
statusHtml = "<p>Status information not available</p>"; | |
} | |
statusDisplay.innerHTML = statusHtml; | |
// Get screenshots | |
const screenshotsResponse = await fetch(`/api/eval/${appState.evalId}/example/${exampleId}/run/${runId}/screenshots?path=${encodeURIComponent(appState.basePath)}`); | |
const screenshots = await screenshotsResponse.json(); | |
appState.loadedData.screenshots[exampleId] = appState.loadedData.screenshots[exampleId] || {}; | |
appState.loadedData.screenshots[exampleId][runId] = screenshots; | |
// Load screenshots | |
loadScreenshots(exampleId, runId); | |
// Load agent trace | |
renderAgentTrace(metadata); | |
// Display raw JSON | |
if (metadata) { | |
rawJson.textContent = JSON.stringify(metadata, null, 2); | |
} else { | |
rawJson.textContent = "No metadata available"; | |
} | |
// Show screenshots tab by default | |
document.querySelector('.tab[data-tab="screenshots"]').click(); | |
loadStatusDisplay.textContent = 'Run data loaded successfully'; | |
} catch (err) { | |
console.error('Error loading run data:', err); | |
loadStatusDisplay.textContent = `Error: ${err.message}`; | |
jsonError.textContent = `Error loading data: ${err.message}`; | |
jsonError.classList.remove('hidden'); | |
} finally { | |
jsonLoadingIndicator.classList.add('hidden'); | |
runSelect.disabled = false; | |
} | |
} | |
// Load screenshots | |
function loadScreenshots(exampleId, runId) { | |
appState.currentImages = appState.loadedData.screenshots[exampleId]?.[runId] || []; | |
if (appState.currentImages.length === 0) { | |
imageContainer.classList.add('hidden'); | |
imageControls.classList.add('hidden'); | |
noImages.classList.remove('hidden'); | |
return; | |
} | |
// Setup image viewer | |
noImages.classList.add('hidden'); | |
imageContainer.classList.remove('hidden'); | |
imageControls.classList.remove('hidden'); | |
// Configure slider | |
imageSlider.min = 0; | |
imageSlider.max = appState.currentImages.length - 1; | |
imageSlider.value = 0; | |
// Reset to first image | |
appState.currentImageIndex = 0; | |
updateImageDisplay(); | |
} | |
// Update image display | |
function updateImageDisplay() { | |
if (appState.currentImages.length === 0) return; | |
const image = appState.currentImages[appState.currentImageIndex]; | |
currentImage.src = image.path; | |
imageCaption.textContent = image.name; | |
imageCounter.textContent = `${appState.currentImageIndex + 1} / ${appState.currentImages.length}`; | |
imageSlider.value = appState.currentImageIndex; | |
// Update button states | |
prevImage.disabled = appState.currentImageIndex === 0; | |
nextImage.disabled = appState.currentImageIndex === appState.currentImages.length - 1; | |
} | |
// Image navigation | |
prevImage.addEventListener('click', () => { | |
if (appState.currentImageIndex > 0) { | |
appState.currentImageIndex--; | |
updateImageDisplay(); | |
} | |
}); | |
nextImage.addEventListener('click', () => { | |
if (appState.currentImageIndex < appState.currentImages.length - 1) { | |
appState.currentImageIndex++; | |
updateImageDisplay(); | |
} | |
}); | |
imageSlider.addEventListener('input', () => { | |
appState.currentImageIndex = parseInt(imageSlider.value); | |
updateImageDisplay(); | |
}); | |
// Tab handling | |
document.querySelectorAll('.tab').forEach(tab => { | |
tab.addEventListener('click', () => { | |
// Set active tab | |
document.querySelectorAll('.tab').forEach(t => t.classList.remove('active')); | |
tab.classList.add('active'); | |
// Show active content | |
const tabId = tab.getAttribute('data-tab'); | |
document.querySelectorAll('.tab-content').forEach(content => { | |
content.classList.remove('active'); | |
}); | |
document.getElementById(`${tabId}-tab`).classList.add('active'); | |
}); | |
}); | |
// Render agent trace - UPDATED to show all sections expanded and remove duplicated task title | |
function renderAgentTrace(metadata) { | |
agentSteps.innerHTML = ''; | |
if (!metadata || !metadata.summary || metadata.summary.length === 0) { | |
agentSteps.innerHTML = '<p>No agent trace data available</p>'; | |
return; | |
} | |
// Process each step | |
metadata.summary.forEach((step, index) => { | |
const stepDiv = document.createElement('div'); | |
stepDiv.className = 'step'; | |
// Create step header | |
const headerDiv = document.createElement('div'); | |
headerDiv.className = 'step-header'; | |
let headerText = `Step ${index}`; | |
if (index === 0 && step.task) { | |
headerText = 'Task'; | |
} else if (step.model_output_message) { | |
headerText = 'Planning'; | |
} else if (step.tool_calls) { | |
headerText = `Action ${index}`; | |
} else if (step.error) { | |
headerText = 'Error'; | |
} | |
headerDiv.innerHTML = `<span>${headerText}</span><span>▲</span>`; | |
stepDiv.appendChild(headerDiv); | |
// Create step content | |
const contentDiv = document.createElement('div'); | |
contentDiv.className = 'step-content'; | |
// Make all sections visible by default | |
contentDiv.style.display = 'block'; | |
let contentHtml = ''; | |
// Task information - don't duplicate the title | |
if (index === 0 && step.task) { | |
// Just show the task content without the "Task:" title | |
contentHtml += `${step.task}\n\n`; | |
} | |
// Model output and planning | |
if (step.model_output_message && step.model_output_message.content) { | |
contentHtml += `<strong>Model Output:</strong>\n${step.model_output_message.content}\n\n`; | |
if (step.plan) { | |
contentHtml += `<strong>Plan:</strong>\n${step.plan}\n\n`; | |
} | |
} | |
// Tool calls | |
if (step.tool_calls && step.tool_calls.length > 0) { | |
step.tool_calls.forEach(toolCall => { | |
if (toolCall.function) { | |
contentHtml += `<strong>Tool Call:</strong> ${toolCall.function.name}\n`; | |
if (toolCall.function.arguments) { | |
contentHtml += `<strong>Arguments:</strong>\n${toolCall.function.arguments}\n\n`; | |
} | |
} | |
}); | |
} | |
// Model reasoning | |
if (step.model_output) { | |
contentHtml += `<strong>Model Reasoning:</strong>\n${step.model_output}\n\n`; | |
} | |
// Observations | |
if (step.observations) { | |
contentHtml += `<strong>Observations:</strong>\n${step.observations}\n\n`; | |
} | |
// Action output | |
if (step.action_output) { | |
contentHtml += `<strong>Action Output:</strong>\n${step.action_output}\n\n`; | |
} | |
// Errors | |
if (step.error) { | |
contentHtml += `<strong>Error Type:</strong> ${step.error.type || 'Unknown'}\n`; | |
if (step.error.message) { | |
contentHtml += `<strong>Error Message:</strong> ${step.error.message}\n`; | |
} | |
} | |
contentDiv.textContent = contentHtml || "No content available for this step"; | |
stepDiv.appendChild(contentDiv); | |
// Add click handler to toggle content | |
headerDiv.addEventListener('click', () => { | |
const isHidden = contentDiv.style.display === 'none'; | |
contentDiv.style.display = isHidden ? 'block' : 'none'; | |
headerDiv.querySelector('span:last-child').textContent = isHidden ? '▲' : '▼'; | |
}); | |
agentSteps.appendChild(stepDiv); | |
}); | |
// No need to expand the first step by default since all are now expanded | |
} | |
// Handle keyboard navigation for images | |
document.addEventListener('keydown', (e) => { | |
if (!appState.currentImages || appState.currentImages.length === 0) return; | |
// Check if the screenshots tab is active | |
const screenshotsTab = document.getElementById('screenshots-tab'); | |
if (!screenshotsTab.classList.contains('active')) return; | |
if (e.key === 'ArrowLeft' && appState.currentImageIndex > 0) { | |
appState.currentImageIndex--; | |
updateImageDisplay(); | |
} else if (e.key === 'ArrowRight' && appState.currentImageIndex < appState.currentImages.length - 1) { | |
appState.currentImageIndex++; | |
updateImageDisplay(); | |
} | |
}); | |
// Load evaluations on page load | |
document.addEventListener('DOMContentLoaded', loadEvaluations); | |
</script> | |
</body> | |
</html> |