lvwerra HF Staff commited on
Commit
e781603
·
1 Parent(s): c3fdac2
Files changed (2) hide show
  1. show_eval.py +199 -0
  2. templates/viewer.html +753 -0
show_eval.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import glob
4
+ import traceback
5
+ from flask import Flask, render_template, jsonify, send_file, request
6
+ from flask_cors import CORS
7
+
8
+ app = Flask(__name__)
9
+ CORS(app) # Enable CORS for all routes
10
+
11
+ # Serve the HTML viewer
12
+ @app.route('/')
13
+ def index():
14
+ return render_template('viewer.html')
15
+
16
+ # Get list of available evaluations
17
+ @app.route('/api/evals')
18
+ def list_evals():
19
+ base_dir = request.args.get('path', './eval_results')
20
+ if not os.path.exists(base_dir):
21
+ return jsonify({"error": f"Path {base_dir} does not exist"}), 404
22
+
23
+ eval_dirs = []
24
+ for item in os.listdir(base_dir):
25
+ full_path = os.path.join(base_dir, item)
26
+ if os.path.isdir(full_path) and item.startswith('eval_'):
27
+ eval_dirs.append(item)
28
+
29
+ return jsonify(eval_dirs)
30
+
31
+ # Get examples for an evaluation
32
+ @app.route('/api/eval/<eval_id>/examples')
33
+ def get_examples(eval_id):
34
+ base_dir = request.args.get('path', './eval_results')
35
+ eval_path = os.path.join(base_dir, eval_id)
36
+
37
+ # Try to read examples.json
38
+ examples_json_path = os.path.join(eval_path, 'examples.json')
39
+ examples = {}
40
+
41
+ if os.path.exists(examples_json_path):
42
+ try:
43
+ with open(examples_json_path, 'r') as f:
44
+ examples = json.load(f)
45
+ except json.JSONDecodeError:
46
+ app.logger.error(f"Error parsing examples.json at {examples_json_path}")
47
+
48
+ # If examples.json doesn't exist or is empty, scan for example directories
49
+ if not examples:
50
+ for item in os.listdir(eval_path):
51
+ if os.path.isdir(os.path.join(eval_path, item)) and item.startswith('example_'):
52
+ example_id = item.replace('example_', '')
53
+ example_dir = os.path.join(eval_path, item)
54
+
55
+ # Find the first run and read task.txt
56
+ run_dirs = []
57
+ for run_item in os.listdir(example_dir):
58
+ run_path = os.path.join(example_dir, run_item)
59
+ if os.path.isdir(run_path) and run_item.startswith('run_'):
60
+ run_dirs.append(run_item)
61
+
62
+ if run_dirs:
63
+ task_path = os.path.join(example_dir, run_dirs[0], 'task.txt')
64
+ if os.path.exists(task_path):
65
+ with open(task_path, 'r') as f:
66
+ examples[example_id] = f.read().strip()
67
+ else:
68
+ # If no task.txt, try reading from metadata.json
69
+ metadata_path = os.path.join(example_dir, run_dirs[0], 'metadata.json')
70
+ if os.path.exists(metadata_path):
71
+ try:
72
+ with open(metadata_path, 'r') as f:
73
+ metadata = json.load(f)
74
+ # Look for task in summary[0].task
75
+ if 'summary' in metadata and metadata['summary'] and 'task' in metadata['summary'][0]:
76
+ examples[example_id] = metadata['summary'][0]['task']
77
+ except:
78
+ # Default to directory name if all else fails
79
+ examples[example_id] = f"Task for {example_id}"
80
+ else:
81
+ examples[example_id] = f"Task for {example_id}"
82
+
83
+ return jsonify(examples)
84
+
85
+ # Get runs for an example
86
+ @app.route('/api/eval/<eval_id>/example/<example_id>/runs')
87
+ def get_runs(eval_id, example_id):
88
+ base_dir = request.args.get('path', './eval_results')
89
+ example_dir = os.path.join(base_dir, eval_id, f'example_{example_id}')
90
+
91
+ if not os.path.exists(example_dir):
92
+ return jsonify({"error": f"Example directory not found: {example_dir}"}), 404
93
+
94
+ runs = []
95
+ for item in os.listdir(example_dir):
96
+ item_path = os.path.join(example_dir, item)
97
+ if os.path.isdir(item_path) and item.startswith('run_'):
98
+ run_id = item
99
+
100
+ # Try to get status from metadata.json
101
+ metadata_path = os.path.join(item_path, 'metadata.json')
102
+ status = 'unknown'
103
+
104
+ if os.path.exists(metadata_path):
105
+ try:
106
+ with open(metadata_path, 'r') as f:
107
+ metadata = json.load(f)
108
+ status = metadata.get('status', 'unknown')
109
+ except Exception as e:
110
+ app.logger.error(f"Error reading metadata.json for {run_id}: {str(e)}")
111
+
112
+ runs.append({'id': run_id, 'status': status})
113
+ app.logger.info(f"runs: {runs}")
114
+
115
+ return jsonify(runs)
116
+
117
+ # Get metadata for a run
118
+ @app.route('/api/eval/<eval_id>/example/<example_id>/run/<run_id>/metadata')
119
+ def get_metadata(eval_id, example_id, run_id):
120
+ base_dir = request.args.get('path', './eval_results')
121
+ run_dir = os.path.join(base_dir, eval_id, f'example_{example_id}', run_id)
122
+ metadata_path = os.path.join(run_dir, 'metadata.json')
123
+ app.logger.info(f"metadata: {metadata_path}")
124
+
125
+ if not os.path.exists(metadata_path):
126
+ return jsonify({"error": "Metadata not found", "path": metadata_path}), 404
127
+
128
+ try:
129
+ with open(metadata_path, 'r') as f:
130
+ metadata_content = f.read()
131
+ if not metadata_content.strip():
132
+ return jsonify({"error": "Metadata file is empty"}), 404
133
+
134
+ metadata = json.loads(metadata_content)
135
+ return jsonify(metadata)
136
+ except json.JSONDecodeError as e:
137
+ error_info = {
138
+ "error": "Invalid JSON in metadata file",
139
+ "details": str(e),
140
+ "path": metadata_path
141
+ }
142
+ app.logger.error(f"JSON error in {metadata_path}: {str(e)}")
143
+ return jsonify(error_info), 400
144
+ except Exception as e:
145
+ error_info = {
146
+ "error": "Error reading metadata file",
147
+ "details": str(e),
148
+ "traceback": traceback.format_exc(),
149
+ "path": metadata_path
150
+ }
151
+ app.logger.error(f"Error reading {metadata_path}: {str(e)}")
152
+ return jsonify(error_info), 500
153
+
154
+ # Get screenshots for a run
155
+ @app.route('/api/eval/<eval_id>/example/<example_id>/run/<run_id>/screenshots')
156
+ def get_screenshots(eval_id, example_id, run_id):
157
+ base_dir = request.args.get('path', './eval_results')
158
+ run_dir = os.path.join(base_dir, eval_id, f'example_{example_id}', run_id)
159
+
160
+ if not os.path.exists(run_dir):
161
+ return jsonify({"error": f"Run directory not found: {run_dir}"}), 404
162
+
163
+ screenshots = []
164
+ for ext in ['png', 'jpg', 'jpeg']:
165
+ pattern = os.path.join(run_dir, f'*.{ext}')
166
+ for file_path in glob.glob(pattern):
167
+ filename = os.path.basename(file_path)
168
+ screenshots.append({
169
+ 'name': filename,
170
+ 'path': f'/api/image?path={file_path}'
171
+ })
172
+
173
+ # Sort by filename
174
+ screenshots.sort(key=lambda x: x['name'])
175
+
176
+ app.logger.info(f"screenshots: {screenshots}")
177
+
178
+ return jsonify(screenshots)
179
+
180
+ # Serve an image file
181
+ @app.route('/api/image')
182
+ def get_image():
183
+ path = request.args.get('path')
184
+ if not path:
185
+ return jsonify({"error": "No path provided"}), 400
186
+
187
+ if not os.path.exists(path):
188
+ return jsonify({"error": f"Image not found at path: {path}"}), 404
189
+
190
+ try:
191
+ return send_file(path)
192
+ except Exception as e:
193
+ return jsonify({"error": f"Error serving image: {str(e)}"}), 500
194
+
195
+ if __name__ == '__main__':
196
+ print("Evaluation Server is running at http://localhost:8000")
197
+ print("Press Ctrl+C to stop the server")
198
+
199
+ app.run(debug=True, port=8000)
templates/viewer.html ADDED
@@ -0,0 +1,753 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Computer Agent Evaluation Viewer</title>
7
+ <style>
8
+ /* CSS styles here */
9
+ body {
10
+ font-family: Arial, sans-serif;
11
+ margin: 0;
12
+ padding: 20px;
13
+ background-color: #f5f5f5;
14
+ }
15
+ .container {
16
+ max-width: 1200px;
17
+ margin: 0 auto;
18
+ background-color: #fff;
19
+ padding: 20px;
20
+ border-radius: 8px;
21
+ box-shadow: 0 2px 10px rgba(0,0,0,0.1);
22
+ }
23
+ h1, h2, h3 {
24
+ color: #333;
25
+ }
26
+ select, input, button {
27
+ padding: 8px 12px;
28
+ margin: 5px 0;
29
+ border: 1px solid #ddd;
30
+ border-radius: 4px;
31
+ }
32
+ button {
33
+ background-color: #4a6cf7;
34
+ color: white;
35
+ cursor: pointer;
36
+ border: none;
37
+ }
38
+ button:hover {
39
+ background-color: #3a5ce5;
40
+ }
41
+ button:disabled {
42
+ background-color: #cccccc;
43
+ cursor: not-allowed;
44
+ }
45
+ .row {
46
+ display: flex;
47
+ margin-bottom: 20px;
48
+ }
49
+ .col {
50
+ flex: 1;
51
+ padding: 0 10px;
52
+ }
53
+ .image-viewer {
54
+ width: 100%;
55
+ max-height: 500px;
56
+ border: 1px solid #ddd;
57
+ border-radius: 4px;
58
+ overflow: hidden;
59
+ margin-bottom: 10px;
60
+ position: relative;
61
+ }
62
+ .image-viewer img {
63
+ max-width: 100%;
64
+ max-height: 450px;
65
+ display: block;
66
+ margin: 0 auto;
67
+ }
68
+ .image-controls {
69
+ display: flex;
70
+ justify-content: space-between;
71
+ align-items: center;
72
+ margin-top: 10px;
73
+ }
74
+ .nav-buttons {
75
+ display: flex;
76
+ gap: 10px;
77
+ }
78
+ .step {
79
+ border: 1px solid #ddd;
80
+ border-radius: 4px;
81
+ margin-bottom: 10px;
82
+ overflow: hidden;
83
+ }
84
+ .step-header {
85
+ background-color: #f0f0f0;
86
+ padding: 10px;
87
+ font-weight: bold;
88
+ cursor: pointer;
89
+ display: flex;
90
+ justify-content: space-between;
91
+ }
92
+ .step-content {
93
+ padding: 15px;
94
+ white-space: pre-wrap;
95
+ font-family: monospace;
96
+ background-color: #f9f9f9;
97
+ max-height: 300px;
98
+ overflow-y: auto;
99
+ }
100
+ .hidden {
101
+ display: none;
102
+ }
103
+ .status-success {
104
+ color: #22c55e;
105
+ font-weight: bold;
106
+ }
107
+ .status-failure {
108
+ color: #ef4444;
109
+ font-weight: bold;
110
+ }
111
+ .tabs {
112
+ display: flex;
113
+ border-bottom: 1px solid #ddd;
114
+ margin-bottom: 20px;
115
+ }
116
+ .tab {
117
+ padding: 10px 20px;
118
+ cursor: pointer;
119
+ border-bottom: 2px solid transparent;
120
+ }
121
+ .tab.active {
122
+ border-bottom-color: #4a6cf7;
123
+ font-weight: bold;
124
+ }
125
+ .tab-content {
126
+ display: none;
127
+ }
128
+ .tab-content.active {
129
+ display: block;
130
+ }
131
+ pre {
132
+ background-color: #f0f0f0;
133
+ padding: 10px;
134
+ border-radius: 4px;
135
+ overflow-x: auto;
136
+ white-space: pre-wrap;
137
+ }
138
+ .error-message {
139
+ background-color: #fee2e2;
140
+ color: #b91c1c;
141
+ padding: 10px;
142
+ border-radius: 4px;
143
+ margin: 10px 0;
144
+ }
145
+ .loading {
146
+ display: inline-block;
147
+ width: 20px;
148
+ height: 20px;
149
+ border: 2px solid #f3f3f3;
150
+ border-top: 2px solid #3498db;
151
+ border-radius: 50%;
152
+ animation: spin 1s linear infinite;
153
+ margin-left: 10px;
154
+ }
155
+ @keyframes spin {
156
+ 0% { transform: rotate(0deg); }
157
+ 100% { transform: rotate(360deg); }
158
+ }
159
+ </style>
160
+ </head>
161
+ <body>
162
+ <div class="container">
163
+ <h1>Computer Agent Evaluation Viewer</h1>
164
+
165
+ <!-- Path and Eval Selection -->
166
+ <div style="margin-bottom: 20px; padding: 15px; background-color: #f0f0f0; border-radius: 8px;">
167
+ <h2>Load Evaluation Data</h2>
168
+ <div style="display: flex; gap: 10px; margin-top: 10px;">
169
+ <input type="text" id="base-path" placeholder="Base directory path (leave empty for default)"
170
+ style="flex-grow: 1; padding: 8px; border: 1px solid #ddd; border-radius: 4px;">
171
+ <button id="refresh-evals-btn">Refresh</button>
172
+ </div>
173
+ <div style="margin-top: 10px;">
174
+ <label for="eval-select">Select Evaluation:</label>
175
+ <select id="eval-select" style="min-width: 300px;"></select>
176
+ </div>
177
+ <div id="load-status" style="margin-top: 10px; font-style: italic;"></div>
178
+ </div>
179
+
180
+ <!-- Example and Run Selectors -->
181
+ <div class="row">
182
+ <div class="col">
183
+ <label for="example-select">Select Example:</label>
184
+ <select id="example-select">
185
+ <option value="">-- Select Example --</option>
186
+ </select>
187
+ </div>
188
+ <div class="col">
189
+ <label for="run-select">Select Run:</label>
190
+ <select id="run-select" disabled>
191
+ <option value="">-- Select Run --</option>
192
+ </select>
193
+ </div>
194
+ </div>
195
+
196
+ <!-- Task & Status Display -->
197
+ <div id="run-details" class="hidden">
198
+ <div>
199
+ <h2>Task</h2>
200
+ <pre id="task-text"></pre>
201
+ </div>
202
+
203
+ <div>
204
+ <h2>Run Status</h2>
205
+ <div id="status-display"></div>
206
+ </div>
207
+
208
+ <!-- Tabs -->
209
+ <div class="tabs">
210
+ <div class="tab active" data-tab="screenshots">Screenshots</div>
211
+ <div class="tab" data-tab="agent-trace">Agent Trace</div>
212
+ <div class="tab" data-tab="raw-json">Raw JSON</div>
213
+ </div>
214
+
215
+ <!-- Screenshots Tab -->
216
+ <div id="screenshots-tab" class="tab-content active">
217
+ <div id="no-images" class="hidden">
218
+ <p>No screenshots available for this run.</p>
219
+ </div>
220
+ <div id="image-container" class="image-viewer hidden">
221
+ <img id="current-image" src="" alt="Screenshot">
222
+ <p id="image-caption" class="text-center"></p>
223
+ </div>
224
+ <div class="image-controls hidden" id="image-controls">
225
+ <div class="nav-buttons">
226
+ <button id="prev-image">Previous</button>
227
+ <span id="image-counter">0 / 0</span>
228
+ <button id="next-image">Next</button>
229
+ </div>
230
+ <input type="range" id="image-slider" min="0" max="0" value="0" style="width: 100%">
231
+ </div>
232
+ </div>
233
+
234
+ <!-- Agent Trace Tab -->
235
+ <div id="agent-trace-tab" class="tab-content">
236
+ <div id="agent-steps"></div>
237
+ </div>
238
+
239
+ <!-- Raw JSON Tab -->
240
+ <div id="raw-json-tab" class="tab-content">
241
+ <div id="json-loading-indicator" class="hidden">
242
+ <p>Loading metadata... <span class="loading"></span></p>
243
+ </div>
244
+ <div id="json-error" class="error-message hidden"></div>
245
+ <pre id="raw-json"></pre>
246
+ </div>
247
+ </div>
248
+ </div>
249
+
250
+ <script>
251
+ // Application state
252
+ const appState = {
253
+ basePath: '',
254
+ evalId: null,
255
+ currentExampleId: null,
256
+ currentRunId: null,
257
+ currentImages: [],
258
+ currentImageIndex: 0,
259
+ loadedData: {
260
+ examples: {},
261
+ runs: {},
262
+ metadata: {},
263
+ screenshots: {}
264
+ }
265
+ };
266
+
267
+ // DOM elements
268
+ const basePathInput = document.getElementById('base-path');
269
+ const refreshEvalsBtn = document.getElementById('refresh-evals-btn');
270
+ const evalSelect = document.getElementById('eval-select');
271
+ const loadStatusDisplay = document.getElementById('load-status');
272
+ const exampleSelect = document.getElementById('example-select');
273
+ const runSelect = document.getElementById('run-select');
274
+ const runDetails = document.getElementById('run-details');
275
+ const taskText = document.getElementById('task-text');
276
+ const statusDisplay = document.getElementById('status-display');
277
+ const imageContainer = document.getElementById('image-container');
278
+ const noImages = document.getElementById('no-images');
279
+ const imageControls = document.getElementById('image-controls');
280
+ const currentImage = document.getElementById('current-image');
281
+ const imageCaption = document.getElementById('image-caption');
282
+ const imageCounter = document.getElementById('image-counter');
283
+ const imageSlider = document.getElementById('image-slider');
284
+ const prevImage = document.getElementById('prev-image');
285
+ const nextImage = document.getElementById('next-image');
286
+ const agentSteps = document.getElementById('agent-steps');
287
+ const rawJson = document.getElementById('raw-json');
288
+ const jsonLoadingIndicator = document.getElementById('json-loading-indicator');
289
+ const jsonError = document.getElementById('json-error');
290
+
291
+ // Initialize by loading available evaluations
292
+ refreshEvalsBtn.addEventListener('click', loadEvaluations);
293
+
294
+ // Load evaluations from server
295
+ async function loadEvaluations() {
296
+ appState.basePath = basePathInput.value.trim();
297
+ loadStatusDisplay.textContent = 'Loading evaluations...';
298
+ refreshEvalsBtn.disabled = true;
299
+
300
+ try {
301
+ const response = await fetch(`/api/evals?path=${encodeURIComponent(appState.basePath)}`);
302
+ if (!response.ok) {
303
+ const errorData = await response.json();
304
+ throw new Error(errorData.error || 'Failed to load evaluations');
305
+ }
306
+
307
+ const evals = await response.json();
308
+
309
+ // Clear existing options
310
+ evalSelect.innerHTML = '<option value="">-- Select Evaluation --</option>';
311
+
312
+ // Add new options
313
+ evals.forEach(evalId => {
314
+ const option = document.createElement('option');
315
+ option.value = evalId;
316
+ option.textContent = evalId;
317
+ evalSelect.appendChild(option);
318
+ });
319
+
320
+ loadStatusDisplay.textContent = `Loaded ${evals.length} evaluations`;
321
+
322
+ // AUTO-SELECT LATEST EVALUATION
323
+ if (evals.length > 0) {
324
+ // Sort evaluations to get the latest one
325
+ evals.sort().reverse();
326
+ evalSelect.value = evals[0];
327
+ // Trigger change event to load examples
328
+ evalSelect.dispatchEvent(new Event('change'));
329
+ }
330
+ } catch (err) {
331
+ console.error('Error loading evaluations:', err);
332
+ loadStatusDisplay.textContent = `Error: ${err.message}`;
333
+ } finally {
334
+ refreshEvalsBtn.disabled = false;
335
+ }
336
+ }
337
+
338
+ // Handle evaluation selection
339
+ evalSelect.addEventListener('change', async () => {
340
+ appState.evalId = evalSelect.value;
341
+
342
+ if (!appState.evalId) {
343
+ exampleSelect.innerHTML = '<option value="">-- Select Example --</option>';
344
+ exampleSelect.disabled = true;
345
+ runSelect.innerHTML = '<option value="">-- Select Run --</option>';
346
+ runSelect.disabled = true;
347
+ runDetails.classList.add('hidden');
348
+ return;
349
+ }
350
+
351
+ try {
352
+ loadStatusDisplay.textContent = 'Loading examples...';
353
+ evalSelect.disabled = true;
354
+
355
+ const response = await fetch(`/api/eval/${appState.evalId}/examples?path=${encodeURIComponent(appState.basePath)}`);
356
+ if (!response.ok) {
357
+ const errorData = await response.json();
358
+ throw new Error(errorData.error || 'Failed to load examples');
359
+ }
360
+
361
+ const examples = await response.json();
362
+ appState.loadedData.examples = examples;
363
+
364
+ // Update example dropdown
365
+ exampleSelect.innerHTML = '<option value="">-- Select Example --</option>';
366
+
367
+ for (const [exampleId, task] of Object.entries(examples)) {
368
+ const option = document.createElement('option');
369
+ option.value = exampleId;
370
+ option.textContent = exampleId;
371
+ option.title = task; // Show task as tooltip
372
+ exampleSelect.appendChild(option);
373
+ }
374
+
375
+ exampleSelect.disabled = false;
376
+ runSelect.innerHTML = '<option value="">-- Select Run --</option>';
377
+ runSelect.disabled = true;
378
+ runDetails.classList.add('hidden');
379
+
380
+ loadStatusDisplay.textContent = `Loaded ${Object.keys(examples).length} examples`;
381
+
382
+ // AUTO-SELECT FIRST EXAMPLE
383
+ if (Object.keys(examples).length > 0) {
384
+ const firstExampleId = Object.keys(examples)[0];
385
+ exampleSelect.value = firstExampleId;
386
+ // Trigger change event to load runs
387
+ exampleSelect.dispatchEvent(new Event('change'));
388
+ }
389
+ } catch (err) {
390
+ console.error('Error loading examples:', err);
391
+ loadStatusDisplay.textContent = `Error: ${err.message}`;
392
+ } finally {
393
+ evalSelect.disabled = false;
394
+ }
395
+ });
396
+
397
+ // Example selection
398
+ exampleSelect.addEventListener('change', async () => {
399
+ appState.currentExampleId = exampleSelect.value;
400
+
401
+ // Reset run selection
402
+ runSelect.innerHTML = '<option value="">-- Select Run --</option>';
403
+
404
+ if (!appState.currentExampleId) {
405
+ runSelect.disabled = true;
406
+ runDetails.classList.add('hidden');
407
+ return;
408
+ }
409
+
410
+ try {
411
+ loadStatusDisplay.textContent = 'Loading runs...';
412
+ exampleSelect.disabled = true;
413
+
414
+ const response = await fetch(`/api/eval/${appState.evalId}/example/${appState.currentExampleId}/runs?path=${encodeURIComponent(appState.basePath)}`);
415
+ if (!response.ok) {
416
+ const errorData = await response.json();
417
+ throw new Error(errorData.error || 'Failed to load runs');
418
+ }
419
+
420
+ const runs = await response.json();
421
+ appState.loadedData.runs[appState.currentExampleId] = runs;
422
+
423
+ // SORT RUNS by ID (assuming run IDs have timestamps or sequence numbers)
424
+ runs.sort((a, b) => a.id.localeCompare(b.id, undefined, {numeric: true}));
425
+
426
+ // Update run dropdown with sorted runs
427
+ runSelect.innerHTML = '<option value="">-- Select Run --</option>';
428
+ runs.forEach(run => {
429
+ const option = document.createElement('option');
430
+ option.value = run.id;
431
+ option.textContent = `${run.id} (${run.status})`;
432
+ option.dataset.status = run.status;
433
+ runSelect.appendChild(option);
434
+ });
435
+
436
+ runSelect.disabled = false;
437
+ runDetails.classList.add('hidden');
438
+
439
+ loadStatusDisplay.textContent = `Loaded ${runs.length} runs`;
440
+
441
+ // AUTO-SELECT FIRST RUN
442
+ if (runs.length > 0) {
443
+ runSelect.value = runs[0].id;
444
+ // Trigger change event to load run data
445
+ runSelect.dispatchEvent(new Event('change'));
446
+ }
447
+ } catch (err) {
448
+ console.error('Error loading runs:', err);
449
+ loadStatusDisplay.textContent = `Error: ${err.message}`;
450
+ } finally {
451
+ exampleSelect.disabled = false;
452
+ }
453
+ });
454
+
455
+ // Run selection
456
+ runSelect.addEventListener('change', () => {
457
+ appState.currentRunId = runSelect.value;
458
+
459
+ if (appState.currentRunId && appState.currentExampleId) {
460
+ loadRunData(appState.currentExampleId, appState.currentRunId);
461
+ runDetails.classList.remove('hidden');
462
+ } else {
463
+ runDetails.classList.add('hidden');
464
+ }
465
+ });
466
+
467
+ // Load run data
468
+ async function loadRunData(exampleId, runId) {
469
+ loadStatusDisplay.textContent = 'Loading run data...';
470
+ runSelect.disabled = true;
471
+ jsonLoadingIndicator.classList.remove('hidden');
472
+ jsonError.classList.add('hidden');
473
+
474
+ try {
475
+ // Get metadata
476
+ const metadataResponse = await fetch(`/api/eval/${appState.evalId}/example/${exampleId}/run/${runId}/metadata?path=${encodeURIComponent(appState.basePath)}`);
477
+ let metadata;
478
+
479
+ if (metadataResponse.ok) {
480
+ metadata = await metadataResponse.json();
481
+ } else {
482
+ const errorData = await metadataResponse.json();
483
+ console.error('Error loading metadata:', errorData);
484
+ jsonError.textContent = `Error loading metadata: ${errorData.error || 'Unknown error'}`;
485
+ jsonError.classList.remove('hidden');
486
+ metadata = null;
487
+ }
488
+
489
+ appState.loadedData.metadata[exampleId] = appState.loadedData.metadata[exampleId] || {};
490
+ appState.loadedData.metadata[exampleId][runId] = metadata;
491
+
492
+ // Display task
493
+ const task = appState.loadedData.examples[exampleId];
494
+ taskText.textContent = task || "No task available";
495
+
496
+ // Display status
497
+ let statusHtml = "";
498
+
499
+ if (metadata) {
500
+ if (metadata.status === 'completed') {
501
+ statusHtml = `<p><span class="status-success">✓ Completed successfully</span></p>`;
502
+ } else {
503
+ statusHtml = `<p><span class="status-failure">✗ Failed</span></p>`;
504
+ if (metadata.error_message) {
505
+ statusHtml += `<p>Error: ${metadata.error_message}</p>`;
506
+ }
507
+ }
508
+ } else {
509
+ statusHtml = "<p>Status information not available</p>";
510
+ }
511
+
512
+ statusDisplay.innerHTML = statusHtml;
513
+
514
+ // Get screenshots
515
+ const screenshotsResponse = await fetch(`/api/eval/${appState.evalId}/example/${exampleId}/run/${runId}/screenshots?path=${encodeURIComponent(appState.basePath)}`);
516
+ const screenshots = await screenshotsResponse.json();
517
+
518
+ appState.loadedData.screenshots[exampleId] = appState.loadedData.screenshots[exampleId] || {};
519
+ appState.loadedData.screenshots[exampleId][runId] = screenshots;
520
+
521
+ // Load screenshots
522
+ loadScreenshots(exampleId, runId);
523
+
524
+ // Load agent trace
525
+ renderAgentTrace(metadata);
526
+
527
+ // Display raw JSON
528
+ if (metadata) {
529
+ rawJson.textContent = JSON.stringify(metadata, null, 2);
530
+ } else {
531
+ rawJson.textContent = "No metadata available";
532
+ }
533
+
534
+ // Show screenshots tab by default
535
+ document.querySelector('.tab[data-tab="screenshots"]').click();
536
+
537
+ loadStatusDisplay.textContent = 'Run data loaded successfully';
538
+ } catch (err) {
539
+ console.error('Error loading run data:', err);
540
+ loadStatusDisplay.textContent = `Error: ${err.message}`;
541
+ jsonError.textContent = `Error loading data: ${err.message}`;
542
+ jsonError.classList.remove('hidden');
543
+ } finally {
544
+ jsonLoadingIndicator.classList.add('hidden');
545
+ runSelect.disabled = false;
546
+ }
547
+ }
548
+
549
+ // Load screenshots
550
+ function loadScreenshots(exampleId, runId) {
551
+ appState.currentImages = appState.loadedData.screenshots[exampleId]?.[runId] || [];
552
+
553
+ if (appState.currentImages.length === 0) {
554
+ imageContainer.classList.add('hidden');
555
+ imageControls.classList.add('hidden');
556
+ noImages.classList.remove('hidden');
557
+ return;
558
+ }
559
+
560
+ // Setup image viewer
561
+ noImages.classList.add('hidden');
562
+ imageContainer.classList.remove('hidden');
563
+ imageControls.classList.remove('hidden');
564
+
565
+ // Configure slider
566
+ imageSlider.min = 0;
567
+ imageSlider.max = appState.currentImages.length - 1;
568
+ imageSlider.value = 0;
569
+
570
+ // Reset to first image
571
+ appState.currentImageIndex = 0;
572
+ updateImageDisplay();
573
+ }
574
+
575
+ // Update image display
576
+ function updateImageDisplay() {
577
+ if (appState.currentImages.length === 0) return;
578
+
579
+ const image = appState.currentImages[appState.currentImageIndex];
580
+ currentImage.src = image.path;
581
+ imageCaption.textContent = image.name;
582
+ imageCounter.textContent = `${appState.currentImageIndex + 1} / ${appState.currentImages.length}`;
583
+ imageSlider.value = appState.currentImageIndex;
584
+
585
+ // Update button states
586
+ prevImage.disabled = appState.currentImageIndex === 0;
587
+ nextImage.disabled = appState.currentImageIndex === appState.currentImages.length - 1;
588
+ }
589
+
590
+ // Image navigation
591
+ prevImage.addEventListener('click', () => {
592
+ if (appState.currentImageIndex > 0) {
593
+ appState.currentImageIndex--;
594
+ updateImageDisplay();
595
+ }
596
+ });
597
+
598
+ nextImage.addEventListener('click', () => {
599
+ if (appState.currentImageIndex < appState.currentImages.length - 1) {
600
+ appState.currentImageIndex++;
601
+ updateImageDisplay();
602
+ }
603
+ });
604
+
605
+ imageSlider.addEventListener('input', () => {
606
+ appState.currentImageIndex = parseInt(imageSlider.value);
607
+ updateImageDisplay();
608
+ });
609
+
610
+ // Tab handling
611
+ document.querySelectorAll('.tab').forEach(tab => {
612
+ tab.addEventListener('click', () => {
613
+ // Set active tab
614
+ document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
615
+ tab.classList.add('active');
616
+
617
+ // Show active content
618
+ const tabId = tab.getAttribute('data-tab');
619
+ document.querySelectorAll('.tab-content').forEach(content => {
620
+ content.classList.remove('active');
621
+ });
622
+ document.getElementById(`${tabId}-tab`).classList.add('active');
623
+ });
624
+ });
625
+
626
+ // Render agent trace - UPDATED to show all sections expanded and remove duplicated task title
627
+ function renderAgentTrace(metadata) {
628
+ agentSteps.innerHTML = '';
629
+
630
+ if (!metadata || !metadata.summary || metadata.summary.length === 0) {
631
+ agentSteps.innerHTML = '<p>No agent trace data available</p>';
632
+ return;
633
+ }
634
+
635
+ // Process each step
636
+ metadata.summary.forEach((step, index) => {
637
+ const stepDiv = document.createElement('div');
638
+ stepDiv.className = 'step';
639
+
640
+ // Create step header
641
+ const headerDiv = document.createElement('div');
642
+ headerDiv.className = 'step-header';
643
+
644
+ let headerText = `Step ${index}`;
645
+ if (index === 0 && step.task) {
646
+ headerText = 'Task';
647
+ } else if (step.model_output_message) {
648
+ headerText = 'Planning';
649
+ } else if (step.tool_calls) {
650
+ headerText = `Action ${index}`;
651
+ } else if (step.error) {
652
+ headerText = 'Error';
653
+ }
654
+
655
+ headerDiv.innerHTML = `<span>${headerText}</span><span>▲</span>`;
656
+ stepDiv.appendChild(headerDiv);
657
+
658
+ // Create step content
659
+ const contentDiv = document.createElement('div');
660
+ contentDiv.className = 'step-content';
661
+ // Make all sections visible by default
662
+ contentDiv.style.display = 'block';
663
+
664
+ let contentHtml = '';
665
+
666
+ // Task information - don't duplicate the title
667
+ if (index === 0 && step.task) {
668
+ // Just show the task content without the "Task:" title
669
+ contentHtml += `${step.task}\n\n`;
670
+ }
671
+
672
+ // Model output and planning
673
+ if (step.model_output_message && step.model_output_message.content) {
674
+ contentHtml += `<strong>Model Output:</strong>\n${step.model_output_message.content}\n\n`;
675
+
676
+ if (step.plan) {
677
+ contentHtml += `<strong>Plan:</strong>\n${step.plan}\n\n`;
678
+ }
679
+ }
680
+
681
+ // Tool calls
682
+ if (step.tool_calls && step.tool_calls.length > 0) {
683
+ step.tool_calls.forEach(toolCall => {
684
+ if (toolCall.function) {
685
+ contentHtml += `<strong>Tool Call:</strong> ${toolCall.function.name}\n`;
686
+ if (toolCall.function.arguments) {
687
+ contentHtml += `<strong>Arguments:</strong>\n${toolCall.function.arguments}\n\n`;
688
+ }
689
+ }
690
+ });
691
+ }
692
+
693
+ // Model reasoning
694
+ if (step.model_output) {
695
+ contentHtml += `<strong>Model Reasoning:</strong>\n${step.model_output}\n\n`;
696
+ }
697
+
698
+ // Observations
699
+ if (step.observations) {
700
+ contentHtml += `<strong>Observations:</strong>\n${step.observations}\n\n`;
701
+ }
702
+
703
+ // Action output
704
+ if (step.action_output) {
705
+ contentHtml += `<strong>Action Output:</strong>\n${step.action_output}\n\n`;
706
+ }
707
+
708
+ // Errors
709
+ if (step.error) {
710
+ contentHtml += `<strong>Error Type:</strong> ${step.error.type || 'Unknown'}\n`;
711
+ if (step.error.message) {
712
+ contentHtml += `<strong>Error Message:</strong> ${step.error.message}\n`;
713
+ }
714
+ }
715
+
716
+ contentDiv.textContent = contentHtml || "No content available for this step";
717
+ stepDiv.appendChild(contentDiv);
718
+
719
+ // Add click handler to toggle content
720
+ headerDiv.addEventListener('click', () => {
721
+ const isHidden = contentDiv.style.display === 'none';
722
+ contentDiv.style.display = isHidden ? 'block' : 'none';
723
+ headerDiv.querySelector('span:last-child').textContent = isHidden ? '▲' : '▼';
724
+ });
725
+
726
+ agentSteps.appendChild(stepDiv);
727
+ });
728
+
729
+ // No need to expand the first step by default since all are now expanded
730
+ }
731
+
732
+ // Handle keyboard navigation for images
733
+ document.addEventListener('keydown', (e) => {
734
+ if (!appState.currentImages || appState.currentImages.length === 0) return;
735
+
736
+ // Check if the screenshots tab is active
737
+ const screenshotsTab = document.getElementById('screenshots-tab');
738
+ if (!screenshotsTab.classList.contains('active')) return;
739
+
740
+ if (e.key === 'ArrowLeft' && appState.currentImageIndex > 0) {
741
+ appState.currentImageIndex--;
742
+ updateImageDisplay();
743
+ } else if (e.key === 'ArrowRight' && appState.currentImageIndex < appState.currentImages.length - 1) {
744
+ appState.currentImageIndex++;
745
+ updateImageDisplay();
746
+ }
747
+ });
748
+
749
+ // Load evaluations on page load
750
+ document.addEventListener('DOMContentLoaded', loadEvaluations);
751
+ </script>
752
+ </body>
753
+ </html>