Spaces:
Sleeping
Sleeping
朱东升
commited on
Commit
·
8cc5c8e
1
Parent(s):
fd2c041
update32
Browse files- src/containerized_eval.py +30 -20
- src/evaluation/evaluator.py +21 -9
- src/queue/queue_processor.py +13 -2
- src/ui/dashboard.py +27 -2
src/containerized_eval.py
CHANGED
@@ -65,24 +65,34 @@ EVALUATORS = {
|
|
65 |
"go_test.go": (eval_go.eval_script, "_test.go"),
|
66 |
}
|
67 |
|
68 |
-
def eval_string_script(language,
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
#
|
80 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
return {
|
82 |
-
"
|
83 |
-
"
|
84 |
-
"
|
85 |
-
"
|
86 |
-
|
87 |
-
|
88 |
-
return {"status": "Exception", "error": str(e)}
|
|
|
65 |
"go_test.go": (eval_go.eval_script, "_test.go"),
|
66 |
}
|
67 |
|
68 |
+
def eval_string_script(language, program):
|
69 |
+
if language in EVALUATORS:
|
70 |
+
(eval_script, file_ext) = EVALUATORS[language]
|
71 |
+
else:
|
72 |
+
eval_module = __import__(f"eval_{language}" if language != "go_test.go" else "eval_go")
|
73 |
+
eval_script = eval_module.eval_script
|
74 |
+
file_ext = f".{language}" if language != "go_test.go" else "_test.go"
|
75 |
+
with tempfile.NamedTemporaryFile(suffix=file_ext, delete=True) as f:
|
76 |
+
f.write(program.encode("utf-8"))
|
77 |
+
f.flush()
|
78 |
+
result = eval_script(Path(f.name))
|
79 |
+
# Only save the first 2K of output from the running program. Any futher
|
80 |
+
# output is very likely an exceptionally long stack trace or a long
|
81 |
+
# series of prints.
|
82 |
+
if type(result["stdout"]) == bytes:
|
83 |
+
result["stdout"] = result["stdout"].decode("utf-8", errors="ignore")
|
84 |
+
if result["stdout"] is None:
|
85 |
+
result["stdout"] = ""
|
86 |
+
if result["stderr"] is None:
|
87 |
+
result["stderr"] = ""
|
88 |
+
if type(result["stderr"]) == bytes:
|
89 |
+
result["stderr"] = result["stderr"].decode("utf-8", errors="ignore")
|
90 |
+
assert type(result["stdout"]) == str
|
91 |
+
assert type(result["stderr"]) == str
|
92 |
return {
|
93 |
+
"program": program,
|
94 |
+
"stdout": result['stdout'].replace("!!int", "")[:2048],
|
95 |
+
"stderr": result['stderr'][:2048],
|
96 |
+
"exit_code": result['exit_code'],
|
97 |
+
"status": result['status']
|
98 |
+
}
|
|
src/evaluation/evaluator.py
CHANGED
@@ -25,11 +25,15 @@ def evaluate(input_data):
|
|
25 |
item = future_to_item[future]
|
26 |
try:
|
27 |
result = future.result()
|
28 |
-
item
|
29 |
-
|
|
|
|
|
|
|
30 |
except Exception as e:
|
31 |
-
item.
|
32 |
-
|
|
|
33 |
return results
|
34 |
|
35 |
except Exception as e:
|
@@ -52,23 +56,31 @@ def evaluate_single_case(input_data):
|
|
52 |
|
53 |
results = []
|
54 |
for comp in completions:
|
55 |
-
code = input_data.get('prompt') + comp + '\n' + input_data.get('tests')
|
56 |
|
57 |
# Try up to max_retries + 1 times for all test cases
|
58 |
for attempt in range(max_retries + 1):
|
59 |
result = evaluate_code(code, language)
|
60 |
|
61 |
-
# If success or last attempt,
|
62 |
if result["status"] == "OK" or attempt == max_retries:
|
63 |
-
if result["status"] == "OK":
|
64 |
-
return result
|
65 |
results.append(result)
|
66 |
break
|
67 |
|
68 |
# For retries, briefly wait to allow resources to stabilize
|
69 |
time.sleep(0.3)
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
-
return
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
except Exception as e:
|
74 |
return {"status": "Exception", "error": str(e)}
|
|
|
25 |
item = future_to_item[future]
|
26 |
try:
|
27 |
result = future.result()
|
28 |
+
# Preserve original item data but add result fields
|
29 |
+
updated_item = item.copy()
|
30 |
+
if isinstance(result, dict):
|
31 |
+
updated_item.update(result)
|
32 |
+
results.append(updated_item)
|
33 |
except Exception as e:
|
34 |
+
updated_item = item.copy()
|
35 |
+
updated_item.update({"status": "Exception", "error": str(e)})
|
36 |
+
results.append(updated_item)
|
37 |
return results
|
38 |
|
39 |
except Exception as e:
|
|
|
56 |
|
57 |
results = []
|
58 |
for comp in completions:
|
59 |
+
code = input_data.get('prompt', '') + comp + '\n' + input_data.get('tests', '')
|
60 |
|
61 |
# Try up to max_retries + 1 times for all test cases
|
62 |
for attempt in range(max_retries + 1):
|
63 |
result = evaluate_code(code, language)
|
64 |
|
65 |
+
# If success or last attempt, add to results
|
66 |
if result["status"] == "OK" or attempt == max_retries:
|
|
|
|
|
67 |
results.append(result)
|
68 |
break
|
69 |
|
70 |
# For retries, briefly wait to allow resources to stabilize
|
71 |
time.sleep(0.3)
|
72 |
+
|
73 |
+
# If we have at least one successful result, return that
|
74 |
+
successful_results = [r for r in results if r["status"] == "OK"]
|
75 |
+
if successful_results:
|
76 |
+
return successful_results[0]
|
77 |
|
78 |
+
# Otherwise return the first result
|
79 |
+
if results:
|
80 |
+
return results[0]
|
81 |
+
|
82 |
+
# Fallback error in case no results were collected
|
83 |
+
return {"status": "Exception", "error": "Failed to evaluate code"}
|
84 |
|
85 |
except Exception as e:
|
86 |
return {"status": "Exception", "error": str(e)}
|
src/queue/queue_processor.py
CHANGED
@@ -42,7 +42,18 @@ class QueueProcessor:
|
|
42 |
process_time = end_time - self.task_queue.task_status[task_id]['start_time']
|
43 |
|
44 |
with self.task_queue.lock:
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
self.task_queue.task_status[task_id]['result'] = result
|
47 |
self.task_queue.task_status[task_id]['end_time'] = end_time
|
48 |
self.task_queue.task_status[task_id]['process_time'] = process_time
|
@@ -53,7 +64,7 @@ class QueueProcessor:
|
|
53 |
'task_id': task_id,
|
54 |
'request_time': request_time,
|
55 |
'process_time': process_time,
|
56 |
-
'status': '
|
57 |
'factors': self.task_queue.task_status[task_id].get('estimated_factors', {})
|
58 |
})
|
59 |
while len(self.task_queue.task_history) > 200:
|
|
|
42 |
process_time = end_time - self.task_queue.task_status[task_id]['start_time']
|
43 |
|
44 |
with self.task_queue.lock:
|
45 |
+
# Set status based on evaluation result
|
46 |
+
if isinstance(result, dict) and result.get('status') == 'Exception':
|
47 |
+
self.task_queue.task_status[task_id]['status'] = 'error'
|
48 |
+
self.task_queue.task_status[task_id]['error'] = result.get('error', 'Unknown error')
|
49 |
+
elif isinstance(result, list) and any(item.get('status') == 'Exception' for item in result if isinstance(item, dict)):
|
50 |
+
self.task_queue.task_status[task_id]['status'] = 'error'
|
51 |
+
error_items = [item for item in result if isinstance(item, dict) and item.get('status') == 'Exception']
|
52 |
+
if error_items:
|
53 |
+
self.task_queue.task_status[task_id]['error'] = error_items[0].get('error', 'Unknown error')
|
54 |
+
else:
|
55 |
+
self.task_queue.task_status[task_id]['status'] = 'completed'
|
56 |
+
|
57 |
self.task_queue.task_status[task_id]['result'] = result
|
58 |
self.task_queue.task_status[task_id]['end_time'] = end_time
|
59 |
self.task_queue.task_status[task_id]['process_time'] = process_time
|
|
|
64 |
'task_id': task_id,
|
65 |
'request_time': request_time,
|
66 |
'process_time': process_time,
|
67 |
+
'status': self.task_queue.task_status[task_id]['status'],
|
68 |
'factors': self.task_queue.task_status[task_id].get('estimated_factors', {})
|
69 |
})
|
70 |
while len(self.task_queue.task_history) > 200:
|
src/ui/dashboard.py
CHANGED
@@ -13,18 +13,22 @@ class Dashboard:
|
|
13 |
|
14 |
tasks_html = ""
|
15 |
for task in reversed(queue_info['recent_tasks']):
|
|
|
|
|
|
|
16 |
tasks_html += f"""
|
17 |
-
<tr>
|
18 |
<td>{task['task_id'][:8]}...</td>
|
19 |
<td>{datetime.fromtimestamp(task['request_time']).strftime('%H:%M:%S')}</td>
|
20 |
<td>{self.time_estimator.format_time(task['process_time'])}</td>
|
|
|
21 |
</tr>
|
22 |
"""
|
23 |
|
24 |
if not tasks_html:
|
25 |
tasks_html = """
|
26 |
<tr>
|
27 |
-
<td colspan="
|
28 |
</tr>
|
29 |
"""
|
30 |
|
@@ -61,6 +65,7 @@ class Dashboard:
|
|
61 |
<th>Task ID</th>
|
62 |
<th>Request Time</th>
|
63 |
<th>Processing Time</th>
|
|
|
64 |
</tr>
|
65 |
</thead>
|
66 |
<tbody>
|
@@ -204,6 +209,26 @@ class Dashboard:
|
|
204 |
.recent-tasks tbody tr:hover {
|
205 |
background-color: #f8f9fa;
|
206 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
|
208 |
.tabs {
|
209 |
margin-top: 20px;
|
|
|
13 |
|
14 |
tasks_html = ""
|
15 |
for task in reversed(queue_info['recent_tasks']):
|
16 |
+
status_class = "success" if task['status'] == 'completed' else "error" if task['status'] == 'error' else ""
|
17 |
+
status_icon = "✓" if task['status'] == 'completed' else "✗" if task['status'] == 'error' else "⚙"
|
18 |
+
|
19 |
tasks_html += f"""
|
20 |
+
<tr class="{status_class}">
|
21 |
<td>{task['task_id'][:8]}...</td>
|
22 |
<td>{datetime.fromtimestamp(task['request_time']).strftime('%H:%M:%S')}</td>
|
23 |
<td>{self.time_estimator.format_time(task['process_time'])}</td>
|
24 |
+
<td class="status-cell {status_class}">{status_icon} {task['status'].capitalize()}</td>
|
25 |
</tr>
|
26 |
"""
|
27 |
|
28 |
if not tasks_html:
|
29 |
tasks_html = """
|
30 |
<tr>
|
31 |
+
<td colspan="4" style="text-align: center; padding: 20px;">No historical tasks</td>
|
32 |
</tr>
|
33 |
"""
|
34 |
|
|
|
65 |
<th>Task ID</th>
|
66 |
<th>Request Time</th>
|
67 |
<th>Processing Time</th>
|
68 |
+
<th>Status</th>
|
69 |
</tr>
|
70 |
</thead>
|
71 |
<tbody>
|
|
|
209 |
.recent-tasks tbody tr:hover {
|
210 |
background-color: #f8f9fa;
|
211 |
}
|
212 |
+
|
213 |
+
.recent-tasks tr.success {
|
214 |
+
background-color: #e7f5e7;
|
215 |
+
}
|
216 |
+
|
217 |
+
.recent-tasks tr.error {
|
218 |
+
background-color: #f8d7da;
|
219 |
+
}
|
220 |
+
|
221 |
+
.status-cell {
|
222 |
+
font-weight: bold;
|
223 |
+
}
|
224 |
+
|
225 |
+
.status-cell.success {
|
226 |
+
color: #28a745;
|
227 |
+
}
|
228 |
+
|
229 |
+
.status-cell.error {
|
230 |
+
color: #dc3545;
|
231 |
+
}
|
232 |
|
233 |
.tabs {
|
234 |
margin-top: 20px;
|