朱东升 commited on
Commit
8cc5c8e
·
1 Parent(s): fd2c041
src/containerized_eval.py CHANGED
@@ -65,24 +65,34 @@ EVALUATORS = {
65
  "go_test.go": (eval_go.eval_script, "_test.go"),
66
  }
67
 
68
- def eval_string_script(language, code):
69
- """
70
- Evaluate code in a specific language
71
-
72
- This is a placeholder for the actual implementation. In a real scenario,
73
- this would contain the actual code evaluation logic.
74
- """
75
- try:
76
- if not language or not code:
77
- return {"status": "Exception", "error": "Language or code is missing"}
78
-
79
- # This is where the actual logic would be implemented
80
- # For now, we'll just return a simulated success
 
 
 
 
 
 
 
 
 
 
 
81
  return {
82
- "status": "OK",
83
- "result": "Evaluation completed successfully",
84
- "language": language,
85
- "code_length": len(code)
86
- }
87
- except Exception as e:
88
- return {"status": "Exception", "error": str(e)}
 
65
  "go_test.go": (eval_go.eval_script, "_test.go"),
66
  }
67
 
68
+ def eval_string_script(language, program):
69
+ if language in EVALUATORS:
70
+ (eval_script, file_ext) = EVALUATORS[language]
71
+ else:
72
+ eval_module = __import__(f"eval_{language}" if language != "go_test.go" else "eval_go")
73
+ eval_script = eval_module.eval_script
74
+ file_ext = f".{language}" if language != "go_test.go" else "_test.go"
75
+ with tempfile.NamedTemporaryFile(suffix=file_ext, delete=True) as f:
76
+ f.write(program.encode("utf-8"))
77
+ f.flush()
78
+ result = eval_script(Path(f.name))
79
+ # Only save the first 2K of output from the running program. Any futher
80
+ # output is very likely an exceptionally long stack trace or a long
81
+ # series of prints.
82
+ if type(result["stdout"]) == bytes:
83
+ result["stdout"] = result["stdout"].decode("utf-8", errors="ignore")
84
+ if result["stdout"] is None:
85
+ result["stdout"] = ""
86
+ if result["stderr"] is None:
87
+ result["stderr"] = ""
88
+ if type(result["stderr"]) == bytes:
89
+ result["stderr"] = result["stderr"].decode("utf-8", errors="ignore")
90
+ assert type(result["stdout"]) == str
91
+ assert type(result["stderr"]) == str
92
  return {
93
+ "program": program,
94
+ "stdout": result['stdout'].replace("!!int", "")[:2048],
95
+ "stderr": result['stderr'][:2048],
96
+ "exit_code": result['exit_code'],
97
+ "status": result['status']
98
+ }
 
src/evaluation/evaluator.py CHANGED
@@ -25,11 +25,15 @@ def evaluate(input_data):
25
  item = future_to_item[future]
26
  try:
27
  result = future.result()
28
- item.update(result)
29
- results.append(item)
 
 
 
30
  except Exception as e:
31
- item.update({"status": "Exception", "error": str(e)})
32
- results.append(item)
 
33
  return results
34
 
35
  except Exception as e:
@@ -52,23 +56,31 @@ def evaluate_single_case(input_data):
52
 
53
  results = []
54
  for comp in completions:
55
- code = input_data.get('prompt') + comp + '\n' + input_data.get('tests')
56
 
57
  # Try up to max_retries + 1 times for all test cases
58
  for attempt in range(max_retries + 1):
59
  result = evaluate_code(code, language)
60
 
61
- # If success or last attempt, return/record the result
62
  if result["status"] == "OK" or attempt == max_retries:
63
- if result["status"] == "OK":
64
- return result
65
  results.append(result)
66
  break
67
 
68
  # For retries, briefly wait to allow resources to stabilize
69
  time.sleep(0.3)
 
 
 
 
 
70
 
71
- return results[0]
 
 
 
 
 
72
 
73
  except Exception as e:
74
  return {"status": "Exception", "error": str(e)}
 
25
  item = future_to_item[future]
26
  try:
27
  result = future.result()
28
+ # Preserve original item data but add result fields
29
+ updated_item = item.copy()
30
+ if isinstance(result, dict):
31
+ updated_item.update(result)
32
+ results.append(updated_item)
33
  except Exception as e:
34
+ updated_item = item.copy()
35
+ updated_item.update({"status": "Exception", "error": str(e)})
36
+ results.append(updated_item)
37
  return results
38
 
39
  except Exception as e:
 
56
 
57
  results = []
58
  for comp in completions:
59
+ code = input_data.get('prompt', '') + comp + '\n' + input_data.get('tests', '')
60
 
61
  # Try up to max_retries + 1 times for all test cases
62
  for attempt in range(max_retries + 1):
63
  result = evaluate_code(code, language)
64
 
65
+ # If success or last attempt, add to results
66
  if result["status"] == "OK" or attempt == max_retries:
 
 
67
  results.append(result)
68
  break
69
 
70
  # For retries, briefly wait to allow resources to stabilize
71
  time.sleep(0.3)
72
+
73
+ # If we have at least one successful result, return that
74
+ successful_results = [r for r in results if r["status"] == "OK"]
75
+ if successful_results:
76
+ return successful_results[0]
77
 
78
+ # Otherwise return the first result
79
+ if results:
80
+ return results[0]
81
+
82
+ # Fallback error in case no results were collected
83
+ return {"status": "Exception", "error": "Failed to evaluate code"}
84
 
85
  except Exception as e:
86
  return {"status": "Exception", "error": str(e)}
src/queue/queue_processor.py CHANGED
@@ -42,7 +42,18 @@ class QueueProcessor:
42
  process_time = end_time - self.task_queue.task_status[task_id]['start_time']
43
 
44
  with self.task_queue.lock:
45
- self.task_queue.task_status[task_id]['status'] = 'completed'
 
 
 
 
 
 
 
 
 
 
 
46
  self.task_queue.task_status[task_id]['result'] = result
47
  self.task_queue.task_status[task_id]['end_time'] = end_time
48
  self.task_queue.task_status[task_id]['process_time'] = process_time
@@ -53,7 +64,7 @@ class QueueProcessor:
53
  'task_id': task_id,
54
  'request_time': request_time,
55
  'process_time': process_time,
56
- 'status': 'completed',
57
  'factors': self.task_queue.task_status[task_id].get('estimated_factors', {})
58
  })
59
  while len(self.task_queue.task_history) > 200:
 
42
  process_time = end_time - self.task_queue.task_status[task_id]['start_time']
43
 
44
  with self.task_queue.lock:
45
+ # Set status based on evaluation result
46
+ if isinstance(result, dict) and result.get('status') == 'Exception':
47
+ self.task_queue.task_status[task_id]['status'] = 'error'
48
+ self.task_queue.task_status[task_id]['error'] = result.get('error', 'Unknown error')
49
+ elif isinstance(result, list) and any(item.get('status') == 'Exception' for item in result if isinstance(item, dict)):
50
+ self.task_queue.task_status[task_id]['status'] = 'error'
51
+ error_items = [item for item in result if isinstance(item, dict) and item.get('status') == 'Exception']
52
+ if error_items:
53
+ self.task_queue.task_status[task_id]['error'] = error_items[0].get('error', 'Unknown error')
54
+ else:
55
+ self.task_queue.task_status[task_id]['status'] = 'completed'
56
+
57
  self.task_queue.task_status[task_id]['result'] = result
58
  self.task_queue.task_status[task_id]['end_time'] = end_time
59
  self.task_queue.task_status[task_id]['process_time'] = process_time
 
64
  'task_id': task_id,
65
  'request_time': request_time,
66
  'process_time': process_time,
67
+ 'status': self.task_queue.task_status[task_id]['status'],
68
  'factors': self.task_queue.task_status[task_id].get('estimated_factors', {})
69
  })
70
  while len(self.task_queue.task_history) > 200:
src/ui/dashboard.py CHANGED
@@ -13,18 +13,22 @@ class Dashboard:
13
 
14
  tasks_html = ""
15
  for task in reversed(queue_info['recent_tasks']):
 
 
 
16
  tasks_html += f"""
17
- <tr>
18
  <td>{task['task_id'][:8]}...</td>
19
  <td>{datetime.fromtimestamp(task['request_time']).strftime('%H:%M:%S')}</td>
20
  <td>{self.time_estimator.format_time(task['process_time'])}</td>
 
21
  </tr>
22
  """
23
 
24
  if not tasks_html:
25
  tasks_html = """
26
  <tr>
27
- <td colspan="3" style="text-align: center; padding: 20px;">No historical tasks</td>
28
  </tr>
29
  """
30
 
@@ -61,6 +65,7 @@ class Dashboard:
61
  <th>Task ID</th>
62
  <th>Request Time</th>
63
  <th>Processing Time</th>
 
64
  </tr>
65
  </thead>
66
  <tbody>
@@ -204,6 +209,26 @@ class Dashboard:
204
  .recent-tasks tbody tr:hover {
205
  background-color: #f8f9fa;
206
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
  .tabs {
209
  margin-top: 20px;
 
13
 
14
  tasks_html = ""
15
  for task in reversed(queue_info['recent_tasks']):
16
+ status_class = "success" if task['status'] == 'completed' else "error" if task['status'] == 'error' else ""
17
+ status_icon = "✓" if task['status'] == 'completed' else "✗" if task['status'] == 'error' else "⚙"
18
+
19
  tasks_html += f"""
20
+ <tr class="{status_class}">
21
  <td>{task['task_id'][:8]}...</td>
22
  <td>{datetime.fromtimestamp(task['request_time']).strftime('%H:%M:%S')}</td>
23
  <td>{self.time_estimator.format_time(task['process_time'])}</td>
24
+ <td class="status-cell {status_class}">{status_icon} {task['status'].capitalize()}</td>
25
  </tr>
26
  """
27
 
28
  if not tasks_html:
29
  tasks_html = """
30
  <tr>
31
+ <td colspan="4" style="text-align: center; padding: 20px;">No historical tasks</td>
32
  </tr>
33
  """
34
 
 
65
  <th>Task ID</th>
66
  <th>Request Time</th>
67
  <th>Processing Time</th>
68
+ <th>Status</th>
69
  </tr>
70
  </thead>
71
  <tbody>
 
209
  .recent-tasks tbody tr:hover {
210
  background-color: #f8f9fa;
211
  }
212
+
213
+ .recent-tasks tr.success {
214
+ background-color: #e7f5e7;
215
+ }
216
+
217
+ .recent-tasks tr.error {
218
+ background-color: #f8d7da;
219
+ }
220
+
221
+ .status-cell {
222
+ font-weight: bold;
223
+ }
224
+
225
+ .status-cell.success {
226
+ color: #28a745;
227
+ }
228
+
229
+ .status-cell.error {
230
+ color: #dc3545;
231
+ }
232
 
233
  .tabs {
234
  margin-top: 20px;