朱东升 commited on
Commit
6ae8d58
·
1 Parent(s): 8cc5c8e
Files changed (3) hide show
  1. app.py.bk +677 -0
  2. src/evaluation/evaluator.py +8 -20
  3. src/ui/dashboard.py +3 -25
app.py.bk ADDED
@@ -0,0 +1,677 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import importlib
4
+ import os
5
+ import sys
6
+ from pathlib import Path
7
+ import concurrent.futures
8
+ import multiprocessing
9
+ import time
10
+ import threading
11
+ import queue
12
+ import uuid
13
+ import numpy as np
14
+ from datetime import datetime
15
+ from tqdm.auto import tqdm
16
+ from src.containerized_eval import eval_string_script
17
+
18
+ # Add current directory and src directory to module search path
19
+ current_dir = os.path.dirname(os.path.abspath(__file__))
20
+ src_dir = os.path.join(current_dir, "src")
21
+ if current_dir not in sys.path:
22
+ sys.path.append(current_dir)
23
+ if src_dir not in sys.path:
24
+ sys.path.append(src_dir)
25
+
26
+ # Create message queue
27
+ task_queue = queue.Queue()
28
+ # Dictionary to store task status
29
+ task_status = {}
30
+ # List to store task history, max 200 tasks
31
+ task_history = []
32
+ # Lock for shared resources
33
+ lock = threading.Lock()
34
+ # Number of worker threads
35
+ worker_threads = max(1, multiprocessing.cpu_count() // 2) # Using half the available cores for better stability
36
+ # Flag for running background threads
37
+ running = True
38
+ # Mapping from task type to processing time
39
+ task_type_times = {}
40
+
41
+ def queue_processor():
42
+ """Process tasks in the queue"""
43
+ while running:
44
+ try:
45
+ task_id, input_data, request_time = task_queue.get(timeout=0.1)
46
+ with lock:
47
+ task_status[task_id]['status'] = 'processing'
48
+ task_status[task_id]['start_time'] = time.time()
49
+
50
+ if isinstance(input_data, list) and len(input_data) > 0:
51
+ sample_task = input_data[0]
52
+ language = sample_task.get('language', 'unknown') if isinstance(sample_task, dict) else 'unknown'
53
+ task_size = len(input_data)
54
+ task_complexity = _estimate_task_complexity(input_data)
55
+
56
+ with lock:
57
+ task_status[task_id]['estimated_factors'] = {
58
+ 'language': language,
59
+ 'size': task_size,
60
+ 'complexity': task_complexity
61
+ }
62
+
63
+ result = evaluate(input_data)
64
+
65
+ end_time = time.time()
66
+ process_time = end_time - task_status[task_id]['start_time']
67
+
68
+ with lock:
69
+ task_status[task_id]['status'] = 'completed'
70
+ task_status[task_id]['result'] = result
71
+ task_status[task_id]['end_time'] = end_time
72
+ task_status[task_id]['process_time'] = process_time
73
+
74
+ if 'estimated_factors' in task_status[task_id]:
75
+ factors = task_status[task_id]['estimated_factors']
76
+ key = f"{factors['language']}_{factors['complexity']}"
77
+
78
+ if key not in task_type_times:
79
+ task_type_times[key] = []
80
+
81
+ task_type_times[key].append(process_time / factors['size'])
82
+ if len(task_type_times[key]) > 10:
83
+ task_type_times[key] = task_type_times[key][-10:]
84
+
85
+ task_history.append({
86
+ 'task_id': task_id,
87
+ 'request_time': request_time,
88
+ 'process_time': process_time,
89
+ 'status': 'completed',
90
+ 'factors': task_status[task_id].get('estimated_factors', {})
91
+ })
92
+ while len(task_history) > 200:
93
+ task_history.pop(0)
94
+
95
+ task_queue.task_done()
96
+
97
+ except queue.Empty:
98
+ continue
99
+ except Exception as e:
100
+ if 'task_id' in locals():
101
+ with lock:
102
+ task_status[task_id]['status'] = 'error'
103
+ task_status[task_id]['error'] = str(e)
104
+ task_status[task_id]['end_time'] = time.time()
105
+ task_queue.task_done()
106
+
107
+ def _estimate_task_complexity(tasks):
108
+ """Estimate task complexity
109
+
110
+ Returns: 'simple', 'medium', or 'complex'
111
+ """
112
+ total_code_length = 0
113
+ count = 0
114
+
115
+ for task in tasks:
116
+ if isinstance(task, dict):
117
+ prompt = task.get('prompt', '')
118
+ tests = task.get('tests', '')
119
+ completions = task.get('processed_completions', [])
120
+
121
+ code_length = len(prompt) + len(tests)
122
+ if completions:
123
+ code_length += sum(len(comp) for comp in completions)
124
+
125
+ total_code_length += code_length
126
+ count += 1
127
+
128
+ if count == 0:
129
+ return 'medium'
130
+
131
+ avg_length = total_code_length / count
132
+
133
+ if avg_length < 1000:
134
+ return 'simple'
135
+ elif avg_length < 5000:
136
+ return 'medium'
137
+ else:
138
+ return 'complex'
139
+
140
+ def evaluate(input_data):
141
+ """Main function for code evaluation"""
142
+ try:
143
+ if not isinstance(input_data, list):
144
+ return {"status": "Exception", "error": "Input must be a list"}
145
+
146
+ results = []
147
+
148
+ # Use a moderate number of workers for all language tests to ensure stability
149
+ # This prevents resource contention regardless of language
150
+ max_workers = max(1, min(multiprocessing.cpu_count() // 2, 4))
151
+
152
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
153
+ future_to_item = {executor.submit(evaluate_single_case, item): item for item in input_data}
154
+ for future in concurrent.futures.as_completed(future_to_item):
155
+ item = future_to_item[future]
156
+ try:
157
+ result = future.result()
158
+ item.update(result)
159
+ results.append(item)
160
+ except Exception as e:
161
+ item.update({"status": "Exception", "error": str(e)})
162
+ results.append(item)
163
+ return results
164
+
165
+ except Exception as e:
166
+ return {"status": "Exception", "error": str(e)}
167
+
168
+ def evaluate_single_case(input_data):
169
+ """Evaluate a single code case"""
170
+ try:
171
+ if not isinstance(input_data, dict):
172
+ return {"status": "Exception", "error": "Input item must be a dictionary"}
173
+
174
+ language = input_data.get('language')
175
+ completions = input_data.get('processed_completions', [])
176
+
177
+ if not completions:
178
+ return {"status": "Exception", "error": "No code provided"}
179
+
180
+ # Use a retry mechanism for all languages for better reliability
181
+ max_retries = 2 # One retry for all languages
182
+
183
+ results = []
184
+ for comp in completions:
185
+ code = input_data.get('prompt') + comp + '\n' + input_data.get('tests')
186
+
187
+ # Try up to max_retries + 1 times for all test cases
188
+ for attempt in range(max_retries + 1):
189
+ result = evaluate_code(code, language)
190
+
191
+ # If success or last attempt, return/record the result
192
+ if result["status"] == "OK" or attempt == max_retries:
193
+ if result["status"] == "OK":
194
+ return result
195
+ results.append(result)
196
+ break
197
+
198
+ # For retries, briefly wait to allow resources to stabilize
199
+ time.sleep(0.3)
200
+
201
+ return results[0]
202
+
203
+ except Exception as e:
204
+ return {"status": "Exception", "error": str(e)}
205
+
206
+ def evaluate_code(code, language):
207
+ """Evaluate code in a specific language"""
208
+ try:
209
+ result = eval_string_script(language, code)
210
+ return result
211
+
212
+ except Exception as e:
213
+ return {"status": "Exception", "error": str(e)}
214
+
215
+ def synchronous_evaluate(input_data):
216
+ """Synchronously evaluate code, compatible with original interface"""
217
+ if isinstance(input_data, list) and len(input_data) > 0:
218
+ sample_task = input_data[0]
219
+ language = sample_task.get('language', 'unknown') if isinstance(sample_task, dict) else 'unknown'
220
+ task_size = len(input_data)
221
+ task_complexity = _estimate_task_complexity(input_data)
222
+ else:
223
+ language = 'unknown'
224
+ task_size = 1
225
+ task_complexity = 'medium'
226
+
227
+ estimated_time_per_task = _get_estimated_time_for_task(language, task_complexity)
228
+ estimated_total_time = estimated_time_per_task * task_size
229
+
230
+ queue_info = get_queue_status()
231
+ waiting_tasks = queue_info['waiting_tasks']
232
+
233
+ task_id = str(uuid.uuid4())
234
+ request_time = time.time()
235
+
236
+ with lock:
237
+ task_status[task_id] = {
238
+ 'status': 'queued',
239
+ 'queued_time': request_time,
240
+ 'queue_position': task_queue.qsize() + 1,
241
+ 'synchronous': True,
242
+ 'estimated_factors': {
243
+ 'language': language,
244
+ 'size': task_size,
245
+ 'complexity': task_complexity
246
+ },
247
+ 'estimated_time': estimated_total_time
248
+ }
249
+
250
+ task_queue.put((task_id, input_data, request_time))
251
+
252
+ while True:
253
+ with lock:
254
+ if task_id in task_status:
255
+ status = task_status[task_id]['status']
256
+ if status == 'completed':
257
+ result = task_status[task_id]['result']
258
+ task_status.pop(task_id, None)
259
+ return result
260
+ elif status == 'error':
261
+ error = task_status[task_id].get('error', 'Unknown error')
262
+ task_status.pop(task_id, None)
263
+ return {"status": "Exception", "error": error}
264
+
265
+ time.sleep(0.1)
266
+
267
+ def _get_estimated_time_for_task(language, complexity):
268
+ """Get estimated processing time for a specific task type"""
269
+ key = f"{language}_{complexity}"
270
+
271
+ if key in task_type_times and len(task_type_times[key]) > 0:
272
+ return np.median(task_type_times[key])
273
+
274
+ if complexity == 'simple':
275
+ return 1.0
276
+ elif complexity == 'medium':
277
+ return 3.0
278
+ else: # complex
279
+ return 8.0
280
+
281
+ def enqueue_task(input_data):
282
+ """Add task to queue"""
283
+ if isinstance(input_data, list) and len(input_data) > 0:
284
+ sample_task = input_data[0]
285
+ language = sample_task.get('language', 'unknown') if isinstance(sample_task, dict) else 'unknown'
286
+ task_size = len(input_data)
287
+ task_complexity = _estimate_task_complexity(input_data)
288
+ else:
289
+ language = 'unknown'
290
+ task_size = 1
291
+ task_complexity = 'medium'
292
+
293
+ estimated_time_per_task = _get_estimated_time_for_task(language, task_complexity)
294
+ estimated_total_time = estimated_time_per_task * task_size
295
+
296
+ task_id = str(uuid.uuid4())
297
+ request_time = time.time()
298
+
299
+ with lock:
300
+ task_status[task_id] = {
301
+ 'status': 'queued',
302
+ 'queued_time': request_time,
303
+ 'queue_position': task_queue.qsize() + 1,
304
+ 'estimated_factors': {
305
+ 'language': language,
306
+ 'size': task_size,
307
+ 'complexity': task_complexity
308
+ },
309
+ 'estimated_time': estimated_total_time
310
+ }
311
+
312
+ queue_info = get_queue_status()
313
+ est_wait = queue_info['estimated_wait']
314
+
315
+ task_queue.put((task_id, input_data, request_time))
316
+
317
+ return {
318
+ 'task_id': task_id,
319
+ 'status': 'queued',
320
+ 'queue_position': task_status[task_id]['queue_position'],
321
+ 'estimated_wait': est_wait,
322
+ 'estimated_processing': estimated_total_time
323
+ }
324
+
325
+ def check_status(task_id):
326
+ """Check task status"""
327
+ with lock:
328
+ if task_id not in task_status:
329
+ return {'status': 'not_found'}
330
+
331
+ status_info = task_status[task_id].copy()
332
+
333
+ if status_info['status'] in ['completed', 'error'] and time.time() - status_info.get('end_time', 0) > 3600:
334
+ task_status.pop(task_id, None)
335
+
336
+ return status_info
337
+
338
+ def get_queue_status():
339
+ """Get queue status"""
340
+ with lock:
341
+ queued_tasks = [t for t in task_status.values() if t['status'] == 'queued']
342
+ processing_tasks = [t for t in task_status.values() if t['status'] == 'processing']
343
+
344
+ queue_size = task_queue.qsize()
345
+ active_tasks = len(processing_tasks)
346
+ waiting_tasks = len(queued_tasks)
347
+
348
+ remaining_processing_time = 0
349
+ for task in processing_tasks:
350
+ if 'start_time' in task and 'estimated_time' in task:
351
+ elapsed = time.time() - task['start_time']
352
+ remaining = max(0, task['estimated_time'] - elapsed)
353
+ remaining_processing_time += remaining
354
+ else:
355
+ remaining_processing_time += 2
356
+
357
+ if active_tasks > 0:
358
+ remaining_processing_time = remaining_processing_time / min(active_tasks, worker_threads)
359
+
360
+ queued_processing_time = 0
361
+ for task in queued_tasks:
362
+ if 'estimated_time' in task:
363
+ queued_processing_time += task['estimated_time']
364
+ else:
365
+ queued_processing_time += 5
366
+
367
+ if worker_threads > 0 and queued_processing_time > 0:
368
+ queued_processing_time = queued_processing_time / worker_threads
369
+
370
+ estimated_wait = remaining_processing_time + queued_processing_time
371
+
372
+ if task_history:
373
+ prediction_ratios = []
374
+ for task in task_history:
375
+ if 'factors' in task and 'estimated_time' in task:
376
+ prediction_ratios.append(task['process_time'] / task['estimated_time'])
377
+
378
+ if prediction_ratios:
379
+ correction_factor = np.median(prediction_ratios)
380
+ correction_factor = max(0.5, min(2.0, correction_factor))
381
+ estimated_wait *= correction_factor
382
+
383
+ estimated_wait = max(0.1, estimated_wait)
384
+ if waiting_tasks == 0 and active_tasks == 0:
385
+ estimated_wait = 0
386
+
387
+ recent_tasks = task_history[-5:] if task_history else []
388
+
389
+ return {
390
+ 'queue_size': queue_size,
391
+ 'active_tasks': active_tasks,
392
+ 'waiting_tasks': waiting_tasks,
393
+ 'worker_threads': worker_threads,
394
+ 'estimated_wait': estimated_wait,
395
+ 'recent_tasks': recent_tasks
396
+ }
397
+
398
+ def format_time(seconds):
399
+ """Format time into readable format"""
400
+ if seconds < 60:
401
+ return f"{seconds:.1f} seconds"
402
+ elif seconds < 3600:
403
+ minutes = int(seconds / 60)
404
+ seconds = seconds % 60
405
+ return f"{minutes}m {seconds:.1f}s"
406
+ else:
407
+ hours = int(seconds / 3600)
408
+ minutes = int((seconds % 3600) / 60)
409
+ return f"{hours}h {minutes}m"
410
+
411
+ def ui_get_queue_info():
412
+ """Get queue info for UI"""
413
+ queue_info = get_queue_status()
414
+
415
+ tasks_html = ""
416
+ for task in reversed(queue_info['recent_tasks']):
417
+ tasks_html += f"""
418
+ <tr>
419
+ <td>{task['task_id'][:8]}...</td>
420
+ <td>{datetime.fromtimestamp(task['request_time']).strftime('%H:%M:%S')}</td>
421
+ <td>{format_time(task['process_time'])}</td>
422
+ </tr>
423
+ """
424
+
425
+ if not tasks_html:
426
+ tasks_html = """
427
+ <tr>
428
+ <td colspan="3" style="text-align: center; padding: 20px;">No historical tasks</td>
429
+ </tr>
430
+ """
431
+
432
+ return f"""
433
+ <div class="dashboard">
434
+ <div class="queue-info-card main-card">
435
+ <h3 class="card-title">Queue Status Monitor</h3>
436
+ <div class="queue-stats">
437
+ <div class="stat-item">
438
+ <div class="stat-value">{queue_info['waiting_tasks']}</div>
439
+ <div class="stat-label">Waiting</div>
440
+ </div>
441
+ <div class="stat-item">
442
+ <div class="stat-value">{queue_info['active_tasks']}</div>
443
+ <div class="stat-label">Processing</div>
444
+ </div>
445
+ <div class="stat-item">
446
+ <div class="stat-value">{queue_info['worker_threads']}</div>
447
+ <div class="stat-label">Worker Threads</div>
448
+ </div>
449
+ </div>
450
+
451
+ <div class="wait-time">
452
+ <p><b>Current Estimated Wait Time:</b> {format_time(queue_info['estimated_wait'])}</p>
453
+ <p class="last-update"><small>Last update: {datetime.now().strftime('%H:%M:%S')}</small></p>
454
+ </div>
455
+ </div>
456
+
457
+ <div class="queue-info-card history-card">
458
+ <h3 class="card-title">Recently Processed Tasks</h3>
459
+ <table class="recent-tasks">
460
+ <thead>
461
+ <tr>
462
+ <th>Task ID</th>
463
+ <th>Request Time</th>
464
+ <th>Processing Time</th>
465
+ </tr>
466
+ </thead>
467
+ <tbody>
468
+ {tasks_html}
469
+ </tbody>
470
+ </table>
471
+ </div>
472
+ </div>
473
+ """
474
+
475
+ def launch_workers():
476
+ """Launch worker threads"""
477
+ global running
478
+ running = True
479
+
480
+ for _ in range(worker_threads):
481
+ worker = threading.Thread(target=queue_processor)
482
+ worker.daemon = True
483
+ worker.start()
484
+
485
+ # Custom CSS
486
+ custom_css = """
487
+ .container {
488
+ max-width: 1200px;
489
+ margin: 0 auto;
490
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
491
+ }
492
+
493
+ .dashboard {
494
+ display: flex;
495
+ flex-direction: column;
496
+ gap: 20px;
497
+ }
498
+
499
+ .card-title {
500
+ color: #333;
501
+ border-bottom: 2px solid #ddd;
502
+ padding-bottom: 10px;
503
+ margin-top: 0;
504
+ }
505
+
506
+ .status-card, .queue-info-card {
507
+ background: #fff;
508
+ border-radius: 12px;
509
+ padding: 20px;
510
+ margin: 10px 0;
511
+ box-shadow: 0 4px 15px rgba(0,0,0,0.08);
512
+ }
513
+
514
+ .main-card {
515
+ border-top: 5px solid #4285f4;
516
+ }
517
+
518
+ .history-card {
519
+ border-top: 5px solid #34a853;
520
+ }
521
+
522
+ .status-card.success {
523
+ background: #e7f5e7;
524
+ border-left: 5px solid #28a745;
525
+ }
526
+
527
+ .status-card.error {
528
+ background: #f8d7da;
529
+ border-left: 5px solid #dc3545;
530
+ }
531
+
532
+ .error-message {
533
+ color: #dc3545;
534
+ font-weight: bold;
535
+ padding: 10px;
536
+ background: #f8d7da;
537
+ border-radius: 5px;
538
+ }
539
+
540
+ .notice {
541
+ color: #0c5460;
542
+ background-color: #d1ecf1;
543
+ padding: 10px;
544
+ border-radius: 5px;
545
+ }
546
+
547
+ .queue-stats {
548
+ display: flex;
549
+ justify-content: space-around;
550
+ margin: 20px 0;
551
+ }
552
+
553
+ .stat-item {
554
+ text-align: center;
555
+ padding: 15px;
556
+ background: #f8f9fa;
557
+ border-radius: 10px;
558
+ min-width: 120px;
559
+ transition: transform 0.3s ease;
560
+ }
561
+
562
+ .stat-item:hover {
563
+ transform: translateY(-5px);
564
+ box-shadow: 0 5px 15px rgba(0,0,0,0.1);
565
+ }
566
+
567
+ .stat-value {
568
+ font-size: 32px;
569
+ font-weight: bold;
570
+ color: #4285f4;
571
+ margin-bottom: 5px;
572
+ }
573
+
574
+ .stat-label {
575
+ color: #5f6368;
576
+ font-size: 16px;
577
+ }
578
+
579
+ .wait-time {
580
+ text-align: center;
581
+ margin: 20px 0;
582
+ padding: 15px;
583
+ background: #f1f3f4;
584
+ border-radius: 8px;
585
+ font-size: 18px;
586
+ }
587
+
588
+ .last-update {
589
+ color: #80868b;
590
+ margin-top: 10px;
591
+ margin-bottom: 0;
592
+ }
593
+
594
+ .recent-tasks {
595
+ width: 100%;
596
+ border-collapse: collapse;
597
+ margin-top: 15px;
598
+ background: white;
599
+ box-shadow: 0 1px 3px rgba(0,0,0,0.05);
600
+ }
601
+
602
+ .recent-tasks th, .recent-tasks td {
603
+ border: 1px solid #e0e0e0;
604
+ padding: 12px 15px;
605
+ text-align: center;
606
+ }
607
+
608
+ .recent-tasks th {
609
+ background-color: #f1f3f4;
610
+ color: #202124;
611
+ font-weight: 500;
612
+ }
613
+
614
+ .recent-tasks tbody tr:hover {
615
+ background-color: #f8f9fa;
616
+ }
617
+
618
+ .tabs {
619
+ margin-top: 20px;
620
+ }
621
+
622
+ button.primary {
623
+ background-color: #4285f4;
624
+ color: white;
625
+ padding: 10px 20px;
626
+ border: none;
627
+ border-radius: 4px;
628
+ cursor: pointer;
629
+ font-size: 16px;
630
+ font-weight: 500;
631
+ transition: background-color 0.3s;
632
+ }
633
+
634
+ button.primary:hover {
635
+ background-color: #3367d6;
636
+ }
637
+ """
638
+
639
+ # Initialize and launch worker threads
640
+ launch_workers()
641
+
642
+ # Create Gradio interface
643
+ with gr.Blocks(css=custom_css) as demo:
644
+ gr.Markdown("# Code Evaluation Service")
645
+ gr.Markdown("Code evaluation service supporting multiple programming languages, using queue mechanism to process requests")
646
+
647
+ with gr.Row():
648
+ with gr.Column(scale=3):
649
+ # Queue status info card
650
+ queue_info_html = gr.HTML()
651
+ refresh_queue_btn = gr.Button("Refresh Queue Status", variant="primary")
652
+
653
+ # Hidden API interface components
654
+ with gr.Row(visible=False):
655
+ api_input = gr.JSON()
656
+ api_output = gr.JSON()
657
+
658
+ # Define update function
659
+ def update_queue_info():
660
+ return ui_get_queue_info()
661
+
662
+ # Update queue info periodically
663
+ demo.load(update_queue_info, None, queue_info_html, every=3)
664
+
665
+ # Refresh button event
666
+ refresh_queue_btn.click(update_queue_info, None, queue_info_html)
667
+
668
+ # Add evaluation endpoint compatible with original interface
669
+ demo.queue()
670
+ evaluate_endpoint = demo.load(fn=synchronous_evaluate, inputs=api_input, outputs=api_output, api_name="evaluate")
671
+
672
+ if __name__ == "__main__":
673
+ try:
674
+ demo.launch()
675
+ finally:
676
+ # Stop worker threads
677
+ running = False
src/evaluation/evaluator.py CHANGED
@@ -25,15 +25,11 @@ def evaluate(input_data):
25
  item = future_to_item[future]
26
  try:
27
  result = future.result()
28
- # Preserve original item data but add result fields
29
- updated_item = item.copy()
30
- if isinstance(result, dict):
31
- updated_item.update(result)
32
- results.append(updated_item)
33
  except Exception as e:
34
- updated_item = item.copy()
35
- updated_item.update({"status": "Exception", "error": str(e)})
36
- results.append(updated_item)
37
  return results
38
 
39
  except Exception as e:
@@ -62,25 +58,17 @@ def evaluate_single_case(input_data):
62
  for attempt in range(max_retries + 1):
63
  result = evaluate_code(code, language)
64
 
65
- # If success or last attempt, add to results
66
  if result["status"] == "OK" or attempt == max_retries:
 
 
67
  results.append(result)
68
  break
69
 
70
  # For retries, briefly wait to allow resources to stabilize
71
  time.sleep(0.3)
72
-
73
- # If we have at least one successful result, return that
74
- successful_results = [r for r in results if r["status"] == "OK"]
75
- if successful_results:
76
- return successful_results[0]
77
 
78
- # Otherwise return the first result
79
- if results:
80
- return results[0]
81
-
82
- # Fallback error in case no results were collected
83
- return {"status": "Exception", "error": "Failed to evaluate code"}
84
 
85
  except Exception as e:
86
  return {"status": "Exception", "error": str(e)}
 
25
  item = future_to_item[future]
26
  try:
27
  result = future.result()
28
+ item.update(result)
29
+ results.append(item)
 
 
 
30
  except Exception as e:
31
+ item.update({"status": "Exception", "error": str(e)})
32
+ results.append(item)
 
33
  return results
34
 
35
  except Exception as e:
 
58
  for attempt in range(max_retries + 1):
59
  result = evaluate_code(code, language)
60
 
61
+ # If success or last attempt, return/record the result
62
  if result["status"] == "OK" or attempt == max_retries:
63
+ if result["status"] == "OK":
64
+ return result
65
  results.append(result)
66
  break
67
 
68
  # For retries, briefly wait to allow resources to stabilize
69
  time.sleep(0.3)
 
 
 
 
 
70
 
71
+ return results[0]
 
 
 
 
 
72
 
73
  except Exception as e:
74
  return {"status": "Exception", "error": str(e)}
src/ui/dashboard.py CHANGED
@@ -13,15 +13,13 @@ class Dashboard:
13
 
14
  tasks_html = ""
15
  for task in reversed(queue_info['recent_tasks']):
16
- status_class = "success" if task['status'] == 'completed' else "error" if task['status'] == 'error' else ""
17
- status_icon = "✓" if task['status'] == 'completed' else "✗" if task['status'] == 'error' else "⚙"
18
-
19
  tasks_html += f"""
20
- <tr class="{status_class}">
21
  <td>{task['task_id'][:8]}...</td>
22
  <td>{datetime.fromtimestamp(task['request_time']).strftime('%H:%M:%S')}</td>
23
  <td>{self.time_estimator.format_time(task['process_time'])}</td>
24
- <td class="status-cell {status_class}">{status_icon} {task['status'].capitalize()}</td>
25
  </tr>
26
  """
27
 
@@ -209,26 +207,6 @@ class Dashboard:
209
  .recent-tasks tbody tr:hover {
210
  background-color: #f8f9fa;
211
  }
212
-
213
- .recent-tasks tr.success {
214
- background-color: #e7f5e7;
215
- }
216
-
217
- .recent-tasks tr.error {
218
- background-color: #f8d7da;
219
- }
220
-
221
- .status-cell {
222
- font-weight: bold;
223
- }
224
-
225
- .status-cell.success {
226
- color: #28a745;
227
- }
228
-
229
- .status-cell.error {
230
- color: #dc3545;
231
- }
232
 
233
  .tabs {
234
  margin-top: 20px;
 
13
 
14
  tasks_html = ""
15
  for task in reversed(queue_info['recent_tasks']):
16
+ status_display = task['status'].capitalize()
 
 
17
  tasks_html += f"""
18
+ <tr>
19
  <td>{task['task_id'][:8]}...</td>
20
  <td>{datetime.fromtimestamp(task['request_time']).strftime('%H:%M:%S')}</td>
21
  <td>{self.time_estimator.format_time(task['process_time'])}</td>
22
+ <td>{status_display}</td>
23
  </tr>
24
  """
25
 
 
207
  .recent-tasks tbody tr:hover {
208
  background-color: #f8f9fa;
209
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
  .tabs {
212
  margin-top: 20px;