Quazim0t0 commited on
Commit
ae8cd7f
·
verified ·
1 Parent(s): 6e7f801

Delete evaluation_queue.py

Browse files
Files changed (1) hide show
  1. evaluation_queue.py +0 -1122
evaluation_queue.py DELETED
@@ -1,1122 +0,0 @@
1
- """
2
- Model evaluation queue system for Dynamic Highscores.
3
-
4
- This module handles the evaluation queue, CPU-only processing,
5
- and enforces daily submission limits for users.
6
- """
7
-
8
- import os
9
- import json
10
- import time
11
- import threading
12
- import queue as queue_module
13
- from datetime import datetime, timedelta
14
- import gradio as gr
15
- from huggingface_hub import HfApi, hf_hub_download, snapshot_download
16
- from datasets import load_dataset
17
- import torch
18
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
19
- import sqlite3
20
-
21
- class EvaluationQueue:
22
- """Manages the evaluation queue for model benchmarking."""
23
-
24
- def __init__(self, db_manager, auth_manager):
25
- """Initialize the evaluation queue manager.
26
-
27
- Args:
28
- db_manager: Database manager instance
29
- auth_manager: Authentication manager instance
30
- """
31
- self.db_manager = db_manager
32
- self.auth_manager = auth_manager
33
- self.hf_api = HfApi()
34
- self.queue = queue_module.Queue()
35
- self.is_processing = False
36
- self.worker_thread = None
37
- self.model_tags = ["Merge", "Agent", "Reasoning", "Coding", "General", "Specialized", "Instruction", "Chat"]
38
- self.current_evaluation = None
39
- self.progress = 0
40
- self.progress_lock = threading.Lock()
41
- # Memory limit for models in GB (leave 2GB for system)
42
- self.memory_limit_gb = 14.0
43
-
44
- def start_worker(self):
45
- """Start the worker thread for processing the evaluation queue."""
46
- if self.worker_thread is None or not self.worker_thread.is_alive():
47
- self.is_processing = True
48
- self.worker_thread = threading.Thread(target=self._process_queue)
49
- self.worker_thread.daemon = True
50
- self.worker_thread.start()
51
-
52
- def stop_worker(self):
53
- """Stop the worker thread."""
54
- self.is_processing = False
55
- if self.worker_thread and self.worker_thread.is_alive():
56
- self.worker_thread.join(timeout=1.0)
57
-
58
- def check_model_size(self, model_id):
59
- """Check if a model will fit within RAM limitations.
60
-
61
- Args:
62
- model_id: HuggingFace model ID
63
-
64
- Returns:
65
- tuple: (will_fit, message)
66
- """
67
- try:
68
- # Query model info from the HuggingFace API
69
- model_info_obj = self.hf_api.model_info(model_id)
70
-
71
- # Initialize total size
72
- total_size_gb = 0
73
-
74
- # Try different approaches to get model size based on API response structure
75
- if hasattr(model_info_obj, 'safetensors') and model_info_obj.safetensors:
76
- # New API format with safetensors dict
77
- for file_info in model_info_obj.safetensors.values():
78
- if hasattr(file_info, 'size'):
79
- total_size_gb += file_info.size / (1024 * 1024 * 1024)
80
- elif isinstance(file_info, dict) and 'size' in file_info:
81
- total_size_gb += file_info['size'] / (1024 * 1024 * 1024)
82
-
83
- # Fallback to siblings method
84
- if total_size_gb == 0 and hasattr(model_info_obj, 'siblings'):
85
- for sibling in model_info_obj.siblings:
86
- if hasattr(sibling, 'size'):
87
- if sibling.rfilename.endswith(('.bin', '.safetensors', '.pt')):
88
- total_size_gb += sibling.size / (1024 * 1024 * 1024)
89
- elif isinstance(sibling, dict) and 'size' in sibling:
90
- if sibling.get('rfilename', '').endswith(('.bin', '.safetensors', '.pt')):
91
- total_size_gb += sibling['size'] / (1024 * 1024 * 1024)
92
-
93
- # If we still couldn't determine size, try a reasonable guess based on model name
94
- if total_size_gb == 0:
95
- # Try to guess from model name (e.g., if it has "7b" in the name)
96
- model_name = model_id.lower()
97
- size_indicators = {
98
- "1b": 1, "2b": 2, "3b": 3, "5b": 5, "7b": 7, "8b": 8,
99
- "10b": 10, "13b": 13, "20b": 20, "30b": 30, "65b": 65, "70b": 70
100
- }
101
-
102
- for indicator, size in size_indicators.items():
103
- if indicator in model_name.replace("-", "").replace("_", ""):
104
- total_size_gb = size * 2 # Rough estimate: param count × 2 for size in GB
105
- break
106
-
107
- # If we still couldn't determine size, use a default
108
- if total_size_gb == 0:
109
- # Try direct API method
110
- try:
111
- print(f"Checking model size with direct method for {model_id}")
112
- # Print out the entire structure for debugging
113
- print(f"Model info: {model_info_obj.__dict__}")
114
-
115
- # Default to a conservative estimate
116
- total_size_gb = 5 # Assume a 5GB model as default
117
- except Exception as e:
118
- print(f"Direct size check failed: {e}")
119
- return True, "Unable to determine model size accurately, but allowing submission with caution"
120
-
121
- # Account for memory overhead
122
- estimated_ram_needed = total_size_gb * 1.3 # 30% overhead
123
-
124
- # Check against limit
125
- if estimated_ram_needed > self.memory_limit_gb:
126
- return False, f"Model is too large (approximately {total_size_gb:.1f}GB, needs {estimated_ram_needed:.1f}GB RAM). Maximum allowed is {self.memory_limit_gb}GB."
127
-
128
- return True, f"Model size check passed ({total_size_gb:.1f}GB, estimated {estimated_ram_needed:.1f}GB RAM usage)"
129
-
130
- except Exception as e:
131
- print(f"Model size check error: {e}")
132
- # Log more details for debugging
133
- import traceback
134
- traceback.print_exc()
135
-
136
- # Allow submission with warning
137
- return True, f"Warning: Could not verify model size ({str(e)}). Please ensure your model is under {self.memory_limit_gb}GB."
138
-
139
- def _process_queue(self):
140
- """Process the evaluation queue in a separate thread."""
141
- while self.is_processing:
142
- try:
143
- # Get the next evaluation from the database
144
- pending_evals = self.db_manager.get_evaluation_results(status="pending")
145
-
146
- if pending_evals:
147
- # Sort by priority and added_at
148
- next_eval = pending_evals[0]
149
-
150
- # Update status to running
151
- self.db_manager.update_evaluation_status(next_eval['id'], 'running')
152
-
153
- # Set current evaluation and reset progress
154
- with self.progress_lock:
155
- self.current_evaluation = next_eval
156
- self.progress = 0
157
-
158
- try:
159
- # Get model and benchmark details
160
- model_info = self.db_manager.get_model(next_eval['model_id'])
161
- benchmark_info = self.db_manager.get_benchmark(next_eval['benchmark_id'])
162
-
163
- if model_info and benchmark_info:
164
- # Check if model will fit in memory
165
- will_fit, message = self.check_model_size(model_info['hf_model_id'])
166
-
167
- if not will_fit:
168
- raise Exception(f"Model too large for evaluation: {message}")
169
-
170
- # Run the evaluation
171
- results = self._run_evaluation(
172
- model_info['hf_model_id'],
173
- benchmark_info['dataset_id']
174
- )
175
-
176
- # Calculate overall score
177
- score = self._calculate_overall_score(results)
178
-
179
- # Update status to completed with results
180
- self.db_manager.update_evaluation_status(
181
- next_eval['id'],
182
- 'completed',
183
- results=results,
184
- score=score
185
- )
186
- else:
187
- raise Exception("Model or benchmark not found")
188
- except Exception as e:
189
- print(f"Evaluation error: {e}")
190
- # Update status to failed with error message
191
- error_results = {"error": str(e)}
192
- self.db_manager.update_evaluation_status(
193
- next_eval['id'],
194
- 'failed',
195
- results=error_results
196
- )
197
-
198
- # Clear current evaluation
199
- with self.progress_lock:
200
- self.current_evaluation = None
201
- self.progress = 0
202
- else:
203
- # No evaluations in queue, sleep for a bit
204
- time.sleep(5)
205
- except Exception as e:
206
- print(f"Queue processing error: {e}")
207
- time.sleep(5)
208
-
209
- def _run_evaluation(self, model_id, dataset_id):
210
- """Run an evaluation for a model on a benchmark.
211
-
212
- Args:
213
- model_id: HuggingFace model ID
214
- dataset_id: HuggingFace dataset ID (with optional config)
215
-
216
- Returns:
217
- dict: Evaluation results
218
- """
219
- # Update progress
220
- with self.progress_lock:
221
- self.progress = 5 # Starting evaluation
222
-
223
- # Parse dataset ID and config
224
- if ":" in dataset_id:
225
- dataset_id, config = dataset_id.split(":", 1)
226
- else:
227
- config = None
228
-
229
- # Update progress
230
- with self.progress_lock:
231
- self.progress = 10 # Loading dataset
232
-
233
- # Load the dataset
234
- try:
235
- if config:
236
- dataset = load_dataset(dataset_id, config, split="test")
237
- else:
238
- dataset = load_dataset(dataset_id, split="test")
239
- except Exception as e:
240
- return {"error": f"Failed to load dataset: {str(e)}"}
241
-
242
- # Update progress
243
- with self.progress_lock:
244
- self.progress = 20 # Loading model
245
-
246
- try:
247
- # Load the model with memory optimization settings
248
- device = "cpu"
249
- model = AutoModelForCausalLM.from_pretrained(
250
- model_id,
251
- device_map=device,
252
- torch_dtype=torch.float32, # Use float32 for CPU
253
- low_cpu_mem_usage=True, # Enable memory optimization
254
- offload_folder="offload", # Enable offloading if needed
255
- offload_state_dict=True, # Offload state dict for memory saving
256
- max_memory={0: f"{self.memory_limit_gb}GB"} # Limit memory usage
257
- )
258
- tokenizer = AutoTokenizer.from_pretrained(model_id)
259
- except Exception as e:
260
- print(f"Model loading error: {e}")
261
- return {"error": f"Failed to load model: {str(e)}"}
262
-
263
- # Update progress
264
- with self.progress_lock:
265
- self.progress = 30 # Determining task type
266
-
267
- # Determine task type based on dataset features
268
- task_type = self._determine_task_type(dataset)
269
-
270
- # Update progress
271
- with self.progress_lock:
272
- self.progress = 40 # Starting evaluation
273
-
274
- try:
275
- # Run appropriate evaluation based on task type
276
- if task_type == "text-generation":
277
- results = self._evaluate_text_generation(model, tokenizer, dataset)
278
- elif task_type == "question-answering":
279
- results = self._evaluate_question_answering(model, tokenizer, dataset)
280
- elif task_type == "classification":
281
- results = self._evaluate_classification(model, tokenizer, dataset)
282
- elif task_type == "code-generation":
283
- results = self._evaluate_code_generation(model, tokenizer, dataset)
284
- else:
285
- # Default to general evaluation
286
- results = self._evaluate_general(model, tokenizer, dataset)
287
- except Exception as e:
288
- print(f"Evaluation task error: {e}")
289
- return {"error": f"Evaluation failed: {str(e)}"}
290
-
291
- # Update progress
292
- with self.progress_lock:
293
- self.progress = 95 # Cleaning up
294
-
295
- # Clean up to free memory
296
- del model
297
- del tokenizer
298
- if torch.cuda.is_available():
299
- torch.cuda.empty_cache()
300
-
301
- # Update progress
302
- with self.progress_lock:
303
- self.progress = 100 # Completed
304
-
305
- return results
306
-
307
- def get_current_progress(self):
308
- """Get the current evaluation progress.
309
-
310
- Returns:
311
- tuple: (current_evaluation, progress_percentage)
312
- """
313
- with self.progress_lock:
314
- return self.current_evaluation, self.progress
315
-
316
- def _determine_task_type(self, dataset):
317
- """Determine the task type based on dataset features.
318
-
319
- Args:
320
- dataset: HuggingFace dataset
321
-
322
- Returns:
323
- str: Task type
324
- """
325
- features = dataset.features
326
-
327
- # Check for common feature patterns
328
- if "question" in features and "answer" in features:
329
- return "question-answering"
330
- elif "code" in features or "solution" in features:
331
- return "code-generation"
332
- elif "label" in features or "class" in features:
333
- return "classification"
334
- elif "input" in features and "output" in features:
335
- return "text-generation"
336
- else:
337
- return "general"
338
-
339
- def _evaluate_text_generation(self, model, tokenizer, dataset):
340
- """Evaluate a model on text generation tasks.
341
-
342
- Args:
343
- model: HuggingFace model
344
- tokenizer: HuggingFace tokenizer
345
- dataset: HuggingFace dataset
346
-
347
- Returns:
348
- dict: Evaluation results
349
- """
350
- # Set up generation pipeline
351
- generator = pipeline(
352
- "text-generation",
353
- model=model,
354
- tokenizer=tokenizer,
355
- device="cpu"
356
- )
357
-
358
- # Sample a subset for evaluation (to keep runtime reasonable)
359
- if len(dataset) > 100:
360
- dataset = dataset.select(range(100))
361
-
362
- # Track metrics
363
- correct = 0
364
- total = 0
365
- generated_texts = []
366
-
367
- # Process each example
368
- for i, example in enumerate(dataset):
369
- # Update progress based on completion percentage
370
- with self.progress_lock:
371
- self.progress = 40 + int((i / len(dataset)) * 50)
372
-
373
- input_text = example.get("input", example.get("prompt", ""))
374
- expected_output = example.get("output", example.get("target", ""))
375
-
376
- if not input_text or not expected_output:
377
- continue
378
-
379
- # Generate text
380
- generated = generator(
381
- input_text,
382
- max_length=100,
383
- num_return_sequences=1
384
- )
385
-
386
- generated_text = generated[0]["generated_text"]
387
- generated_texts.append(generated_text)
388
-
389
- # Simple exact match check
390
- if expected_output.strip() in generated_text:
391
- correct += 1
392
-
393
- total += 1
394
-
395
- # Calculate metrics
396
- accuracy = correct / total if total > 0 else 0
397
-
398
- return {
399
- "accuracy": accuracy,
400
- "samples_evaluated": total,
401
- "generated_samples": generated_texts[:5] # Include a few samples
402
- }
403
-
404
- def _evaluate_question_answering(self, model, tokenizer, dataset):
405
- """Evaluate a model on question answering tasks.
406
-
407
- Args:
408
- model: HuggingFace model
409
- tokenizer: HuggingFace tokenizer
410
- dataset: HuggingFace dataset
411
-
412
- Returns:
413
- dict: Evaluation results
414
- """
415
- # Set up QA pipeline
416
- qa_pipeline = pipeline(
417
- "question-answering",
418
- model=model,
419
- tokenizer=tokenizer,
420
- device="cpu"
421
- )
422
-
423
- # Sample a subset for evaluation
424
- if len(dataset) > 100:
425
- dataset = dataset.select(range(100))
426
-
427
- # Track metrics
428
- exact_matches = 0
429
- f1_scores = []
430
- total = 0
431
-
432
- # Process each example
433
- for i, example in enumerate(dataset):
434
- # Update progress based on completion percentage
435
- with self.progress_lock:
436
- self.progress = 40 + int((i / len(dataset)) * 50)
437
-
438
- question = example.get("question", "")
439
- context = example.get("context", "")
440
- answer = example.get("answer", "")
441
-
442
- if not question or not answer:
443
- continue
444
-
445
- # Get model prediction
446
- if context:
447
- result = qa_pipeline(question=question, context=context)
448
- else:
449
- # If no context provided, use the question as context
450
- result = qa_pipeline(question=question, context=question)
451
-
452
- predicted_answer = result["answer"]
453
-
454
- # Calculate exact match
455
- if predicted_answer.strip() == answer.strip():
456
- exact_matches += 1
457
-
458
- # Calculate F1 score
459
- f1 = self._calculate_f1(answer, predicted_answer)
460
- f1_scores.append(f1)
461
-
462
- total += 1
463
-
464
- # Calculate metrics
465
- exact_match_accuracy = exact_matches / total if total > 0 else 0
466
- avg_f1 = sum(f1_scores) / len(f1_scores) if f1_scores else 0
467
-
468
- return {
469
- "exact_match": exact_match_accuracy,
470
- "f1": avg_f1,
471
- "samples_evaluated": total
472
- }
473
-
474
- def _evaluate_classification(self, model, tokenizer, dataset):
475
- """Evaluate a model on classification tasks.
476
-
477
- Args:
478
- model: HuggingFace model
479
- tokenizer: HuggingFace tokenizer
480
- dataset: HuggingFace dataset
481
-
482
- Returns:
483
- dict: Evaluation results
484
- """
485
- # Set up classification pipeline
486
- classifier = pipeline(
487
- "text-classification",
488
- model=model,
489
- tokenizer=tokenizer,
490
- device="cpu"
491
- )
492
-
493
- # Sample a subset for evaluation
494
- if len(dataset) > 100:
495
- dataset = dataset.select(range(100))
496
-
497
- # Track metrics
498
- correct = 0
499
- total = 0
500
-
501
- # Process each example
502
- for i, example in enumerate(dataset):
503
- # Update progress based on completion percentage
504
- with self.progress_lock:
505
- self.progress = 40 + int((i / len(dataset)) * 50)
506
-
507
- text = example.get("text", example.get("sentence", ""))
508
- label = str(example.get("label", example.get("class", "")))
509
-
510
- if not text or not label:
511
- continue
512
-
513
- # Get model prediction
514
- result = classifier(text)
515
- predicted_label = result[0]["label"]
516
-
517
- # Check if correct
518
- if str(predicted_label) == label:
519
- correct += 1
520
-
521
- total += 1
522
-
523
- # Calculate metrics
524
- accuracy = correct / total if total > 0 else 0
525
-
526
- return {
527
- "accuracy": accuracy,
528
- "samples_evaluated": total
529
- }
530
-
531
- def _evaluate_code_generation(self, model, tokenizer, dataset):
532
- """Evaluate a model on code generation tasks.
533
-
534
- Args:
535
- model: HuggingFace model
536
- tokenizer: HuggingFace tokenizer
537
- dataset: HuggingFace dataset
538
-
539
- Returns:
540
- dict: Evaluation results
541
- """
542
- # Set up generation pipeline
543
- generator = pipeline(
544
- "text-generation",
545
- model=model,
546
- tokenizer=tokenizer,
547
- device="cpu"
548
- )
549
-
550
- # Sample a subset for evaluation
551
- if len(dataset) > 50: # Smaller sample for code tasks
552
- dataset = dataset.select(range(50))
553
-
554
- # Track metrics
555
- exact_matches = 0
556
- functional_matches = 0
557
- total = 0
558
-
559
- # Process each example
560
- for i, example in enumerate(dataset):
561
- # Update progress based on completion percentage
562
- with self.progress_lock:
563
- self.progress = 40 + int((i / len(dataset)) * 50)
564
-
565
- prompt = example.get("prompt", example.get("input", ""))
566
- solution = example.get("solution", example.get("output", ""))
567
-
568
- if not prompt or not solution:
569
- continue
570
-
571
- # Generate code
572
- generated = generator(
573
- prompt,
574
- max_length=200,
575
- num_return_sequences=1
576
- )
577
-
578
- generated_code = generated[0]["generated_text"]
579
-
580
- # Extract code from generated text (remove prompt)
581
- if prompt in generated_code:
582
- generated_code = generated_code[len(prompt):].strip()
583
-
584
- # Check exact match
585
- if generated_code.strip() == solution.strip():
586
- exact_matches += 1
587
- functional_matches += 1
588
- else:
589
- # We would ideally check functional correctness here
590
- # but that requires executing code which is complex and potentially unsafe
591
- # For now, we'll use a simple heuristic
592
- if len(generated_code) > 0 and any(keyword in generated_code for keyword in ["def ", "function", "return", "class"]):
593
- functional_matches += 0.5 # Partial credit
594
-
595
- total += 1
596
-
597
- # Calculate metrics
598
- exact_match_rate = exact_matches / total if total > 0 else 0
599
- functional_correctness = functional_matches / total if total > 0 else 0
600
-
601
- return {
602
- "exact_match": exact_match_rate,
603
- "functional_correctness": functional_correctness,
604
- "samples_evaluated": total
605
- }
606
-
607
- def _evaluate_general(self, model, tokenizer, dataset):
608
- """General evaluation for any dataset type.
609
-
610
- Args:
611
- model: HuggingFace model
612
- tokenizer: HuggingFace tokenizer
613
- dataset: HuggingFace dataset
614
-
615
- Returns:
616
- dict: Evaluation results
617
- """
618
- # Set up generation pipeline
619
- generator = pipeline(
620
- "text-generation",
621
- model=model,
622
- tokenizer=tokenizer,
623
- device="cpu"
624
- )
625
-
626
- # Sample a subset for evaluation
627
- if len(dataset) > 50:
628
- dataset = dataset.select(range(50))
629
-
630
- # Find input and output fields
631
- features = dataset.features
632
- input_field = None
633
- output_field = None
634
-
635
- for field in features:
636
- if field.lower() in ["input", "prompt", "question", "text"]:
637
- input_field = field
638
- elif field.lower() in ["output", "target", "answer", "response"]:
639
- output_field = field
640
-
641
- if not input_field:
642
- # Just use the first string field as input
643
- for field in features:
644
- if isinstance(features[field], (str, list)):
645
- input_field = field
646
- break
647
-
648
- # Track metrics
649
- total = 0
650
- generated_texts = []
651
-
652
- # Process each example
653
- for i, example in enumerate(dataset):
654
- # Update progress based on completion percentage
655
- with self.progress_lock:
656
- self.progress = 40 + int((i / len(dataset)) * 50)
657
-
658
- if input_field and input_field in example:
659
- input_text = str(example[input_field])
660
-
661
- # Generate text
662
- generated = generator(
663
- input_text,
664
- max_length=100,
665
- num_return_sequences=1
666
- )
667
-
668
- generated_text = generated[0]["generated_text"]
669
- generated_texts.append({
670
- "input": input_text,
671
- "output": generated_text,
672
- "expected": str(example[output_field]) if output_field and output_field in example else "N/A"
673
- })
674
-
675
- total += 1
676
-
677
- return {
678
- "samples_evaluated": total,
679
- "generated_samples": generated_texts[:5] # Include a few samples
680
- }
681
-
682
- def _calculate_f1(self, answer, prediction):
683
- """Calculate F1 score between answer and prediction.
684
-
685
- Args:
686
- answer: Ground truth answer
687
- prediction: Model prediction
688
-
689
- Returns:
690
- float: F1 score
691
- """
692
- # Tokenize
693
- answer_tokens = answer.lower().split()
694
- prediction_tokens = prediction.lower().split()
695
-
696
- # Calculate precision and recall
697
- common_tokens = set(answer_tokens) & set(prediction_tokens)
698
-
699
- if not common_tokens:
700
- return 0.0
701
-
702
- precision = len(common_tokens) / len(prediction_tokens)
703
- recall = len(common_tokens) / len(answer_tokens)
704
-
705
- # Calculate F1
706
- if precision + recall == 0:
707
- return 0.0
708
-
709
- f1 = 2 * precision * recall / (precision + recall)
710
- return f1
711
-
712
- def _calculate_overall_score(self, results):
713
- """Calculate an overall score from evaluation results.
714
-
715
- Args:
716
- results: Evaluation results dictionary
717
-
718
- Returns:
719
- float: Overall score between 0 and 100
720
- """
721
- # If there was an error, return a low score
722
- if "error" in results:
723
- return 0.0
724
-
725
- score = 0.0
726
-
727
- # Check for common metrics and weight them
728
- if "accuracy" in results:
729
- score += results["accuracy"] * 100
730
-
731
- if "exact_match" in results:
732
- score += results["exact_match"] * 100
733
-
734
- if "f1" in results:
735
- score += results["f1"] * 100
736
-
737
- if "functional_correctness" in results:
738
- score += results["functional_correctness"] * 100
739
-
740
- # If multiple metrics were found, average them
741
- num_metrics = sum(1 for metric in ["accuracy", "exact_match", "f1", "functional_correctness"] if metric in results)
742
-
743
- if num_metrics > 0:
744
- score /= num_metrics
745
- else:
746
- # Default score if no metrics available
747
- score = 50.0
748
-
749
- return score
750
-
751
- def submit_evaluation(self, model_id, benchmark_id, user_id, priority=0):
752
- """Submit a model for evaluation on a benchmark.
753
-
754
- Args:
755
- model_id: Model ID in the database
756
- benchmark_id: Benchmark ID in the database
757
- user_id: User ID submitting the evaluation
758
- priority: Queue priority (higher = higher priority)
759
-
760
- Returns:
761
- tuple: (evaluation_id, message)
762
- """
763
- # Check if user can submit today
764
- if not self.auth_manager.can_submit_benchmark(user_id):
765
- return None, "Daily submission limit reached. Try again tomorrow."
766
-
767
- try:
768
- # Get model HuggingFace ID to check size
769
- model_info = self.db_manager.get_model(model_id)
770
- if not model_info:
771
- return None, "Model not found in database."
772
-
773
- # Check if model will fit in memory
774
- will_fit, message = self.check_model_size(model_info['hf_model_id'])
775
-
776
- if not will_fit:
777
- return None, message
778
-
779
- # Add evaluation to database and queue
780
- evaluation_id = self.db_manager.add_evaluation(
781
- model_id=model_id,
782
- benchmark_id=benchmark_id,
783
- priority=priority
784
- )
785
-
786
- # Update user's last submission date
787
- self.auth_manager.update_submission_date(user_id)
788
-
789
- # Make sure worker is running
790
- self.start_worker()
791
-
792
- return evaluation_id, f"Evaluation submitted successfully. {message}"
793
- except Exception as e:
794
- print(f"Submit evaluation error: {e}")
795
- return None, f"Failed to submit evaluation: {str(e)}"
796
-
797
- def get_queue_status(self):
798
- """Get the current status of the evaluation queue.
799
-
800
- Returns:
801
- dict: Queue status information
802
- """
803
- try:
804
- # Get evaluations from database
805
- pending_evals = self.db_manager.get_evaluation_results(status="pending")
806
- running_evals = self.db_manager.get_evaluation_results(status="running")
807
- completed_evals = self.db_manager.get_evaluation_results(status="completed")
808
- failed_evals = self.db_manager.get_evaluation_results(status="failed")
809
-
810
- # Get current evaluation progress
811
- current_eval, progress = self.get_current_progress()
812
-
813
- return {
814
- "pending": len(pending_evals),
815
- "running": len(running_evals),
816
- "completed": len(completed_evals),
817
- "failed": len(failed_evals),
818
- "is_processing": self.is_processing,
819
- "current_evaluation": current_eval,
820
- "progress": progress,
821
- "memory_limit_gb": self.memory_limit_gb
822
- }
823
- except Exception as e:
824
- print(f"Queue status error: {e}")
825
- return {
826
- "pending": 0,
827
- "running": 0,
828
- "completed": 0,
829
- "failed": 0,
830
- "is_processing": self.is_processing,
831
- "current_evaluation": None,
832
- "progress": 0,
833
- "memory_limit_gb": self.memory_limit_gb,
834
- "error": str(e)
835
- }
836
-
837
- # Model submission UI components
838
- def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
839
- """Create the model submission UI components.
840
-
841
- Args:
842
- evaluation_queue: Evaluation queue instance
843
- auth_manager: Authentication manager instance
844
- db_manager: Database manager instance
845
-
846
- Returns:
847
- gr.Blocks: Gradio Blocks component with model submission UI
848
- """
849
- with gr.Blocks() as submission_ui:
850
- # Store user authentication state
851
- user_state = gr.State(None)
852
-
853
- # Check authentication on load
854
- def check_auth_on_load(request: gr.Request):
855
- if request:
856
- # Special handling for HF Spaces OAuth
857
- if 'SPACE_ID' in os.environ:
858
- username = request.headers.get("HF-User")
859
- if username:
860
- user = db_manager.get_user_by_username(username)
861
- if user:
862
- print(f"User authenticated via HF Spaces OAuth: {username}")
863
- return user
864
- else:
865
- # Standard token-based auth
866
- user = auth_manager.check_login(request)
867
- if user:
868
- return user
869
- return None
870
-
871
- with gr.Tab("Submit Model"):
872
- gr.Markdown(f"""
873
- ### Model Size Restrictions
874
-
875
- Models must fit within {evaluation_queue.memory_limit_gb}GB of RAM for evaluation.
876
- Large models will be rejected to ensure all evaluations can complete successfully.
877
- """, elem_classes=["info-text"])
878
-
879
- with gr.Row():
880
- with gr.Column(scale=2):
881
- model_id_input = gr.Textbox(
882
- placeholder="HuggingFace model ID (e.g., 'gpt2', 'facebook/opt-350m')",
883
- label="Model ID"
884
- )
885
-
886
- check_size_button = gr.Button("Check Model Size")
887
- size_check_result = gr.Markdown("")
888
- model_name_input = gr.Textbox(
889
- placeholder="Display name for your model",
890
- label="Model Name"
891
- )
892
-
893
- model_description_input = gr.Textbox(
894
- placeholder="Brief description of your model",
895
- label="Description",
896
- lines=3
897
- )
898
-
899
- model_parameters_input = gr.Number(
900
- label="Number of Parameters (billions)",
901
- precision=2
902
- )
903
-
904
- with gr.Column(scale=1):
905
- model_tag_input = gr.Dropdown(
906
- choices=evaluation_queue.model_tags,
907
- label="Model Tag",
908
- info="Select one category that best describes your model"
909
- )
910
-
911
- # Fixed benchmark dropdown to properly show names
912
- benchmark_dropdown = gr.Dropdown(
913
- label="Benchmark",
914
- info="Select a benchmark to evaluate your model on",
915
- choices=[("none", "Loading benchmarks...")],
916
- value=None
917
- )
918
-
919
- refresh_benchmarks_button = gr.Button("Refresh Benchmarks")
920
-
921
- submit_model_button = gr.Button("Submit for Evaluation")
922
- submission_status = gr.Markdown("")
923
- auth_message = gr.Markdown("")
924
-
925
- with gr.Tab("Evaluation Queue"):
926
- refresh_queue_button = gr.Button("Refresh Queue")
927
-
928
- with gr.Row():
929
- with gr.Column(scale=1):
930
- queue_stats = gr.JSON(
931
- label="Queue Statistics"
932
- )
933
-
934
- with gr.Column(scale=2):
935
- queue_status = gr.Dataframe(
936
- headers=["ID", "Model", "Benchmark", "Status", "Submitted"],
937
- label="Recent Evaluations"
938
- )
939
-
940
- with gr.Row(visible=True) as progress_container:
941
- with gr.Column():
942
- current_eval_info = gr.Markdown("No evaluation currently running")
943
- # Use a simple text display for progress instead of Progress component
944
- progress_display = gr.Markdown("Progress: 0%")
945
-
946
- # Event handlers
947
- def check_model_size_handler(model_id):
948
- if not model_id:
949
- return "Please enter a HuggingFace model ID."
950
-
951
- try:
952
- will_fit, message = evaluation_queue.check_model_size(model_id)
953
-
954
- if will_fit:
955
- return f"✅ {message}"
956
- else:
957
- return f"❌ {message}"
958
- except Exception as e:
959
- print(f"Model size check error: {e}")
960
- import traceback
961
- traceback.print_exc()
962
- return f"Error checking model size: {str(e)}"
963
-
964
- def refresh_benchmarks_handler():
965
- benchmarks = db_manager.get_benchmarks()
966
-
967
- # Format for dropdown - properly formatted to display names
968
- choices = []
969
- for b in benchmarks:
970
- # Add as tuple of (id, name) to ensure proper display
971
- choices.append((str(b["id"]), b["name"]))
972
-
973
- if not choices:
974
- choices = [("none", "No benchmarks available - add some first")]
975
-
976
- return gr.update(choices=choices)
977
-
978
- def submit_model_handler(model_id, model_name, model_description, model_parameters, model_tag, benchmark_id, user):
979
- # Check if user is logged in
980
- if not user:
981
- return "Please log in to submit a model."
982
-
983
- if not model_id or not model_name or not model_tag or not benchmark_id:
984
- return "Please fill in all required fields."
985
-
986
- if benchmark_id == "none":
987
- return "Please select a valid benchmark."
988
-
989
- try:
990
- # Check if model will fit in RAM
991
- will_fit, size_message = evaluation_queue.check_model_size(model_id)
992
-
993
- if not will_fit:
994
- return f"❌ {size_message}"
995
-
996
- # Add model to database
997
- model_db_id = db_manager.add_model(
998
- name=model_name,
999
- hf_model_id=model_id,
1000
- user_id=user["id"],
1001
- tag=model_tag,
1002
- parameters=str(model_parameters) if model_parameters else None,
1003
- description=model_description
1004
- )
1005
-
1006
- if not model_db_id:
1007
- return "Failed to add model to database."
1008
-
1009
- # Submit for evaluation
1010
- eval_id, message = evaluation_queue.submit_evaluation(
1011
- model_id=model_db_id,
1012
- benchmark_id=benchmark_id,
1013
- user_id=user["id"]
1014
- )
1015
-
1016
- if eval_id:
1017
- return f"✅ Model submitted successfully. {size_message}\nEvaluation ID: {eval_id}"
1018
- else:
1019
- return message
1020
- except Exception as e:
1021
- print(f"Error submitting model: {str(e)}")
1022
- import traceback
1023
- traceback.print_exc()
1024
- return f"Error submitting model: {str(e)}"
1025
-
1026
- def refresh_queue_handler():
1027
- # Get queue statistics
1028
- stats = evaluation_queue.get_queue_status()
1029
-
1030
- # Get recent evaluations (all statuses, limited to 20)
1031
- evals = db_manager.get_evaluation_results(limit=20)
1032
-
1033
- # Format for dataframe
1034
- eval_data = []
1035
- for eval in evals:
1036
- eval_data.append([
1037
- eval["id"],
1038
- eval["model_name"],
1039
- eval["benchmark_name"],
1040
- eval["status"],
1041
- eval["submitted_at"]
1042
- ])
1043
-
1044
- # Also update progress display
1045
- current_eval, progress = evaluation_queue.get_current_progress()
1046
- if current_eval:
1047
- model_info = db_manager.get_model(current_eval['model_id'])
1048
- benchmark_info = db_manager.get_benchmark(current_eval['benchmark_id'])
1049
-
1050
- if model_info and benchmark_info:
1051
- eval_info = f"**Currently Evaluating:** {model_info['name']} on {benchmark_info['name']}"
1052
- progress_text = f"Progress: {progress}%"
1053
- return stats, eval_data, eval_info, progress_text
1054
-
1055
- return stats, eval_data, "No evaluation currently running", "Progress: 0%"
1056
-
1057
- # Update authentication status
1058
- def update_auth_message(user):
1059
- if user:
1060
- return f"Logged in as {user['username']}"
1061
- else:
1062
- return "Please log in to submit a model."
1063
-
1064
- # Connect event handlers
1065
- check_size_button.click(
1066
- fn=check_model_size_handler,
1067
- inputs=[model_id_input],
1068
- outputs=[size_check_result]
1069
- )
1070
-
1071
- refresh_benchmarks_button.click(
1072
- fn=refresh_benchmarks_handler,
1073
- inputs=[],
1074
- outputs=[benchmark_dropdown]
1075
- )
1076
-
1077
- submit_model_button.click(
1078
- fn=submit_model_handler,
1079
- inputs=[
1080
- model_id_input,
1081
- model_name_input,
1082
- model_description_input,
1083
- model_parameters_input,
1084
- model_tag_input,
1085
- benchmark_dropdown,
1086
- user_state
1087
- ],
1088
- outputs=[submission_status]
1089
- )
1090
-
1091
- refresh_queue_button.click(
1092
- fn=refresh_queue_handler,
1093
- inputs=[],
1094
- outputs=[queue_stats, queue_status, current_eval_info, progress_display]
1095
- )
1096
-
1097
- # Initialize on load
1098
- submission_ui.load(
1099
- fn=check_auth_on_load,
1100
- inputs=[],
1101
- outputs=[user_state]
1102
- )
1103
-
1104
- submission_ui.load(
1105
- fn=lambda user: update_auth_message(user),
1106
- inputs=[user_state],
1107
- outputs=[auth_message]
1108
- )
1109
-
1110
- submission_ui.load(
1111
- fn=refresh_benchmarks_handler,
1112
- inputs=[],
1113
- outputs=[benchmark_dropdown]
1114
- )
1115
-
1116
- submission_ui.load(
1117
- fn=refresh_queue_handler,
1118
- inputs=[],
1119
- outputs=[queue_stats, queue_status, current_eval_info, progress_display]
1120
- )
1121
-
1122
- return submission_ui