Quazim0t0 commited on
Commit
7b72d2c
·
verified ·
1 Parent(s): 5910edd

Delete sample_benchmarks.py

Browse files
Files changed (1) hide show
  1. sample_benchmarks.py +0 -72
sample_benchmarks.py DELETED
@@ -1,72 +0,0 @@
1
- """
2
- Sample benchmarks initialization for Dynamic Highscores system.
3
-
4
- This script adds sample benchmarks to the database to provide initial options for users.
5
- """
6
-
7
- from database_schema import DynamicHighscoresDB
8
-
9
- def add_sample_benchmarks():
10
- """Add sample benchmarks to the database."""
11
- # Initialize database
12
- db = DynamicHighscoresDB()
13
-
14
- # Sample benchmarks to add
15
- sample_benchmarks = [
16
- {
17
- "name": "MMLU (Massive Multitask Language Understanding)",
18
- "dataset_id": "cais/mmlu",
19
- "description": "A benchmark for measuring massive multitask language understanding across 57 tasks including elementary mathematics, US history, computer science, law, and more.",
20
- "metrics": {"accuracy": 1.0, "consistency": 1.0}
21
- },
22
- {
23
- "name": "HumanEval (Code Generation)",
24
- "dataset_id": "openai/humaneval",
25
- "description": "A benchmark for evaluating language models on code generation tasks. It consists of 164 programming problems with unit tests.",
26
- "metrics": {"pass@1": 1.0, "functional_correctness": 1.0}
27
- },
28
- {
29
- "name": "HellaSwag (Commonsense Reasoning)",
30
- "dataset_id": "hellaswag",
31
- "description": "A challenge dataset for evaluating commonsense natural language inference. It consists of multiple-choice questions about grounded situations.",
32
- "metrics": {"accuracy": 1.0}
33
- },
34
- {
35
- "name": "GSM8K (Grade School Math)",
36
- "dataset_id": "gsm8k",
37
- "description": "A dataset of 8.5K high quality grade school math word problems. These problems take between 2 and 8 steps to solve, and solutions primarily involve performing a sequence of elementary calculations using basic arithmetic operations.",
38
- "metrics": {"accuracy": 1.0, "correct_steps": 1.0}
39
- },
40
- {
41
- "name": "TruthfulQA",
42
- "dataset_id": "truthful_qa",
43
- "description": "A benchmark to measure whether a language model is truthful in generating answers to questions. The benchmark comprises 817 questions that span 38 categories, including health, law, finance and politics.",
44
- "metrics": {"accuracy": 1.0, "truthfulness": 1.0}
45
- }
46
- ]
47
-
48
- # Add each benchmark to the database
49
- added_count = 0
50
- for benchmark in sample_benchmarks:
51
- try:
52
- benchmark_id = db.add_benchmark(
53
- name=benchmark["name"],
54
- dataset_id=benchmark["dataset_id"],
55
- description=benchmark["description"],
56
- metrics=benchmark["metrics"]
57
- )
58
-
59
- if benchmark_id:
60
- print(f"Added benchmark '{benchmark['name']}' with ID: {benchmark_id}")
61
- added_count += 1
62
- except Exception as e:
63
- print(f"Error adding benchmark '{benchmark['name']}': {e}")
64
-
65
- # Close database connection
66
- db.close()
67
-
68
- return added_count
69
-
70
- if __name__ == "__main__":
71
- num_added = add_sample_benchmarks()
72
- print(f"Added {num_added} sample benchmarks to the database.")