Quazim0t0 commited on
Commit
366c4b0
·
verified ·
1 Parent(s): e2b775d

Delete benchmark_selection.py

Browse files
Files changed (1) hide show
  1. benchmark_selection.py +0 -573
benchmark_selection.py DELETED
@@ -1,573 +0,0 @@
1
- """
2
- Benchmark selection module for Dynamic Highscores system.
3
-
4
- This module handles browsing, selection, and loading of HuggingFace datasets
5
- to be used as benchmarks for model evaluation.
6
- """
7
-
8
- import os
9
- import json
10
- import gradio as gr
11
- from huggingface_hub import HfApi, list_datasets
12
- from datasets import load_dataset, get_dataset_config_names
13
- from functools import partial
14
-
15
- class BenchmarkSelector:
16
- """Benchmark selection manager for HuggingFace datasets."""
17
-
18
- def __init__(self, db_manager, auth_manager):
19
- """Initialize the benchmark selector.
20
-
21
- Args:
22
- db_manager: Database manager instance for benchmark storage
23
- auth_manager: Authentication manager instance for access control
24
- """
25
- self.db_manager = db_manager
26
- self.auth_manager = auth_manager
27
- self.hf_api = HfApi()
28
-
29
- # Common benchmark categories for filtering
30
- self.categories = [
31
- "All",
32
- "Text Generation",
33
- "Question Answering",
34
- "Summarization",
35
- "Translation",
36
- "Classification",
37
- "Code Generation",
38
- "Reasoning",
39
- "Math"
40
- ]
41
-
42
- # Common metrics for different benchmark types
43
- self.metric_templates = {
44
- "Text Generation": ["bleu", "rouge", "meteor"],
45
- "Question Answering": ["exact_match", "f1"],
46
- "Summarization": ["rouge1", "rouge2", "rougeL"],
47
- "Translation": ["bleu", "ter"],
48
- "Classification": ["accuracy", "f1", "precision", "recall"],
49
- "Code Generation": ["exact_match", "pass@k", "functional_correctness"],
50
- "Reasoning": ["accuracy", "consistency"],
51
- "Math": ["accuracy", "correct_steps"]
52
- }
53
-
54
- def search_datasets(self, query, category="All", limit=50):
55
- """Search for datasets on HuggingFace.
56
-
57
- Args:
58
- query: Search query string
59
- category: Dataset category to filter by
60
- limit: Maximum number of results to return
61
-
62
- Returns:
63
- list: List of dataset information dictionaries
64
- """
65
- try:
66
- # Apply category filter if not "All"
67
- filter_str = None
68
- if category != "All":
69
- filter_str = f"task_categories:{category}"
70
-
71
- # Search for datasets
72
- datasets = list_datasets(
73
- search=query,
74
- filter=filter_str,
75
- limit=limit
76
- )
77
-
78
- # Format results
79
- results = []
80
- for dataset in datasets:
81
- # Handle cases where description might be missing
82
- dataset_description = ""
83
- if hasattr(dataset, 'description') and dataset.description:
84
- dataset_description = dataset.description[:200] + "..." if len(dataset.description) > 200 else dataset.description
85
-
86
- # Handle cases where tags might be missing
87
- dataset_tags = []
88
- if hasattr(dataset, 'tags'):
89
- dataset_tags = dataset.tags
90
-
91
- # Handle cases where downloads might be missing
92
- dataset_downloads = 0
93
- if hasattr(dataset, 'downloads'):
94
- dataset_downloads = dataset.downloads
95
-
96
- # Handle cases where author might be missing
97
- dataset_author = ""
98
- if hasattr(dataset, 'author'):
99
- dataset_author = dataset.author
100
-
101
- results.append({
102
- "id": dataset.id,
103
- "name": dataset.id.split("/")[-1],
104
- "author": dataset_author,
105
- "description": dataset_description,
106
- "tags": dataset_tags,
107
- "downloads": dataset_downloads
108
- })
109
-
110
- return results
111
- except Exception as e:
112
- print(f"Dataset search error: {e}")
113
- return []
114
-
115
- def get_dataset_info(self, dataset_id):
116
- """Get detailed information about a dataset.
117
-
118
- Args:
119
- dataset_id: HuggingFace dataset ID
120
-
121
- Returns:
122
- dict: Dataset information
123
- """
124
- try:
125
- # Get dataset info from HuggingFace
126
- dataset_info = self.hf_api.dataset_info(dataset_id)
127
-
128
- # Get available configurations
129
- configs = []
130
- try:
131
- configs = get_dataset_config_names(dataset_id)
132
- except Exception as e:
133
- print(f"Error getting dataset configs: {e}")
134
-
135
- # Handle missing attributes safely
136
- dataset_description = ""
137
- if hasattr(dataset_info, 'description'):
138
- dataset_description = dataset_info.description
139
-
140
- dataset_citation = ""
141
- if hasattr(dataset_info, 'citation'):
142
- dataset_citation = dataset_info.citation
143
-
144
- dataset_tags = []
145
- if hasattr(dataset_info, 'tags'):
146
- dataset_tags = dataset_info.tags
147
-
148
- dataset_downloads = 0
149
- if hasattr(dataset_info, 'downloads'):
150
- dataset_downloads = dataset_info.downloads
151
-
152
- dataset_author = ""
153
- if hasattr(dataset_info, 'author'):
154
- dataset_author = dataset_info.author
155
-
156
- # Format result
157
- result = {
158
- "id": dataset_info.id,
159
- "name": dataset_info.id.split("/")[-1],
160
- "author": dataset_author,
161
- "description": dataset_description,
162
- "citation": dataset_citation,
163
- "configs": configs,
164
- "tags": dataset_tags,
165
- "downloads": dataset_downloads
166
- }
167
-
168
- return result
169
- except Exception as e:
170
- print(f"Dataset info error: {e}")
171
- return None
172
-
173
- def load_dataset_sample(self, dataset_id, config=None, split="train", sample_size=5):
174
- """Load a sample from a dataset.
175
-
176
- Args:
177
- dataset_id: HuggingFace dataset ID
178
- config: Dataset configuration name
179
- split: Dataset split to sample from
180
- sample_size: Number of samples to load
181
-
182
- Returns:
183
- dict: Dataset sample information
184
- """
185
- try:
186
- # Load dataset
187
- if config:
188
- dataset = load_dataset(dataset_id, config, split=split)
189
- else:
190
- dataset = load_dataset(dataset_id, split=split)
191
-
192
- # Get sample
193
- if len(dataset) > sample_size:
194
- sample = dataset.select(range(sample_size))
195
- else:
196
- sample = dataset
197
-
198
- # Get features
199
- features = list(sample.features.keys())
200
-
201
- # Convert sample to list of dictionaries
202
- sample_data = []
203
- for item in sample:
204
- sample_item = {}
205
- for key in features:
206
- # Convert non-serializable values to strings
207
- if isinstance(item[key], (list, dict)):
208
- sample_item[key] = str(item[key])
209
- else:
210
- sample_item[key] = item[key]
211
- sample_data.append(sample_item)
212
-
213
- # Format result
214
- result = {
215
- "id": dataset_id,
216
- "config": config,
217
- "split": split,
218
- "features": features,
219
- "sample": sample_data,
220
- "total_size": len(dataset)
221
- }
222
-
223
- return result
224
- except Exception as e:
225
- print(f"Dataset sample error: {e}")
226
- return None
227
-
228
- def add_benchmark(self, dataset_id, name=None, description=None, metrics=None, config=None):
229
- """Add a dataset as a benchmark.
230
-
231
- Args:
232
- dataset_id: HuggingFace dataset ID
233
- name: Benchmark name (defaults to dataset name)
234
- description: Benchmark description (defaults to dataset description)
235
- metrics: Metrics to use for evaluation
236
- config: Dataset configuration to use
237
-
238
- Returns:
239
- int: Benchmark ID if successful, None otherwise
240
- """
241
- try:
242
- # Get dataset info if name or description not provided
243
- if not name or not description:
244
- dataset_info = self.get_dataset_info(dataset_id)
245
- if not dataset_info:
246
- return None
247
-
248
- if not name:
249
- name = dataset_info["name"]
250
-
251
- if not description:
252
- description = dataset_info["description"]
253
-
254
- # Format dataset ID with config if provided
255
- full_dataset_id = dataset_id
256
- if config:
257
- full_dataset_id = f"{dataset_id}:{config}"
258
-
259
- # Add benchmark to database
260
- benchmark_id = self.db_manager.add_benchmark(
261
- name=name,
262
- dataset_id=full_dataset_id,
263
- description=description,
264
- metrics=metrics
265
- )
266
-
267
- return benchmark_id
268
- except Exception as e:
269
- print(f"Add benchmark error: {e}")
270
- return None
271
-
272
- def get_benchmarks(self):
273
- """Get all available benchmarks.
274
-
275
- Returns:
276
- list: List of benchmark information dictionaries
277
- """
278
- return self.db_manager.get_benchmarks()
279
-
280
- # Benchmark selection UI components
281
- def create_benchmark_selection_ui(benchmark_selector, auth_manager):
282
- """Create the benchmark selection UI components.
283
-
284
- Args:
285
- benchmark_selector: Benchmark selector instance
286
- auth_manager: Authentication manager instance
287
-
288
- Returns:
289
- gr.Blocks: Gradio Blocks component with benchmark selection UI
290
- """
291
- with gr.Blocks() as benchmark_ui:
292
- gr.Markdown("## 📊 Dynamic Highscores Benchmark Selection")
293
- gr.Markdown("""
294
- ### Add your own datasets from HuggingFace as benchmarks!
295
-
296
- You can add any dataset from HuggingFace to use as a benchmark for evaluating models.
297
- Simply enter the dataset ID (e.g., 'squad', 'glue', 'hellaswag') and add it as a benchmark.
298
-
299
- Other users will be able to select your added benchmarks for their model evaluations.
300
- """, elem_classes=["info-text"])
301
-
302
- with gr.Tabs() as tabs:
303
- with gr.TabItem("➕ Add New Benchmark", id=0):
304
- with gr.Row():
305
- with gr.Column(scale=3):
306
- search_input = gr.Textbox(
307
- placeholder="Search for datasets on HuggingFace...",
308
- label="Search",
309
- show_label=False
310
- )
311
-
312
- with gr.Column(scale=1):
313
- category_dropdown = gr.Dropdown(
314
- choices=benchmark_selector.categories,
315
- value="All",
316
- label="Category"
317
- )
318
-
319
- with gr.Column(scale=1):
320
- search_button = gr.Button("Search")
321
-
322
- dataset_results = gr.Dataframe(
323
- headers=["Name", "Author", "Description", "Downloads"],
324
- datatype=["str", "str", "str", "number"],
325
- label="Search Results",
326
- interactive=True
327
- )
328
-
329
- with gr.Row():
330
- with gr.Column(scale=2):
331
- dataset_id_input = gr.Textbox(
332
- placeholder="Enter HuggingFace dataset ID (e.g., 'squad', 'glue', 'hellaswag')",
333
- label="Dataset ID",
334
- info="You can enter any dataset ID from HuggingFace"
335
- )
336
-
337
- with gr.Column(scale=1):
338
- view_button = gr.Button("View Dataset Details")
339
-
340
- with gr.Accordion("Dataset Details", open=False):
341
- dataset_info = gr.JSON(label="Dataset Information")
342
-
343
- with gr.Row():
344
- config_dropdown = gr.Dropdown(
345
- label="Configuration",
346
- choices=[],
347
- interactive=True
348
- )
349
-
350
- split_dropdown = gr.Dropdown(
351
- label="Split",
352
- choices=["train", "validation", "test"],
353
- value="train",
354
- interactive=True
355
- )
356
-
357
- sample_button = gr.Button("Load Sample")
358
-
359
- sample_data = gr.Dataframe(
360
- label="Sample Data",
361
- interactive=False
362
- )
363
-
364
- gr.Markdown("### Add this dataset as a benchmark")
365
- with gr.Row():
366
- with gr.Column(scale=2):
367
- benchmark_name = gr.Textbox(
368
- placeholder="Enter a name for this benchmark",
369
- label="Benchmark Name",
370
- info="A descriptive name for this benchmark"
371
- )
372
-
373
- benchmark_description = gr.Textbox(
374
- placeholder="Enter a description for this benchmark",
375
- label="Description",
376
- info="Explain what this benchmark evaluates",
377
- lines=3
378
- )
379
-
380
- with gr.Column(scale=1):
381
- metrics_input = gr.CheckboxGroup(
382
- label="Evaluation Metrics",
383
- choices=[],
384
- interactive=True,
385
- info="Select metrics to use for evaluation"
386
- )
387
-
388
- with gr.Row():
389
- add_benchmark_button = gr.Button("Add as Benchmark", size="lg", variant="primary")
390
-
391
- benchmark_status = gr.Markdown("")
392
-
393
- with gr.TabItem("📋 Available Benchmarks", id=1):
394
- gr.Markdown("### Benchmarks available for model evaluation")
395
- gr.Markdown("These benchmarks can be selected when submitting models for evaluation.")
396
-
397
- with gr.Row():
398
- refresh_benchmarks_button = gr.Button("Refresh Benchmarks")
399
- reload_sample_benchmarks_button = gr.Button("Reload Sample Benchmarks", variant="secondary")
400
-
401
- reload_status = gr.Markdown("")
402
-
403
- benchmarks_container = gr.Column()
404
- with benchmarks_container:
405
- no_benchmarks_message = gr.Markdown(
406
- "### No Datasets Added Yet\n\nBe the first to add a benchmark dataset! Go to the 'Add New Benchmark' tab to add a dataset from HuggingFace.",
407
- visible=True
408
- )
409
-
410
- my_benchmarks = gr.Dataframe(
411
- headers=["ID", "Name", "Dataset", "Description"],
412
- label="Available Benchmarks",
413
- interactive=True,
414
- visible=False
415
- )
416
-
417
- # Event handlers
418
- def search_datasets_handler(query, category):
419
- if not query:
420
- return None
421
-
422
- results = benchmark_selector.search_datasets(query, category)
423
-
424
- # Format for dataframe
425
- formatted_results = []
426
- for result in results:
427
- formatted_results.append([
428
- result["name"],
429
- result["author"],
430
- result["description"],
431
- result["downloads"]
432
- ])
433
-
434
- return formatted_results
435
-
436
- def view_dataset_handler(dataset_id):
437
- if not dataset_id:
438
- return None, [], None
439
-
440
- dataset_info = benchmark_selector.get_dataset_info(dataset_id)
441
-
442
- if not dataset_info:
443
- return None, [], None
444
-
445
- # Update metrics based on dataset tags
446
- metrics = []
447
- for category, category_metrics in benchmark_selector.metric_templates.items():
448
- if any(tag.lower() in [t.lower() for t in dataset_info["tags"]] for tag in category.lower().split()):
449
- metrics.extend(category_metrics)
450
-
451
- # Remove duplicates
452
- metrics = list(set(metrics))
453
-
454
- return dataset_info, dataset_info["configs"], gr.update(choices=metrics)
455
-
456
- def load_sample_handler(dataset_id, config, split):
457
- if not dataset_id:
458
- return None
459
-
460
- sample_info = benchmark_selector.load_dataset_sample(
461
- dataset_id,
462
- config=config if config else None,
463
- split=split
464
- )
465
-
466
- if not sample_info:
467
- return None
468
-
469
- return sample_info["sample"]
470
-
471
- def add_benchmark_handler(dataset_id, config, name, description, metrics, request: gr.Request):
472
- if not dataset_id:
473
- return "Please enter a dataset ID from HuggingFace."
474
-
475
- # Check if user is logged in
476
- user = auth_manager.check_login(request)
477
-
478
- if not user:
479
- return "Please log in to add benchmarks."
480
-
481
- # Add benchmark
482
- benchmark_id = benchmark_selector.add_benchmark(
483
- dataset_id=dataset_id,
484
- name=name if name else None,
485
- description=description if description else None,
486
- metrics=metrics if metrics else None,
487
- config=config if config else None
488
- )
489
-
490
- if benchmark_id:
491
- return f"✅ Benchmark added successfully with ID: {benchmark_id}\n\nThis dataset is now available for model evaluation. You can view it in the 'Available Benchmarks' tab."
492
- else:
493
- return "❌ Failed to add benchmark. Please check the dataset ID and try again."
494
-
495
- def get_benchmarks_handler(request: gr.Request):
496
- # Check if user is logged in
497
- user = auth_manager.check_login(request)
498
-
499
- if not user:
500
- return gr.update(visible=True), gr.update(visible=False), None
501
-
502
- # Get benchmarks
503
- benchmarks = benchmark_selector.get_benchmarks()
504
-
505
- # If no benchmarks, show message
506
- if not benchmarks or len(benchmarks) == 0:
507
- return gr.update(visible=True), gr.update(visible=False), None
508
-
509
- # Format for dataframe
510
- formatted_benchmarks = []
511
- for benchmark in benchmarks:
512
- formatted_benchmarks.append([
513
- benchmark["id"],
514
- benchmark["name"],
515
- benchmark["dataset_id"],
516
- benchmark["description"]
517
- ])
518
-
519
- return gr.update(visible=False), gr.update(visible=True), formatted_benchmarks
520
-
521
- def reload_sample_benchmarks_handler():
522
- try:
523
- from sample_benchmarks import add_sample_benchmarks
524
- num_added = add_sample_benchmarks()
525
- return f"✅ Successfully reloaded {num_added} sample benchmarks."
526
- except Exception as e:
527
- return f"❌ Error reloading benchmarks: {str(e)}"
528
-
529
- # Connect event handlers
530
- search_button.click(
531
- fn=search_datasets_handler,
532
- inputs=[search_input, category_dropdown],
533
- outputs=[dataset_results]
534
- )
535
-
536
- view_button.click(
537
- fn=view_dataset_handler,
538
- inputs=[dataset_id_input],
539
- outputs=[dataset_info, config_dropdown, metrics_input]
540
- )
541
-
542
- sample_button.click(
543
- fn=load_sample_handler,
544
- inputs=[dataset_id_input, config_dropdown, split_dropdown],
545
- outputs=[sample_data]
546
- )
547
-
548
- add_benchmark_button.click(
549
- fn=add_benchmark_handler,
550
- inputs=[dataset_id_input, config_dropdown, benchmark_name, benchmark_description, metrics_input],
551
- outputs=[benchmark_status]
552
- )
553
-
554
- refresh_benchmarks_button.click(
555
- fn=get_benchmarks_handler,
556
- inputs=[],
557
- outputs=[no_benchmarks_message, my_benchmarks, my_benchmarks]
558
- )
559
-
560
- reload_sample_benchmarks_button.click(
561
- fn=reload_sample_benchmarks_handler,
562
- inputs=[],
563
- outputs=[reload_status]
564
- )
565
-
566
- # Initialize benchmarks on load
567
- benchmark_ui.load(
568
- fn=get_benchmarks_handler,
569
- inputs=[],
570
- outputs=[no_benchmarks_message, my_benchmarks, my_benchmarks]
571
- )
572
-
573
- return benchmark_ui