qLeaderboard-aBase4Community

Running

App Files Files Community

Quazim0t0 commited on Mar 22

Commit

366c4b0

verified ·

1 Parent(s): e2b775d

Delete benchmark_selection.py

Browse files

Files changed (1) hide show

benchmark_selection.py +0 -573

benchmark_selection.py DELETED Viewed

@@ -1,573 +0,0 @@
-"""
-Benchmark selection module for Dynamic Highscores system.
-This module handles browsing, selection, and loading of HuggingFace datasets
-to be used as benchmarks for model evaluation.
-"""
-import os
-import json
-import gradio as gr
-from huggingface_hub import HfApi, list_datasets
-from datasets import load_dataset, get_dataset_config_names
-from functools import partial
-class BenchmarkSelector:
-    """Benchmark selection manager for HuggingFace datasets."""
-    def __init__(self, db_manager, auth_manager):
-        """Initialize the benchmark selector.
-        Args:
-            db_manager: Database manager instance for benchmark storage
-            auth_manager: Authentication manager instance for access control
-        """
-        self.db_manager = db_manager
-        self.auth_manager = auth_manager
-        self.hf_api = HfApi()
-        # Common benchmark categories for filtering
-        self.categories = [
-            "All",
-            "Text Generation",
-            "Question Answering",
-            "Summarization",
-            "Translation",
-            "Classification",
-            "Code Generation",
-            "Reasoning",
-            "Math"
-        ]
-        # Common metrics for different benchmark types
-        self.metric_templates = {
-            "Text Generation": ["bleu", "rouge", "meteor"],
-            "Question Answering": ["exact_match", "f1"],
-            "Summarization": ["rouge1", "rouge2", "rougeL"],
-            "Translation": ["bleu", "ter"],
-            "Classification": ["accuracy", "f1", "precision", "recall"],
-            "Code Generation": ["exact_match", "pass@k", "functional_correctness"],
-            "Reasoning": ["accuracy", "consistency"],
-            "Math": ["accuracy", "correct_steps"]
-        }
-    def search_datasets(self, query, category="All", limit=50):
-        """Search for datasets on HuggingFace.
-        Args:
-            query: Search query string
-            category: Dataset category to filter by
-            limit: Maximum number of results to return
-        Returns:
-            list: List of dataset information dictionaries
-        """
-        try:
-            # Apply category filter if not "All"
-            filter_str = None
-            if category != "All":
-                filter_str = f"task_categories:{category}"
-            # Search for datasets
-            datasets = list_datasets(
-                search=query,
-                filter=filter_str,
-                limit=limit
-            )
-            # Format results
-            results = []
-            for dataset in datasets:
-                # Handle cases where description might be missing
-                dataset_description = ""
-                if hasattr(dataset, 'description') and dataset.description:
-                    dataset_description = dataset.description[:200] + "..." if len(dataset.description) > 200 else dataset.description
-                # Handle cases where tags might be missing
-                dataset_tags = []
-                if hasattr(dataset, 'tags'):
-                    dataset_tags = dataset.tags
-                # Handle cases where downloads might be missing
-                dataset_downloads = 0
-                if hasattr(dataset, 'downloads'):
-                    dataset_downloads = dataset.downloads
-                # Handle cases where author might be missing
-                dataset_author = ""
-                if hasattr(dataset, 'author'):
-                    dataset_author = dataset.author
-                results.append({
-                    "id": dataset.id,
-                    "name": dataset.id.split("/")[-1],
-                    "author": dataset_author,
-                    "description": dataset_description,
-                    "tags": dataset_tags,
-                    "downloads": dataset_downloads
-                })
-            return results
-        except Exception as e:
-            print(f"Dataset search error: {e}")
-            return []
-    def get_dataset_info(self, dataset_id):
-        """Get detailed information about a dataset.
-        Args:
-            dataset_id: HuggingFace dataset ID
-        Returns:
-            dict: Dataset information
-        """
-        try:
-            # Get dataset info from HuggingFace
-            dataset_info = self.hf_api.dataset_info(dataset_id)
-            # Get available configurations
-            configs = []
-            try:
-                configs = get_dataset_config_names(dataset_id)
-            except Exception as e:
-                print(f"Error getting dataset configs: {e}")
-            # Handle missing attributes safely
-            dataset_description = ""
-            if hasattr(dataset_info, 'description'):
-                dataset_description = dataset_info.description
-            dataset_citation = ""
-            if hasattr(dataset_info, 'citation'):
-                dataset_citation = dataset_info.citation
-            dataset_tags = []
-            if hasattr(dataset_info, 'tags'):
-                dataset_tags = dataset_info.tags
-            dataset_downloads = 0
-            if hasattr(dataset_info, 'downloads'):
-                dataset_downloads = dataset_info.downloads
-            dataset_author = ""
-            if hasattr(dataset_info, 'author'):
-                dataset_author = dataset_info.author
-            # Format result
-            result = {
-                "id": dataset_info.id,
-                "name": dataset_info.id.split("/")[-1],
-                "author": dataset_author,
-                "description": dataset_description,
-                "citation": dataset_citation,
-                "configs": configs,
-                "tags": dataset_tags,
-                "downloads": dataset_downloads
-            }
-            return result
-        except Exception as e:
-            print(f"Dataset info error: {e}")
-            return None
-    def load_dataset_sample(self, dataset_id, config=None, split="train", sample_size=5):
-        """Load a sample from a dataset.
-        Args:
-            dataset_id: HuggingFace dataset ID
-            config: Dataset configuration name
-            split: Dataset split to sample from
-            sample_size: Number of samples to load
-        Returns:
-            dict: Dataset sample information
-        """
-        try:
-            # Load dataset
-            if config:
-                dataset = load_dataset(dataset_id, config, split=split)
-            else:
-                dataset = load_dataset(dataset_id, split=split)
-            # Get sample
-            if len(dataset) > sample_size:
-                sample = dataset.select(range(sample_size))
-            else:
-                sample = dataset
-            # Get features
-            features = list(sample.features.keys())
-            # Convert sample to list of dictionaries
-            sample_data = []
-            for item in sample:
-                sample_item = {}
-                for key in features:
-                    # Convert non-serializable values to strings
-                    if isinstance(item[key], (list, dict)):
-                        sample_item[key] = str(item[key])
-                    else:
-                        sample_item[key] = item[key]
-                sample_data.append(sample_item)
-            # Format result
-            result = {
-                "id": dataset_id,
-                "config": config,
-                "split": split,
-                "features": features,
-                "sample": sample_data,
-                "total_size": len(dataset)
-            }
-            return result
-        except Exception as e:
-            print(f"Dataset sample error: {e}")
-            return None
-    def add_benchmark(self, dataset_id, name=None, description=None, metrics=None, config=None):
-        """Add a dataset as a benchmark.
-        Args:
-            dataset_id: HuggingFace dataset ID
-            name: Benchmark name (defaults to dataset name)
-            description: Benchmark description (defaults to dataset description)
-            metrics: Metrics to use for evaluation
-            config: Dataset configuration to use
-        Returns:
-            int: Benchmark ID if successful, None otherwise
-        """
-        try:
-            # Get dataset info if name or description not provided
-            if not name or not description:
-                dataset_info = self.get_dataset_info(dataset_id)
-                if not dataset_info:
-                    return None
-                if not name:
-                    name = dataset_info["name"]
-                if not description:
-                    description = dataset_info["description"]
-            # Format dataset ID with config if provided
-            full_dataset_id = dataset_id
-            if config:
-                full_dataset_id = f"{dataset_id}:{config}"
-            # Add benchmark to database
-            benchmark_id = self.db_manager.add_benchmark(
-                name=name,
-                dataset_id=full_dataset_id,
-                description=description,
-                metrics=metrics
-            )
-            return benchmark_id
-        except Exception as e:
-            print(f"Add benchmark error: {e}")
-            return None
-    def get_benchmarks(self):
-        """Get all available benchmarks.
-        Returns:
-            list: List of benchmark information dictionaries
-        """
-        return self.db_manager.get_benchmarks()
-# Benchmark selection UI components
-def create_benchmark_selection_ui(benchmark_selector, auth_manager):
-    """Create the benchmark selection UI components.
-    Args:
-        benchmark_selector: Benchmark selector instance
-        auth_manager: Authentication manager instance
-    Returns:
-        gr.Blocks: Gradio Blocks component with benchmark selection UI
-    """
-    with gr.Blocks() as benchmark_ui:
-        gr.Markdown("## 📊 Dynamic Highscores Benchmark Selection")
-        gr.Markdown("""
-        ### Add your own datasets from HuggingFace as benchmarks!
-        You can add any dataset from HuggingFace to use as a benchmark for evaluating models.
-        Simply enter the dataset ID (e.g., 'squad', 'glue', 'hellaswag') and add it as a benchmark.
-        Other users will be able to select your added benchmarks for their model evaluations.
-        """, elem_classes=["info-text"])
-        with gr.Tabs() as tabs:
-            with gr.TabItem("➕ Add New Benchmark", id=0):
-                with gr.Row():
-                    with gr.Column(scale=3):
-                        search_input = gr.Textbox(
-                            placeholder="Search for datasets on HuggingFace...",
-                            label="Search",
-                            show_label=False
-                        )
-                    with gr.Column(scale=1):
-                        category_dropdown = gr.Dropdown(
-                            choices=benchmark_selector.categories,
-                            value="All",
-                            label="Category"
-                        )
-                    with gr.Column(scale=1):
-                        search_button = gr.Button("Search")
-                dataset_results = gr.Dataframe(
-                    headers=["Name", "Author", "Description", "Downloads"],
-                    datatype=["str", "str", "str", "number"],
-                    label="Search Results",
-                    interactive=True
-                )
-                with gr.Row():
-                    with gr.Column(scale=2):
-                        dataset_id_input = gr.Textbox(
-                            placeholder="Enter HuggingFace dataset ID (e.g., 'squad', 'glue', 'hellaswag')",
-                            label="Dataset ID",
-                            info="You can enter any dataset ID from HuggingFace"
-                        )
-                    with gr.Column(scale=1):
-                        view_button = gr.Button("View Dataset Details")
-                with gr.Accordion("Dataset Details", open=False):
-                    dataset_info = gr.JSON(label="Dataset Information")
-                    with gr.Row():
-                        config_dropdown = gr.Dropdown(
-                            label="Configuration",
-                            choices=[],
-                            interactive=True
-                        )
-                        split_dropdown = gr.Dropdown(
-                            label="Split",
-                            choices=["train", "validation", "test"],
-                            value="train",
-                            interactive=True
-                        )
-                        sample_button = gr.Button("Load Sample")
-                    sample_data = gr.Dataframe(
-                        label="Sample Data",
-                        interactive=False
-                    )
-                gr.Markdown("### Add this dataset as a benchmark")
-                with gr.Row():
-                    with gr.Column(scale=2):
-                        benchmark_name = gr.Textbox(
-                            placeholder="Enter a name for this benchmark",
-                            label="Benchmark Name",
-                            info="A descriptive name for this benchmark"
-                        )
-                        benchmark_description = gr.Textbox(
-                            placeholder="Enter a description for this benchmark",
-                            label="Description",
-                            info="Explain what this benchmark evaluates",
-                            lines=3
-                        )
-                    with gr.Column(scale=1):
-                        metrics_input = gr.CheckboxGroup(
-                            label="Evaluation Metrics",
-                            choices=[],
-                            interactive=True,
-                            info="Select metrics to use for evaluation"
-                        )
-                with gr.Row():
-                    add_benchmark_button = gr.Button("Add as Benchmark", size="lg", variant="primary")
-                benchmark_status = gr.Markdown("")
-            with gr.TabItem("📋 Available Benchmarks", id=1):
-                gr.Markdown("### Benchmarks available for model evaluation")
-                gr.Markdown("These benchmarks can be selected when submitting models for evaluation.")
-                with gr.Row():
-                    refresh_benchmarks_button = gr.Button("Refresh Benchmarks")
-                    reload_sample_benchmarks_button = gr.Button("Reload Sample Benchmarks", variant="secondary")
-                reload_status = gr.Markdown("")
-                benchmarks_container = gr.Column()
-                with benchmarks_container:
-                    no_benchmarks_message = gr.Markdown(
-                        "### No Datasets Added Yet\n\nBe the first to add a benchmark dataset! Go to the 'Add New Benchmark' tab to add a dataset from HuggingFace.",
-                        visible=True
-                    )
-                    my_benchmarks = gr.Dataframe(
-                        headers=["ID", "Name", "Dataset", "Description"],
-                        label="Available Benchmarks",
-                        interactive=True,
-                        visible=False
-                    )
-        # Event handlers
-        def search_datasets_handler(query, category):
-            if not query:
-                return None
-            results = benchmark_selector.search_datasets(query, category)
-            # Format for dataframe
-            formatted_results = []
-            for result in results:
-                formatted_results.append([
-                    result["name"],
-                    result["author"],
-                    result["description"],
-                    result["downloads"]
-                ])
-            return formatted_results
-        def view_dataset_handler(dataset_id):
-            if not dataset_id:
-                return None, [], None
-            dataset_info = benchmark_selector.get_dataset_info(dataset_id)
-            if not dataset_info:
-                return None, [], None
-            # Update metrics based on dataset tags
-            metrics = []
-            for category, category_metrics in benchmark_selector.metric_templates.items():
-                if any(tag.lower() in [t.lower() for t in dataset_info["tags"]] for tag in category.lower().split()):
-                    metrics.extend(category_metrics)
-            # Remove duplicates
-            metrics = list(set(metrics))
-            return dataset_info, dataset_info["configs"], gr.update(choices=metrics)
-        def load_sample_handler(dataset_id, config, split):
-            if not dataset_id:
-                return None
-            sample_info = benchmark_selector.load_dataset_sample(
-                dataset_id,
-                config=config if config else None,
-                split=split
-            )
-            if not sample_info:
-                return None
-            return sample_info["sample"]
-        def add_benchmark_handler(dataset_id, config, name, description, metrics, request: gr.Request):
-            if not dataset_id:
-                return "Please enter a dataset ID from HuggingFace."
-            # Check if user is logged in
-            user = auth_manager.check_login(request)
-            if not user:
-                return "Please log in to add benchmarks."
-            # Add benchmark
-            benchmark_id = benchmark_selector.add_benchmark(
-                dataset_id=dataset_id,
-                name=name if name else None,
-                description=description if description else None,
-                metrics=metrics if metrics else None,
-                config=config if config else None
-            )
-            if benchmark_id:
-                return f"✅ Benchmark added successfully with ID: {benchmark_id}\n\nThis dataset is now available for model evaluation. You can view it in the 'Available Benchmarks' tab."
-            else:
-                return "❌ Failed to add benchmark. Please check the dataset ID and try again."
-        def get_benchmarks_handler(request: gr.Request):
-            # Check if user is logged in
-            user = auth_manager.check_login(request)
-            if not user:
-                return gr.update(visible=True), gr.update(visible=False), None
-            # Get benchmarks
-            benchmarks = benchmark_selector.get_benchmarks()
-            # If no benchmarks, show message
-            if not benchmarks or len(benchmarks) == 0:
-                return gr.update(visible=True), gr.update(visible=False), None
-            # Format for dataframe
-            formatted_benchmarks = []
-            for benchmark in benchmarks:
-                formatted_benchmarks.append([
-                    benchmark["id"],
-                    benchmark["name"],
-                    benchmark["dataset_id"],
-                    benchmark["description"]
-                ])
-            return gr.update(visible=False), gr.update(visible=True), formatted_benchmarks
-        def reload_sample_benchmarks_handler():
-            try:
-                from sample_benchmarks import add_sample_benchmarks
-                num_added = add_sample_benchmarks()
-                return f"✅ Successfully reloaded {num_added} sample benchmarks."
-            except Exception as e:
-                return f"❌ Error reloading benchmarks: {str(e)}"
-        # Connect event handlers
-        search_button.click(
-            fn=search_datasets_handler,
-            inputs=[search_input, category_dropdown],
-            outputs=[dataset_results]
-        )
-        view_button.click(
-            fn=view_dataset_handler,
-            inputs=[dataset_id_input],
-            outputs=[dataset_info, config_dropdown, metrics_input]
-        )
-        sample_button.click(
-            fn=load_sample_handler,
-            inputs=[dataset_id_input, config_dropdown, split_dropdown],
-            outputs=[sample_data]
-        )
-        add_benchmark_button.click(
-            fn=add_benchmark_handler,
-            inputs=[dataset_id_input, config_dropdown, benchmark_name, benchmark_description, metrics_input],
-            outputs=[benchmark_status]
-        )
-        refresh_benchmarks_button.click(
-            fn=get_benchmarks_handler,
-            inputs=[],
-            outputs=[no_benchmarks_message, my_benchmarks, my_benchmarks]
-        )
-        reload_sample_benchmarks_button.click(
-            fn=reload_sample_benchmarks_handler,
-            inputs=[],
-            outputs=[reload_status]
-        )
-        # Initialize benchmarks on load
-        benchmark_ui.load(
-            fn=get_benchmarks_handler,
-            inputs=[],
-            outputs=[no_benchmarks_message, my_benchmarks, my_benchmarks]
-        )
-    return benchmark_ui