File size: 3,418 Bytes
6efebdc
b85d9b0
 
 
 
 
 
6efebdc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b85d9b0
6efebdc
b85d9b0
 
 
 
 
 
 
6efebdc
b85d9b0
6efebdc
 
b85d9b0
6efebdc
 
 
 
b85d9b0
6efebdc
 
 
 
b85d9b0
6efebdc
 
 
b85d9b0
6efebdc
b85d9b0
6efebdc
b85d9b0
6efebdc
b85d9b0
 
 
 
 
 
 
 
 
 
 
6efebdc
 
 
 
 
 
 
b85d9b0
6efebdc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from pathlib import Path

from leaderboard_tab import (
    create_leaderboard_tab,
    search_leaderboard,
    update_columns_to_show,
)
from utils import load_json_results

# Constants
RERANKER_ABOUT_SECTION = """
## About Reranking Evaluation

The reranking evaluation assesses a model's ability to improve search quality by reordering initially retrieved results. Models are evaluated across multiple unseen Arabic datasets to ensure robust performance.

### Evaluation Metrics
- **MRR@10 (Mean Reciprocal Rank at 10)**: Measures the ranking quality focusing on the first relevant result in top-10
- **NDCG@10 (Normalized DCG at 10)**: Evaluates the ranking quality of all relevant results in top-10
- **MAP (Mean Average Precision)**: Measures the overall precision across all relevant documents

All metrics are averaged across multiple evaluation datasets to provide a comprehensive assessment of model performance.

### Model Requirements
- Must accept query-document pairs as input
- Should output relevance scores for reranking (has cross-attention or similar mechanism for query-document matching)
- Support for Arabic text processing

### Evaluation Process
1. Models are tested on multiple unseen Arabic datasets
2. For each dataset:
   - Initial candidate documents are provided
   - Model reranks the candidates
   - MRR@10, NDCG@10, and MAP are calculated
3. Final scores are averaged across all datasets
4. Models are ranked based on overall performance

### How to Prepare Your Model
- Model should be public on HuggingFace Hub (private models are not supported yet)
- Make sure it works coherently with `sentence-transformers` library
"""

# Global variables
reranking_df = None


def load_reranking_leaderboard():
    """Load and prepare the reranking leaderboard data"""
    global reranking_df

    # Prepare reranking dataframe
    dataframe_path = Path(__file__).parent / "results" / "reranking_results.json"
    reranking_df = load_json_results(
        dataframe_path,
        prepare_for_display=True,
        sort_col="Average Score",
        drop_cols=["Revision", "Task"],
    )
    reranking_df.insert(0, "Rank", range(1, 1 + len(reranking_df)))

    return reranking_df


def reranking_search_leaderboard(model_name, columns_to_show):
    """Search function for reranking leaderboard"""
    return search_leaderboard(reranking_df, model_name, columns_to_show)


def update_reranker_columns_to_show(columns_to_show):
    """Update displayed columns for reranking leaderboard"""
    return update_columns_to_show(reranking_df, columns_to_show)


def create_reranking_tab():
    """Create the complete reranking leaderboard tab"""
    global reranking_df

    # Load data if not already loaded
    if reranking_df is None:
        reranking_df = load_reranking_leaderboard()

    # Define default columns to show
    default_columns = [
        "Rank",
        "Model",
        "Average Score",
        "Model Size (MB)",
        "Context Length",
        "Embedding Dimension",
        "Namaa Global Knowledge",
        "Navid General Knowledge",
    ]

    # Create and return the tab
    return create_leaderboard_tab(
        df=reranking_df,
        initial_columns_to_show=default_columns,
        search_function=reranking_search_leaderboard,
        update_function=update_reranker_columns_to_show,
        about_section=RERANKER_ABOUT_SECTION,
        task_type="Reranker",
    )