Spaces:
Configuration error
Configuration error
File size: 7,189 Bytes
3b98ef0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 |
"""
Evaluation metrics for document ranking.
This file contains implementation of various evaluation metrics
for assessing the quality of document rankings.
"""
import numpy as np
def recall_at_k(true_items, predicted_items, k=10):
"""
Calculate recall at k for a single query.
Parameters:
true_items (list): List of true relevant items
predicted_items (list): List of predicted items (ranked)
k (int): Number of top items to consider
Returns:
float: Recall@k value between 0 and 1
"""
if not true_items:
return 0.0 # No relevant items to recall
# Get the top k predicted items
top_k_items = predicted_items[:k]
# Count the number of true items in the top k predictions
relevant_in_top_k = sum(1 for item in top_k_items if item in true_items)
# Calculate recall: (relevant items in top k) / (total relevant items)
return relevant_in_top_k / len(true_items)
def mean_recall_at_k(true_items_list, predicted_items_list, k=10):
"""
Calculate mean recall at k across multiple queries.
Parameters:
true_items_list (list of lists): List of true relevant items for each query
predicted_items_list (list of lists): List of predicted items for each query
k (int): Number of top items to consider
Returns:
float: Mean Recall@k value between 0 and 1
"""
if len(true_items_list) != len(predicted_items_list):
raise ValueError("Number of true item lists must match number of predicted item lists")
if not true_items_list:
return 0.0 # No data provided
# Calculate recall@k for each query
recalls = [recall_at_k(true_items, predicted_items, k)
for true_items, predicted_items in zip(true_items_list, predicted_items_list)]
# Return mean recall@k
return sum(recalls) / len(recalls)
def average_precision(true_items, predicted_items):
"""
Calculate average precision for a single query.
Parameters:
true_items (list): List of true relevant items
predicted_items (list): List of predicted items (ranked)
Returns:
float: Average precision value between 0 and 1
"""
if not true_items or not predicted_items:
return 0.0
# Track number of relevant items seen and running sum of precision values
relevant_count = 0
precision_sum = 0.0
# Calculate precision at each position where a relevant item is found
for i, item in enumerate(predicted_items):
position = i + 1 # 1-indexed position
if item in true_items:
relevant_count += 1
# Precision at this position = relevant items seen / position
precision_at_position = relevant_count / position
precision_sum += precision_at_position
# Average precision = sum of precision values / total relevant items
total_relevant = len(true_items)
return precision_sum / total_relevant if total_relevant > 0 else 0.0
def mean_average_precision(true_items_list, predicted_items_list):
"""
Calculate mean average precision (MAP) across multiple queries.
Parameters:
true_items_list (list of lists): List of true relevant items for each query
predicted_items_list (list of lists): List of predicted items for each query
Returns:
float: MAP value between 0 and 1
"""
if len(true_items_list) != len(predicted_items_list):
raise ValueError("Number of true item lists must match number of predicted item lists")
if not true_items_list:
return 0.0 # No data provided
# Calculate average precision for each query
aps = [average_precision(true_items, predicted_items)
for true_items, predicted_items in zip(true_items_list, predicted_items_list)]
# Return mean average precision
return sum(aps) / len(aps)
def inverse_ranking(true_items, predicted_items):
"""
Calculate inverse ranking for the first relevant item.
Parameters:
true_items (list): List of true relevant items
predicted_items (list): List of predicted items (ranked)
Returns:
float: Inverse ranking value between 0 and 1
"""
if not true_items or not predicted_items:
return 0.0
# Find position of first relevant item (1-indexed)
for i, item in enumerate(predicted_items):
if item in true_items:
rank = i + 1
return 1.0 / rank # Inverse ranking
# No relevant items found in predictions
return 0.0
def mean_inv_ranking(true_items_list, predicted_items_list):
"""
Calculate mean inverse ranking (MIR) across multiple queries.
Parameters:
true_items_list (list of lists): List of true relevant items for each query
predicted_items_list (list of lists): List of predicted items for each query
Returns:
float: MIR value between 0 and 1
"""
if len(true_items_list) != len(predicted_items_list):
raise ValueError("Number of true item lists must match number of predicted item lists")
if not true_items_list:
return 0.0 # No data provided
# Calculate inverse ranking for each query
inv_ranks = [inverse_ranking(true_items, predicted_items)
for true_items, predicted_items in zip(true_items_list, predicted_items_list)]
# Return mean inverse ranking
return sum(inv_ranks) / len(inv_ranks)
def ranking(true_items, predicted_items):
"""
Calculate the rank of the first relevant item.
Parameters:
true_items (list): List of true relevant items
predicted_items (list): List of predicted items (ranked)
Returns:
float: Rank of the first relevant item (1-indexed)
"""
if not true_items or not predicted_items:
return float('inf') # No relevant items to find
# Find position of first relevant item (1-indexed)
for i, item in enumerate(predicted_items):
if item in true_items:
return i + 1 # Return rank (1-indexed)
# No relevant items found in predictions
return float('inf')
def mean_ranking(true_items_list, predicted_items_list):
"""
Calculate mean ranking across multiple queries.
Parameters:
true_items_list (list of lists): List of true relevant items for each query
predicted_items_list (list of lists): List of predicted items for each query
Returns:
float: Mean ranking value (higher is worse)
"""
if len(true_items_list) != len(predicted_items_list):
raise ValueError("Number of true item lists must match number of predicted item lists")
if not true_items_list:
return float('inf') # No data provided
# Calculate ranking for each query
ranks = [ranking(true_items, predicted_items)
for true_items, predicted_items in zip(true_items_list, predicted_items_list)]
# Filter out 'inf' values for mean calculation
finite_ranks = [r for r in ranks if r != float('inf')]
# Return mean ranking
return sum(finite_ranks) / len(finite_ranks) if finite_ranks else float('inf')
|