Spaces:
Configuration error
Configuration error
""" | |
Evaluation metrics for document ranking. | |
This file contains implementation of various evaluation metrics | |
for assessing the quality of document rankings. | |
""" | |
import numpy as np | |
def recall_at_k(true_items, predicted_items, k=10): | |
""" | |
Calculate recall at k for a single query. | |
Parameters: | |
true_items (list): List of true relevant items | |
predicted_items (list): List of predicted items (ranked) | |
k (int): Number of top items to consider | |
Returns: | |
float: Recall@k value between 0 and 1 | |
""" | |
if not true_items: | |
return 0.0 # No relevant items to recall | |
# Get the top k predicted items | |
top_k_items = predicted_items[:k] | |
# Count the number of true items in the top k predictions | |
relevant_in_top_k = sum(1 for item in top_k_items if item in true_items) | |
# Calculate recall: (relevant items in top k) / (total relevant items) | |
return relevant_in_top_k / len(true_items) | |
def mean_recall_at_k(true_items_list, predicted_items_list, k=10): | |
""" | |
Calculate mean recall at k across multiple queries. | |
Parameters: | |
true_items_list (list of lists): List of true relevant items for each query | |
predicted_items_list (list of lists): List of predicted items for each query | |
k (int): Number of top items to consider | |
Returns: | |
float: Mean Recall@k value between 0 and 1 | |
""" | |
if len(true_items_list) != len(predicted_items_list): | |
raise ValueError("Number of true item lists must match number of predicted item lists") | |
if not true_items_list: | |
return 0.0 # No data provided | |
# Calculate recall@k for each query | |
recalls = [recall_at_k(true_items, predicted_items, k) | |
for true_items, predicted_items in zip(true_items_list, predicted_items_list)] | |
# Return mean recall@k | |
return sum(recalls) / len(recalls) | |
def average_precision(true_items, predicted_items): | |
""" | |
Calculate average precision for a single query. | |
Parameters: | |
true_items (list): List of true relevant items | |
predicted_items (list): List of predicted items (ranked) | |
Returns: | |
float: Average precision value between 0 and 1 | |
""" | |
if not true_items or not predicted_items: | |
return 0.0 | |
# Track number of relevant items seen and running sum of precision values | |
relevant_count = 0 | |
precision_sum = 0.0 | |
# Calculate precision at each position where a relevant item is found | |
for i, item in enumerate(predicted_items): | |
position = i + 1 # 1-indexed position | |
if item in true_items: | |
relevant_count += 1 | |
# Precision at this position = relevant items seen / position | |
precision_at_position = relevant_count / position | |
precision_sum += precision_at_position | |
# Average precision = sum of precision values / total relevant items | |
total_relevant = len(true_items) | |
return precision_sum / total_relevant if total_relevant > 0 else 0.0 | |
def mean_average_precision(true_items_list, predicted_items_list): | |
""" | |
Calculate mean average precision (MAP) across multiple queries. | |
Parameters: | |
true_items_list (list of lists): List of true relevant items for each query | |
predicted_items_list (list of lists): List of predicted items for each query | |
Returns: | |
float: MAP value between 0 and 1 | |
""" | |
if len(true_items_list) != len(predicted_items_list): | |
raise ValueError("Number of true item lists must match number of predicted item lists") | |
if not true_items_list: | |
return 0.0 # No data provided | |
# Calculate average precision for each query | |
aps = [average_precision(true_items, predicted_items) | |
for true_items, predicted_items in zip(true_items_list, predicted_items_list)] | |
# Return mean average precision | |
return sum(aps) / len(aps) | |
def inverse_ranking(true_items, predicted_items): | |
""" | |
Calculate inverse ranking for the first relevant item. | |
Parameters: | |
true_items (list): List of true relevant items | |
predicted_items (list): List of predicted items (ranked) | |
Returns: | |
float: Inverse ranking value between 0 and 1 | |
""" | |
if not true_items or not predicted_items: | |
return 0.0 | |
# Find position of first relevant item (1-indexed) | |
for i, item in enumerate(predicted_items): | |
if item in true_items: | |
rank = i + 1 | |
return 1.0 / rank # Inverse ranking | |
# No relevant items found in predictions | |
return 0.0 | |
def mean_inv_ranking(true_items_list, predicted_items_list): | |
""" | |
Calculate mean inverse ranking (MIR) across multiple queries. | |
Parameters: | |
true_items_list (list of lists): List of true relevant items for each query | |
predicted_items_list (list of lists): List of predicted items for each query | |
Returns: | |
float: MIR value between 0 and 1 | |
""" | |
if len(true_items_list) != len(predicted_items_list): | |
raise ValueError("Number of true item lists must match number of predicted item lists") | |
if not true_items_list: | |
return 0.0 # No data provided | |
# Calculate inverse ranking for each query | |
inv_ranks = [inverse_ranking(true_items, predicted_items) | |
for true_items, predicted_items in zip(true_items_list, predicted_items_list)] | |
# Return mean inverse ranking | |
return sum(inv_ranks) / len(inv_ranks) | |
def ranking(true_items, predicted_items): | |
""" | |
Calculate the rank of the first relevant item. | |
Parameters: | |
true_items (list): List of true relevant items | |
predicted_items (list): List of predicted items (ranked) | |
Returns: | |
float: Rank of the first relevant item (1-indexed) | |
""" | |
if not true_items or not predicted_items: | |
return float('inf') # No relevant items to find | |
# Find position of first relevant item (1-indexed) | |
for i, item in enumerate(predicted_items): | |
if item in true_items: | |
return i + 1 # Return rank (1-indexed) | |
# No relevant items found in predictions | |
return float('inf') | |
def mean_ranking(true_items_list, predicted_items_list): | |
""" | |
Calculate mean ranking across multiple queries. | |
Parameters: | |
true_items_list (list of lists): List of true relevant items for each query | |
predicted_items_list (list of lists): List of predicted items for each query | |
Returns: | |
float: Mean ranking value (higher is worse) | |
""" | |
if len(true_items_list) != len(predicted_items_list): | |
raise ValueError("Number of true item lists must match number of predicted item lists") | |
if not true_items_list: | |
return float('inf') # No data provided | |
# Calculate ranking for each query | |
ranks = [ranking(true_items, predicted_items) | |
for true_items, predicted_items in zip(true_items_list, predicted_items_list)] | |
# Filter out 'inf' values for mean calculation | |
finite_ranks = [r for r in ranks if r != float('inf')] | |
# Return mean ranking | |
return sum(finite_ranks) / len(finite_ranks) if finite_ranks else float('inf') | |