""" Evaluation metrics for document ranking. This file contains implementation of various evaluation metrics for assessing the quality of document rankings. """ import numpy as np def recall_at_k(true_items, predicted_items, k=10): """ Calculate recall at k for a single query. Parameters: true_items (list): List of true relevant items predicted_items (list): List of predicted items (ranked) k (int): Number of top items to consider Returns: float: Recall@k value between 0 and 1 """ if not true_items: return 0.0 # No relevant items to recall # Get the top k predicted items top_k_items = predicted_items[:k] # Count the number of true items in the top k predictions relevant_in_top_k = sum(1 for item in top_k_items if item in true_items) # Calculate recall: (relevant items in top k) / (total relevant items) return relevant_in_top_k / len(true_items) def mean_recall_at_k(true_items_list, predicted_items_list, k=10): """ Calculate mean recall at k across multiple queries. Parameters: true_items_list (list of lists): List of true relevant items for each query predicted_items_list (list of lists): List of predicted items for each query k (int): Number of top items to consider Returns: float: Mean Recall@k value between 0 and 1 """ if len(true_items_list) != len(predicted_items_list): raise ValueError("Number of true item lists must match number of predicted item lists") if not true_items_list: return 0.0 # No data provided # Calculate recall@k for each query recalls = [recall_at_k(true_items, predicted_items, k) for true_items, predicted_items in zip(true_items_list, predicted_items_list)] # Return mean recall@k return sum(recalls) / len(recalls) def average_precision(true_items, predicted_items): """ Calculate average precision for a single query. Parameters: true_items (list): List of true relevant items predicted_items (list): List of predicted items (ranked) Returns: float: Average precision value between 0 and 1 """ if not true_items or not predicted_items: return 0.0 # Track number of relevant items seen and running sum of precision values relevant_count = 0 precision_sum = 0.0 # Calculate precision at each position where a relevant item is found for i, item in enumerate(predicted_items): position = i + 1 # 1-indexed position if item in true_items: relevant_count += 1 # Precision at this position = relevant items seen / position precision_at_position = relevant_count / position precision_sum += precision_at_position # Average precision = sum of precision values / total relevant items total_relevant = len(true_items) return precision_sum / total_relevant if total_relevant > 0 else 0.0 def mean_average_precision(true_items_list, predicted_items_list): """ Calculate mean average precision (MAP) across multiple queries. Parameters: true_items_list (list of lists): List of true relevant items for each query predicted_items_list (list of lists): List of predicted items for each query Returns: float: MAP value between 0 and 1 """ if len(true_items_list) != len(predicted_items_list): raise ValueError("Number of true item lists must match number of predicted item lists") if not true_items_list: return 0.0 # No data provided # Calculate average precision for each query aps = [average_precision(true_items, predicted_items) for true_items, predicted_items in zip(true_items_list, predicted_items_list)] # Return mean average precision return sum(aps) / len(aps) def inverse_ranking(true_items, predicted_items): """ Calculate inverse ranking for the first relevant item. Parameters: true_items (list): List of true relevant items predicted_items (list): List of predicted items (ranked) Returns: float: Inverse ranking value between 0 and 1 """ if not true_items or not predicted_items: return 0.0 # Find position of first relevant item (1-indexed) for i, item in enumerate(predicted_items): if item in true_items: rank = i + 1 return 1.0 / rank # Inverse ranking # No relevant items found in predictions return 0.0 def mean_inv_ranking(true_items_list, predicted_items_list): """ Calculate mean inverse ranking (MIR) across multiple queries. Parameters: true_items_list (list of lists): List of true relevant items for each query predicted_items_list (list of lists): List of predicted items for each query Returns: float: MIR value between 0 and 1 """ if len(true_items_list) != len(predicted_items_list): raise ValueError("Number of true item lists must match number of predicted item lists") if not true_items_list: return 0.0 # No data provided # Calculate inverse ranking for each query inv_ranks = [inverse_ranking(true_items, predicted_items) for true_items, predicted_items in zip(true_items_list, predicted_items_list)] # Return mean inverse ranking return sum(inv_ranks) / len(inv_ranks) def ranking(true_items, predicted_items): """ Calculate the rank of the first relevant item. Parameters: true_items (list): List of true relevant items predicted_items (list): List of predicted items (ranked) Returns: float: Rank of the first relevant item (1-indexed) """ if not true_items or not predicted_items: return float('inf') # No relevant items to find # Find position of first relevant item (1-indexed) for i, item in enumerate(predicted_items): if item in true_items: return i + 1 # Return rank (1-indexed) # No relevant items found in predictions return float('inf') def mean_ranking(true_items_list, predicted_items_list): """ Calculate mean ranking across multiple queries. Parameters: true_items_list (list of lists): List of true relevant items for each query predicted_items_list (list of lists): List of predicted items for each query Returns: float: Mean ranking value (higher is worse) """ if len(true_items_list) != len(predicted_items_list): raise ValueError("Number of true item lists must match number of predicted item lists") if not true_items_list: return float('inf') # No data provided # Calculate ranking for each query ranks = [ranking(true_items, predicted_items) for true_items, predicted_items in zip(true_items_list, predicted_items_list)] # Filter out 'inf' values for mean calculation finite_ranks = [r for r in ranks if r != float('inf')] # Return mean ranking return sum(finite_ranks) / len(finite_ranks) if finite_ranks else float('inf')