23RAG7

Sleeping

App Files Files Community

vamseelatha2002 commited on Feb 22

Commit

09d15e8

verified ·

1 Parent(s): 95df31f

Update evaluation.py

Browse files

Files changed (1) hide show

evaluation.py +18 -2

evaluation.py CHANGED Viewed

@@ -113,19 +113,28 @@ def retrieve_ground_truths(question, dataset):
     return None,None  # Return None if no match is found
-# Store RMSE for each metric in the global rmse_scores dictionary
 def store_rmse(question, predicted_metrics, ground_truth_metrics):
     for metric_name in predicted_metrics:
-        # Ensure both predicted value and ground truth value are numeric before calculating RMSE
         predicted_value = predicted_metrics[metric_name]
         ground_truth_value = ground_truth_metrics.get(metric_name, None)
         if isinstance(predicted_value, (int, float)) and isinstance(ground_truth_value, (int, float)):
             rmse_value = compute_rmse([predicted_value], [ground_truth_value])
             if rmse_value is not None:
                 if question not in rmse_scores:
                     rmse_scores[question] = {}
                 rmse_scores[question][metric_name] = rmse_value
 def calculate_metrics(question, q_dataset, response, docs, time_taken):
     data = load_query_dataset(q_dataset)
@@ -151,6 +160,13 @@ def calculate_metrics(question, q_dataset, response, docs, time_taken):
         "ground truth completeness": ground_truth_completeness
     }
     store_rmse(question, predicted_metrics, ground_truth_metrics)
      # Now, make sure the values passed to RMSE calculation are numeric

     return None,None  # Return None if no match is found
 def store_rmse(question, predicted_metrics, ground_truth_metrics):
+    """Calculate and store RMSE for each metric."""
     for metric_name in predicted_metrics:
         predicted_value = predicted_metrics[metric_name]
+        # Get the corresponding ground truth value from ground_truth_metrics
         ground_truth_value = ground_truth_metrics.get(metric_name, None)
+        # Debugging: Check the values being compared
+        print(f"Comparing {metric_name}: Predicted = {predicted_value}, Ground Truth = {ground_truth_value}")
+        # Ensure both predicted value and ground truth value are numeric before calculating RMSE
         if isinstance(predicted_value, (int, float)) and isinstance(ground_truth_value, (int, float)):
             rmse_value = compute_rmse([predicted_value], [ground_truth_value])
             if rmse_value is not None:
+                print(f"RMSE for {metric_name}: {rmse_value}")
                 if question not in rmse_scores:
                     rmse_scores[question] = {}
                 rmse_scores[question][metric_name] = rmse_value
+        else:
+            print(f"Skipping RMSE for {metric_name}: One or both values are non-numeric")
 def calculate_metrics(question, q_dataset, response, docs, time_taken):
     data = load_query_dataset(q_dataset)
         "ground truth completeness": ground_truth_completeness
     }
+        # Predicted metrics
+    predicted_metrics_rmse = {
+        "context_relevance": context_relevance(question, docs),
+        "context_utilization": context_utilization(response, docs),
+        "completeness": compute_cosine_similarity(response, ground_truth_answer), #completeness(response, ground_truth_answer),
+        "adherence": adherence(response, docs)
+    }
     store_rmse(question, predicted_metrics, ground_truth_metrics)
      # Now, make sure the values passed to RMSE calculation are numeric