Spaces:
Sleeping
Sleeping
Update evaluation.py
Browse files- evaluation.py +18 -2
evaluation.py
CHANGED
@@ -113,19 +113,28 @@ def retrieve_ground_truths(question, dataset):
|
|
113 |
return None,None # Return None if no match is found
|
114 |
|
115 |
|
116 |
-
|
117 |
def store_rmse(question, predicted_metrics, ground_truth_metrics):
|
|
|
118 |
for metric_name in predicted_metrics:
|
119 |
-
# Ensure both predicted value and ground truth value are numeric before calculating RMSE
|
120 |
predicted_value = predicted_metrics[metric_name]
|
|
|
|
|
121 |
ground_truth_value = ground_truth_metrics.get(metric_name, None)
|
122 |
|
|
|
|
|
|
|
|
|
123 |
if isinstance(predicted_value, (int, float)) and isinstance(ground_truth_value, (int, float)):
|
124 |
rmse_value = compute_rmse([predicted_value], [ground_truth_value])
|
125 |
if rmse_value is not None:
|
|
|
126 |
if question not in rmse_scores:
|
127 |
rmse_scores[question] = {}
|
128 |
rmse_scores[question][metric_name] = rmse_value
|
|
|
|
|
129 |
|
130 |
def calculate_metrics(question, q_dataset, response, docs, time_taken):
|
131 |
data = load_query_dataset(q_dataset)
|
@@ -151,6 +160,13 @@ def calculate_metrics(question, q_dataset, response, docs, time_taken):
|
|
151 |
"ground truth completeness": ground_truth_completeness
|
152 |
}
|
153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
|
155 |
store_rmse(question, predicted_metrics, ground_truth_metrics)
|
156 |
# Now, make sure the values passed to RMSE calculation are numeric
|
|
|
113 |
return None,None # Return None if no match is found
|
114 |
|
115 |
|
116 |
+
|
117 |
def store_rmse(question, predicted_metrics, ground_truth_metrics):
|
118 |
+
"""Calculate and store RMSE for each metric."""
|
119 |
for metric_name in predicted_metrics:
|
|
|
120 |
predicted_value = predicted_metrics[metric_name]
|
121 |
+
|
122 |
+
# Get the corresponding ground truth value from ground_truth_metrics
|
123 |
ground_truth_value = ground_truth_metrics.get(metric_name, None)
|
124 |
|
125 |
+
# Debugging: Check the values being compared
|
126 |
+
print(f"Comparing {metric_name}: Predicted = {predicted_value}, Ground Truth = {ground_truth_value}")
|
127 |
+
|
128 |
+
# Ensure both predicted value and ground truth value are numeric before calculating RMSE
|
129 |
if isinstance(predicted_value, (int, float)) and isinstance(ground_truth_value, (int, float)):
|
130 |
rmse_value = compute_rmse([predicted_value], [ground_truth_value])
|
131 |
if rmse_value is not None:
|
132 |
+
print(f"RMSE for {metric_name}: {rmse_value}")
|
133 |
if question not in rmse_scores:
|
134 |
rmse_scores[question] = {}
|
135 |
rmse_scores[question][metric_name] = rmse_value
|
136 |
+
else:
|
137 |
+
print(f"Skipping RMSE for {metric_name}: One or both values are non-numeric")
|
138 |
|
139 |
def calculate_metrics(question, q_dataset, response, docs, time_taken):
|
140 |
data = load_query_dataset(q_dataset)
|
|
|
160 |
"ground truth completeness": ground_truth_completeness
|
161 |
}
|
162 |
|
163 |
+
# Predicted metrics
|
164 |
+
predicted_metrics_rmse = {
|
165 |
+
"context_relevance": context_relevance(question, docs),
|
166 |
+
"context_utilization": context_utilization(response, docs),
|
167 |
+
"completeness": compute_cosine_similarity(response, ground_truth_answer), #completeness(response, ground_truth_answer),
|
168 |
+
"adherence": adherence(response, docs)
|
169 |
+
}
|
170 |
|
171 |
store_rmse(question, predicted_metrics, ground_truth_metrics)
|
172 |
# Now, make sure the values passed to RMSE calculation are numeric
|