askveracity / evaluate_performance.py
ankanghosh's picture
Update evaluate_performance.py
4dbfec8 verified
#!/usr/bin/env python3
"""
Performance Evaluation Script for AskVeracity.
This script evaluates the performance of the AskVeracity fact-checking system
using a predefined set of test claims with known ground truth labels.
It collects metrics on accuracy, safety rate, processing time, and confidence scores
without modifying the core codebase.
Usage:
python evaluate_performance.py [--limit N] [--output FILE]
Options:
--limit N Limit evaluation to first N claims (default: all)
--output FILE Save results to FILE (default: performance_results.json)
"""
import os
import sys
import json
import time
import argparse
from datetime import datetime
import matplotlib.pyplot as plt
from tabulate import tabulate
import numpy as np
# Add the parent directory to sys.path if this script is run directly
if __name__ == "__main__":
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Import the agent and performance tracker
import agent
from utils.performance import PerformanceTracker
from utils.models import initialize_models
# IMPORTANT NOTE FOR DEVELOPERS:
# The test claims below include many recent events that will become outdated.
# When using this script for testing or evaluation, please update these claims
# with relevant and up-to-date examples to ensure meaningful results.
# Performance metrics are heavily influenced by the recency and verifiability
# of these claims, so using outdated claims will likely lead to poor results.
# Define the test claims with ground truth labels
TEST_CLAIMS = [
# True claims
{"claim": "Dozens killed as gunmen massacre tourists in Kashmir beauty spot.", "expected": "True"},
{"claim": "Pope Francis dies at 88.", "expected": "True"},
{"claim": "OpenAI released new reasoning models called o3 and o4-mini.", "expected": "True"},
{"claim": "Trump And Zelensky Clash Again As US Says Crimea Now Russian Territory.", "expected": "True"},
{"claim": "Twelve states sue Donald Trump administration in trade court over chaotic and illegal tariff policy.", "expected": "True"},
{"claim": "Zomato has been renamed to Eternal Limited.", "expected": "True"},
{"claim": "The Taj Mahal is located in Agra.", "expected": "True"},
{"claim": "ISRO achieves second docking with SpaDeX satellites.", "expected": "True"},
{"claim": "The TV series Adolescence is streaming on Netflix.", "expected": "True"},
{"claim": "Vladimir Putin offers to halt Ukraine invasion.", "expected": "True"},
{"claim": "Meta released its Llama 4 language model.", "expected": "True"},
{"claim": "Google launched Gemini 2.5 Pro Experimental, the first model in the Gemini 2.5 family.", "expected": "True"},
{"claim": "Microsoft is rolling out improved Recall feature for Windows Insiders.", "expected": "True"},
{"claim": "Microsoft announced a 1-bit language model that can run on CPU.", "expected": "True"},
{"claim": "Royal Challengers Bengaluru beat Rajasthan Royals by 11 runs in yesterday's IPL match.", "expected": "True"},
{"claim": "Anthropic introduced Claude Research.", "expected": "True"},
{"claim": "The IMF has lowered India's growth projection for the fiscal year 2025-26 to 6.2 per cent.", "expected": "True"},
{"claim": "In Bundesliga, Bayern Munich beat Heidenheim 4-0 last week.", "expected": "True"},
{"claim": "Manchester United in Europa League semi-finals.", "expected": "True"},
# False claims
{"claim": "The Eiffel Tower is in Rome.", "expected": "False"},
{"claim": "The earth is flat.", "expected": "False"},
{"claim": "Rishi Sunak is the current Prime Minister of the UK.", "expected": "False"},
{"claim": "New Zealand won the ICC Champions Trophy in 2025.", "expected": "False"},
{"claim": "US President Donald trump to visit India next week.", "expected": "False"},
{"claim": "Quantum computers have definitively solved the protein folding problem.", "expected": "False"},
{"claim": "CRISPR gene editing has successfully cured type 1 diabetes in human clinical trials.", "expected": "False"},
{"claim": "Google's new quantum computer, Willow, has demonstrated remarkable capabilities by solving mathematical problems far beyond the reach of the fastest supercomputers.", "expected": "False"},
{"claim": "NASA confirmed that the James Webb Space Telescope has found definitive evidence of alien life on an exoplanet.", "expected": "False"},
{"claim": "Google launched Gemini 3.", "expected": "False"},
{"claim": "A solar eclipse was be seen in India on October 17, 2024.", "expected": "False"},
{"claim": "Tom Cruise and Shah Rukh Khan have starred in a Bollywood movie in the past.", "expected": "False"},
{"claim": "Germany has the highest GDP in the world.", "expected": "False"},
# Uncertain claims
{"claim": "Aliens have visited the Earth.", "expected": "Uncertain"},
{"claim": "Information that falls into a black hole is permanently lost or destroyed.", "expected": "Uncertain"},
{"claim": "Time travel into the past is possible.", "expected": "Uncertain"},
{"claim": "Bigfoot (or Yeti) exists in remote wilderness areas.", "expected": "Uncertain"},
{"claim": "Intelligent life exists elsewhere in the universe.", "expected": "Uncertain"},
{"claim": "Yogi Adityanath will be the next Prime Minister of India.", "expected": "Uncertain"},
{"claim": "Consciousness continues to exist after biological death.", "expected": "Uncertain"},
{"claim": "There are multiple parallel universes.", "expected": "Uncertain"}
]
def setup_argument_parser():
"""
Set up command line argument parsing.
Returns:
argparse.Namespace: Parsed command line arguments
"""
parser = argparse.ArgumentParser(description="Evaluate AskVeracity performance")
parser.add_argument("--limit", type=int, help="Limit evaluation to first N claims")
parser.add_argument("--output", type=str, default="performance_results.json",
help="Output file for results (default: performance_results.json)")
return parser.parse_args()
def initialize_system():
"""
Initialize the system for evaluation.
Returns:
object: Initialized LangGraph agent
"""
print("Initializing models and agent...")
initialize_models()
eval_agent = agent.setup_agent()
return eval_agent
def normalize_classification(classification):
"""
Normalize classification labels for consistent comparison.
Args:
classification (str): Classification label from the system
Returns:
str: Normalized classification label ("True", "False", or "Uncertain")
"""
if not classification:
return "Uncertain"
if "true" in classification.lower():
return "True"
elif "false" in classification.lower():
return "False"
else:
return "Uncertain"
def is_correct(actual, expected):
"""
Determine if the actual classification matches the expected classification.
Args:
actual (str): Actual classification from the system
expected (str): Expected (ground truth) classification
Returns:
bool: True if classifications match, False otherwise
"""
# Normalize both for comparison
normalized_actual = normalize_classification(actual)
normalized_expected = expected
return normalized_actual == normalized_expected
def is_safe(actual, expected):
"""
Determine if the classification is "safe" - either correct or abstained (Uncertain)
instead of making an incorrect assertion.
Args:
actual (str): Actual classification from the system
expected (str): Expected (ground truth) classification
Returns:
bool: True if the classification is safe, False otherwise
"""
# Normalize both for comparison
normalized_actual = normalize_classification(actual)
normalized_expected = expected
# If the classification is correct, it's definitely safe
if normalized_actual == normalized_expected:
return True
# If the system classified as "Uncertain", that's safe (abstaining rather than wrong assertion)
if normalized_actual == "Uncertain":
return True
# Otherwise, the system made an incorrect assertion (False as True or True as False)
return False
def evaluate_claims(test_claims, eval_agent, limit=None):
"""
Evaluate a list of claims using the fact-checking system.
Args:
test_claims (list): List of test claims with expected classifications
eval_agent (object): Initialized LangGraph agent
limit (int, optional): Maximum number of claims to evaluate
Returns:
tuple: (results, metrics)
- results (list): Detailed results for each claim
- metrics (dict): Aggregated performance metrics
"""
# Initialize performance tracker
performance_tracker = PerformanceTracker()
# Limit the number of claims if requested
if limit and limit > 0:
claims_to_evaluate = test_claims[:limit]
else:
claims_to_evaluate = test_claims
results = []
total_count = len(claims_to_evaluate)
correct_count = 0
safe_count = 0
# Classification counts
classification_counts = {"True": 0, "False": 0, "Uncertain": 0}
# Track processing times by expected classification
processing_times = {"True": [], "False": [], "Uncertain": []}
# Confidence scores by expected classification
confidence_scores = {"True": [], "False": [], "Uncertain": []}
# Track correct classifications by expected classification
correct_by_class = {"True": 0, "False": 0, "Uncertain": 0}
safe_by_class = {"True": 0, "False": 0, "Uncertain": 0}
total_by_class = {"True": 0, "False": 0, "Uncertain": 0}
print(f"Evaluating {len(claims_to_evaluate)} claims...")
# Process each claim
for idx, test_case in enumerate(claims_to_evaluate):
claim = test_case["claim"]
expected = test_case["expected"]
print(f"\nProcessing claim {idx+1}/{len(claims_to_evaluate)}: {claim}")
try:
# Process the claim and measure time
start_time = time.time()
result = agent.process_claim(claim, eval_agent)
total_time = time.time() - start_time
# Extract classification and confidence
classification = result.get("classification", "Uncertain")
confidence = result.get("confidence", 0.0)
# Normalize classification for comparison
normalized_classification = normalize_classification(classification)
# Check if classification is correct
correct = is_correct(normalized_classification, expected)
if correct:
correct_count += 1
correct_by_class[expected] += 1
# Check if classification is safe
safe = is_safe(normalized_classification, expected)
if safe:
safe_count += 1
safe_by_class[expected] += 1
# Update classification count
classification_counts[normalized_classification] = classification_counts.get(normalized_classification, 0) + 1
# Update counts by expected class
total_by_class[expected] += 1
# Update processing times
processing_times[expected].append(total_time)
# Update confidence scores
confidence_scores[expected].append(confidence)
# Save detailed result
detail_result = {
"claim": claim,
"expected": expected,
"actual": normalized_classification,
"correct": correct,
"safe": safe,
"confidence": confidence,
"processing_time": total_time
}
results.append(detail_result)
# Print progress indicator
outcome = "✓" if correct else "✗"
safety = "(safe)" if safe and not correct else ""
print(f" Result: {normalized_classification} (Expected: {expected}) {outcome} {safety}")
print(f" Time: {total_time:.2f}s, Confidence: {confidence:.2f}")
except Exception as e:
print(f"Error processing claim: {str(e)}")
results.append({
"claim": claim,
"expected": expected,
"error": str(e)
})
# Calculate performance metrics
accuracy = correct_count / total_count if total_count > 0 else 0
safety_rate = safe_count / total_count if total_count > 0 else 0
# Calculate per-class metrics
class_metrics = {}
for cls in ["True", "False", "Uncertain"]:
class_accuracy = correct_by_class[cls] / total_by_class[cls] if total_by_class[cls] > 0 else 0
class_safety_rate = safe_by_class[cls] / total_by_class[cls] if total_by_class[cls] > 0 else 0
avg_time = sum(processing_times[cls]) / len(processing_times[cls]) if processing_times[cls] else 0
avg_confidence = sum(confidence_scores[cls]) / len(confidence_scores[cls]) if confidence_scores[cls] else 0
class_metrics[cls] = {
"accuracy": class_accuracy,
"safety_rate": class_safety_rate,
"count": total_by_class[cls],
"correct": correct_by_class[cls],
"safe": safe_by_class[cls],
"avg_processing_time": avg_time,
"avg_confidence": avg_confidence
}
# Calculate overall metrics
all_times = [r.get("processing_time", 0) for r in results if "processing_time" in r]
all_confidence = [r.get("confidence", 0) for r in results if "confidence" in r]
metrics = {
"total_claims": total_count,
"correct_claims": correct_count,
"safe_claims": safe_count,
"accuracy": accuracy,
"safety_rate": safety_rate,
"avg_processing_time": sum(all_times) / len(all_times) if all_times else 0,
"avg_confidence": sum(all_confidence) / len(all_confidence) if all_confidence else 0,
"classification_counts": classification_counts,
"per_class_metrics": class_metrics
}
return results, metrics
def save_results(results, metrics, output_file):
"""
Save evaluation results to a JSON file.
Args:
results (list): Detailed results for each claim
metrics (dict): Aggregated performance metrics
output_file (str): Path to output file
"""
output_data = {
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"metrics": metrics,
"detailed_results": results
}
with open(output_file, 'w') as f:
json.dump(output_data, f, indent=2)
print(f"\nResults saved to {output_file}")
def print_summary(metrics):
"""
Print a summary of performance metrics.
Args:
metrics (dict): Aggregated performance metrics
"""
print("\n" + "="*70)
print(f"PERFORMANCE SUMMARY")
print("="*70)
# Overall metrics
print(f"\nOverall Metrics:")
print(f"Total Claims: {metrics['total_claims']}")
print(f"Correctly Classified: {metrics['correct_claims']}")
print(f"Safely Classified: {metrics['safe_claims']}")
print(f"Accuracy: {metrics['accuracy']:.2%}")
print(f"Safety Rate: {metrics['safety_rate']:.2%}")
print(f"Average Processing Time: {metrics['avg_processing_time']:.2f} seconds")
print(f"Average Confidence Score: {metrics['avg_confidence']:.2f}")
# Per-class metrics as table
print("\nPer-Class Performance:")
table_data = []
headers = ["Class", "Count", "Correct", "Safe", "Accuracy", "Safety Rate", "Avg Time", "Avg Confidence"]
for cls, cls_metrics in metrics['per_class_metrics'].items():
table_data.append([
cls,
cls_metrics['count'],
cls_metrics['correct'],
cls_metrics['safe'],
f"{cls_metrics['accuracy']:.2%}",
f"{cls_metrics['safety_rate']:.2%}",
f"{cls_metrics['avg_processing_time']:.2f}s",
f"{cls_metrics['avg_confidence']:.2f}"
])
print(tabulate(table_data, headers=headers, tablefmt="grid"))
def create_charts(metrics, output_dir="."):
"""
Create visualizations of performance metrics.
Args:
metrics (dict): Aggregated performance metrics
output_dir (str): Directory to save charts
"""
try:
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Plot 1: Accuracy by class
plt.figure(figsize=(10, 6))
classes = list(metrics['per_class_metrics'].keys())
accuracies = [metrics['per_class_metrics'][cls]['accuracy'] for cls in classes]
plt.bar(classes, accuracies, color=['green', 'red', 'gray'])
plt.title('Accuracy by Classification Type')
plt.xlabel('Classification')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
for i, v in enumerate(accuracies):
plt.text(i, v + 0.02, f"{v:.2%}", ha='center')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'accuracy_by_class.png'))
plt.close() # Close the figure to free memory
# Plot 2: Safety rate by class
plt.figure(figsize=(10, 6))
safety_rates = [metrics['per_class_metrics'][cls]['safety_rate'] for cls in classes]
plt.bar(classes, safety_rates, color=['green', 'red', 'gray'])
plt.title('Safety Rate by Classification Type')
plt.xlabel('Classification')
plt.ylabel('Safety Rate')
plt.ylim(0, 1)
for i, v in enumerate(safety_rates):
plt.text(i, v + 0.02, f"{v:.2%}", ha='center')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'safety_rate_by_class.png'))
plt.close() # Close the figure to free memory
# Plot 3: Processing time by class
plt.figure(figsize=(10, 6))
times = [metrics['per_class_metrics'][cls]['avg_processing_time'] for cls in classes]
plt.bar(classes, times, color=['green', 'red', 'gray'])
plt.title('Average Processing Time by Classification Type')
plt.xlabel('Classification')
plt.ylabel('Time (seconds)')
for i, v in enumerate(times):
plt.text(i, v + 0.5, f"{v:.2f}s", ha='center')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'processing_time_by_class.png'))
plt.close() # Close the figure to free memory
# Plot 4: Confidence scores by class
plt.figure(figsize=(10, 6))
confidence = [metrics['per_class_metrics'][cls]['avg_confidence'] for cls in classes]
plt.bar(classes, confidence, color=['green', 'red', 'gray'])
plt.title('Average Confidence Score by Classification Type')
plt.xlabel('Classification')
plt.ylabel('Confidence Score')
plt.ylim(0, 1)
for i, v in enumerate(confidence):
plt.text(i, v + 0.02, f"{v:.2f}", ha='center')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'confidence_by_class.png'))
plt.close() # Close the figure to free memory
print(f"\nCharts created in {output_dir}")
except Exception as e:
print(f"Error creating charts: {str(e)}")
print("Continuing without charts.")
def main():
"""Main evaluation function that runs the entire evaluation process."""
# Parse arguments
args = setup_argument_parser()
# Initialize the agent
eval_agent = initialize_system()
# Create results directory if it doesn't exist
results_dir = "results"
os.makedirs(results_dir, exist_ok=True)
# Set output file path
output_file = args.output
if not os.path.isabs(output_file):
output_file = os.path.join(results_dir, output_file)
# Evaluate claims
results, metrics = evaluate_claims(TEST_CLAIMS, eval_agent, args.limit)
# results, metrics = evaluate_claims(TEST_CLAIMS, eval_agent, 1)
# Print summary
print_summary(metrics)
# Save results
save_results(results, metrics, output_file)
# Create charts
try:
from tabulate import tabulate
import matplotlib.pyplot as plt
create_charts(metrics, results_dir)
except ImportError:
print("\nCould not create charts. Please install matplotlib and tabulate packages:")
print("pip install matplotlib tabulate")
if __name__ == "__main__":
main()