Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Performance Evaluation Script for AskVeracity. | |
This script evaluates the performance of the AskVeracity fact-checking system | |
using a predefined set of test claims with known ground truth labels. | |
It collects metrics on accuracy, safety rate, processing time, and confidence scores | |
without modifying the core codebase. | |
Usage: | |
python evaluate_performance.py [--limit N] [--output FILE] | |
Options: | |
--limit N Limit evaluation to first N claims (default: all) | |
--output FILE Save results to FILE (default: performance_results.json) | |
""" | |
import os | |
import sys | |
import json | |
import time | |
import argparse | |
from datetime import datetime | |
import matplotlib.pyplot as plt | |
from tabulate import tabulate | |
import numpy as np | |
# Add the parent directory to sys.path if this script is run directly | |
if __name__ == "__main__": | |
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
# Import the agent and performance tracker | |
import agent | |
from utils.performance import PerformanceTracker | |
from utils.models import initialize_models | |
# IMPORTANT NOTE FOR DEVELOPERS: | |
# The test claims below include many recent events that will become outdated. | |
# When using this script for testing or evaluation, please update these claims | |
# with relevant and up-to-date examples to ensure meaningful results. | |
# Performance metrics are heavily influenced by the recency and verifiability | |
# of these claims, so using outdated claims will likely lead to poor results. | |
# Define the test claims with ground truth labels | |
TEST_CLAIMS = [ | |
# True claims | |
{"claim": "Dozens killed as gunmen massacre tourists in Kashmir beauty spot.", "expected": "True"}, | |
{"claim": "Pope Francis dies at 88.", "expected": "True"}, | |
{"claim": "OpenAI released new reasoning models called o3 and o4-mini.", "expected": "True"}, | |
{"claim": "Trump And Zelensky Clash Again As US Says Crimea Now Russian Territory.", "expected": "True"}, | |
{"claim": "Twelve states sue Donald Trump administration in trade court over chaotic and illegal tariff policy.", "expected": "True"}, | |
{"claim": "Zomato has been renamed to Eternal Limited.", "expected": "True"}, | |
{"claim": "The Taj Mahal is located in Agra.", "expected": "True"}, | |
{"claim": "ISRO achieves second docking with SpaDeX satellites.", "expected": "True"}, | |
{"claim": "The TV series Adolescence is streaming on Netflix.", "expected": "True"}, | |
{"claim": "Vladimir Putin offers to halt Ukraine invasion.", "expected": "True"}, | |
{"claim": "Meta released its Llama 4 language model.", "expected": "True"}, | |
{"claim": "Google launched Gemini 2.5 Pro Experimental, the first model in the Gemini 2.5 family.", "expected": "True"}, | |
{"claim": "Microsoft is rolling out improved Recall feature for Windows Insiders.", "expected": "True"}, | |
{"claim": "Microsoft announced a 1-bit language model that can run on CPU.", "expected": "True"}, | |
{"claim": "Royal Challengers Bengaluru beat Rajasthan Royals by 11 runs in yesterday's IPL match.", "expected": "True"}, | |
{"claim": "Anthropic introduced Claude Research.", "expected": "True"}, | |
{"claim": "The IMF has lowered India's growth projection for the fiscal year 2025-26 to 6.2 per cent.", "expected": "True"}, | |
{"claim": "In Bundesliga, Bayern Munich beat Heidenheim 4-0 last week.", "expected": "True"}, | |
{"claim": "Manchester United in Europa League semi-finals.", "expected": "True"}, | |
# False claims | |
{"claim": "The Eiffel Tower is in Rome.", "expected": "False"}, | |
{"claim": "The earth is flat.", "expected": "False"}, | |
{"claim": "Rishi Sunak is the current Prime Minister of the UK.", "expected": "False"}, | |
{"claim": "New Zealand won the ICC Champions Trophy in 2025.", "expected": "False"}, | |
{"claim": "US President Donald trump to visit India next week.", "expected": "False"}, | |
{"claim": "Quantum computers have definitively solved the protein folding problem.", "expected": "False"}, | |
{"claim": "CRISPR gene editing has successfully cured type 1 diabetes in human clinical trials.", "expected": "False"}, | |
{"claim": "Google's new quantum computer, Willow, has demonstrated remarkable capabilities by solving mathematical problems far beyond the reach of the fastest supercomputers.", "expected": "False"}, | |
{"claim": "NASA confirmed that the James Webb Space Telescope has found definitive evidence of alien life on an exoplanet.", "expected": "False"}, | |
{"claim": "Google launched Gemini 3.", "expected": "False"}, | |
{"claim": "A solar eclipse was be seen in India on October 17, 2024.", "expected": "False"}, | |
{"claim": "Tom Cruise and Shah Rukh Khan have starred in a Bollywood movie in the past.", "expected": "False"}, | |
{"claim": "Germany has the highest GDP in the world.", "expected": "False"}, | |
# Uncertain claims | |
{"claim": "Aliens have visited the Earth.", "expected": "Uncertain"}, | |
{"claim": "Information that falls into a black hole is permanently lost or destroyed.", "expected": "Uncertain"}, | |
{"claim": "Time travel into the past is possible.", "expected": "Uncertain"}, | |
{"claim": "Bigfoot (or Yeti) exists in remote wilderness areas.", "expected": "Uncertain"}, | |
{"claim": "Intelligent life exists elsewhere in the universe.", "expected": "Uncertain"}, | |
{"claim": "Yogi Adityanath will be the next Prime Minister of India.", "expected": "Uncertain"}, | |
{"claim": "Consciousness continues to exist after biological death.", "expected": "Uncertain"}, | |
{"claim": "There are multiple parallel universes.", "expected": "Uncertain"} | |
] | |
def setup_argument_parser(): | |
""" | |
Set up command line argument parsing. | |
Returns: | |
argparse.Namespace: Parsed command line arguments | |
""" | |
parser = argparse.ArgumentParser(description="Evaluate AskVeracity performance") | |
parser.add_argument("--limit", type=int, help="Limit evaluation to first N claims") | |
parser.add_argument("--output", type=str, default="performance_results.json", | |
help="Output file for results (default: performance_results.json)") | |
return parser.parse_args() | |
def initialize_system(): | |
""" | |
Initialize the system for evaluation. | |
Returns: | |
object: Initialized LangGraph agent | |
""" | |
print("Initializing models and agent...") | |
initialize_models() | |
eval_agent = agent.setup_agent() | |
return eval_agent | |
def normalize_classification(classification): | |
""" | |
Normalize classification labels for consistent comparison. | |
Args: | |
classification (str): Classification label from the system | |
Returns: | |
str: Normalized classification label ("True", "False", or "Uncertain") | |
""" | |
if not classification: | |
return "Uncertain" | |
if "true" in classification.lower(): | |
return "True" | |
elif "false" in classification.lower(): | |
return "False" | |
else: | |
return "Uncertain" | |
def is_correct(actual, expected): | |
""" | |
Determine if the actual classification matches the expected classification. | |
Args: | |
actual (str): Actual classification from the system | |
expected (str): Expected (ground truth) classification | |
Returns: | |
bool: True if classifications match, False otherwise | |
""" | |
# Normalize both for comparison | |
normalized_actual = normalize_classification(actual) | |
normalized_expected = expected | |
return normalized_actual == normalized_expected | |
def is_safe(actual, expected): | |
""" | |
Determine if the classification is "safe" - either correct or abstained (Uncertain) | |
instead of making an incorrect assertion. | |
Args: | |
actual (str): Actual classification from the system | |
expected (str): Expected (ground truth) classification | |
Returns: | |
bool: True if the classification is safe, False otherwise | |
""" | |
# Normalize both for comparison | |
normalized_actual = normalize_classification(actual) | |
normalized_expected = expected | |
# If the classification is correct, it's definitely safe | |
if normalized_actual == normalized_expected: | |
return True | |
# If the system classified as "Uncertain", that's safe (abstaining rather than wrong assertion) | |
if normalized_actual == "Uncertain": | |
return True | |
# Otherwise, the system made an incorrect assertion (False as True or True as False) | |
return False | |
def evaluate_claims(test_claims, eval_agent, limit=None): | |
""" | |
Evaluate a list of claims using the fact-checking system. | |
Args: | |
test_claims (list): List of test claims with expected classifications | |
eval_agent (object): Initialized LangGraph agent | |
limit (int, optional): Maximum number of claims to evaluate | |
Returns: | |
tuple: (results, metrics) | |
- results (list): Detailed results for each claim | |
- metrics (dict): Aggregated performance metrics | |
""" | |
# Initialize performance tracker | |
performance_tracker = PerformanceTracker() | |
# Limit the number of claims if requested | |
if limit and limit > 0: | |
claims_to_evaluate = test_claims[:limit] | |
else: | |
claims_to_evaluate = test_claims | |
results = [] | |
total_count = len(claims_to_evaluate) | |
correct_count = 0 | |
safe_count = 0 | |
# Classification counts | |
classification_counts = {"True": 0, "False": 0, "Uncertain": 0} | |
# Track processing times by expected classification | |
processing_times = {"True": [], "False": [], "Uncertain": []} | |
# Confidence scores by expected classification | |
confidence_scores = {"True": [], "False": [], "Uncertain": []} | |
# Track correct classifications by expected classification | |
correct_by_class = {"True": 0, "False": 0, "Uncertain": 0} | |
safe_by_class = {"True": 0, "False": 0, "Uncertain": 0} | |
total_by_class = {"True": 0, "False": 0, "Uncertain": 0} | |
print(f"Evaluating {len(claims_to_evaluate)} claims...") | |
# Process each claim | |
for idx, test_case in enumerate(claims_to_evaluate): | |
claim = test_case["claim"] | |
expected = test_case["expected"] | |
print(f"\nProcessing claim {idx+1}/{len(claims_to_evaluate)}: {claim}") | |
try: | |
# Process the claim and measure time | |
start_time = time.time() | |
result = agent.process_claim(claim, eval_agent) | |
total_time = time.time() - start_time | |
# Extract classification and confidence | |
classification = result.get("classification", "Uncertain") | |
confidence = result.get("confidence", 0.0) | |
# Normalize classification for comparison | |
normalized_classification = normalize_classification(classification) | |
# Check if classification is correct | |
correct = is_correct(normalized_classification, expected) | |
if correct: | |
correct_count += 1 | |
correct_by_class[expected] += 1 | |
# Check if classification is safe | |
safe = is_safe(normalized_classification, expected) | |
if safe: | |
safe_count += 1 | |
safe_by_class[expected] += 1 | |
# Update classification count | |
classification_counts[normalized_classification] = classification_counts.get(normalized_classification, 0) + 1 | |
# Update counts by expected class | |
total_by_class[expected] += 1 | |
# Update processing times | |
processing_times[expected].append(total_time) | |
# Update confidence scores | |
confidence_scores[expected].append(confidence) | |
# Save detailed result | |
detail_result = { | |
"claim": claim, | |
"expected": expected, | |
"actual": normalized_classification, | |
"correct": correct, | |
"safe": safe, | |
"confidence": confidence, | |
"processing_time": total_time | |
} | |
results.append(detail_result) | |
# Print progress indicator | |
outcome = "✓" if correct else "✗" | |
safety = "(safe)" if safe and not correct else "" | |
print(f" Result: {normalized_classification} (Expected: {expected}) {outcome} {safety}") | |
print(f" Time: {total_time:.2f}s, Confidence: {confidence:.2f}") | |
except Exception as e: | |
print(f"Error processing claim: {str(e)}") | |
results.append({ | |
"claim": claim, | |
"expected": expected, | |
"error": str(e) | |
}) | |
# Calculate performance metrics | |
accuracy = correct_count / total_count if total_count > 0 else 0 | |
safety_rate = safe_count / total_count if total_count > 0 else 0 | |
# Calculate per-class metrics | |
class_metrics = {} | |
for cls in ["True", "False", "Uncertain"]: | |
class_accuracy = correct_by_class[cls] / total_by_class[cls] if total_by_class[cls] > 0 else 0 | |
class_safety_rate = safe_by_class[cls] / total_by_class[cls] if total_by_class[cls] > 0 else 0 | |
avg_time = sum(processing_times[cls]) / len(processing_times[cls]) if processing_times[cls] else 0 | |
avg_confidence = sum(confidence_scores[cls]) / len(confidence_scores[cls]) if confidence_scores[cls] else 0 | |
class_metrics[cls] = { | |
"accuracy": class_accuracy, | |
"safety_rate": class_safety_rate, | |
"count": total_by_class[cls], | |
"correct": correct_by_class[cls], | |
"safe": safe_by_class[cls], | |
"avg_processing_time": avg_time, | |
"avg_confidence": avg_confidence | |
} | |
# Calculate overall metrics | |
all_times = [r.get("processing_time", 0) for r in results if "processing_time" in r] | |
all_confidence = [r.get("confidence", 0) for r in results if "confidence" in r] | |
metrics = { | |
"total_claims": total_count, | |
"correct_claims": correct_count, | |
"safe_claims": safe_count, | |
"accuracy": accuracy, | |
"safety_rate": safety_rate, | |
"avg_processing_time": sum(all_times) / len(all_times) if all_times else 0, | |
"avg_confidence": sum(all_confidence) / len(all_confidence) if all_confidence else 0, | |
"classification_counts": classification_counts, | |
"per_class_metrics": class_metrics | |
} | |
return results, metrics | |
def save_results(results, metrics, output_file): | |
""" | |
Save evaluation results to a JSON file. | |
Args: | |
results (list): Detailed results for each claim | |
metrics (dict): Aggregated performance metrics | |
output_file (str): Path to output file | |
""" | |
output_data = { | |
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), | |
"metrics": metrics, | |
"detailed_results": results | |
} | |
with open(output_file, 'w') as f: | |
json.dump(output_data, f, indent=2) | |
print(f"\nResults saved to {output_file}") | |
def print_summary(metrics): | |
""" | |
Print a summary of performance metrics. | |
Args: | |
metrics (dict): Aggregated performance metrics | |
""" | |
print("\n" + "="*70) | |
print(f"PERFORMANCE SUMMARY") | |
print("="*70) | |
# Overall metrics | |
print(f"\nOverall Metrics:") | |
print(f"Total Claims: {metrics['total_claims']}") | |
print(f"Correctly Classified: {metrics['correct_claims']}") | |
print(f"Safely Classified: {metrics['safe_claims']}") | |
print(f"Accuracy: {metrics['accuracy']:.2%}") | |
print(f"Safety Rate: {metrics['safety_rate']:.2%}") | |
print(f"Average Processing Time: {metrics['avg_processing_time']:.2f} seconds") | |
print(f"Average Confidence Score: {metrics['avg_confidence']:.2f}") | |
# Per-class metrics as table | |
print("\nPer-Class Performance:") | |
table_data = [] | |
headers = ["Class", "Count", "Correct", "Safe", "Accuracy", "Safety Rate", "Avg Time", "Avg Confidence"] | |
for cls, cls_metrics in metrics['per_class_metrics'].items(): | |
table_data.append([ | |
cls, | |
cls_metrics['count'], | |
cls_metrics['correct'], | |
cls_metrics['safe'], | |
f"{cls_metrics['accuracy']:.2%}", | |
f"{cls_metrics['safety_rate']:.2%}", | |
f"{cls_metrics['avg_processing_time']:.2f}s", | |
f"{cls_metrics['avg_confidence']:.2f}" | |
]) | |
print(tabulate(table_data, headers=headers, tablefmt="grid")) | |
def create_charts(metrics, output_dir="."): | |
""" | |
Create visualizations of performance metrics. | |
Args: | |
metrics (dict): Aggregated performance metrics | |
output_dir (str): Directory to save charts | |
""" | |
try: | |
# Create output directory if it doesn't exist | |
os.makedirs(output_dir, exist_ok=True) | |
# Plot 1: Accuracy by class | |
plt.figure(figsize=(10, 6)) | |
classes = list(metrics['per_class_metrics'].keys()) | |
accuracies = [metrics['per_class_metrics'][cls]['accuracy'] for cls in classes] | |
plt.bar(classes, accuracies, color=['green', 'red', 'gray']) | |
plt.title('Accuracy by Classification Type') | |
plt.xlabel('Classification') | |
plt.ylabel('Accuracy') | |
plt.ylim(0, 1) | |
for i, v in enumerate(accuracies): | |
plt.text(i, v + 0.02, f"{v:.2%}", ha='center') | |
plt.tight_layout() | |
plt.savefig(os.path.join(output_dir, 'accuracy_by_class.png')) | |
plt.close() # Close the figure to free memory | |
# Plot 2: Safety rate by class | |
plt.figure(figsize=(10, 6)) | |
safety_rates = [metrics['per_class_metrics'][cls]['safety_rate'] for cls in classes] | |
plt.bar(classes, safety_rates, color=['green', 'red', 'gray']) | |
plt.title('Safety Rate by Classification Type') | |
plt.xlabel('Classification') | |
plt.ylabel('Safety Rate') | |
plt.ylim(0, 1) | |
for i, v in enumerate(safety_rates): | |
plt.text(i, v + 0.02, f"{v:.2%}", ha='center') | |
plt.tight_layout() | |
plt.savefig(os.path.join(output_dir, 'safety_rate_by_class.png')) | |
plt.close() # Close the figure to free memory | |
# Plot 3: Processing time by class | |
plt.figure(figsize=(10, 6)) | |
times = [metrics['per_class_metrics'][cls]['avg_processing_time'] for cls in classes] | |
plt.bar(classes, times, color=['green', 'red', 'gray']) | |
plt.title('Average Processing Time by Classification Type') | |
plt.xlabel('Classification') | |
plt.ylabel('Time (seconds)') | |
for i, v in enumerate(times): | |
plt.text(i, v + 0.5, f"{v:.2f}s", ha='center') | |
plt.tight_layout() | |
plt.savefig(os.path.join(output_dir, 'processing_time_by_class.png')) | |
plt.close() # Close the figure to free memory | |
# Plot 4: Confidence scores by class | |
plt.figure(figsize=(10, 6)) | |
confidence = [metrics['per_class_metrics'][cls]['avg_confidence'] for cls in classes] | |
plt.bar(classes, confidence, color=['green', 'red', 'gray']) | |
plt.title('Average Confidence Score by Classification Type') | |
plt.xlabel('Classification') | |
plt.ylabel('Confidence Score') | |
plt.ylim(0, 1) | |
for i, v in enumerate(confidence): | |
plt.text(i, v + 0.02, f"{v:.2f}", ha='center') | |
plt.tight_layout() | |
plt.savefig(os.path.join(output_dir, 'confidence_by_class.png')) | |
plt.close() # Close the figure to free memory | |
print(f"\nCharts created in {output_dir}") | |
except Exception as e: | |
print(f"Error creating charts: {str(e)}") | |
print("Continuing without charts.") | |
def main(): | |
"""Main evaluation function that runs the entire evaluation process.""" | |
# Parse arguments | |
args = setup_argument_parser() | |
# Initialize the agent | |
eval_agent = initialize_system() | |
# Create results directory if it doesn't exist | |
results_dir = "results" | |
os.makedirs(results_dir, exist_ok=True) | |
# Set output file path | |
output_file = args.output | |
if not os.path.isabs(output_file): | |
output_file = os.path.join(results_dir, output_file) | |
# Evaluate claims | |
results, metrics = evaluate_claims(TEST_CLAIMS, eval_agent, args.limit) | |
# results, metrics = evaluate_claims(TEST_CLAIMS, eval_agent, 1) | |
# Print summary | |
print_summary(metrics) | |
# Save results | |
save_results(results, metrics, output_file) | |
# Create charts | |
try: | |
from tabulate import tabulate | |
import matplotlib.pyplot as plt | |
create_charts(metrics, results_dir) | |
except ImportError: | |
print("\nCould not create charts. Please install matplotlib and tabulate packages:") | |
print("pip install matplotlib tabulate") | |
if __name__ == "__main__": | |
main() |