Spaces:

ankanghosh
/

askveracity

Running

File size: 21,087 Bytes

#!/usr/bin/env python3
"""
Performance Evaluation Script for AskVeracity.

This script evaluates the performance of the AskVeracity fact-checking system
using a predefined set of test claims with known ground truth labels.
It collects metrics on accuracy, safety rate, processing time, and confidence scores
without modifying the core codebase.

Usage:
    python evaluate_performance.py [--limit N] [--output FILE]

Options:
    --limit N        Limit evaluation to first N claims (default: all)
    --output FILE    Save results to FILE (default: performance_results.json)
"""

import os
import sys
import json
import time
import argparse
from datetime import datetime
import matplotlib.pyplot as plt
from tabulate import tabulate
import numpy as np

# Add the parent directory to sys.path if this script is run directly
if __name__ == "__main__":
    sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

# Import the agent and performance tracker
import agent
from utils.performance import PerformanceTracker
from utils.models import initialize_models

# IMPORTANT NOTE FOR DEVELOPERS:
# The test claims below include many recent events that will become outdated.
# When using this script for testing or evaluation, please update these claims 
# with relevant and up-to-date examples to ensure meaningful results.
# Performance metrics are heavily influenced by the recency and verifiability 
# of these claims, so using outdated claims will likely lead to poor results.

# Define the test claims with ground truth labels
TEST_CLAIMS = [
    # True claims
    {"claim": "Dozens killed as gunmen massacre tourists in Kashmir beauty spot.", "expected": "True"},
    {"claim": "Pope Francis dies at 88.", "expected": "True"},
    {"claim": "OpenAI released new reasoning models called o3 and o4-mini.", "expected": "True"},
    {"claim": "Trump And Zelensky Clash Again As US Says Crimea Now Russian Territory.", "expected": "True"},
    {"claim": "Twelve states sue Donald Trump administration in trade court over chaotic and illegal tariff policy.", "expected": "True"},
    {"claim": "Zomato has been renamed to Eternal Limited.", "expected": "True"},
    {"claim": "The Taj Mahal is located in Agra.", "expected": "True"},
    {"claim": "ISRO achieves second docking with SpaDeX satellites.", "expected": "True"},
    {"claim": "The TV series Adolescence is streaming on Netflix.", "expected": "True"},
    {"claim": "Vladimir Putin offers to halt Ukraine invasion.", "expected": "True"},
    {"claim": "Meta released its Llama 4 language model.", "expected": "True"},
    {"claim": "Google launched Gemini 2.5 Pro Experimental, the first model in the Gemini 2.5 family.", "expected": "True"},
    {"claim": "Microsoft is rolling out improved Recall feature for Windows Insiders.", "expected": "True"},
    {"claim": "Microsoft announced a 1-bit language model that can run on CPU.", "expected": "True"},
    {"claim": "Royal Challengers Bengaluru beat Rajasthan Royals by 11 runs in yesterday's IPL match.", "expected": "True"},
    {"claim": "Anthropic introduced Claude Research.", "expected": "True"},
    {"claim": "The IMF has lowered India's growth projection for the fiscal year 2025-26 to 6.2 per cent.", "expected": "True"},
    {"claim": "In Bundesliga, Bayern Munich beat Heidenheim 4-0 last week.", "expected": "True"},
    {"claim": "Manchester United in Europa League semi-finals.", "expected": "True"},
    
    # False claims
    {"claim": "The Eiffel Tower is in Rome.", "expected": "False"},
    {"claim": "The earth is flat.", "expected": "False"},
    {"claim": "Rishi Sunak is the current Prime Minister of the UK.", "expected": "False"},
    {"claim": "New Zealand won the ICC Champions Trophy in 2025.", "expected": "False"},
    {"claim": "US President Donald trump to visit India next week.", "expected": "False"},
    {"claim": "Quantum computers have definitively solved the protein folding problem.", "expected": "False"},
    {"claim": "CRISPR gene editing has successfully cured type 1 diabetes in human clinical trials.", "expected": "False"},
    {"claim": "Google's new quantum computer, Willow, has demonstrated remarkable capabilities by solving mathematical problems far beyond the reach of the fastest supercomputers.", "expected": "False"},
    {"claim": "NASA confirmed that the James Webb Space Telescope has found definitive evidence of alien life on an exoplanet.", "expected": "False"},
    {"claim": "Google launched Gemini 3.", "expected": "False"},
    {"claim": "A solar eclipse was be seen in India on October 17, 2024.", "expected": "False"},
    {"claim": "Tom Cruise and Shah Rukh Khan have starred in a Bollywood movie in the past.", "expected": "False"},
    {"claim": "Germany has the highest GDP in the world.", "expected": "False"},
    
    # Uncertain claims
    {"claim": "Aliens have visited the Earth.", "expected": "Uncertain"},
    {"claim": "Information that falls into a black hole is permanently lost or destroyed.", "expected": "Uncertain"},
    {"claim": "Time travel into the past is possible.", "expected": "Uncertain"},
    {"claim": "Bigfoot (or Yeti) exists in remote wilderness areas.", "expected": "Uncertain"},
    {"claim": "Intelligent life exists elsewhere in the universe.", "expected": "Uncertain"},
    {"claim": "Yogi Adityanath will be the next Prime Minister of India.", "expected": "Uncertain"},
    {"claim": "Consciousness continues to exist after biological death.", "expected": "Uncertain"},
    {"claim": "There are multiple parallel universes.", "expected": "Uncertain"}
]

def setup_argument_parser():
    """
    Set up command line argument parsing.
    
    Returns:
        argparse.Namespace: Parsed command line arguments
    """
    parser = argparse.ArgumentParser(description="Evaluate AskVeracity performance")
    parser.add_argument("--limit", type=int, help="Limit evaluation to first N claims")
    parser.add_argument("--output", type=str, default="performance_results.json", 
                        help="Output file for results (default: performance_results.json)")
    return parser.parse_args()

def initialize_system():
    """
    Initialize the system for evaluation.
    
    Returns:
        object: Initialized LangGraph agent
    """
    print("Initializing models and agent...")
    initialize_models()
    eval_agent = agent.setup_agent()
    return eval_agent

def normalize_classification(classification):
    """
    Normalize classification labels for consistent comparison.
    
    Args:
        classification (str): Classification label from the system
        
    Returns:
        str: Normalized classification label ("True", "False", or "Uncertain")
    """
    if not classification:
        return "Uncertain"
    
    if "true" in classification.lower():
        return "True"
    elif "false" in classification.lower():
        return "False"
    else:
        return "Uncertain"

def is_correct(actual, expected):
    """
    Determine if the actual classification matches the expected classification.
    
    Args:
        actual (str): Actual classification from the system
        expected (str): Expected (ground truth) classification
        
    Returns:
        bool: True if classifications match, False otherwise
    """
    # Normalize both for comparison
    normalized_actual = normalize_classification(actual)
    normalized_expected = expected
    
    return normalized_actual == normalized_expected

def is_safe(actual, expected):
    """
    Determine if the classification is "safe" - either correct or abstained (Uncertain)
    instead of making an incorrect assertion.
    
    Args:
        actual (str): Actual classification from the system
        expected (str): Expected (ground truth) classification
        
    Returns:
        bool: True if the classification is safe, False otherwise
    """
    # Normalize both for comparison
    normalized_actual = normalize_classification(actual)
    normalized_expected = expected
    
    # If the classification is correct, it's definitely safe
    if normalized_actual == normalized_expected:
        return True
    
    # If the system classified as "Uncertain", that's safe (abstaining rather than wrong assertion)
    if normalized_actual == "Uncertain":
        return True
    
    # Otherwise, the system made an incorrect assertion (False as True or True as False)
    return False

def evaluate_claims(test_claims, eval_agent, limit=None):
    """
    Evaluate a list of claims using the fact-checking system.
    
    Args:
        test_claims (list): List of test claims with expected classifications
        eval_agent (object): Initialized LangGraph agent
        limit (int, optional): Maximum number of claims to evaluate
        
    Returns:
        tuple: (results, metrics)
            - results (list): Detailed results for each claim
            - metrics (dict): Aggregated performance metrics
    """
    # Initialize performance tracker
    performance_tracker = PerformanceTracker()
    
    # Limit the number of claims if requested
    if limit and limit > 0:
        claims_to_evaluate = test_claims[:limit]
    else:
        claims_to_evaluate = test_claims
    
    results = []
    total_count = len(claims_to_evaluate)
    correct_count = 0
    safe_count = 0
    
    # Classification counts
    classification_counts = {"True": 0, "False": 0, "Uncertain": 0}
    
    # Track processing times by expected classification
    processing_times = {"True": [], "False": [], "Uncertain": []}
    
    # Confidence scores by expected classification
    confidence_scores = {"True": [], "False": [], "Uncertain": []}
    
    # Track correct classifications by expected classification
    correct_by_class = {"True": 0, "False": 0, "Uncertain": 0}
    safe_by_class = {"True": 0, "False": 0, "Uncertain": 0}
    total_by_class = {"True": 0, "False": 0, "Uncertain": 0}
    
    print(f"Evaluating {len(claims_to_evaluate)} claims...")
    
    # Process each claim
    for idx, test_case in enumerate(claims_to_evaluate):
        claim = test_case["claim"]
        expected = test_case["expected"]
        
        print(f"\nProcessing claim {idx+1}/{len(claims_to_evaluate)}: {claim}")
        
        try:
            # Process the claim and measure time
            start_time = time.time()
            result = agent.process_claim(claim, eval_agent)
            total_time = time.time() - start_time
            
            # Extract classification and confidence
            classification = result.get("classification", "Uncertain")
            confidence = result.get("confidence", 0.0)
            
            # Normalize classification for comparison
            normalized_classification = normalize_classification(classification)
            
            # Check if classification is correct
            correct = is_correct(normalized_classification, expected)
            if correct:
                correct_count += 1
                correct_by_class[expected] += 1
            
            # Check if classification is safe
            safe = is_safe(normalized_classification, expected)
            if safe:
                safe_count += 1
                safe_by_class[expected] += 1
            
            # Update classification count
            classification_counts[normalized_classification] = classification_counts.get(normalized_classification, 0) + 1
            
            # Update counts by expected class
            total_by_class[expected] += 1
            
            # Update processing times
            processing_times[expected].append(total_time)
            
            # Update confidence scores
            confidence_scores[expected].append(confidence)
            
            # Save detailed result
            detail_result = {
                "claim": claim,
                "expected": expected,
                "actual": normalized_classification,
                "correct": correct,
                "safe": safe,
                "confidence": confidence,
                "processing_time": total_time
            }
            
            results.append(detail_result)
            
            # Print progress indicator
            outcome = "✓" if correct else "✗"
            safety = "(safe)" if safe and not correct else ""
            print(f"  Result: {normalized_classification} (Expected: {expected}) {outcome} {safety}")
            print(f"  Time: {total_time:.2f}s, Confidence: {confidence:.2f}")
            
        except Exception as e:
            print(f"Error processing claim: {str(e)}")
            results.append({
                "claim": claim,
                "expected": expected,
                "error": str(e)
            })
    
    # Calculate performance metrics
    accuracy = correct_count / total_count if total_count > 0 else 0
    safety_rate = safe_count / total_count if total_count > 0 else 0
    
    # Calculate per-class metrics
    class_metrics = {}
    for cls in ["True", "False", "Uncertain"]:
        class_accuracy = correct_by_class[cls] / total_by_class[cls] if total_by_class[cls] > 0 else 0
        class_safety_rate = safe_by_class[cls] / total_by_class[cls] if total_by_class[cls] > 0 else 0
        avg_time = sum(processing_times[cls]) / len(processing_times[cls]) if processing_times[cls] else 0
        avg_confidence = sum(confidence_scores[cls]) / len(confidence_scores[cls]) if confidence_scores[cls] else 0
        
        class_metrics[cls] = {
            "accuracy": class_accuracy,
            "safety_rate": class_safety_rate,
            "count": total_by_class[cls],
            "correct": correct_by_class[cls],
            "safe": safe_by_class[cls],
            "avg_processing_time": avg_time,
            "avg_confidence": avg_confidence
        }
    
    # Calculate overall metrics
    all_times = [r.get("processing_time", 0) for r in results if "processing_time" in r]
    all_confidence = [r.get("confidence", 0) for r in results if "confidence" in r]
    
    metrics = {
        "total_claims": total_count,
        "correct_claims": correct_count,
        "safe_claims": safe_count,
        "accuracy": accuracy,
        "safety_rate": safety_rate,
        "avg_processing_time": sum(all_times) / len(all_times) if all_times else 0,
        "avg_confidence": sum(all_confidence) / len(all_confidence) if all_confidence else 0,
        "classification_counts": classification_counts,
        "per_class_metrics": class_metrics
    }
    
    return results, metrics

def save_results(results, metrics, output_file):
    """
    Save evaluation results to a JSON file.
    
    Args:
        results (list): Detailed results for each claim
        metrics (dict): Aggregated performance metrics
        output_file (str): Path to output file
    """
    output_data = {
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "metrics": metrics,
        "detailed_results": results
    }
    
    with open(output_file, 'w') as f:
        json.dump(output_data, f, indent=2)
    
    print(f"\nResults saved to {output_file}")

def print_summary(metrics):
    """
    Print a summary of performance metrics.
    
    Args:
        metrics (dict): Aggregated performance metrics
    """
    print("\n" + "="*70)
    print(f"PERFORMANCE SUMMARY")
    print("="*70)
    
    # Overall metrics
    print(f"\nOverall Metrics:")
    print(f"Total Claims: {metrics['total_claims']}")
    print(f"Correctly Classified: {metrics['correct_claims']}")
    print(f"Safely Classified: {metrics['safe_claims']}")
    print(f"Accuracy: {metrics['accuracy']:.2%}")
    print(f"Safety Rate: {metrics['safety_rate']:.2%}")
    print(f"Average Processing Time: {metrics['avg_processing_time']:.2f} seconds")
    print(f"Average Confidence Score: {metrics['avg_confidence']:.2f}")
    
    # Per-class metrics as table
    print("\nPer-Class Performance:")
    table_data = []
    headers = ["Class", "Count", "Correct", "Safe", "Accuracy", "Safety Rate", "Avg Time", "Avg Confidence"]
    
    for cls, cls_metrics in metrics['per_class_metrics'].items():
        table_data.append([
            cls,
            cls_metrics['count'],
            cls_metrics['correct'],
            cls_metrics['safe'],
            f"{cls_metrics['accuracy']:.2%}",
            f"{cls_metrics['safety_rate']:.2%}",
            f"{cls_metrics['avg_processing_time']:.2f}s",
            f"{cls_metrics['avg_confidence']:.2f}"
        ])
    
    print(tabulate(table_data, headers=headers, tablefmt="grid"))

def create_charts(metrics, output_dir="."):
    """
    Create visualizations of performance metrics.
    
    Args:
        metrics (dict): Aggregated performance metrics
        output_dir (str): Directory to save charts
    """
    try:
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Plot 1: Accuracy by class
        plt.figure(figsize=(10, 6))
        classes = list(metrics['per_class_metrics'].keys())
        accuracies = [metrics['per_class_metrics'][cls]['accuracy'] for cls in classes]
        
        plt.bar(classes, accuracies, color=['green', 'red', 'gray'])
        plt.title('Accuracy by Classification Type')
        plt.xlabel('Classification')
        plt.ylabel('Accuracy')
        plt.ylim(0, 1)
        
        for i, v in enumerate(accuracies):
            plt.text(i, v + 0.02, f"{v:.2%}", ha='center')
        
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'accuracy_by_class.png'))
        plt.close()  # Close the figure to free memory
        
        # Plot 2: Safety rate by class
        plt.figure(figsize=(10, 6))
        safety_rates = [metrics['per_class_metrics'][cls]['safety_rate'] for cls in classes]
        
        plt.bar(classes, safety_rates, color=['green', 'red', 'gray'])
        plt.title('Safety Rate by Classification Type')
        plt.xlabel('Classification')
        plt.ylabel('Safety Rate')
        plt.ylim(0, 1)
        
        for i, v in enumerate(safety_rates):
            plt.text(i, v + 0.02, f"{v:.2%}", ha='center')
        
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'safety_rate_by_class.png'))
        plt.close()  # Close the figure to free memory
        
        # Plot 3: Processing time by class
        plt.figure(figsize=(10, 6))
        times = [metrics['per_class_metrics'][cls]['avg_processing_time'] for cls in classes]
        
        plt.bar(classes, times, color=['green', 'red', 'gray'])
        plt.title('Average Processing Time by Classification Type')
        plt.xlabel('Classification')
        plt.ylabel('Time (seconds)')
        
        for i, v in enumerate(times):
            plt.text(i, v + 0.5, f"{v:.2f}s", ha='center')
        
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'processing_time_by_class.png'))
        plt.close()  # Close the figure to free memory
        
        # Plot 4: Confidence scores by class
        plt.figure(figsize=(10, 6))
        confidence = [metrics['per_class_metrics'][cls]['avg_confidence'] for cls in classes]
        
        plt.bar(classes, confidence, color=['green', 'red', 'gray'])
        plt.title('Average Confidence Score by Classification Type')
        plt.xlabel('Classification')
        plt.ylabel('Confidence Score')
        plt.ylim(0, 1)
        
        for i, v in enumerate(confidence):
            plt.text(i, v + 0.02, f"{v:.2f}", ha='center')
        
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'confidence_by_class.png'))
        plt.close()  # Close the figure to free memory
        
        print(f"\nCharts created in {output_dir}")
        
    except Exception as e:
        print(f"Error creating charts: {str(e)}")
        print("Continuing without charts.")

def main():
    """Main evaluation function that runs the entire evaluation process."""
    # Parse arguments
    args = setup_argument_parser()
    
    # Initialize the agent
    eval_agent = initialize_system()
    
    # Create results directory if it doesn't exist
    results_dir = "results"
    os.makedirs(results_dir, exist_ok=True)
    
    # Set output file path
    output_file = args.output
    if not os.path.isabs(output_file):
        output_file = os.path.join(results_dir, output_file)
    
    # Evaluate claims
    results, metrics = evaluate_claims(TEST_CLAIMS, eval_agent, args.limit)
    # results, metrics = evaluate_claims(TEST_CLAIMS, eval_agent, 1)
    
    # Print summary
    print_summary(metrics)
    
    # Save results
    save_results(results, metrics, output_file)
    
    # Create charts
    try:
        from tabulate import tabulate
        import matplotlib.pyplot as plt
        create_charts(metrics, results_dir)
    except ImportError:
        print("\nCould not create charts. Please install matplotlib and tabulate packages:")
        print("pip install matplotlib tabulate")

if __name__ == "__main__":
    main()