Spaces:

MrSimple01
/

RuSimulBench_arena

Sleeping

File size: 6,457 Bytes

5465a38

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from typing import Dict
import matplotlib.pyplot as plt
import seaborn as sns
import os

def evaluate_stability(df: pd.DataFrame, prompt_col: str, answer_col: str,
                       model_name: str = 'paraphrase-MiniLM-L6-v2',
                       progress=None) -> Dict:
    if progress:
        progress(0, desc="Loading sentence transformer model...")
    
    model = SentenceTransformer(model_name)
    
    prompts = df[prompt_col].tolist()
    outputs = df[answer_col].tolist()
    
    if progress:
        progress(0.3, desc="Encoding prompts...")
    prompt_embeddings = model.encode(prompts)
    
    if progress:
        progress(0.6, desc="Encoding outputs...")
    output_embeddings = model.encode(outputs)
    
    if progress:
        progress(0.9, desc="Computing similarities...")
    similarities = cosine_similarity(prompt_embeddings, output_embeddings)
    stability_coefficients = np.diag(similarities)
    
    if progress:
        progress(1.0, desc="Done!")
    return {
        'stability_score': np.mean(stability_coefficients) * 100,  
        'stability_std': np.std(stability_coefficients) * 100,
        'individual_similarities': stability_coefficients
    }

def evaluate_combined_score(creativity_df: pd.DataFrame, stability_results: Dict, 
                           model_name: str) -> Dict:
    creative_score = creativity_df["Среднее"].mean()
    stability_score = stability_results['stability_score']
    combined_score = (creative_score + stability_score) / 2
    
    timestamp = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
    
    return {
        'model': model_name,
        'creativity_score': creative_score,
        'stability_score': stability_score,
        'combined_score': combined_score,
        'evaluation_timestamp': timestamp,
        'creative_details': {
            'creativity': creativity_df["Креативность"].mean(),
            'diversity': creativity_df["Разнообразие"].mean(),
            'relevance': creativity_df["Релевантность"].mean(),
        },
        'stability_details': stability_results
    }

def create_radar_chart(all_results):
    os.makedirs('results', exist_ok=True)
    
    # Extract data for radar chart
    categories = ['Креативность', 'Разнообразие', 'Релевантность', 'Стабильность']
    models = list(all_results.keys())
    
    # Create figure and polar axis
    fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(polar=True))
    
    # Number of variables
    N = len(categories)
    
    # Angle of each axis
    angles = [n / float(N) * 2 * np.pi for n in range(N)]
    angles += angles[:1]  # Close the polygon
    
    # Set the labels
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(categories)
    
    # Draw the polygons for each model
    for i, model in enumerate(models):
        values = [
            all_results[model]['creative_details']['creativity'],
            all_results[model]['creative_details']['diversity'],
            all_results[model]['creative_details']['relevance'],
            all_results[model]['stability_score']
        ]
        
        # Add the first value again to close the polygon
        values += values[:1]
        
        # Plot values
        ax.plot(angles, values, linewidth=2, linestyle='solid', label=model)
        ax.fill(angles, values, alpha=0.1)
    
    # Add legend
    plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    
    # Add title
    plt.title('Model Performance Comparison', size=15, pad=20)
    
    # Save the chart
    radar_chart_path = 'results/radar_chart.png'
    plt.savefig(radar_chart_path, dpi=300, bbox_inches='tight')
    plt.close()
    
    return radar_chart_path

def create_bar_chart(all_results):
    # Extract data for bar chart
    models = list(all_results.keys())
    creative_scores = [all_results[model]['creativity_score'] for model in models]
    stability_scores = [all_results[model]['stability_score'] for model in models]
    combined_scores = [all_results[model]['combined_score'] for model in models]
    
    # Create figure
    fig, ax = plt.subplots(figsize=(12, 8))
    
    # Set bar width
    bar_width = 0.25
    
    # Set bar positions
    r1 = np.arange(len(models))
    r2 = [x + bar_width for x in r1]
    r3 = [x + bar_width for x in r2]
    
    # Create bars
    ax.bar(r1, creative_scores, width=bar_width, label='Креативность', color='skyblue')
    ax.bar(r2, stability_scores, width=bar_width, label='Стабильность', color='orange')
    ax.bar(r3, combined_scores, width=bar_width, label='Общий балл', color='green')
    
    # Add labels and title
    ax.set_xlabel('Модели')
    ax.set_ylabel('Оценка')
    ax.set_title('Сравнение моделей по креативности и стабильности')
    ax.set_xticks([r + bar_width for r in range(len(models))])
    ax.set_xticklabels(models)
    
    # Add legend
    ax.legend()
    
    # Save the chart
    bar_chart_path = 'results/bar_chart.png'
    plt.savefig(bar_chart_path, dpi=300, bbox_inches='tight')
    plt.close()
    
    return bar_chart_path

def get_leaderboard_data():
    benchmark_file = 'results/benchmark_results.csv'
    if not os.path.exists(benchmark_file):
        return pd.DataFrame(columns=[
            "Model", "Креативность", "Разнообразие", "Релевантность", "Стабильность", "Общий балл"
        ])
    
    try:
        df = pd.read_csv(benchmark_file)
        # Format the dataframe for display
        formatted_df = pd.DataFrame({
            "Model": df['model'],
            "Креативность": df['creativity_score'].round(2),
            "Стабильность": df['stability_score'].round(2),
            "Общий балл": df['combined_score'].round(2)
        })
        return formatted_df.sort_values(by="Общий балл", ascending=False)
    except Exception as e:
        print(f"Error loading leaderboard data: {str(e)}")
        return pd.DataFrame(columns=[
            "Model", "Креативность", "Разнообразие", "Релевантность", "Стабильность", "Общий балл"
        ])