RuSimulBench_arena / evaluate_stability.py
MrSimple01's picture
Create evaluate_stability.py
5465a38 verified
raw
history blame contribute delete
6.46 kB
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from typing import Dict
import matplotlib.pyplot as plt
import seaborn as sns
import os
def evaluate_stability(df: pd.DataFrame, prompt_col: str, answer_col: str,
model_name: str = 'paraphrase-MiniLM-L6-v2',
progress=None) -> Dict:
if progress:
progress(0, desc="Loading sentence transformer model...")
model = SentenceTransformer(model_name)
prompts = df[prompt_col].tolist()
outputs = df[answer_col].tolist()
if progress:
progress(0.3, desc="Encoding prompts...")
prompt_embeddings = model.encode(prompts)
if progress:
progress(0.6, desc="Encoding outputs...")
output_embeddings = model.encode(outputs)
if progress:
progress(0.9, desc="Computing similarities...")
similarities = cosine_similarity(prompt_embeddings, output_embeddings)
stability_coefficients = np.diag(similarities)
if progress:
progress(1.0, desc="Done!")
return {
'stability_score': np.mean(stability_coefficients) * 100,
'stability_std': np.std(stability_coefficients) * 100,
'individual_similarities': stability_coefficients
}
def evaluate_combined_score(creativity_df: pd.DataFrame, stability_results: Dict,
model_name: str) -> Dict:
creative_score = creativity_df["Среднее"].mean()
stability_score = stability_results['stability_score']
combined_score = (creative_score + stability_score) / 2
timestamp = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
return {
'model': model_name,
'creativity_score': creative_score,
'stability_score': stability_score,
'combined_score': combined_score,
'evaluation_timestamp': timestamp,
'creative_details': {
'creativity': creativity_df["Креативность"].mean(),
'diversity': creativity_df["Разнообразие"].mean(),
'relevance': creativity_df["Релевантность"].mean(),
},
'stability_details': stability_results
}
def create_radar_chart(all_results):
os.makedirs('results', exist_ok=True)
# Extract data for radar chart
categories = ['Креативность', 'Разнообразие', 'Релевантность', 'Стабильность']
models = list(all_results.keys())
# Create figure and polar axis
fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(polar=True))
# Number of variables
N = len(categories)
# Angle of each axis
angles = [n / float(N) * 2 * np.pi for n in range(N)]
angles += angles[:1] # Close the polygon
# Set the labels
ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories)
# Draw the polygons for each model
for i, model in enumerate(models):
values = [
all_results[model]['creative_details']['creativity'],
all_results[model]['creative_details']['diversity'],
all_results[model]['creative_details']['relevance'],
all_results[model]['stability_score']
]
# Add the first value again to close the polygon
values += values[:1]
# Plot values
ax.plot(angles, values, linewidth=2, linestyle='solid', label=model)
ax.fill(angles, values, alpha=0.1)
# Add legend
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
# Add title
plt.title('Model Performance Comparison', size=15, pad=20)
# Save the chart
radar_chart_path = 'results/radar_chart.png'
plt.savefig(radar_chart_path, dpi=300, bbox_inches='tight')
plt.close()
return radar_chart_path
def create_bar_chart(all_results):
# Extract data for bar chart
models = list(all_results.keys())
creative_scores = [all_results[model]['creativity_score'] for model in models]
stability_scores = [all_results[model]['stability_score'] for model in models]
combined_scores = [all_results[model]['combined_score'] for model in models]
# Create figure
fig, ax = plt.subplots(figsize=(12, 8))
# Set bar width
bar_width = 0.25
# Set bar positions
r1 = np.arange(len(models))
r2 = [x + bar_width for x in r1]
r3 = [x + bar_width for x in r2]
# Create bars
ax.bar(r1, creative_scores, width=bar_width, label='Креативность', color='skyblue')
ax.bar(r2, stability_scores, width=bar_width, label='Стабильность', color='orange')
ax.bar(r3, combined_scores, width=bar_width, label='Общий балл', color='green')
# Add labels and title
ax.set_xlabel('Модели')
ax.set_ylabel('Оценка')
ax.set_title('Сравнение моделей по креативности и стабильности')
ax.set_xticks([r + bar_width for r in range(len(models))])
ax.set_xticklabels(models)
# Add legend
ax.legend()
# Save the chart
bar_chart_path = 'results/bar_chart.png'
plt.savefig(bar_chart_path, dpi=300, bbox_inches='tight')
plt.close()
return bar_chart_path
def get_leaderboard_data():
benchmark_file = 'results/benchmark_results.csv'
if not os.path.exists(benchmark_file):
return pd.DataFrame(columns=[
"Model", "Креативность", "Разнообразие", "Релевантность", "Стабильность", "Общий балл"
])
try:
df = pd.read_csv(benchmark_file)
# Format the dataframe for display
formatted_df = pd.DataFrame({
"Model": df['model'],
"Креативность": df['creativity_score'].round(2),
"Стабильность": df['stability_score'].round(2),
"Общий балл": df['combined_score'].round(2)
})
return formatted_df.sort_values(by="Общий балл", ascending=False)
except Exception as e:
print(f"Error loading leaderboard data: {str(e)}")
return pd.DataFrame(columns=[
"Model", "Креативность", "Разнообразие", "Релевантность", "Стабильность", "Общий балл"
])