Spaces:
Sleeping
Sleeping
import numpy as np | |
import pandas as pd | |
from sentence_transformers import SentenceTransformer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from typing import Dict | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import os | |
def evaluate_stability(df: pd.DataFrame, prompt_col: str, answer_col: str, | |
model_name: str = 'paraphrase-MiniLM-L6-v2', | |
progress=None) -> Dict: | |
if progress: | |
progress(0, desc="Loading sentence transformer model...") | |
model = SentenceTransformer(model_name) | |
prompts = df[prompt_col].tolist() | |
outputs = df[answer_col].tolist() | |
if progress: | |
progress(0.3, desc="Encoding prompts...") | |
prompt_embeddings = model.encode(prompts) | |
if progress: | |
progress(0.6, desc="Encoding outputs...") | |
output_embeddings = model.encode(outputs) | |
if progress: | |
progress(0.9, desc="Computing similarities...") | |
similarities = cosine_similarity(prompt_embeddings, output_embeddings) | |
stability_coefficients = np.diag(similarities) | |
if progress: | |
progress(1.0, desc="Done!") | |
return { | |
'stability_score': np.mean(stability_coefficients) * 100, | |
'stability_std': np.std(stability_coefficients) * 100, | |
'individual_similarities': stability_coefficients | |
} | |
def evaluate_combined_score(creativity_df: pd.DataFrame, stability_results: Dict, | |
model_name: str) -> Dict: | |
creative_score = creativity_df["Среднее"].mean() | |
stability_score = stability_results['stability_score'] | |
combined_score = (creative_score + stability_score) / 2 | |
timestamp = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S') | |
return { | |
'model': model_name, | |
'creativity_score': creative_score, | |
'stability_score': stability_score, | |
'combined_score': combined_score, | |
'evaluation_timestamp': timestamp, | |
'creative_details': { | |
'creativity': creativity_df["Креативность"].mean(), | |
'diversity': creativity_df["Разнообразие"].mean(), | |
'relevance': creativity_df["Релевантность"].mean(), | |
}, | |
'stability_details': stability_results | |
} | |
def create_radar_chart(all_results): | |
os.makedirs('results', exist_ok=True) | |
# Extract data for radar chart | |
categories = ['Креативность', 'Разнообразие', 'Релевантность', 'Стабильность'] | |
models = list(all_results.keys()) | |
# Create figure and polar axis | |
fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(polar=True)) | |
# Number of variables | |
N = len(categories) | |
# Angle of each axis | |
angles = [n / float(N) * 2 * np.pi for n in range(N)] | |
angles += angles[:1] # Close the polygon | |
# Set the labels | |
ax.set_xticks(angles[:-1]) | |
ax.set_xticklabels(categories) | |
# Draw the polygons for each model | |
for i, model in enumerate(models): | |
values = [ | |
all_results[model]['creative_details']['creativity'], | |
all_results[model]['creative_details']['diversity'], | |
all_results[model]['creative_details']['relevance'], | |
all_results[model]['stability_score'] | |
] | |
# Add the first value again to close the polygon | |
values += values[:1] | |
# Plot values | |
ax.plot(angles, values, linewidth=2, linestyle='solid', label=model) | |
ax.fill(angles, values, alpha=0.1) | |
# Add legend | |
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1)) | |
# Add title | |
plt.title('Model Performance Comparison', size=15, pad=20) | |
# Save the chart | |
radar_chart_path = 'results/radar_chart.png' | |
plt.savefig(radar_chart_path, dpi=300, bbox_inches='tight') | |
plt.close() | |
return radar_chart_path | |
def create_bar_chart(all_results): | |
# Extract data for bar chart | |
models = list(all_results.keys()) | |
creative_scores = [all_results[model]['creativity_score'] for model in models] | |
stability_scores = [all_results[model]['stability_score'] for model in models] | |
combined_scores = [all_results[model]['combined_score'] for model in models] | |
# Create figure | |
fig, ax = plt.subplots(figsize=(12, 8)) | |
# Set bar width | |
bar_width = 0.25 | |
# Set bar positions | |
r1 = np.arange(len(models)) | |
r2 = [x + bar_width for x in r1] | |
r3 = [x + bar_width for x in r2] | |
# Create bars | |
ax.bar(r1, creative_scores, width=bar_width, label='Креативность', color='skyblue') | |
ax.bar(r2, stability_scores, width=bar_width, label='Стабильность', color='orange') | |
ax.bar(r3, combined_scores, width=bar_width, label='Общий балл', color='green') | |
# Add labels and title | |
ax.set_xlabel('Модели') | |
ax.set_ylabel('Оценка') | |
ax.set_title('Сравнение моделей по креативности и стабильности') | |
ax.set_xticks([r + bar_width for r in range(len(models))]) | |
ax.set_xticklabels(models) | |
# Add legend | |
ax.legend() | |
# Save the chart | |
bar_chart_path = 'results/bar_chart.png' | |
plt.savefig(bar_chart_path, dpi=300, bbox_inches='tight') | |
plt.close() | |
return bar_chart_path | |
def get_leaderboard_data(): | |
benchmark_file = 'results/benchmark_results.csv' | |
if not os.path.exists(benchmark_file): | |
return pd.DataFrame(columns=[ | |
"Model", "Креативность", "Разнообразие", "Релевантность", "Стабильность", "Общий балл" | |
]) | |
try: | |
df = pd.read_csv(benchmark_file) | |
# Format the dataframe for display | |
formatted_df = pd.DataFrame({ | |
"Model": df['model'], | |
"Креативность": df['creativity_score'].round(2), | |
"Стабильность": df['stability_score'].round(2), | |
"Общий балл": df['combined_score'].round(2) | |
}) | |
return formatted_df.sort_values(by="Общий балл", ascending=False) | |
except Exception as e: | |
print(f"Error loading leaderboard data: {str(e)}") | |
return pd.DataFrame(columns=[ | |
"Model", "Креативность", "Разнообразие", "Релевантность", "Стабильность", "Общий балл" | |
]) |