MrSimple01 commited on
Commit
5465a38
·
verified ·
1 Parent(s): 908acb6

Create evaluate_stability.py

Browse files
Files changed (1) hide show
  1. evaluate_stability.py +175 -0
evaluate_stability.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sentence_transformers import SentenceTransformer
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ from typing import Dict
6
+ import matplotlib.pyplot as plt
7
+ import seaborn as sns
8
+ import os
9
+
10
+ def evaluate_stability(df: pd.DataFrame, prompt_col: str, answer_col: str,
11
+ model_name: str = 'paraphrase-MiniLM-L6-v2',
12
+ progress=None) -> Dict:
13
+ if progress:
14
+ progress(0, desc="Loading sentence transformer model...")
15
+
16
+ model = SentenceTransformer(model_name)
17
+
18
+ prompts = df[prompt_col].tolist()
19
+ outputs = df[answer_col].tolist()
20
+
21
+ if progress:
22
+ progress(0.3, desc="Encoding prompts...")
23
+ prompt_embeddings = model.encode(prompts)
24
+
25
+ if progress:
26
+ progress(0.6, desc="Encoding outputs...")
27
+ output_embeddings = model.encode(outputs)
28
+
29
+ if progress:
30
+ progress(0.9, desc="Computing similarities...")
31
+ similarities = cosine_similarity(prompt_embeddings, output_embeddings)
32
+ stability_coefficients = np.diag(similarities)
33
+
34
+ if progress:
35
+ progress(1.0, desc="Done!")
36
+ return {
37
+ 'stability_score': np.mean(stability_coefficients) * 100,
38
+ 'stability_std': np.std(stability_coefficients) * 100,
39
+ 'individual_similarities': stability_coefficients
40
+ }
41
+
42
+ def evaluate_combined_score(creativity_df: pd.DataFrame, stability_results: Dict,
43
+ model_name: str) -> Dict:
44
+ creative_score = creativity_df["Среднее"].mean()
45
+ stability_score = stability_results['stability_score']
46
+ combined_score = (creative_score + stability_score) / 2
47
+
48
+ timestamp = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
49
+
50
+ return {
51
+ 'model': model_name,
52
+ 'creativity_score': creative_score,
53
+ 'stability_score': stability_score,
54
+ 'combined_score': combined_score,
55
+ 'evaluation_timestamp': timestamp,
56
+ 'creative_details': {
57
+ 'creativity': creativity_df["Креативность"].mean(),
58
+ 'diversity': creativity_df["Разнообразие"].mean(),
59
+ 'relevance': creativity_df["Релевантность"].mean(),
60
+ },
61
+ 'stability_details': stability_results
62
+ }
63
+
64
+ def create_radar_chart(all_results):
65
+ os.makedirs('results', exist_ok=True)
66
+
67
+ # Extract data for radar chart
68
+ categories = ['Креативность', 'Разнообразие', 'Релевантность', 'Стабильность']
69
+ models = list(all_results.keys())
70
+
71
+ # Create figure and polar axis
72
+ fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(polar=True))
73
+
74
+ # Number of variables
75
+ N = len(categories)
76
+
77
+ # Angle of each axis
78
+ angles = [n / float(N) * 2 * np.pi for n in range(N)]
79
+ angles += angles[:1] # Close the polygon
80
+
81
+ # Set the labels
82
+ ax.set_xticks(angles[:-1])
83
+ ax.set_xticklabels(categories)
84
+
85
+ # Draw the polygons for each model
86
+ for i, model in enumerate(models):
87
+ values = [
88
+ all_results[model]['creative_details']['creativity'],
89
+ all_results[model]['creative_details']['diversity'],
90
+ all_results[model]['creative_details']['relevance'],
91
+ all_results[model]['stability_score']
92
+ ]
93
+
94
+ # Add the first value again to close the polygon
95
+ values += values[:1]
96
+
97
+ # Plot values
98
+ ax.plot(angles, values, linewidth=2, linestyle='solid', label=model)
99
+ ax.fill(angles, values, alpha=0.1)
100
+
101
+ # Add legend
102
+ plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
103
+
104
+ # Add title
105
+ plt.title('Model Performance Comparison', size=15, pad=20)
106
+
107
+ # Save the chart
108
+ radar_chart_path = 'results/radar_chart.png'
109
+ plt.savefig(radar_chart_path, dpi=300, bbox_inches='tight')
110
+ plt.close()
111
+
112
+ return radar_chart_path
113
+
114
+ def create_bar_chart(all_results):
115
+ # Extract data for bar chart
116
+ models = list(all_results.keys())
117
+ creative_scores = [all_results[model]['creativity_score'] for model in models]
118
+ stability_scores = [all_results[model]['stability_score'] for model in models]
119
+ combined_scores = [all_results[model]['combined_score'] for model in models]
120
+
121
+ # Create figure
122
+ fig, ax = plt.subplots(figsize=(12, 8))
123
+
124
+ # Set bar width
125
+ bar_width = 0.25
126
+
127
+ # Set bar positions
128
+ r1 = np.arange(len(models))
129
+ r2 = [x + bar_width for x in r1]
130
+ r3 = [x + bar_width for x in r2]
131
+
132
+ # Create bars
133
+ ax.bar(r1, creative_scores, width=bar_width, label='Креативность', color='skyblue')
134
+ ax.bar(r2, stability_scores, width=bar_width, label='Стабильность', color='orange')
135
+ ax.bar(r3, combined_scores, width=bar_width, label='Общий балл', color='green')
136
+
137
+ # Add labels and title
138
+ ax.set_xlabel('Модели')
139
+ ax.set_ylabel('Оценка')
140
+ ax.set_title('Сра��нение моделей по креативности и стабильности')
141
+ ax.set_xticks([r + bar_width for r in range(len(models))])
142
+ ax.set_xticklabels(models)
143
+
144
+ # Add legend
145
+ ax.legend()
146
+
147
+ # Save the chart
148
+ bar_chart_path = 'results/bar_chart.png'
149
+ plt.savefig(bar_chart_path, dpi=300, bbox_inches='tight')
150
+ plt.close()
151
+
152
+ return bar_chart_path
153
+
154
+ def get_leaderboard_data():
155
+ benchmark_file = 'results/benchmark_results.csv'
156
+ if not os.path.exists(benchmark_file):
157
+ return pd.DataFrame(columns=[
158
+ "Model", "Креативность", "Разнообразие", "Релевантность", "Стабильность", "Общий балл"
159
+ ])
160
+
161
+ try:
162
+ df = pd.read_csv(benchmark_file)
163
+ # Format the dataframe for display
164
+ formatted_df = pd.DataFrame({
165
+ "Model": df['model'],
166
+ "Креативность": df['creativity_score'].round(2),
167
+ "Стабильность": df['stability_score'].round(2),
168
+ "Общий балл": df['combined_score'].round(2)
169
+ })
170
+ return formatted_df.sort_values(by="Общий балл", ascending=False)
171
+ except Exception as e:
172
+ print(f"Error loading leaderboard data: {str(e)}")
173
+ return pd.DataFrame(columns=[
174
+ "Model", "Креативность", "Разнообразие", "Релевантность", "Стабильность", "Общий балл"
175
+ ])