Spaces:
Sleeping
Sleeping
File size: 8,664 Bytes
050fdc5 abb648a d0ac18d abb648a d0ac18d abb648a d0ac18d abb648a d0ac18d abb648a d0ac18d abb648a d0ac18d abb648a d0ac18d abb648a 050fdc5 abb648a d0ac18d abb648a 050fdc5 abb648a 050fdc5 abb648a 050fdc5 abb648a e2b92e5 abb648a e2b92e5 abb648a e2b92e5 abb648a e2b92e5 abb648a e2b92e5 abb648a e2b92e5 abb648a 7149d5b abb648a e2b92e5 abb648a 050fdc5 abb648a 7149d5b abb648a d0ac18d abb648a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
import os
import time
import pandas as pd
import numpy as np
import gradio as gr
from typing import Dict, List, Optional
import matplotlib.pyplot as plt
import seaborn as sns
# Import functions from our modules
from evaluate_creativity import evaluate_creativity
from evaluate_stability import (
evaluate_stability,
evaluate_combined_score,
create_radar_chart,
create_bar_chart,
get_leaderboard_data
)
def list_available_models(csv_file):
try:
df = pd.read_csv(csv_file)
model_columns = [col for col in df.columns if col.endswith('_answers')]
models = [col.replace('_answers', '') for col in model_columns]
return models
except Exception as e:
print(f"Error listing models: {str(e)}")
return []
def evaluate_models(file_path, api_key, prompt_col, selected_models=None, progress=gr.Progress()):
os.makedirs('results', exist_ok=True)
progress(0, desc="Loading data...")
df = pd.read_csv(file_path)
# Determine which models to evaluate
if selected_models:
answer_cols = [f"{model}_answers" for model in selected_models]
models = selected_models
else:
answer_cols = [col for col in df.columns if col.endswith('_answers')]
models = [col.replace('_answers', '') for col in answer_cols]
model_mapping = dict(zip(models, answer_cols))
progress(0.1, desc=f"Found {len(model_mapping)} models to evaluate")
all_results = {}
all_creativity_dfs = {}
benchmark_file = 'results/benchmark_results.csv'
if os.path.exists(benchmark_file):
try:
benchmark_df = pd.read_csv(benchmark_file)
except:
benchmark_df = pd.DataFrame(columns=[
'model', 'creativity_score', 'stability_score',
'combined_score', 'evaluation_timestamp'
])
else:
benchmark_df = pd.DataFrame(columns=[
'model', 'creativity_score', 'stability_score',
'combined_score', 'evaluation_timestamp'
])
progress_increment = 0.9 / len(model_mapping)
progress_current = 0.1
for model, column in model_mapping.items():
try:
progress(progress_current, desc=f"Evaluating {model}...")
# Evaluate creativity
creativity_df = evaluate_creativity(api_key, df, prompt_col, column, batch_size=1, progress=progress)
progress_current += progress_increment * 0.6
progress(progress_current, desc=f"Evaluating stability for {model}...")
# Evaluate stability
stability_results = evaluate_stability(df, prompt_col, column, progress=progress)
progress_current += progress_increment * 0.3
progress(progress_current, desc=f"Calculating combined score for {model}...")
# Calculate combined score
combined_results = evaluate_combined_score(creativity_df, stability_results, model)
# Save detailed results
timestamp = pd.Timestamp.now().strftime('%Y-%m-%d_%H-%M-%S')
output_file = f'results/evaluated_responses_{model}_{timestamp}.csv'
creativity_df.to_csv(output_file, index=False)
# Add to benchmark DataFrame
result_row = {
'model': model,
'creativity_score': combined_results['creativity_score'],
'stability_score': combined_results['stability_score'],
'combined_score': combined_results['combined_score'],
'evaluation_timestamp': combined_results['evaluation_timestamp']
}
benchmark_df = pd.concat([benchmark_df, pd.DataFrame([result_row])], ignore_index=True)
all_results[model] = combined_results
all_creativity_dfs[model] = creativity_df
progress_current += progress_increment * 0.1
progress(progress_current, desc=f"Finished evaluating {model}")
except Exception as e:
print(f"Error evaluating {model}: {str(e)}")
# Save benchmark results
benchmark_df.to_csv(benchmark_file, index=False)
timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
combined_benchmark_path = f'results/benchmark_results_{timestamp}.csv'
benchmark_df.to_csv(combined_benchmark_path, index=False)
progress(0.95, desc="Creating visualizations...")
radar_chart_path = create_radar_chart(all_results)
bar_chart_path = create_bar_chart(all_results)
progress(1.0, desc="Evaluation complete!")
sorted_results = benchmark_df.sort_values(by='combined_score', ascending=False)
return sorted_results, radar_chart_path, bar_chart_path, combined_benchmark_path
def get_leaderboard_data():
return [
["Vikhr", "7.75", "0.9363600260019302", "0.860"],
["Llama3", "7.30", "0.9410231244564057", "0.827"],
["Mistral", "6.95", "0.9459488660097122", "0.807"],
["Owen", "6.93", "0.945682458281517", "0.800"],
["TinyLlama", "1.12", "0.945682458281517", "0.573"]
]
def create_gradio_interface():
with gr.Blocks(title="LLM Evaluation Tool") as app:
gr.Markdown("# LLM Evaluation Tool")
gr.Markdown("Оцените модели на креативность, разнообразие, релевантность и стабильность")
with gr.Tab("Evaluate Models"):
with gr.Row():
with gr.Column():
file_input = gr.File(label="Upload CSV with prompts and responses")
api_key_input = gr.Textbox(label="Gemini API Key", type="password")
prompt_col_input = gr.Textbox(label="Prompt Column Name", value="rus_prompt")
model_selection = gr.CheckboxGroup(
label="Select Models to Evaluate (leave empty to evaluate all)",
choices=[],
interactive=True
)
refresh_button = gr.Button("Refresh Model List")
@refresh_button.click(inputs=[file_input], outputs=[model_selection])
def update_model_list(file):
if file:
models = list_available_models(file.name)
return gr.CheckboxGroup(choices=models)
return gr.CheckboxGroup(choices=[])
evaluate_button = gr.Button("Evaluate Models", variant="primary")
with gr.Row():
result_table = gr.Dataframe(label="Evaluation Results")
with gr.Row():
with gr.Column():
radar_chart = gr.Image(label="Radar Chart")
with gr.Column():
bar_chart = gr.Image(label="Bar Chart")
result_file = gr.File(label="Download Complete Results")
evaluate_button.click(
fn=evaluate_models,
inputs=[file_input, api_key_input, prompt_col_input, model_selection],
outputs=[result_table, radar_chart, bar_chart, result_file]
)
with gr.Tab("Leaderboard"):
with gr.Row():
leaderboard_table = gr.Dataframe(
label="Model Leaderboard",
headers=["Model", "Креативность", "Стабильность", "Общий балл"]
)
refresh_leaderboard = gr.Button("Refresh Leaderboard")
@refresh_leaderboard.click(outputs=[leaderboard_table])
def update_leaderboard():
return get_leaderboard_data()
with gr.Row():
gr.Markdown("### Leaderboard Details")
gr.Markdown("""
- **Креативность**: Оригинальность и инновационность ответов (шкала до 10)
- **Стабильность**: Коэффициент стабильности модели (0-1)
- **Общий балл**: Средний комбинированный показатель производительности (0-1)
""")
return app
if __name__ == "__main__":
app = create_gradio_interface()
app.launch() |