File size: 8,664 Bytes
050fdc5
abb648a
d0ac18d
abb648a
d0ac18d
abb648a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d0ac18d
abb648a
 
d0ac18d
abb648a
 
 
 
 
 
 
d0ac18d
abb648a
d0ac18d
abb648a
d0ac18d
abb648a
 
 
 
 
 
 
 
 
050fdc5
 
 
abb648a
 
 
 
 
d0ac18d
abb648a
 
050fdc5
abb648a
050fdc5
abb648a
050fdc5
abb648a
 
 
 
e2b92e5
abb648a
 
 
 
e2b92e5
abb648a
 
e2b92e5
abb648a
 
 
 
 
 
 
 
 
 
 
 
 
 
e2b92e5
abb648a
 
e2b92e5
abb648a
 
e2b92e5
 
abb648a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7149d5b
 
 
 
 
 
 
 
 
 
 
abb648a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e2b92e5
abb648a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
050fdc5
abb648a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7149d5b
 
 
abb648a
 
d0ac18d
 
 
abb648a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import os
import time
import pandas as pd
import numpy as np
import gradio as gr
from typing import Dict, List, Optional
import matplotlib.pyplot as plt
import seaborn as sns

# Import functions from our modules
from evaluate_creativity import evaluate_creativity
from evaluate_stability import (
    evaluate_stability, 
    evaluate_combined_score, 
    create_radar_chart, 
    create_bar_chart,
    get_leaderboard_data
)

def list_available_models(csv_file):
    try:
        df = pd.read_csv(csv_file)
        model_columns = [col for col in df.columns if col.endswith('_answers')]
        models = [col.replace('_answers', '') for col in model_columns]
        return models
    except Exception as e:
        print(f"Error listing models: {str(e)}")
        return []

def evaluate_models(file_path, api_key, prompt_col, selected_models=None, progress=gr.Progress()):
    os.makedirs('results', exist_ok=True)
    
    progress(0, desc="Loading data...")
    df = pd.read_csv(file_path)
    
    # Determine which models to evaluate
    if selected_models:
        answer_cols = [f"{model}_answers" for model in selected_models]
        models = selected_models
    else:
        answer_cols = [col for col in df.columns if col.endswith('_answers')]
        models = [col.replace('_answers', '') for col in answer_cols]
    
    model_mapping = dict(zip(models, answer_cols))
    
    progress(0.1, desc=f"Found {len(model_mapping)} models to evaluate")
    
    all_results = {}
    all_creativity_dfs = {}
    
    benchmark_file = 'results/benchmark_results.csv'
    if os.path.exists(benchmark_file):
        try:
            benchmark_df = pd.read_csv(benchmark_file)
        except:
            benchmark_df = pd.DataFrame(columns=[
                'model', 'creativity_score', 'stability_score', 
                'combined_score', 'evaluation_timestamp'
            ])
    else:
        benchmark_df = pd.DataFrame(columns=[
            'model', 'creativity_score', 'stability_score', 
            'combined_score', 'evaluation_timestamp'
        ])

    progress_increment = 0.9 / len(model_mapping)
    progress_current = 0.1
    
    for model, column in model_mapping.items():
        try:
            progress(progress_current, desc=f"Evaluating {model}...")
            
            # Evaluate creativity
            creativity_df = evaluate_creativity(api_key, df, prompt_col, column, batch_size=1, progress=progress)
            progress_current += progress_increment * 0.6
            progress(progress_current, desc=f"Evaluating stability for {model}...")
            
            # Evaluate stability
            stability_results = evaluate_stability(df, prompt_col, column, progress=progress)
            progress_current += progress_increment * 0.3
            progress(progress_current, desc=f"Calculating combined score for {model}...")
            
            # Calculate combined score
            combined_results = evaluate_combined_score(creativity_df, stability_results, model)
            
            # Save detailed results
            timestamp = pd.Timestamp.now().strftime('%Y-%m-%d_%H-%M-%S')
            output_file = f'results/evaluated_responses_{model}_{timestamp}.csv'
            creativity_df.to_csv(output_file, index=False)
            
            # Add to benchmark DataFrame
            result_row = {
                'model': model,
                'creativity_score': combined_results['creativity_score'],
                'stability_score': combined_results['stability_score'],
                'combined_score': combined_results['combined_score'],
                'evaluation_timestamp': combined_results['evaluation_timestamp']
            }
            benchmark_df = pd.concat([benchmark_df, pd.DataFrame([result_row])], ignore_index=True)
            
            all_results[model] = combined_results
            all_creativity_dfs[model] = creativity_df
            
            progress_current += progress_increment * 0.1
            progress(progress_current, desc=f"Finished evaluating {model}")
            
        except Exception as e:
            print(f"Error evaluating {model}: {str(e)}")
    
    # Save benchmark results
    benchmark_df.to_csv(benchmark_file, index=False)
    timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
    combined_benchmark_path = f'results/benchmark_results_{timestamp}.csv'
    benchmark_df.to_csv(combined_benchmark_path, index=False)
    
    progress(0.95, desc="Creating visualizations...")
    radar_chart_path = create_radar_chart(all_results)
    bar_chart_path = create_bar_chart(all_results)
    
    progress(1.0, desc="Evaluation complete!")
    
    sorted_results = benchmark_df.sort_values(by='combined_score', ascending=False)
    
    return sorted_results, radar_chart_path, bar_chart_path, combined_benchmark_path


def get_leaderboard_data():
    return [
        ["Vikhr", "7.75", "0.9363600260019302", "0.860"],
        ["Llama3", "7.30", "0.9410231244564057", "0.827"], 
        ["Mistral", "6.95", "0.9459488660097122", "0.807"],
        ["Owen", "6.93", "0.945682458281517", "0.800"],
        ["TinyLlama", "1.12", "0.945682458281517", "0.573"]
    ]


def create_gradio_interface():
    with gr.Blocks(title="LLM Evaluation Tool") as app:
        gr.Markdown("# LLM Evaluation Tool")
        gr.Markdown("Оцените модели на креативность, разнообразие, релевантность и стабильность")
        
        with gr.Tab("Evaluate Models"):
            with gr.Row():
                with gr.Column():
                    file_input = gr.File(label="Upload CSV with prompts and responses")
                    api_key_input = gr.Textbox(label="Gemini API Key", type="password")
                    prompt_col_input = gr.Textbox(label="Prompt Column Name", value="rus_prompt")
                    
                    model_selection = gr.CheckboxGroup(
                        label="Select Models to Evaluate (leave empty to evaluate all)",
                        choices=[],
                        interactive=True
                    )
                    
                    refresh_button = gr.Button("Refresh Model List")
                    
                    @refresh_button.click(inputs=[file_input], outputs=[model_selection])
                    def update_model_list(file):
                        if file:
                            models = list_available_models(file.name)
                            return gr.CheckboxGroup(choices=models)
                        return gr.CheckboxGroup(choices=[])
                    
                    evaluate_button = gr.Button("Evaluate Models", variant="primary")
            
            with gr.Row():
                result_table = gr.Dataframe(label="Evaluation Results")
            
            with gr.Row():
                with gr.Column():
                    radar_chart = gr.Image(label="Radar Chart")
                
                with gr.Column():
                    bar_chart = gr.Image(label="Bar Chart")
            
            result_file = gr.File(label="Download Complete Results")
            
            evaluate_button.click(
                fn=evaluate_models,
                inputs=[file_input, api_key_input, prompt_col_input, model_selection],
                outputs=[result_table, radar_chart, bar_chart, result_file]
            )
        
        with gr.Tab("Leaderboard"):
            with gr.Row():
                leaderboard_table = gr.Dataframe(
                    label="Model Leaderboard",
                    headers=["Model", "Креативность", "Стабильность", "Общий балл"]
                )
                
                refresh_leaderboard = gr.Button("Refresh Leaderboard")
                
                @refresh_leaderboard.click(outputs=[leaderboard_table])
                def update_leaderboard():
                    return get_leaderboard_data()
            
            with gr.Row():
                gr.Markdown("### Leaderboard Details")
                gr.Markdown("""
                - **Креативность**: Оригинальность и инновационность ответов (шкала до 10)
                - **Стабильность**: Коэффициент стабильности модели (0-1)
                - **Общий балл**: Средний комбинированный показатель производительности (0-1)
                """)
    
    return app

if __name__ == "__main__":
    app = create_gradio_interface()
    app.launch()