Linker1907 commited on
Commit
22f9e0d
·
1 Parent(s): 6639773
Files changed (1) hide show
  1. app.py +222 -0
app.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import json
3
+ import gradio as gr
4
+
5
+ # Hardcoded list of subsets from experiments.json
6
+ BENCHMARKS = [
7
+ "custom|gpqa:diamond|0",
8
+ "custom|aime24|0",
9
+ "custom|aime25|0",
10
+ "extended|ifeval|0"
11
+ ]
12
+ from datasets import get_dataset_split_names
13
+
14
+ # Add this near the top with other constants
15
+ REPO_OPTIONS = [
16
+ "OpenEvals/details_meta-llama__Llama-4-Maverick-17B-128E-Instruct-FP8_private",
17
+ "OpenEvals/details_meta-llama__Llama-4-Scout-17B-16E-Instruct_private",
18
+ # Add more common repositories as needed
19
+ ]
20
+
21
+ def get_available_splits(repo, benchmark):
22
+ return get_dataset_split_names(repo, config_name=benchmark.replace("|", "_").replace(":", "_"))
23
+
24
+ def load_details_and_results(repo, subset, split):
25
+ def worker(example):
26
+ example["predictions"] = example["predictions"]
27
+ example["gold"] = example["gold"][0]
28
+ example["metrics"] = example["metrics"]
29
+ return example
30
+
31
+ details = load_dataset(repo, subset.replace("|", "_").replace(":", "_"), split=split)
32
+ results = load_dataset(repo, "results", split=split)
33
+ results = eval(results[0]["results"])
34
+
35
+ columns_to_keep = ['full_prompt', 'gold', 'metrics', 'predictions']
36
+ details = details.select_columns(columns_to_keep)
37
+ details = details.map(worker)
38
+
39
+ return details, results
40
+
41
+ def update_splits(repo, benchmark):
42
+ splits = get_available_splits(repo, benchmark)
43
+ return gr.Dropdown(choices=splits, value=splits[0] if splits else None)
44
+
45
+ def display_model_details(repo_name, benchmark, split, example_index):
46
+ try:
47
+ # Load details for the specific model, benchmark and split
48
+ details, _ = load_details_and_results(repo_name, benchmark, split)
49
+ example = details[example_index]
50
+ except Exception as e:
51
+ return f"Error loading model details: {str(e)}"
52
+
53
+ # Create HTML output
54
+ html_output = "<div style='max-width: 800px; margin: 0 auto;'>\n\n"
55
+
56
+ # Ground Truth section
57
+ html_output += "<div style='background: #e6f3e6; padding: 20px; border-radius: 10px; margin-bottom: 20px;'>\n"
58
+ html_output += "<h3 style='margin-top: 0;'>Ground Truth</h3>\n"
59
+ html_output += "<div style='overflow-x: auto; max-width: 100%;'>\n"
60
+ html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{example['gold']}</code></pre>\n"
61
+ html_output += "</div>\n"
62
+ html_output += "</div>\n"
63
+
64
+ # Model output section
65
+ html_output += "<div style='background: #f5f5f5; padding: 20px; margin-bottom: 20px; border-radius: 10px;'>\n"
66
+ html_output += f"<h2 style='margin-top: 0;'>{repo_name}</h2>\n"
67
+ html_output += f"<p style='color: #666;'>Split: {split}</p>\n"
68
+
69
+ # Prompt section
70
+ html_output += "<details style='margin-bottom: 15px;'>\n"
71
+ html_output += "<summary><h3 style='display: inline; margin: 0;'>Prompt</h3></summary>\n"
72
+ html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
73
+ html_output += "<div style='overflow-x: auto;'>\n"
74
+
75
+ prompt = example['full_prompt']
76
+ if isinstance(prompt, list):
77
+ for msg in prompt:
78
+ if isinstance(msg, dict) and 'role' in msg and 'content' in msg:
79
+ role = msg['role'].title()
80
+ content = msg['content'].replace('<', '&lt;').replace('>', '&gt;')
81
+ html_output += f"<div style='margin-bottom: 10px;'>\n"
82
+ html_output += f"<strong>{role}:</strong>\n"
83
+ html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0; background: #f8f8f8; padding: 10px; border-radius: 5px;'><code>{content}</code></pre>\n"
84
+ html_output += "</div>\n"
85
+ else:
86
+ content = str(msg).replace('<', '&lt;').replace('>', '&gt;')
87
+ html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0; background: #f8f8f8; padding: 10px; border-radius: 5px;'><code>{content}</code></pre>\n"
88
+ else:
89
+ prompt_text = str(prompt).replace('<', '&lt;').replace('>', '&gt;')
90
+ html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0; background: #f8f8f8; padding: 10px; border-radius: 5px;'><code>{prompt_text}</code></pre>\n"
91
+
92
+ html_output += "</div>\n"
93
+ html_output += "</div>\n"
94
+ html_output += "</details>\n\n"
95
+
96
+ # Metrics section
97
+ html_output += "<details open style='margin-bottom: 15px;'>\n"
98
+ html_output += "<summary><h3 style='display: inline; margin: 0;'>Metrics</h3></summary>\n"
99
+ metrics = example['metrics']
100
+ if isinstance(metrics, str):
101
+ metrics = eval(metrics)
102
+ html_output += "<div style='overflow-x: auto;'>\n"
103
+ html_output += "<table style='width: 100%; margin: 10px 0; border-collapse: collapse;'>\n"
104
+ for key, value in metrics.items():
105
+ if isinstance(value, float):
106
+ value = f"{value:.3f}"
107
+ html_output += f"<tr><td style='padding: 5px; border-bottom: 1px solid #ddd;'><strong>{key}</strong></td><td style='padding: 5px; border-bottom: 1px solid #ddd;'>{value}</td></tr>\n"
108
+ html_output += "</table>\n"
109
+ html_output += "</div>\n"
110
+ html_output += "</details>\n\n"
111
+
112
+ # Prediction section
113
+ prediction = example['predictions'][0] if example['predictions'] else ''
114
+ html_output += "<details open style='margin-bottom: 15px;'>\n"
115
+ html_output += "<summary><h3 style='display: inline; margin: 0;'>Prediction</h3>"
116
+ word_count = len(prediction.split())
117
+ html_output += f"<span style='color: #666; font-size: 0.8em; margin-left: 10px;'>({word_count} words)</span>"
118
+ html_output += "</summary>\n"
119
+ html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
120
+ html_output += "<div style='overflow-x: auto;'>\n"
121
+ prediction = prediction.replace('<', '&lt;').replace('>', '&gt;')
122
+ html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{prediction}</code></pre>\n"
123
+ html_output += "</div>\n"
124
+ html_output += "</div>\n"
125
+ html_output += "</details>\n"
126
+
127
+ html_output += "</div>\n</div>"
128
+ return html_output
129
+
130
+ # Create the Gradio interface
131
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
132
+ gr.Markdown("# Model Generation Details")
133
+ gr.Markdown("View detailed outputs for a specific model")
134
+
135
+ with gr.Row():
136
+ repo_select = gr.Radio(
137
+ choices=["Choose from list", "Custom"],
138
+ label="Repository Selection Method",
139
+ value="Choose from list",
140
+ info="Select how you want to specify the repository"
141
+ )
142
+
143
+ with gr.Row():
144
+ repo_dropdown = gr.Dropdown(
145
+ choices=REPO_OPTIONS,
146
+ label="Repository Name",
147
+ value=REPO_OPTIONS[0] if REPO_OPTIONS else None,
148
+ visible=True,
149
+ info="Select from predefined repositories"
150
+ )
151
+ repo_custom = gr.Textbox(
152
+ label="Custom Repository Name",
153
+ placeholder="e.g. OpenEvals/details_custom_model_private",
154
+ visible=False,
155
+ info="Enter custom repository name"
156
+ )
157
+
158
+ with gr.Row():
159
+ benchmark = gr.Dropdown(
160
+ label="Benchmark",
161
+ choices=BENCHMARKS,
162
+ value=BENCHMARKS[0],
163
+ info="Select the benchmark subset"
164
+ )
165
+ split = gr.Dropdown(
166
+ label="Split",
167
+ choices=[],
168
+ info="Select the evaluation split"
169
+ )
170
+
171
+ with gr.Row():
172
+ example_index = gr.Number(
173
+ label="Example Index",
174
+ value=0,
175
+ step=1,
176
+ info="Navigate through different examples"
177
+ )
178
+ submit_btn = gr.Button("Show Results", variant="primary")
179
+
180
+ # Add this function to handle visibility toggling
181
+ def toggle_repo_input(choice):
182
+ return {
183
+ repo_dropdown: gr.update(visible=(choice == "Choose from list")),
184
+ repo_custom: gr.update(visible=(choice == "Custom"))
185
+ }
186
+
187
+ # Add this function to get the active repository name
188
+ def get_active_repo(selection_method, dropdown_value, custom_value):
189
+ return custom_value if selection_method == "Custom" else dropdown_value
190
+
191
+ # Update the event handlers
192
+ repo_select.change(
193
+ fn=toggle_repo_input,
194
+ inputs=[repo_select],
195
+ outputs=[repo_dropdown, repo_custom]
196
+ )
197
+
198
+ # Update the benchmark change handler
199
+ benchmark.change(
200
+ fn=lambda selection_method, dropdown, custom, bench: update_splits(
201
+ get_active_repo(selection_method, dropdown, custom),
202
+ bench
203
+ ),
204
+ inputs=[repo_select, repo_dropdown, repo_custom, benchmark],
205
+ outputs=split
206
+ )
207
+
208
+ # Display results
209
+ output = gr.HTML()
210
+ submit_btn.click(
211
+ fn=lambda selection_method, dropdown, custom, bench, split_val, idx: display_model_details(
212
+ get_active_repo(selection_method, dropdown, custom),
213
+ bench,
214
+ split_val,
215
+ idx
216
+ ),
217
+ inputs=[repo_select, repo_dropdown, repo_custom, benchmark, split, example_index],
218
+ outputs=output
219
+ )
220
+
221
+ if __name__ == "__main__":
222
+ demo.launch()