AdnanElAssadi commited on
Commit
06f87ee
·
verified ·
1 Parent(s): 84410cb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +450 -256
app.py CHANGED
@@ -1,256 +1,450 @@
1
- import gradio as gr
2
- import json
3
- import os
4
- from pathlib import Path
5
-
6
- def create_reranking_interface(task_data):
7
- """Create a Gradio interface for reranking evaluation."""
8
- samples = task_data["samples"]
9
- results = {"task_name": task_data["task_name"], "task_type": "reranking", "annotations": []}
10
- completed_samples = {s["id"]: False for s in samples}
11
-
12
- def save_ranking(rankings, sample_id):
13
- """Save the current set of rankings."""
14
- # Check if all documents have rankings
15
- all_ranked = all(r is not None and r != "" for r in rankings)
16
- if not all_ranked:
17
- return "⚠️ Please assign a rank to all documents before submitting", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
18
-
19
- # Convert rankings to integers
20
- processed_rankings = [int(r) for r in rankings]
21
-
22
- # Check for duplicate rankings
23
- if len(set(processed_rankings)) != len(processed_rankings):
24
- return "⚠️ Each document must have a unique rank. Please review your rankings.", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
25
-
26
- # Store this annotation
27
- existing_idx = next((i for i, a in enumerate(results["annotations"]) if a["sample_id"] == sample_id), None)
28
- if existing_idx is not None:
29
- results["annotations"][existing_idx] = {
30
- "sample_id": sample_id,
31
- "rankings": processed_rankings
32
- }
33
- else:
34
- results["annotations"].append({
35
- "sample_id": sample_id,
36
- "rankings": processed_rankings
37
- })
38
-
39
- completed_samples[sample_id] = True
40
- success_msg = f"✅ Rankings for query '{sample_id}' successfully saved!"
41
- progress = f"Progress: {sum(completed_samples.values())}/{len(samples)}"
42
-
43
- # Auto-save results after each submission
44
- output_path = f"{task_data['task_name']}_human_results.json"
45
- with open(output_path, "w") as f:
46
- json.dump(results, f, indent=2)
47
-
48
- return success_msg, progress
49
-
50
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
51
- gr.Markdown(f"# {task_data['task_name']} - Human Reranking Evaluation")
52
-
53
- with gr.Accordion("Instructions", open=True):
54
- gr.Markdown("""
55
- ## Task Instructions
56
-
57
- {instructions}
58
-
59
- ### How to use this interface:
60
- 1. Read the query at the top
61
- 2. Review each document carefully
62
- 3. Assign a rank to each document (1 = most relevant, higher numbers = less relevant)
63
- 4. Each document must have a unique rank
64
- 5. Click "Submit Rankings" when you're done with the current query
65
- 6. Use "Previous" and "Next" to navigate between queries
66
- 7. Click "Save All Results" periodically to ensure your work is saved
67
- """.format(instructions=task_data["instructions"]))
68
-
69
- current_sample_id = gr.State(value=samples[0]["id"])
70
-
71
- with gr.Row():
72
- progress_text = gr.Textbox(label="Progress", value=f"Progress: 0/{len(samples)}", interactive=False)
73
- status_box = gr.Textbox(label="Status", value="Ready to start evaluation", interactive=False)
74
-
75
- with gr.Group():
76
- gr.Markdown("## Query:")
77
- query_text = gr.Textbox(value=samples[0]["query"], label="", interactive=False)
78
-
79
- gr.Markdown("## Documents to Rank:")
80
-
81
- # Create document displays and ranking dropdowns in synchronized pairs
82
- doc_containers = []
83
- ranking_dropdowns = []
84
-
85
- with gr.Column():
86
- for i, doc in enumerate(samples[0]["candidates"]):
87
- with gr.Row():
88
- doc_box = gr.Textbox(
89
- value=doc,
90
- label=f"Document {i+1}",
91
- interactive=False
92
- )
93
- dropdown = gr.Dropdown(
94
- choices=[str(j) for j in range(1, len(samples[0]["candidates"])+1)],
95
- label=f"Rank",
96
- value=""
97
- )
98
- doc_containers.append(doc_box)
99
- ranking_dropdowns.append(dropdown)
100
-
101
- with gr.Row():
102
- prev_btn = gr.Button("← Previous Query", size="sm")
103
- submit_btn = gr.Button("Submit Rankings", size="lg", variant="primary")
104
- next_btn = gr.Button("Next Query →", size="sm")
105
-
106
- save_btn = gr.Button("💾 Save All Results", variant="secondary")
107
-
108
- def load_sample(sample_id):
109
- """Load a specific sample into the interface."""
110
- sample = next((s for s in samples if s["id"] == sample_id), None)
111
- if not sample:
112
- return [query_text.value] + [d.value for d in doc_containers] + [""] * len(ranking_dropdowns) + [current_sample_id.value, progress_text.value, status_box.value]
113
-
114
- # Update query
115
- new_query = sample["query"]
116
-
117
- # Update documents
118
- new_docs = []
119
- for i, doc in enumerate(sample["candidates"]):
120
- if i < len(doc_containers):
121
- new_docs.append(doc)
122
-
123
- # Initialize rankings
124
- new_rankings = [""] * len(ranking_dropdowns)
125
-
126
- # Check if this sample has already been annotated
127
- existing_annotation = next((a for a in results["annotations"] if a["sample_id"] == sample_id), None)
128
- if existing_annotation:
129
- # Restore previous rankings
130
- for i, rank in enumerate(existing_annotation["rankings"]):
131
- if i < len(new_rankings) and rank is not None:
132
- new_rankings[i] = str(rank)
133
-
134
- # Update progress
135
- current_idx = samples.index(sample)
136
- new_progress = f"Progress: {sum(completed_samples.values())}/{len(samples)}"
137
-
138
- new_status = f"Viewing query {current_idx + 1} of {len(samples)}"
139
- if completed_samples[sample_id]:
140
- new_status += " (already completed)"
141
-
142
- return [new_query] + new_docs + new_rankings + [sample["id"], new_progress, new_status]
143
-
144
- def next_sample(current_id):
145
- """Load the next sample."""
146
- current_sample = next((s for s in samples if s["id"] == current_id), None)
147
- if not current_sample:
148
- return current_id
149
-
150
- current_idx = samples.index(current_sample)
151
- if current_idx < len(samples) - 1:
152
- next_sample = samples[current_idx + 1]
153
- return next_sample["id"]
154
- return current_id
155
-
156
- def prev_sample(current_id):
157
- """Load the previous sample."""
158
- current_sample = next((s for s in samples if s["id"] == current_id), None)
159
- if not current_sample:
160
- return current_id
161
-
162
- current_idx = samples.index(current_sample)
163
- if current_idx > 0:
164
- prev_sample = samples[current_idx - 1]
165
- return prev_sample["id"]
166
- return current_id
167
-
168
- def save_results():
169
- """Save all collected results to a file."""
170
- output_path = f"{task_data['task_name']}_human_results.json"
171
- with open(output_path, "w") as f:
172
- json.dump(results, f, indent=2)
173
- return f"✅ Results saved to {output_path} ({len(results['annotations'])} annotations)"
174
-
175
- # Connect events
176
- submit_btn.click(
177
- save_ranking,
178
- inputs=ranking_dropdowns + [current_sample_id],
179
- outputs=[status_box, progress_text]
180
- )
181
-
182
- next_btn.click(
183
- next_sample,
184
- inputs=[current_sample_id],
185
- outputs=[current_sample_id]
186
- ).then(
187
- load_sample,
188
- inputs=[current_sample_id],
189
- outputs=[query_text] + doc_containers + ranking_dropdowns + [current_sample_id, progress_text, status_box]
190
- )
191
-
192
- prev_btn.click(
193
- prev_sample,
194
- inputs=[current_sample_id],
195
- outputs=[current_sample_id]
196
- ).then(
197
- load_sample,
198
- inputs=[current_sample_id],
199
- outputs=[query_text] + doc_containers + ranking_dropdowns + [current_sample_id, progress_text, status_box]
200
- )
201
-
202
- save_btn.click(save_results, outputs=[status_box])
203
-
204
- return demo
205
-
206
- # Main app with file upload capability
207
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
208
- gr.Markdown("# MTEB Human Evaluation Demo")
209
-
210
- with gr.Tabs():
211
- with gr.TabItem("Demo"):
212
- gr.Markdown("""
213
- ## Try the MTEB Human Evaluation Interface
214
-
215
- This is a demonstration of the human evaluation interface for MTEB reranking tasks.
216
- The example below uses the AskUbuntuDupQuestions dataset.
217
- """)
218
-
219
- # Load the example task file
220
- with open("AskUbuntuDupQuestions_human_eval.json", "r") as f:
221
- example_data = json.load(f)
222
-
223
- # Display a demo with the example data
224
- reranking_demo = create_reranking_interface(example_data)
225
-
226
- with gr.TabItem("Upload & Evaluate"):
227
- gr.Markdown("""
228
- ## Upload Your Own Task File
229
-
230
- If you have a prepared task file, you can upload it here to try out the evaluation interface.
231
- """)
232
-
233
- file_input = gr.File(label="Upload a task file (JSON)")
234
- load_btn = gr.Button("Load Task")
235
- message = gr.Textbox(label="Status")
236
- task_container = gr.HTML()
237
-
238
- def load_custom_task(file):
239
- if not file:
240
- return "Please upload a task file"
241
-
242
- try:
243
- with open(file.name, "r") as f:
244
- task_data = json.load(f)
245
-
246
- task_interface = create_reranking_interface(task_data)
247
- # This is a placeholder - in Gradio you can't dynamically create interfaces this way
248
- # You would need a different approach for a real implementation
249
- return f"Task '{task_data['task_name']}' loaded with {len(task_data['samples'])} samples"
250
- except Exception as e:
251
- return f"Error loading task file: {str(e)}"
252
-
253
- load_btn.click(load_custom_task, inputs=[file_input], outputs=[message])
254
-
255
- if __name__ == "__main__":
256
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import os
4
+ from pathlib import Path
5
+
6
+ def create_reranking_interface(task_data):
7
+ """Create a Gradio interface for reranking evaluation."""
8
+ samples = task_data["samples"]
9
+ results = {"task_name": task_data["task_name"], "task_type": "reranking", "annotations": []}
10
+ completed_samples = {s["id"]: False for s in samples}
11
+
12
+ def save_ranking(rankings, sample_id):
13
+ """Save the current set of rankings."""
14
+ # Check if all documents have rankings
15
+ all_ranked = all(r is not None and r != "" for r in rankings)
16
+ if not all_ranked:
17
+ return "⚠️ Please assign a rank to all documents before submitting", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
18
+
19
+ # Convert rankings to integers
20
+ processed_rankings = [int(r) for r in rankings]
21
+
22
+ # Check for duplicate rankings
23
+ if len(set(processed_rankings)) != len(processed_rankings):
24
+ return "⚠️ Each document must have a unique rank. Please review your rankings.", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
25
+
26
+ # Store this annotation
27
+ existing_idx = next((i for i, a in enumerate(results["annotations"]) if a["sample_id"] == sample_id), None)
28
+ if existing_idx is not None:
29
+ results["annotations"][existing_idx] = {
30
+ "sample_id": sample_id,
31
+ "rankings": processed_rankings
32
+ }
33
+ else:
34
+ results["annotations"].append({
35
+ "sample_id": sample_id,
36
+ "rankings": processed_rankings
37
+ })
38
+
39
+ completed_samples[sample_id] = True
40
+ success_msg = f"✅ Rankings for query '{sample_id}' successfully saved!"
41
+ progress = f"Progress: {sum(completed_samples.values())}/{len(samples)}"
42
+
43
+ # Auto-save results after each submission
44
+ output_path = f"{task_data['task_name']}_human_results.json"
45
+ with open(output_path, "w") as f:
46
+ json.dump(results, f, indent=2)
47
+
48
+ return success_msg, progress
49
+
50
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
51
+ gr.Markdown(f"# {task_data['task_name']} - Human Reranking Evaluation")
52
+
53
+ with gr.Accordion("Instructions", open=True):
54
+ gr.Markdown("""
55
+ ## Task Instructions
56
+
57
+ {instructions}
58
+
59
+ ### How to use this interface:
60
+ 1. Read the query at the top
61
+ 2. Review each document carefully
62
+ 3. Assign a rank to each document (1 = most relevant, higher numbers = less relevant)
63
+ 4. Each document must have a unique rank
64
+ 5. Click "Submit Rankings" when you're done with the current query
65
+ 6. Use "Previous" and "Next" to navigate between queries
66
+ 7. Click "Save All Results" periodically to ensure your work is saved
67
+ """.format(instructions=task_data["instructions"]))
68
+
69
+ current_sample_id = gr.State(value=samples[0]["id"])
70
+
71
+ with gr.Row():
72
+ progress_text = gr.Textbox(label="Progress", value=f"Progress: 0/{len(samples)}", interactive=False)
73
+ status_box = gr.Textbox(label="Status", value="Ready to start evaluation", interactive=False)
74
+
75
+ with gr.Group():
76
+ gr.Markdown("## Query:")
77
+ query_text = gr.Textbox(value=samples[0]["query"], label="", interactive=False)
78
+
79
+ gr.Markdown("## Documents to Rank:")
80
+
81
+ # Create document displays and ranking dropdowns in synchronized pairs
82
+ doc_containers = []
83
+ ranking_dropdowns = []
84
+
85
+ with gr.Column():
86
+ for i, doc in enumerate(samples[0]["candidates"]):
87
+ with gr.Row():
88
+ doc_box = gr.Textbox(
89
+ value=doc,
90
+ label=f"Document {i+1}",
91
+ interactive=False
92
+ )
93
+ dropdown = gr.Dropdown(
94
+ choices=[str(j) for j in range(1, len(samples[0]["candidates"])+1)],
95
+ label=f"Rank",
96
+ value=""
97
+ )
98
+ doc_containers.append(doc_box)
99
+ ranking_dropdowns.append(dropdown)
100
+
101
+ with gr.Row():
102
+ prev_btn = gr.Button("← Previous Query", size="sm")
103
+ submit_btn = gr.Button("Submit Rankings", size="lg", variant="primary")
104
+ next_btn = gr.Button("Next Query →", size="sm")
105
+
106
+ save_btn = gr.Button("💾 Save All Results", variant="secondary")
107
+
108
+ def load_sample(sample_id):
109
+ """Load a specific sample into the interface."""
110
+ sample = next((s for s in samples if s["id"] == sample_id), None)
111
+ if not sample:
112
+ return [query_text.value] + [d.value for d in doc_containers] + [""] * len(ranking_dropdowns) + [current_sample_id.value, progress_text.value, status_box.value]
113
+
114
+ # Update query
115
+ new_query = sample["query"]
116
+
117
+ # Update documents
118
+ new_docs = []
119
+ for i, doc in enumerate(sample["candidates"]):
120
+ if i < len(doc_containers):
121
+ new_docs.append(doc)
122
+
123
+ # Initialize rankings
124
+ new_rankings = [""] * len(ranking_dropdowns)
125
+
126
+ # Check if this sample has already been annotated
127
+ existing_annotation = next((a for a in results["annotations"] if a["sample_id"] == sample_id), None)
128
+ if existing_annotation:
129
+ # Restore previous rankings
130
+ for i, rank in enumerate(existing_annotation["rankings"]):
131
+ if i < len(new_rankings) and rank is not None:
132
+ new_rankings[i] = str(rank)
133
+
134
+ # Update progress
135
+ current_idx = samples.index(sample)
136
+ new_progress = f"Progress: {sum(completed_samples.values())}/{len(samples)}"
137
+
138
+ new_status = f"Viewing query {current_idx + 1} of {len(samples)}"
139
+ if completed_samples[sample_id]:
140
+ new_status += " (already completed)"
141
+
142
+ return [new_query] + new_docs + new_rankings + [sample["id"], new_progress, new_status]
143
+
144
+ def next_sample(current_id):
145
+ """Load the next sample."""
146
+ current_sample = next((s for s in samples if s["id"] == current_id), None)
147
+ if not current_sample:
148
+ return current_id
149
+
150
+ current_idx = samples.index(current_sample)
151
+ if current_idx < len(samples) - 1:
152
+ next_sample = samples[current_idx + 1]
153
+ return next_sample["id"]
154
+ return current_id
155
+
156
+ def prev_sample(current_id):
157
+ """Load the previous sample."""
158
+ current_sample = next((s for s in samples if s["id"] == current_id), None)
159
+ if not current_sample:
160
+ return current_id
161
+
162
+ current_idx = samples.index(current_sample)
163
+ if current_idx > 0:
164
+ prev_sample = samples[current_idx - 1]
165
+ return prev_sample["id"]
166
+ return current_id
167
+
168
+ def save_results():
169
+ """Save all collected results to a file."""
170
+ output_path = f"{task_data['task_name']}_human_results.json"
171
+ with open(output_path, "w") as f:
172
+ json.dump(results, f, indent=2)
173
+ return f"✅ Results saved to {output_path} ({len(results['annotations'])} annotations)"
174
+
175
+ # Connect events
176
+ submit_btn.click(
177
+ save_ranking,
178
+ inputs=ranking_dropdowns + [current_sample_id],
179
+ outputs=[status_box, progress_text]
180
+ )
181
+
182
+ next_btn.click(
183
+ next_sample,
184
+ inputs=[current_sample_id],
185
+ outputs=[current_sample_id]
186
+ ).then(
187
+ load_sample,
188
+ inputs=[current_sample_id],
189
+ outputs=[query_text] + doc_containers + ranking_dropdowns + [current_sample_id, progress_text, status_box]
190
+ )
191
+
192
+ prev_btn.click(
193
+ prev_sample,
194
+ inputs=[current_sample_id],
195
+ outputs=[current_sample_id]
196
+ ).then(
197
+ load_sample,
198
+ inputs=[current_sample_id],
199
+ outputs=[query_text] + doc_containers + ranking_dropdowns + [current_sample_id, progress_text, status_box]
200
+ )
201
+
202
+ save_btn.click(save_results, outputs=[status_box])
203
+
204
+ return demo
205
+
206
+ # Main app with file upload capability
207
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
208
+ gr.Markdown("# MTEB Human Evaluation Demo")
209
+
210
+ with gr.Tabs():
211
+ with gr.TabItem("Demo"):
212
+ gr.Markdown("""
213
+ ## MTEB Human Evaluation Interface
214
+
215
+ This interface allows you to evaluate the relevance of documents for reranking tasks.
216
+ """)
217
+
218
+ # Function to get the most recent task file
219
+ def get_latest_task_file():
220
+ # Check first in uploaded_tasks directory
221
+ os.makedirs("uploaded_tasks", exist_ok=True)
222
+ uploaded_tasks = [f for f in os.listdir("uploaded_tasks") if f.endswith(".json")]
223
+
224
+ if uploaded_tasks:
225
+ # Sort by modification time, newest first
226
+ uploaded_tasks.sort(key=lambda x: os.path.getmtime(os.path.join("uploaded_tasks", x)), reverse=True)
227
+ return os.path.join("uploaded_tasks", uploaded_tasks[0])
228
+
229
+ # Fall back to default example
230
+ return "AskUbuntuDupQuestions_human_eval.json"
231
+
232
+ # Load the task file
233
+ task_file = get_latest_task_file()
234
+
235
+ try:
236
+ with open(task_file, "r") as f:
237
+ task_data = json.load(f)
238
+
239
+ # Show which task is currently loaded
240
+ gr.Markdown(f"**Current Task: {task_data['task_name']}** ({len(task_data['samples'])} samples)")
241
+
242
+ # Display the interface
243
+ reranking_demo = create_reranking_interface(task_data)
244
+ except Exception as e:
245
+ gr.Markdown(f"**Error loading task: {str(e)}**")
246
+ gr.Markdown("Please upload a valid task file in the 'Upload & Evaluate' tab.")
247
+
248
+ with gr.TabItem("Upload & Evaluate"):
249
+ gr.Markdown("""
250
+ ## Upload Your Own Task File
251
+
252
+ If you have a prepared task file, you can upload it here to create an evaluation interface.
253
+ """)
254
+
255
+ with gr.Row():
256
+ with gr.Column(scale=1):
257
+ file_input = gr.File(label="Upload a task file (JSON)")
258
+ load_btn = gr.Button("Load Task")
259
+ message = gr.Textbox(label="Status", interactive=False)
260
+
261
+ # Add task list for previously uploaded tasks
262
+ gr.Markdown("### Previous Uploads")
263
+
264
+ # Function to list existing task files in the tasks directory
265
+ def list_task_files():
266
+ os.makedirs("uploaded_tasks", exist_ok=True)
267
+ tasks = [f for f in os.listdir("uploaded_tasks") if f.endswith(".json")]
268
+ if not tasks:
269
+ return "No task files uploaded yet."
270
+ return "\n".join([f"- [{t}](javascript:selectTask('{t}'))" for t in tasks])
271
+
272
+ task_list = gr.Markdown(list_task_files())
273
+ refresh_btn = gr.Button("Refresh List")
274
+
275
+ # Add results management section
276
+ gr.Markdown("### Results Management")
277
+
278
+ # Function to list existing result files
279
+ def list_result_files():
280
+ results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
281
+ if not results:
282
+ return "No result files available yet."
283
+
284
+ result_links = []
285
+ for r in results:
286
+ # Calculate completion stats
287
+ try:
288
+ with open(r, "r") as f:
289
+ result_data = json.load(f)
290
+ annotation_count = len(result_data.get("annotations", []))
291
+ task_name = result_data.get("task_name", "Unknown")
292
+ result_links.append(f"- {r} ({annotation_count} annotations for {task_name})")
293
+ except:
294
+ result_links.append(f"- {r}")
295
+
296
+ return "\n".join(result_links)
297
+
298
+ results_list = gr.Markdown(list_result_files())
299
+ download_results_btn = gr.Button("Download Results")
300
+
301
+ # Right side - will contain the actual interface
302
+ with gr.Column(scale=2):
303
+ task_container = gr.HTML()
304
+
305
+ # Handle file upload and storage
306
+ def handle_upload(file):
307
+ if not file:
308
+ return "Please upload a task file", task_list.value, task_container.value
309
+
310
+ try:
311
+ # Create directory if it doesn't exist
312
+ os.makedirs("uploaded_tasks", exist_ok=True)
313
+
314
+ # Read the uploaded file
315
+ with open(file.name, "r") as f:
316
+ task_data = json.load(f)
317
+
318
+ # Validate task format
319
+ if "task_name" not in task_data or "samples" not in task_data:
320
+ return "Invalid task file format. Must contain 'task_name' and 'samples' fields.", task_list.value, task_container.value
321
+
322
+ # Save to a consistent location
323
+ task_filename = f"uploaded_tasks/{task_data['task_name']}_task.json"
324
+ with open(task_filename, "w") as f:
325
+ json.dump(task_data, f, indent=2)
326
+
327
+ # Instead of trying to create the interface here,
328
+ # we'll return a message with instructions
329
+ return f"Task '{task_data['task_name']}' uploaded successfully with {len(task_data['samples'])} samples. Please refresh the app and use the Demo tab to evaluate it.", list_task_files(), f"""
330
+ <div style="padding: 20px; background-color: #f0f0f0; border-radius: 10px;">
331
+ <h3>Task uploaded successfully!</h3>
332
+ <p>Task Name: {task_data['task_name']}</p>
333
+ <p>Samples: {len(task_data['samples'])}</p>
334
+ <p>To evaluate this task:</p>
335
+ <ol>
336
+ <li>Refresh the app</li>
337
+ <li>The Demo tab will now use your uploaded task</li>
338
+ <li>Complete your evaluations</li>
339
+ <li>Results will be saved as {task_data['task_name']}_human_results.json</li>
340
+ </ol>
341
+ </div>
342
+ """
343
+ except Exception as e:
344
+ return f"Error processing task file: {str(e)}", task_list.value, task_container.value
345
+
346
+ # Function to prepare results for download
347
+ def prepare_results_for_download():
348
+ results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
349
+ if not results:
350
+ return None
351
+
352
+ # Create a zip file with all results
353
+ import zipfile
354
+ zip_path = "mteb_human_eval_results.zip"
355
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
356
+ for r in results:
357
+ zipf.write(r)
358
+
359
+ return zip_path
360
+
361
+ # Connect events
362
+ load_btn.click(handle_upload, inputs=[file_input], outputs=[message, task_list, task_container])
363
+ refresh_btn.click(list_task_files, outputs=[task_list])
364
+ download_results_btn.click(prepare_results_for_download, outputs=[gr.File(label="Download Results")])
365
+
366
+ with gr.TabItem("Results Management"):
367
+ gr.Markdown("""
368
+ ## Manage Evaluation Results
369
+
370
+ View, download, and analyze your evaluation results.
371
+ """)
372
+
373
+ # Function to load and display result stats
374
+ def get_result_stats():
375
+ results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
376
+ if not results:
377
+ return "No result files available yet."
378
+
379
+ stats = []
380
+ for r in results:
381
+ try:
382
+ with open(r, "r") as f:
383
+ result_data = json.load(f)
384
+
385
+ task_name = result_data.get("task_name", "Unknown")
386
+ annotations = result_data.get("annotations", [])
387
+ annotation_count = len(annotations)
388
+
389
+ # Calculate completion percentage
390
+ sample_ids = set(a.get("sample_id") for a in annotations)
391
+
392
+ # Try to get the total sample count from the corresponding task file
393
+ total_samples = 0
394
+ task_file = f"uploaded_tasks/{task_name}_task.json"
395
+ if os.path.exists(task_file):
396
+ with open(task_file, "r") as f:
397
+ task_data = json.load(f)
398
+ total_samples = len(task_data.get("samples", []))
399
+
400
+ completion = f"{len(sample_ids)}/{total_samples}" if total_samples else f"{len(sample_ids)} samples"
401
+
402
+ stats.append(f"### {task_name}\n- Annotations: {annotation_count}\n- Completion: {completion}\n- File: {r}")
403
+ except Exception as e:
404
+ stats.append(f"### {r}\n- Error loading results: {str(e)}")
405
+
406
+ return "\n\n".join(stats)
407
+
408
+ result_stats = gr.Markdown(get_result_stats())
409
+ refresh_results_btn = gr.Button("Refresh Results")
410
+
411
+ # Add download options
412
+ with gr.Row():
413
+ download_all_btn = gr.Button("Download All Results (ZIP)")
414
+ result_select = gr.Dropdown(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")], label="Select Result to Download")
415
+ download_selected_btn = gr.Button("Download Selected")
416
+
417
+ # Add results visualization placeholder
418
+ gr.Markdown("### Results Visualization")
419
+ gr.Markdown("*Visualization features will be added in a future update.*")
420
+
421
+ # Connect events
422
+ refresh_results_btn.click(get_result_stats, outputs=[result_stats])
423
+
424
+ # Function to prepare all results for download as ZIP
425
+ def prepare_all_results():
426
+ import zipfile
427
+ zip_path = "mteb_human_eval_results.zip"
428
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
429
+ for r in [f for f in os.listdir(".") if f.endswith("_human_results.json")]:
430
+ zipf.write(r)
431
+ return zip_path
432
+
433
+ # Function to return a single result file
434
+ def get_selected_result(filename):
435
+ if not filename:
436
+ return None
437
+ if os.path.exists(filename):
438
+ return filename
439
+ return None
440
+
441
+ # Update dropdown when refreshing results
442
+ def update_result_dropdown():
443
+ return gr.Dropdown.update(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")])
444
+
445
+ refresh_results_btn.click(update_result_dropdown, outputs=[result_select])
446
+ download_all_btn.click(prepare_all_results, outputs=[gr.File(label="Download All Results")])
447
+ download_selected_btn.click(get_selected_result, inputs=[result_select], outputs=[gr.File(label="Download Selected Result")])
448
+
449
+ if __name__ == "__main__":
450
+ demo.launch()