Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -78,9 +78,19 @@ def create_reranking_interface(task_data):
|
|
78 |
return f"Error: {str(e)}", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
|
79 |
|
80 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
-
|
|
|
84 |
gr.Markdown("""
|
85 |
## Task Instructions
|
86 |
|
@@ -96,77 +106,180 @@ def create_reranking_interface(task_data):
|
|
96 |
7. Your rankings are automatically saved when you submit or navigate
|
97 |
""".format(instructions=task_data.get("instructions", "Rank documents by their relevance to the query.")))
|
98 |
|
|
|
99 |
current_sample_id = gr.State(value=samples[0]["id"])
|
100 |
auto_save_enabled = gr.State(value=True)
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
|
|
107 |
with gr.Group():
|
108 |
-
|
109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
-
|
|
|
112 |
|
113 |
-
#
|
114 |
doc_containers = []
|
115 |
ranking_inputs = []
|
116 |
validation_indicators = []
|
117 |
|
118 |
-
|
119 |
-
|
120 |
-
with gr.Row():
|
121 |
-
gr.Markdown("### Quick Ranking Options:")
|
122 |
-
sequential_btn = gr.Button("Rank in Order (1,2,3...)")
|
123 |
-
reverse_btn = gr.Button("Reverse Order (n,n-1,...)")
|
124 |
-
clear_btn = gr.Button("Clear All Rankings")
|
125 |
-
|
126 |
-
# Document display with better UI for ranking
|
127 |
for i, doc in enumerate(samples[0]["candidates"]):
|
128 |
-
|
129 |
-
|
|
|
|
|
|
|
|
|
130 |
doc_box = gr.Textbox(
|
131 |
value=doc,
|
132 |
label=f"Document {i+1}",
|
133 |
-
interactive=False
|
|
|
134 |
)
|
135 |
doc_containers.append(doc_box)
|
136 |
|
137 |
-
with gr.Column(scale=
|
138 |
-
#
|
139 |
rank_input = gr.Dropdown(
|
140 |
choices=[str(j) for j in range(1, len(samples[0]["candidates"])+1)],
|
141 |
label=f"Rank",
|
142 |
-
value=""
|
|
|
143 |
)
|
144 |
ranking_inputs.append(rank_input)
|
145 |
|
146 |
-
|
147 |
-
|
148 |
validation = gr.HTML(value="")
|
149 |
validation_indicators.append(validation)
|
150 |
|
151 |
-
|
|
|
152 |
prev_btn = gr.Button("← Previous Query", size="sm")
|
153 |
submit_btn = gr.Button("Submit Rankings", size="lg", variant="primary")
|
154 |
next_btn = gr.Button("Next Query →", size="sm")
|
155 |
|
|
|
156 |
with gr.Row():
|
157 |
-
save_btn = gr.Button("💾 Save All Results", variant="secondary")
|
158 |
results_info = gr.HTML(value=f"<p>Results will be saved to <code>{task_data['task_name']}_human_results.json</code></p>")
|
159 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
def validate_rankings(*rankings):
|
161 |
-
"""Validate rankings and update indicators."""
|
162 |
results = []
|
163 |
all_valid = True
|
164 |
for rank in rankings:
|
165 |
if rank is None or rank == "":
|
166 |
-
results.append("
|
167 |
all_valid = False
|
168 |
else:
|
169 |
-
results.append("
|
170 |
|
171 |
return results + [all_valid] # Return validation indicators and validity flag
|
172 |
|
@@ -284,6 +397,7 @@ def create_reranking_interface(task_data):
|
|
284 |
|
285 |
# Define a function that collects all ranking values and validates them
|
286 |
def submit_rankings(*args):
|
|
|
287 |
# Get the last argument (sample_id) and the rankings
|
288 |
if len(args) < 1:
|
289 |
return "Error: No arguments provided", progress_text.value
|
@@ -305,14 +419,84 @@ def create_reranking_interface(task_data):
|
|
305 |
if i < len(validation_indicators):
|
306 |
validation_indicators[i].update(value=result)
|
307 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
# If not all valid, return error message
|
309 |
if not all_valid:
|
310 |
return "⚠️ Please assign a rank to all documents before submitting", progress_text.value
|
311 |
|
312 |
# Save the validated rankings
|
313 |
status, progress = save_ranking(rankings, sample_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
314 |
return status, progress
|
315 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
# Wire up events (Gradio 3.x syntax)
|
317 |
submit_btn.click(
|
318 |
fn=submit_rankings,
|
@@ -389,6 +573,190 @@ def create_reranking_interface(task_data):
|
|
389 |
inputs=[auto_save_toggle],
|
390 |
outputs=[auto_save_enabled]
|
391 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
392 |
|
393 |
return demo
|
394 |
|
@@ -400,6 +768,45 @@ def create_main_app():
|
|
400 |
task_container = gr.HTML()
|
401 |
loaded_task_info = gr.JSON(label="Loaded Task Information", visible=False)
|
402 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
403 |
tabs = gr.Tabs()
|
404 |
|
405 |
with tabs:
|
@@ -408,7 +815,7 @@ def create_main_app():
|
|
408 |
## MTEB Human Evaluation Interface
|
409 |
|
410 |
This interface allows you to evaluate the relevance of documents for reranking tasks.
|
411 |
-
""")
|
412 |
|
413 |
# Function to get the most recent task file
|
414 |
def get_latest_task_file():
|
@@ -447,76 +854,80 @@ def create_main_app():
|
|
447 |
# Load the task file
|
448 |
task_file = get_latest_task_file()
|
449 |
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
|
|
|
|
|
|
|
|
463 |
gr.Markdown("Please upload a valid task file in the 'Upload & Evaluate' tab.")
|
464 |
-
else:
|
465 |
-
gr.Markdown("**No task file found**")
|
466 |
-
gr.Markdown("Please upload a valid task file in the 'Upload & Evaluate' tab.")
|
467 |
|
468 |
with gr.TabItem("Upload & Evaluate"):
|
469 |
gr.Markdown("""
|
470 |
## Upload Your Own Task File
|
471 |
|
472 |
If you have a prepared task file, you can upload it here to create an evaluation interface.
|
473 |
-
""")
|
474 |
|
475 |
with gr.Row():
|
476 |
with gr.Column(scale=1):
|
477 |
-
|
478 |
-
|
479 |
-
|
|
|
480 |
|
481 |
# Add task list for previously uploaded tasks
|
482 |
-
gr.
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
|
|
494 |
|
495 |
# Add results management section
|
496 |
-
gr.
|
497 |
-
|
498 |
-
# Function to list existing result files
|
499 |
-
def list_result_files():
|
500 |
-
results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
|
501 |
-
if not results:
|
502 |
-
return "No result files available yet."
|
503 |
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
515 |
|
516 |
-
|
517 |
-
|
518 |
-
results_list = gr.Markdown(list_result_files())
|
519 |
-
download_results_btn = gr.Button("Download Results")
|
520 |
|
521 |
# Handle file upload and storage
|
522 |
def handle_upload(file):
|
@@ -540,8 +951,8 @@ def create_main_app():
|
|
540 |
with open(task_filename, "w") as f:
|
541 |
json.dump(task_data, f, indent=2)
|
542 |
|
543 |
-
return f"Task '{task_data['task_name']}' uploaded successfully with {len(task_data['samples'])} samples. Please refresh the app and use the Demo tab to evaluate it.", list_task_files(), f"""
|
544 |
-
<div
|
545 |
<h3>Task uploaded successfully!</h3>
|
546 |
<p>Task Name: {task_data['task_name']}</p>
|
547 |
<p>Samples: {len(task_data['samples'])}</p>
|
@@ -555,7 +966,7 @@ def create_main_app():
|
|
555 |
</div>
|
556 |
"""
|
557 |
except Exception as e:
|
558 |
-
return f"Error processing task file: {str(e)}", task_list.value, ""
|
559 |
|
560 |
# Function to prepare results for download
|
561 |
def prepare_results_for_download():
|
@@ -596,7 +1007,7 @@ def create_main_app():
|
|
596 |
## Manage Evaluation Results
|
597 |
|
598 |
View, download, and analyze your evaluation results.
|
599 |
-
""")
|
600 |
|
601 |
# Function to load and display result stats
|
602 |
def get_result_stats():
|
@@ -642,14 +1053,17 @@ def create_main_app():
|
|
642 |
|
643 |
return "\n\n".join(stats)
|
644 |
|
645 |
-
|
646 |
-
|
|
|
647 |
|
648 |
# Add download options
|
649 |
-
with gr.
|
650 |
-
|
651 |
-
|
652 |
-
|
|
|
|
|
653 |
|
654 |
# Function to prepare all results for download as ZIP
|
655 |
def prepare_all_results():
|
|
|
78 |
return f"Error: {str(e)}", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
|
79 |
|
80 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
81 |
+
# Header section with title and progress indicators
|
82 |
+
with gr.Row(equal_height=True):
|
83 |
+
with gr.Column(scale=3):
|
84 |
+
gr.Markdown(f"# {task_data['task_name']} - Human Reranking Evaluation")
|
85 |
+
with gr.Column(scale=1):
|
86 |
+
progress_text = gr.Textbox(
|
87 |
+
label="Progress",
|
88 |
+
value=f"Progress: 0/{len(samples)}",
|
89 |
+
interactive=False
|
90 |
+
)
|
91 |
|
92 |
+
# Instructions in a collapsible section
|
93 |
+
with gr.Accordion("📋 Task Instructions", open=False):
|
94 |
gr.Markdown("""
|
95 |
## Task Instructions
|
96 |
|
|
|
106 |
7. Your rankings are automatically saved when you submit or navigate
|
107 |
""".format(instructions=task_data.get("instructions", "Rank documents by their relevance to the query.")))
|
108 |
|
109 |
+
# Hidden state variables
|
110 |
current_sample_id = gr.State(value=samples[0]["id"])
|
111 |
auto_save_enabled = gr.State(value=True)
|
112 |
|
113 |
+
# Status and control section
|
114 |
+
with gr.Row(equal_height=True):
|
115 |
+
with gr.Column(scale=3):
|
116 |
+
status_box = gr.Textbox(
|
117 |
+
label="Status",
|
118 |
+
value="Ready to start evaluation",
|
119 |
+
interactive=False
|
120 |
+
)
|
121 |
+
with gr.Column(scale=1):
|
122 |
+
auto_save_toggle = gr.Checkbox(
|
123 |
+
label="Auto-save when navigating",
|
124 |
+
value=True
|
125 |
+
)
|
126 |
|
127 |
+
# Main content area
|
128 |
with gr.Group():
|
129 |
+
# Query section with clear visual distinction
|
130 |
+
with gr.Box():
|
131 |
+
gr.Markdown("## 📝 Query")
|
132 |
+
query_text = gr.Textbox(
|
133 |
+
value=samples[0]["query"],
|
134 |
+
label="",
|
135 |
+
interactive=False,
|
136 |
+
elem_classes=["query-text"]
|
137 |
+
)
|
138 |
+
|
139 |
+
# Quick ranking tools in a nicely formatted bar
|
140 |
+
with gr.Row(equal_height=True):
|
141 |
+
gr.Markdown("### 🔄 Quick Ranking Tools:", elem_classes=["tool-heading"])
|
142 |
+
sequential_btn = gr.Button("Rank in Order (1,2,3...)", elem_classes=["tool-button"])
|
143 |
+
reverse_btn = gr.Button("Reverse Order (n,n-1,...)", elem_classes=["tool-button"])
|
144 |
+
clear_btn = gr.Button("Clear All Rankings", elem_classes=["tool-button"])
|
145 |
|
146 |
+
# Documents section with improved layout
|
147 |
+
gr.Markdown("## 📄 Documents to Rank")
|
148 |
|
149 |
+
# Container for documents and rankings
|
150 |
doc_containers = []
|
151 |
ranking_inputs = []
|
152 |
validation_indicators = []
|
153 |
|
154 |
+
# Create a better visual layout for documents
|
155 |
+
with gr.Box():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
for i, doc in enumerate(samples[0]["candidates"]):
|
157 |
+
row_class = "document-row-even" if i % 2 == 0 else "document-row-odd"
|
158 |
+
with gr.Row(equal_height=True, elem_classes=["document-row", row_class]):
|
159 |
+
with gr.Column(scale=1, min_width=50):
|
160 |
+
gr.HTML(f"<div class='doc-number'>{i+1}</div>")
|
161 |
+
|
162 |
+
with gr.Column(scale=6):
|
163 |
doc_box = gr.Textbox(
|
164 |
value=doc,
|
165 |
label=f"Document {i+1}",
|
166 |
+
interactive=False,
|
167 |
+
elem_classes=["document-text"]
|
168 |
)
|
169 |
doc_containers.append(doc_box)
|
170 |
|
171 |
+
with gr.Column(scale=2):
|
172 |
+
# Dropdown for ranking
|
173 |
rank_input = gr.Dropdown(
|
174 |
choices=[str(j) for j in range(1, len(samples[0]["candidates"])+1)],
|
175 |
label=f"Rank",
|
176 |
+
value="",
|
177 |
+
elem_classes=["rank-dropdown"]
|
178 |
)
|
179 |
ranking_inputs.append(rank_input)
|
180 |
|
181 |
+
with gr.Column(scale=2):
|
182 |
+
# Validation indicator
|
183 |
validation = gr.HTML(value="")
|
184 |
validation_indicators.append(validation)
|
185 |
|
186 |
+
# Navigation and submission controls
|
187 |
+
with gr.Row(equal_height=True):
|
188 |
prev_btn = gr.Button("← Previous Query", size="sm")
|
189 |
submit_btn = gr.Button("Submit Rankings", size="lg", variant="primary")
|
190 |
next_btn = gr.Button("Next Query →", size="sm")
|
191 |
|
192 |
+
# Save results button
|
193 |
with gr.Row():
|
194 |
+
save_btn = gr.Button("💾 Save All Results", variant="secondary", size="sm")
|
195 |
results_info = gr.HTML(value=f"<p>Results will be saved to <code>{task_data['task_name']}_human_results.json</code></p>")
|
196 |
|
197 |
+
# CSS for styling
|
198 |
+
gr.HTML("""
|
199 |
+
<style>
|
200 |
+
.query-text textarea {
|
201 |
+
font-size: 16px !important;
|
202 |
+
font-weight: bold !important;
|
203 |
+
background-color: #f8f9fa !important;
|
204 |
+
border-left: 4px solid #2c7be5 !important;
|
205 |
+
padding-left: 10px !important;
|
206 |
+
}
|
207 |
+
|
208 |
+
.document-row {
|
209 |
+
border-bottom: 1px solid #e0e0e0;
|
210 |
+
padding: 10px 0;
|
211 |
+
margin-bottom: 5px !important;
|
212 |
+
}
|
213 |
+
|
214 |
+
.document-text textarea {
|
215 |
+
font-size: 14px !important;
|
216 |
+
line-height: 1.5 !important;
|
217 |
+
}
|
218 |
+
|
219 |
+
.rank-dropdown select {
|
220 |
+
font-weight: bold !important;
|
221 |
+
text-align: center !important;
|
222 |
+
}
|
223 |
+
|
224 |
+
.tool-button button {
|
225 |
+
min-width: 120px !important;
|
226 |
+
}
|
227 |
+
|
228 |
+
.tool-heading {
|
229 |
+
padding-top: 8px !important;
|
230 |
+
}
|
231 |
+
|
232 |
+
.document-row-even {
|
233 |
+
background-color: #f8f9fa;
|
234 |
+
}
|
235 |
+
|
236 |
+
.document-row-odd {
|
237 |
+
background-color: #ffffff;
|
238 |
+
}
|
239 |
+
|
240 |
+
.document-row:hover {
|
241 |
+
background-color: #e9ecef;
|
242 |
+
}
|
243 |
+
|
244 |
+
.doc-number {
|
245 |
+
display: flex;
|
246 |
+
align-items: center;
|
247 |
+
justify-content: center;
|
248 |
+
width: 30px;
|
249 |
+
height: 30px;
|
250 |
+
border-radius: 50%;
|
251 |
+
background-color: #2c7be5;
|
252 |
+
color: white;
|
253 |
+
font-weight: bold;
|
254 |
+
margin: 0 auto;
|
255 |
+
}
|
256 |
+
|
257 |
+
.rank-dropdown select {
|
258 |
+
font-weight: bold !important;
|
259 |
+
font-size: 16px !important;
|
260 |
+
text-align: center !important;
|
261 |
+
padding: 8px !important;
|
262 |
+
border-radius: 5px !important;
|
263 |
+
border: 2px solid #2c7be5 !important;
|
264 |
+
}
|
265 |
+
|
266 |
+
.rank-dropdown select:focus {
|
267 |
+
border-color: #007bff !important;
|
268 |
+
box-shadow: 0 0 0 0.2rem rgba(0, 123, 255, 0.25) !important;
|
269 |
+
}
|
270 |
+
</style>
|
271 |
+
""")
|
272 |
+
|
273 |
def validate_rankings(*rankings):
|
274 |
+
"""Validate rankings and update indicators with visual cues."""
|
275 |
results = []
|
276 |
all_valid = True
|
277 |
for rank in rankings:
|
278 |
if rank is None or rank == "":
|
279 |
+
results.append('<span style="color: #dc3545; font-weight: bold;">⚠️ Missing</span>')
|
280 |
all_valid = False
|
281 |
else:
|
282 |
+
results.append('<span style="color: #28a745; font-weight: bold;">✓ Rank ' + str(rank) + '</span>')
|
283 |
|
284 |
return results + [all_valid] # Return validation indicators and validity flag
|
285 |
|
|
|
397 |
|
398 |
# Define a function that collects all ranking values and validates them
|
399 |
def submit_rankings(*args):
|
400 |
+
"""Submit rankings with improved validation and user feedback."""
|
401 |
# Get the last argument (sample_id) and the rankings
|
402 |
if len(args) < 1:
|
403 |
return "Error: No arguments provided", progress_text.value
|
|
|
419 |
if i < len(validation_indicators):
|
420 |
validation_indicators[i].update(value=result)
|
421 |
|
422 |
+
# Check for duplicate rankings
|
423 |
+
if all_valid:
|
424 |
+
try:
|
425 |
+
processed_rankings = [int(r) for r in rankings]
|
426 |
+
if len(set(processed_rankings)) != len(processed_rankings):
|
427 |
+
dup_ranks = {}
|
428 |
+
for i, r in enumerate(processed_rankings):
|
429 |
+
if r in dup_ranks:
|
430 |
+
dup_ranks[r].append(i)
|
431 |
+
else:
|
432 |
+
dup_ranks[r] = [i]
|
433 |
+
|
434 |
+
# Highlight duplicates with error styling
|
435 |
+
for rank, indices in dup_ranks.items():
|
436 |
+
if len(indices) > 1:
|
437 |
+
for idx in indices:
|
438 |
+
if idx < len(validation_indicators):
|
439 |
+
validation_indicators[idx].update(
|
440 |
+
value=f'<span style="color: #dc3545; font-weight: bold;">⚠️ Duplicate rank {rank}</span>'
|
441 |
+
)
|
442 |
+
|
443 |
+
return "⚠️ Each document must have a unique rank. Please fix duplicate rankings.", progress_text.value
|
444 |
+
except:
|
445 |
+
pass
|
446 |
+
|
447 |
# If not all valid, return error message
|
448 |
if not all_valid:
|
449 |
return "⚠️ Please assign a rank to all documents before submitting", progress_text.value
|
450 |
|
451 |
# Save the validated rankings
|
452 |
status, progress = save_ranking(rankings, sample_id)
|
453 |
+
|
454 |
+
# Provide clear success feedback
|
455 |
+
if "✅" in status:
|
456 |
+
for i in range(len(validation_indicators)):
|
457 |
+
validation_indicators[i].update(
|
458 |
+
value=f'<span style="color: #28a745; font-weight: bold;">✓ Saved</span>'
|
459 |
+
)
|
460 |
+
|
461 |
return status, progress
|
462 |
|
463 |
+
# Update ranking input's event handling for immediate validation
|
464 |
+
def on_ranking_change(*rankings):
|
465 |
+
"""Validate rankings whenever any ranking dropdown changes."""
|
466 |
+
validation_results = validate_rankings(*rankings)
|
467 |
+
return validation_results[:-1] # Return only the validation indicators
|
468 |
+
|
469 |
+
# Check for overlapping ranks and duplicate assignments
|
470 |
+
def check_for_duplicates(*rankings):
|
471 |
+
"""Highlight duplicate rankings with visual feedback."""
|
472 |
+
clean_rankings = []
|
473 |
+
for r in rankings:
|
474 |
+
if r is not None and r != "":
|
475 |
+
clean_rankings.append(int(r))
|
476 |
+
|
477 |
+
if len(clean_rankings) != len(set(clean_rankings)):
|
478 |
+
used_ranks = {}
|
479 |
+
for i, r in enumerate(rankings):
|
480 |
+
if r is not None and r != "":
|
481 |
+
rank = int(r)
|
482 |
+
if rank in used_ranks:
|
483 |
+
used_ranks[rank].append(i)
|
484 |
+
else:
|
485 |
+
used_ranks[rank] = [i]
|
486 |
+
|
487 |
+
results = []
|
488 |
+
for i, r in enumerate(rankings):
|
489 |
+
if r is not None and r != "":
|
490 |
+
rank = int(r)
|
491 |
+
if len(used_ranks[rank]) > 1:
|
492 |
+
results.append(f'<span style="color: #dc3545; font-weight: bold;">⚠️ Duplicate rank {rank}</span>')
|
493 |
+
else:
|
494 |
+
results.append(f'<span style="color: #28a745; font-weight: bold;">✓ Rank {rank}</span>')
|
495 |
+
else:
|
496 |
+
results.append('<span style="color: #dc3545; font-weight: bold;">⚠️ Missing</span>')
|
497 |
+
|
498 |
+
return results
|
499 |
+
|
500 |
# Wire up events (Gradio 3.x syntax)
|
501 |
submit_btn.click(
|
502 |
fn=submit_rankings,
|
|
|
573 |
inputs=[auto_save_toggle],
|
574 |
outputs=[auto_save_enabled]
|
575 |
)
|
576 |
+
|
577 |
+
# Connect validation to ranking inputs for real-time feedback
|
578 |
+
for i, ranking in enumerate(ranking_inputs):
|
579 |
+
ranking.change(
|
580 |
+
fn=on_ranking_change,
|
581 |
+
inputs=ranking_inputs,
|
582 |
+
outputs=validation_indicators
|
583 |
+
)
|
584 |
+
|
585 |
+
# Add a real-time validation for the entire set to check for duplicates
|
586 |
+
def validate_all_inputs(*rankings):
|
587 |
+
"""Check all inputs for duplicate ranks and provide feedback."""
|
588 |
+
validation_results = validate_rankings(*rankings)
|
589 |
+
all_valid = validation_results[-1]
|
590 |
+
validation_indicators_values = validation_results[:-1]
|
591 |
+
|
592 |
+
# Show clear button status based on validation
|
593 |
+
submit_status = "Ready to submit" if all_valid else "Please assign unique ranks to all documents"
|
594 |
+
|
595 |
+
return validation_indicators_values + [submit_status]
|
596 |
+
|
597 |
+
# Connect this validation to all ranking inputs
|
598 |
+
for ranking in ranking_inputs:
|
599 |
+
ranking.change(
|
600 |
+
fn=validate_all_inputs,
|
601 |
+
inputs=ranking_inputs,
|
602 |
+
outputs=validation_indicators + [status_box]
|
603 |
+
)
|
604 |
+
|
605 |
+
# Helper function for ranking - sort documents by rankings
|
606 |
+
def rank_by_relevance(*args):
|
607 |
+
"""Sorts the documents by their current rankings for a clearer view."""
|
608 |
+
# Last argument is sample_id
|
609 |
+
sample_id = args[-1]
|
610 |
+
rankings = args[:-1]
|
611 |
+
|
612 |
+
# Check if we have valid rankings
|
613 |
+
valid_rankings = []
|
614 |
+
for i, r in enumerate(rankings):
|
615 |
+
if r is not None and r != "":
|
616 |
+
try:
|
617 |
+
valid_rankings.append((i, int(r)))
|
618 |
+
except:
|
619 |
+
pass
|
620 |
+
|
621 |
+
# If we don't have enough valid rankings, do nothing
|
622 |
+
if len(valid_rankings) < 2:
|
623 |
+
return [status_box.value]
|
624 |
+
|
625 |
+
# Sort by rank
|
626 |
+
valid_rankings.sort(key=lambda x: x[1])
|
627 |
+
|
628 |
+
# Generate message showing the ranking order
|
629 |
+
result = "<p><strong>Current ranking order:</strong></p><ol>"
|
630 |
+
for idx, _ in valid_rankings:
|
631 |
+
doc_text = doc_containers[idx].value
|
632 |
+
# Truncate if too long
|
633 |
+
if len(doc_text) > 100:
|
634 |
+
doc_text = doc_text[:97] + "..."
|
635 |
+
result += f"<li>Doc {idx+1}: {doc_text}</li>"
|
636 |
+
result += "</ol>"
|
637 |
+
|
638 |
+
return [result]
|
639 |
+
|
640 |
+
# Add a "Show Current Ranking" button
|
641 |
+
with gr.Row():
|
642 |
+
show_ranking_btn = gr.Button("👁️ Show Current Ranking Order", variant="secondary")
|
643 |
+
ranking_display = gr.HTML("")
|
644 |
+
|
645 |
+
# Connect the show ranking button
|
646 |
+
show_ranking_btn.click(
|
647 |
+
fn=rank_by_relevance,
|
648 |
+
inputs=ranking_inputs + [current_sample_id],
|
649 |
+
outputs=[ranking_display]
|
650 |
+
)
|
651 |
+
|
652 |
+
# Add a ranking preview section that shows documents in their ranked order
|
653 |
+
def generate_ranking_preview(*rankings):
|
654 |
+
"""Creates a visual preview of current rankings."""
|
655 |
+
# Create list of (index, rank) pairs for valid rankings
|
656 |
+
ranked_docs = []
|
657 |
+
for i, rank in enumerate(rankings):
|
658 |
+
if rank and rank.strip():
|
659 |
+
try:
|
660 |
+
ranked_docs.append((i, int(rank)))
|
661 |
+
except:
|
662 |
+
continue
|
663 |
+
|
664 |
+
# Sort by rank
|
665 |
+
ranked_docs.sort(key=lambda x: x[1])
|
666 |
+
|
667 |
+
# Generate HTML for the preview
|
668 |
+
if not ranked_docs:
|
669 |
+
return "<p><i>No rankings assigned yet. Assign ranks to see a preview.</i></p>"
|
670 |
+
|
671 |
+
html = "<div class='ranking-preview'>"
|
672 |
+
html += "<h3>Current Ranking Preview</h3>"
|
673 |
+
html += "<ol class='ranked-docs'>"
|
674 |
+
|
675 |
+
for doc_idx, rank in ranked_docs:
|
676 |
+
if doc_idx < len(doc_containers):
|
677 |
+
doc_text = doc_containers[doc_idx].value
|
678 |
+
# Truncate if too long
|
679 |
+
if len(doc_text) > 100:
|
680 |
+
doc_text = doc_text[:97] + "..."
|
681 |
+
|
682 |
+
html += f"""
|
683 |
+
<li class='ranked-doc'>
|
684 |
+
<div class='rank-badge'>#{rank}</div>
|
685 |
+
<div class='doc-index'>Document {doc_idx+1}</div>
|
686 |
+
<div class='doc-content'>{doc_text}</div>
|
687 |
+
</li>
|
688 |
+
"""
|
689 |
+
|
690 |
+
html += "</ol></div>"
|
691 |
+
|
692 |
+
# Add CSS for the preview
|
693 |
+
html += """
|
694 |
+
<style>
|
695 |
+
.ranking-preview {
|
696 |
+
margin-top: 20px;
|
697 |
+
padding: 10px;
|
698 |
+
border: 1px solid #e0e0e0;
|
699 |
+
border-radius: 5px;
|
700 |
+
background-color: #f8f9fa;
|
701 |
+
}
|
702 |
+
|
703 |
+
.ranked-docs {
|
704 |
+
list-style-type: none;
|
705 |
+
padding: 0;
|
706 |
+
}
|
707 |
+
|
708 |
+
.ranked-doc {
|
709 |
+
display: flex;
|
710 |
+
align-items: center;
|
711 |
+
padding: 10px;
|
712 |
+
margin-bottom: 5px;
|
713 |
+
border: 1px solid #ddd;
|
714 |
+
border-radius: 5px;
|
715 |
+
background-color: white;
|
716 |
+
}
|
717 |
+
|
718 |
+
.rank-badge {
|
719 |
+
display: flex;
|
720 |
+
align-items: center;
|
721 |
+
justify-content: center;
|
722 |
+
width: 40px;
|
723 |
+
height: 40px;
|
724 |
+
border-radius: 50%;
|
725 |
+
background-color: #2c7be5;
|
726 |
+
color: white;
|
727 |
+
font-weight: bold;
|
728 |
+
margin-right: 10px;
|
729 |
+
}
|
730 |
+
|
731 |
+
.doc-index {
|
732 |
+
font-weight: bold;
|
733 |
+
width: 120px;
|
734 |
+
}
|
735 |
+
|
736 |
+
.doc-content {
|
737 |
+
flex-grow: 1;
|
738 |
+
overflow: hidden;
|
739 |
+
text-overflow: ellipsis;
|
740 |
+
}
|
741 |
+
</style>
|
742 |
+
"""
|
743 |
+
|
744 |
+
return html
|
745 |
+
|
746 |
+
# Add ranking preview
|
747 |
+
ranking_preview = gr.HTML("<p><i>No rankings assigned yet. Assign ranks to see a preview.</i></p>")
|
748 |
+
|
749 |
+
# Update the ranking preview whenever a ranking changes
|
750 |
+
for ranking in ranking_inputs:
|
751 |
+
ranking.change(
|
752 |
+
fn=generate_ranking_preview,
|
753 |
+
inputs=ranking_inputs,
|
754 |
+
outputs=[ranking_preview]
|
755 |
+
)
|
756 |
+
|
757 |
+
# Show preview section
|
758 |
+
with gr.Accordion("📊 Ranking Preview", open=True):
|
759 |
+
ranking_preview
|
760 |
|
761 |
return demo
|
762 |
|
|
|
768 |
task_container = gr.HTML()
|
769 |
loaded_task_info = gr.JSON(label="Loaded Task Information", visible=False)
|
770 |
|
771 |
+
# CSS for consistent styling throughout the app
|
772 |
+
gr.HTML("""
|
773 |
+
<style>
|
774 |
+
/* Main App Styling */
|
775 |
+
.tab-content {
|
776 |
+
padding: 15px !important;
|
777 |
+
}
|
778 |
+
|
779 |
+
.btn-primary {
|
780 |
+
background-color: #2c7be5 !important;
|
781 |
+
}
|
782 |
+
|
783 |
+
.btn-secondary {
|
784 |
+
background-color: #6c757d !important;
|
785 |
+
}
|
786 |
+
|
787 |
+
/* Status messages */
|
788 |
+
.status-message {
|
789 |
+
font-weight: bold !important;
|
790 |
+
}
|
791 |
+
|
792 |
+
/* Box styling */
|
793 |
+
.content-box {
|
794 |
+
border: 1px solid #e0e0e0;
|
795 |
+
border-radius: 5px;
|
796 |
+
padding: 15px;
|
797 |
+
margin-bottom: 15px;
|
798 |
+
background-color: #f8f9fa;
|
799 |
+
}
|
800 |
+
|
801 |
+
/* Section headers */
|
802 |
+
.section-header {
|
803 |
+
border-bottom: 2px solid #2c7be5;
|
804 |
+
padding-bottom: 5px;
|
805 |
+
margin-bottom: 15px;
|
806 |
+
}
|
807 |
+
</style>
|
808 |
+
""")
|
809 |
+
|
810 |
tabs = gr.Tabs()
|
811 |
|
812 |
with tabs:
|
|
|
815 |
## MTEB Human Evaluation Interface
|
816 |
|
817 |
This interface allows you to evaluate the relevance of documents for reranking tasks.
|
818 |
+
""", elem_classes=["section-header"])
|
819 |
|
820 |
# Function to get the most recent task file
|
821 |
def get_latest_task_file():
|
|
|
854 |
# Load the task file
|
855 |
task_file = get_latest_task_file()
|
856 |
|
857 |
+
with gr.Box(elem_classes=["content-box"]):
|
858 |
+
if task_file:
|
859 |
+
try:
|
860 |
+
with open(task_file, "r") as f:
|
861 |
+
task_data = json.load(f)
|
862 |
+
|
863 |
+
# Show which task is currently loaded
|
864 |
+
gr.Markdown(f"**Current Task: {task_data['task_name']}** ({len(task_data['samples'])} samples)")
|
865 |
+
|
866 |
+
# Display the interface
|
867 |
+
demo = create_reranking_interface(task_data)
|
868 |
+
task_container.update(value=f"<p>Task loaded: {task_file}</p>")
|
869 |
+
except Exception as e:
|
870 |
+
gr.Markdown(f"**Error loading task: {str(e)}**", elem_classes=["status-message"])
|
871 |
+
gr.Markdown("Please upload a valid task file in the 'Upload & Evaluate' tab.")
|
872 |
+
else:
|
873 |
+
gr.Markdown("**No task file found**", elem_classes=["status-message"])
|
874 |
gr.Markdown("Please upload a valid task file in the 'Upload & Evaluate' tab.")
|
|
|
|
|
|
|
875 |
|
876 |
with gr.TabItem("Upload & Evaluate"):
|
877 |
gr.Markdown("""
|
878 |
## Upload Your Own Task File
|
879 |
|
880 |
If you have a prepared task file, you can upload it here to create an evaluation interface.
|
881 |
+
""", elem_classes=["section-header"])
|
882 |
|
883 |
with gr.Row():
|
884 |
with gr.Column(scale=1):
|
885 |
+
with gr.Box(elem_classes=["content-box"]):
|
886 |
+
file_input = gr.File(label="Upload a task file (JSON)")
|
887 |
+
load_btn = gr.Button("Load Task", variant="primary")
|
888 |
+
message = gr.Textbox(label="Status", interactive=False, elem_classes=["status-message"])
|
889 |
|
890 |
# Add task list for previously uploaded tasks
|
891 |
+
with gr.Box(elem_classes=["content-box"]):
|
892 |
+
gr.Markdown("### Previous Uploads", elem_classes=["section-header"])
|
893 |
+
|
894 |
+
# Function to list existing task files in the tasks directory
|
895 |
+
def list_task_files():
|
896 |
+
os.makedirs("uploaded_tasks", exist_ok=True)
|
897 |
+
tasks = [f for f in os.listdir("uploaded_tasks") if f.endswith(".json")]
|
898 |
+
if not tasks:
|
899 |
+
return "No task files uploaded yet."
|
900 |
+
return "\n".join([f"- {t}" for t in tasks])
|
901 |
+
|
902 |
+
task_list = gr.Markdown(list_task_files())
|
903 |
+
refresh_btn = gr.Button("Refresh List")
|
904 |
|
905 |
# Add results management section
|
906 |
+
with gr.Box(elem_classes=["content-box"]):
|
907 |
+
gr.Markdown("### Results Management", elem_classes=["section-header"])
|
|
|
|
|
|
|
|
|
|
|
908 |
|
909 |
+
# Function to list existing result files
|
910 |
+
def list_result_files():
|
911 |
+
results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
|
912 |
+
if not results:
|
913 |
+
return "No result files available yet."
|
914 |
+
|
915 |
+
result_links = []
|
916 |
+
for r in results:
|
917 |
+
# Calculate completion stats
|
918 |
+
try:
|
919 |
+
with open(r, "r") as f:
|
920 |
+
result_data = json.load(f)
|
921 |
+
annotation_count = len(result_data.get("annotations", []))
|
922 |
+
task_name = result_data.get("task_name", "Unknown")
|
923 |
+
result_links.append(f"- {r} ({annotation_count} annotations for {task_name})")
|
924 |
+
except:
|
925 |
+
result_links.append(f"- {r}")
|
926 |
+
|
927 |
+
return "\n".join(result_links)
|
928 |
|
929 |
+
results_list = gr.Markdown(list_result_files())
|
930 |
+
download_results_btn = gr.Button("Download Results")
|
|
|
|
|
931 |
|
932 |
# Handle file upload and storage
|
933 |
def handle_upload(file):
|
|
|
951 |
with open(task_filename, "w") as f:
|
952 |
json.dump(task_data, f, indent=2)
|
953 |
|
954 |
+
return f"✅ Task '{task_data['task_name']}' uploaded successfully with {len(task_data['samples'])} samples. Please refresh the app and use the Demo tab to evaluate it.", list_task_files(), f"""
|
955 |
+
<div class="content-box">
|
956 |
<h3>Task uploaded successfully!</h3>
|
957 |
<p>Task Name: {task_data['task_name']}</p>
|
958 |
<p>Samples: {len(task_data['samples'])}</p>
|
|
|
966 |
</div>
|
967 |
"""
|
968 |
except Exception as e:
|
969 |
+
return f"⚠️ Error processing task file: {str(e)}", task_list.value, ""
|
970 |
|
971 |
# Function to prepare results for download
|
972 |
def prepare_results_for_download():
|
|
|
1007 |
## Manage Evaluation Results
|
1008 |
|
1009 |
View, download, and analyze your evaluation results.
|
1010 |
+
""", elem_classes=["section-header"])
|
1011 |
|
1012 |
# Function to load and display result stats
|
1013 |
def get_result_stats():
|
|
|
1053 |
|
1054 |
return "\n\n".join(stats)
|
1055 |
|
1056 |
+
with gr.Box(elem_classes=["content-box"]):
|
1057 |
+
result_stats = gr.Markdown(get_result_stats())
|
1058 |
+
refresh_results_btn = gr.Button("Refresh Results", variant="secondary")
|
1059 |
|
1060 |
# Add download options
|
1061 |
+
with gr.Box(elem_classes=["content-box"]):
|
1062 |
+
gr.Markdown("### Download Options", elem_classes=["section-header"])
|
1063 |
+
with gr.Row():
|
1064 |
+
download_all_btn = gr.Button("Download All Results (ZIP)", variant="primary")
|
1065 |
+
result_select = gr.Dropdown(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")], label="Select Result to Download")
|
1066 |
+
download_selected_btn = gr.Button("Download Selected", variant="secondary")
|
1067 |
|
1068 |
# Function to prepare all results for download as ZIP
|
1069 |
def prepare_all_results():
|