Spaces:

AdnanElAssadi
/

MTEB-Human-Eval-Demo

Sleeping

App Files Files Community

AdnanElAssadi commited on Apr 8

Commit

ac98842

verified ·

1 Parent(s): 2ce9eb5

Update app.py

Browse files

Files changed (1) hide show

app.py +348 -299

app.py CHANGED Viewed

@@ -97,7 +97,7 @@ def create_reranking_interface(task_data):
             """.format(instructions=task_data.get("instructions", "Rank documents by their relevance to the query.")))
         current_sample_id = gr.State(value=samples[0]["id"])
-        current_state = gr.State(value={"auto_save_enabled": True, "last_saved": time.time()})
         with gr.Row():
             progress_text = gr.Textbox(label="Progress", value=f"Progress: 0/{len(samples)}", interactive=False)
@@ -135,13 +135,11 @@ def create_reranking_interface(task_data):
                             doc_containers.append(doc_box)
                         with gr.Column(scale=1):
-                            # Use Radio buttons for ranking rather than dropdowns
-                            # This provides a more visual and error-resistant interface
-                            rank_input = gr.Radio(
                                 choices=[str(j) for j in range(1, len(samples[0]["candidates"])+1)],
                                 label=f"Rank",
-                                value="",
-                                interactive=True
                             )
                             ranking_inputs.append(rank_input)
@@ -170,13 +168,13 @@ def create_reranking_interface(task_data):
                 else:
                     results.append("✓")
-            return results, all_valid
         def load_sample(sample_id):
             """Load a specific sample into the interface."""
             sample = next((s for s in samples if s["id"] == sample_id), None)
             if not sample:
-                return [query_text.value] + [d.value for d in doc_containers] + [""] * len(ranking_inputs) + validation_indicators + [sample_id, progress_text.value, status_box.value]
             # Update query
             new_query = sample["query"]
@@ -207,9 +205,10 @@ def create_reranking_interface(task_data):
                 new_status += " (already completed)"
             # Initialize validation indicators
-            validation_results, _ = validate_rankings(*new_rankings)
-            return [new_query] + new_docs + new_rankings + validation_results + [sample_id, new_progress, new_status]
         def auto_save_and_navigate(direction, current_id, auto_save, *rankings):
             """Save rankings if auto-save is enabled, then navigate."""
@@ -222,7 +221,8 @@ def create_reranking_interface(task_data):
             if auto_save:
                 # Only save if all rankings are provided
-                validation_results, all_valid = validate_rankings(*actual_rankings)
                 if all_valid:
                     status_msg, progress_msg = save_ranking(actual_rankings, current_id)
@@ -265,7 +265,6 @@ def create_reranking_interface(task_data):
             try:
                 with open(output_path, "w") as f:
                     json.dump(results, f, indent=2)
-                current_state.value["last_saved"] = time.time()
                 return f"✅ Results saved to {output_path} ({len(results['annotations'])} annotations)"
             except Exception as e:
                 return f"Error saving results: {str(e)}"
@@ -281,7 +280,7 @@ def create_reranking_interface(task_data):
         # Function to clear all rankings
         def clear_rankings():
-            return ["" for _ in range(len(samples[0]["candidates"]))]
         # Define a function that collects all ranking values and validates them
         def submit_rankings(*args):
@@ -297,11 +296,14 @@ def create_reranking_interface(task_data):
             rankings = args[:len(ranking_inputs)]
             # First validate the rankings
-            validation_results, all_valid = validate_rankings(*rankings)
             # Update validation indicators
-            for i, result in enumerate(validation_results):
-                validation_indicators[i].update(value=result)
             # If not all valid, return error message
             if not all_valid:
@@ -311,347 +313,394 @@ def create_reranking_interface(task_data):
             status, progress = save_ranking(rankings, sample_id)
             return status, progress
-        # Connect events - Direct input/output connections for reliability
         submit_btn.click(
-            submit_rankings,
             inputs=ranking_inputs + [current_sample_id],
             outputs=[status_box, progress_text]
         )
-        # Apply auto-save before navigation if enabled
         next_btn.click(
-            auto_save_and_navigate,
-            inputs=["next", current_sample_id, auto_save_toggle] + ranking_inputs,
-            outputs=[current_sample_id, status_box, progress_text]
-        ).then(
-            load_sample,
-            inputs=[current_sample_id],
             outputs=[query_text] + doc_containers + ranking_inputs + validation_indicators + [current_sample_id, progress_text, status_box]
         )
         prev_btn.click(
-            auto_save_and_navigate,
-            inputs=["prev", current_sample_id, auto_save_toggle] + ranking_inputs,
-            outputs=[current_sample_id, status_box, progress_text]
-        ).then(
-            load_sample,
-            inputs=[current_sample_id],
             outputs=[query_text] + doc_containers + ranking_inputs + validation_indicators + [current_sample_id, progress_text, status_box]
         )
         # Connect quick ranking buttons
         sequential_btn.click(
-            assign_sequential_ranks,
             outputs=ranking_inputs
         )
         reverse_btn.click(
-            assign_reverse_ranks,
             outputs=ranking_inputs
         )
         clear_btn.click(
-            clear_rankings,
             outputs=ranking_inputs
         )
         # Connect save button
-        save_btn.click(save_results, outputs=[status_box])
-        # Add validation on ranking changes
-        for i, ranking in enumerate(ranking_inputs):
-            ranking.change(
-                validate_rankings,
-                inputs=ranking_inputs,
-                outputs=validation_indicators + [gr.State(value=None)]  # Add dummy output to match function return
-            )
-        # Set up auto-save feature
         auto_save_toggle.change(
-            lambda x: {"auto_save_enabled": x},
             inputs=[auto_save_toggle],
-            outputs=[current_state]
         )
     return demo
 # Main app with file upload capability and improved task management
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# MTEB Human Evaluation Demo")
-    with gr.Tabs():
-        with gr.TabItem("Demo"):
-            gr.Markdown("""
-            ## MTEB Human Evaluation Interface
-            This interface allows you to evaluate the relevance of documents for reranking tasks.
-            """)
-            # Function to get the most recent task file
-            def get_latest_task_file():
-                # Check first in uploaded_tasks directory
-                os.makedirs("uploaded_tasks", exist_ok=True)
-                uploaded_tasks = [f for f in os.listdir("uploaded_tasks") if f.endswith(".json")]
-                if uploaded_tasks:
-                    # Sort by modification time, newest first
-                    uploaded_tasks.sort(key=lambda x: os.path.getmtime(os.path.join("uploaded_tasks", x)), reverse=True)
-                    task_path = os.path.join("uploaded_tasks", uploaded_tasks[0])
-                    # Verify this is a valid task file
-                    try:
-                        with open(task_path, "r") as f:
-                            task_data = json.load(f)
-                            if "task_name" in task_data and "samples" in task_data:
-                                return task_path
-                    except:
-                        pass
-                # Look for task files in the current directory
-                current_dir_tasks = [f for f in os.listdir(".") if f.endswith("_human_eval.json")]
-                if current_dir_tasks:
-                    # Sort by modification time, newest first
-                    current_dir_tasks.sort(key=lambda x: os.path.getmtime(x), reverse=True)
-                    return current_dir_tasks[0]
-                # Fall back to fixed example if available
-                if os.path.exists("AskUbuntuDupQuestions_human_eval.json"):
-                    return "AskUbuntuDupQuestions_human_eval.json"
-                # No valid task file found
-                return None
-            # Load the task file
-            task_file = get_latest_task_file()
-            if task_file:
-                try:
-                    with open(task_file, "r") as f:
-                        task_data = json.load(f)
-                    # Show which task is currently loaded
-                    gr.Markdown(f"**Current Task: {task_data['task_name']}** ({len(task_data['samples'])} samples)")
-                    # Display the interface
-                    reranking_demo = create_reranking_interface(task_data)
-                except Exception as e:
-                    gr.Markdown(f"**Error loading task: {str(e)}**")
-                    gr.Markdown("Please upload a valid task file in the 'Upload & Evaluate' tab.")
-            else:
-                gr.Markdown("**No task file found**")
-                gr.Markdown("Please upload a valid task file in the 'Upload & Evaluate' tab.")
-        with gr.TabItem("Upload & Evaluate"):
-            gr.Markdown("""
-            ## Upload Your Own Task File
-            If you have a prepared task file, you can upload it here to create an evaluation interface.
-            """)
-            with gr.Row():
-                with gr.Column(scale=1):
-                    file_input = gr.File(label="Upload a task file (JSON)")
-                    load_btn = gr.Button("Load Task")
-                    message = gr.Textbox(label="Status", interactive=False)
-                    # Add task list for previously uploaded tasks
-                    gr.Markdown("### Previous Uploads")
-                    # Function to list existing task files in the tasks directory
-                    def list_task_files():
-                        os.makedirs("uploaded_tasks", exist_ok=True)
-                        tasks = [f for f in os.listdir("uploaded_tasks") if f.endswith(".json")]
-                        if not tasks:
-                            return "No task files uploaded yet."
-                        return "\n".join([f"- {t}" for t in tasks])
-                    task_list = gr.Markdown(list_task_files())
-                    refresh_btn = gr.Button("Refresh List")
-                    # Add results management section
-                    gr.Markdown("### Results Management")
-                    # Function to list existing result files
-                    def list_result_files():
-                        results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
-                        if not results:
-                            return "No result files available yet."
-                        result_links = []
-                        for r in results:
-                            # Calculate completion stats
-                            try:
-                                with open(r, "r") as f:
-                                    result_data = json.load(f)
-                                annotation_count = len(result_data.get("annotations", []))
-                                task_name = result_data.get("task_name", "Unknown")
-                                result_links.append(f"- {r} ({annotation_count} annotations for {task_name})")
-                            except:
-                                result_links.append(f"- {r}")
-                        return "\n".join(result_links)
-                    results_list = gr.Markdown(list_result_files())
-                    download_results_btn = gr.Button("Download Results")
-                # Right side - will contain the actual interface
-                with gr.Column(scale=2):
-                    task_container = gr.HTML()
-                    loaded_task_info = gr.JSON(label="Loaded Task Information", visible=False)
-            # Handle file upload and storage
-            def handle_upload(file):
-                if not file:
-                    return "Please upload a task file", task_list.value, task_container.value, loaded_task_info.value
-                try:
-                    # Create directory if it doesn't exist
                     os.makedirs("uploaded_tasks", exist_ok=True)
-                    # Read the uploaded file
-                    with open(file.name, "r") as f:
-                        task_data = json.load(f)
-                    # Validate task format
-                    if "task_name" not in task_data or "samples" not in task_data:
-                        return "Invalid task file format. Must contain 'task_name' and 'samples' fields.", task_list.value, task_container.value, loaded_task_info.value
-                    # Save to a consistent location
-                    task_filename = f"uploaded_tasks/{task_data['task_name']}_task.json"
-                    with open(task_filename, "w") as f:
-                        json.dump(task_data, f, indent=2)
-                    # Show task info
-                    task_info = {
-                        "task_name": task_data["task_name"],
-                        "samples": len(task_data["samples"]),
-                        "file_path": task_filename
-                    }
-                    return f"Task '{task_data['task_name']}' uploaded successfully with {len(task_data['samples'])} samples. Please refresh the app and use the Demo tab to evaluate it.", list_task_files(), f"""
-                    <div style="padding: 20px; background-color: #f0f0f0; border-radius: 10px;">
-                        <h3>Task uploaded successfully!</h3>
-                        <p>Task Name: {task_data['task_name']}</p>
-                        <p>Samples: {len(task_data['samples'])}</p>
-                        <p>To evaluate this task:</p>
-                        <ol>
-                            <li>Refresh the app</li>
-                            <li>The Demo tab will now use your uploaded task</li>
-                            <li>Complete your evaluations</li>
-                            <li>Results will be saved as {task_data['task_name']}_human_results.json</li>
-                        </ol>
-                    </div>
-                    """, task_info
-                except Exception as e:
-                    return f"Error processing task file: {str(e)}", task_list.value, task_container.value, loaded_task_info.value
-            # Function to prepare results for download
-            def prepare_results_for_download():
-                results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
-                if not results:
                     return None
-                # Create a zip file with all results
-                import zipfile
-                zip_path = "mteb_human_eval_results.zip"
-                with zipfile.ZipFile(zip_path, 'w') as zipf:
-                    for r in results:
-                        zipf.write(r)
-                return zip_path
-            # Connect events
-            load_btn.click(handle_upload, inputs=[file_input], outputs=[message, task_list, task_container, loaded_task_info])
-            refresh_btn.click(list_task_files, outputs=[task_list])
-            download_results_btn.click(prepare_results_for_download, outputs=[gr.File(label="Download Results")])
-        with gr.TabItem("Results Management"):
-            gr.Markdown("""
-            ## Manage Evaluation Results
-            View, download, and analyze your evaluation results.
-            """)
-            # Function to load and display result stats
-            def get_result_stats():
-                results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
-                if not results:
-                    return "No result files available yet."
-                stats = []
-                for r in results:
                     try:
-                        with open(r, "r") as f:
-                            result_data = json.load(f)
-                        task_name = result_data.get("task_name", "Unknown")
-                        annotations = result_data.get("annotations", [])
-                        annotation_count = len(annotations)
-                        # Calculate completion percentage
-                        sample_ids = set(a.get("sample_id") for a in annotations)
-                        # Try to get the total sample count from the corresponding task file
-                        total_samples = 0
-                        # Try uploaded_tasks directory first
-                        task_file = f"uploaded_tasks/{task_name}_task.json"
-                        if os.path.exists(task_file):
-                            with open(task_file, "r") as f:
-                                task_data = json.load(f)
-                            total_samples = len(task_data.get("samples", []))
-                        else:
-                            # Try human_eval file in current directory
-                            task_file = f"{task_name}_human_eval.json"
                             if os.path.exists(task_file):
                                 with open(task_file, "r") as f:
                                     task_data = json.load(f)
                                 total_samples = len(task_data.get("samples", []))
-                        completion = f"{len(sample_ids)}/{total_samples}" if total_samples else f"{len(sample_ids)} samples"
-                        stats.append(f"### {task_name}\n- Annotations: {annotation_count}\n- Completion: {completion}\n- File: {r}")
-                    except Exception as e:
-                        stats.append(f"### {r}\n- Error loading results: {str(e)}")
-                return "\n\n".join(stats)
-            result_stats = gr.Markdown(get_result_stats())
-            refresh_results_btn = gr.Button("Refresh Results")
-            # Add download options
-            with gr.Row():
-                download_all_btn = gr.Button("Download All Results (ZIP)")
-                result_select = gr.Dropdown(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")], label="Select Result to Download")
-                download_selected_btn = gr.Button("Download Selected")
-            # Connect events
-            refresh_results_btn.click(get_result_stats, outputs=[result_stats])
-            # Function to prepare all results for download as ZIP
-            def prepare_all_results():
-                import zipfile
-                zip_path = "mteb_human_eval_results.zip"
-                with zipfile.ZipFile(zip_path, 'w') as zipf:
-                    for r in [f for f in os.listdir(".") if f.endswith("_human_results.json")]:
-                        zipf.write(r)
-                return zip_path
-            # Function to return a single result file
-            def get_selected_result(filename):
-                if not filename:
                     return None
-                if os.path.exists(filename):
-                    return filename
-                return None
-            # Update dropdown when refreshing results
-            def update_result_dropdown():
-                return gr.Dropdown.update(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")])
-            refresh_results_btn.click(update_result_dropdown, outputs=[result_select])
-            download_all_btn.click(prepare_all_results, outputs=[gr.File(label="Download All Results")])
-            download_selected_btn.click(get_selected_result, inputs=[result_select], outputs=[gr.File(label="Download Selected Result")])
 if __name__ == "__main__":
     demo.launch()

             """.format(instructions=task_data.get("instructions", "Rank documents by their relevance to the query.")))
         current_sample_id = gr.State(value=samples[0]["id"])
+        auto_save_enabled = gr.State(value=True)
         with gr.Row():
             progress_text = gr.Textbox(label="Progress", value=f"Progress: 0/{len(samples)}", interactive=False)
                             doc_containers.append(doc_box)
                         with gr.Column(scale=1):
+                            # Use Dropdown instead of Radio for compatibility with Gradio 3.x
+                            rank_input = gr.Dropdown(
                                 choices=[str(j) for j in range(1, len(samples[0]["candidates"])+1)],
                                 label=f"Rank",
+                                value=""
                             )
                             ranking_inputs.append(rank_input)
                 else:
                     results.append("✓")
+            return results + [all_valid]  # Return validation indicators and validity flag
         def load_sample(sample_id):
             """Load a specific sample into the interface."""
             sample = next((s for s in samples if s["id"] == sample_id), None)
             if not sample:
+                return [query_text.value] + [d.value for d in doc_containers] + [""] * len(ranking_inputs) + [""] * len(validation_indicators) + [sample_id, progress_text.value, status_box.value]
             # Update query
             new_query = sample["query"]
                 new_status += " (already completed)"
             # Initialize validation indicators
+            validation_results = validate_rankings(*new_rankings)
+            validation_indicators_values = validation_results[:-1]  # Remove validity flag
+            return [new_query] + new_docs + new_rankings + validation_indicators_values + [sample_id, new_progress, new_status]
         def auto_save_and_navigate(direction, current_id, auto_save, *rankings):
             """Save rankings if auto-save is enabled, then navigate."""
             if auto_save:
                 # Only save if all rankings are provided
+                validation_results = validate_rankings(*actual_rankings)
+                all_valid = validation_results[-1]  # Last item is validity flag
                 if all_valid:
                     status_msg, progress_msg = save_ranking(actual_rankings, current_id)
             try:
                 with open(output_path, "w") as f:
                     json.dump(results, f, indent=2)
                 return f"✅ Results saved to {output_path} ({len(results['annotations'])} annotations)"
             except Exception as e:
                 return f"Error saving results: {str(e)}"
         # Function to clear all rankings
         def clear_rankings():
+            return [""] * len(samples[0]["candidates"])
         # Define a function that collects all ranking values and validates them
         def submit_rankings(*args):
             rankings = args[:len(ranking_inputs)]
             # First validate the rankings
+            validation_results = validate_rankings(*rankings)
+            all_valid = validation_results[-1]  # Last item is validity flag
+            validation_indicators_values = validation_results[:-1]  # Remove validity flag
             # Update validation indicators
+            for i, result in enumerate(validation_indicators_values):
+                if i < len(validation_indicators):
+                    validation_indicators[i].update(value=result)
             # If not all valid, return error message
             if not all_valid:
             status, progress = save_ranking(rankings, sample_id)
             return status, progress
+        # Wire up events (Gradio 3.x syntax)
         submit_btn.click(
+            fn=submit_rankings,
             inputs=ranking_inputs + [current_sample_id],
             outputs=[status_box, progress_text]
         )
+        # Auto-save and navigate events
+        def handle_next(current_id, auto_save, *rankings):
+            # First, handle auto-save
+            new_id, status, progress = auto_save_and_navigate("next", current_id, auto_save, *rankings)
+            # Then, load the new sample
+            outputs = load_sample(new_id)
+            # Add the status and progress
+            outputs[-2] = progress if status else outputs[-2]
+            outputs[-1] = status if status else outputs[-1]
+            return outputs
+        def handle_prev(current_id, auto_save, *rankings):
+            # First, handle auto-save
+            new_id, status, progress = auto_save_and_navigate("prev", current_id, auto_save, *rankings)
+            # Then, load the new sample
+            outputs = load_sample(new_id)
+            # Add the status and progress
+            outputs[-2] = progress if status else outputs[-2]
+            outputs[-1] = status if status else outputs[-1]
+            return outputs
+        # Connect navigation with Gradio 3.x syntax
         next_btn.click(
+            fn=handle_next,
+            inputs=[current_sample_id, auto_save_toggle] + ranking_inputs,
             outputs=[query_text] + doc_containers + ranking_inputs + validation_indicators + [current_sample_id, progress_text, status_box]
         )
         prev_btn.click(
+            fn=handle_prev,
+            inputs=[current_sample_id, auto_save_toggle] + ranking_inputs,
             outputs=[query_text] + doc_containers + ranking_inputs + validation_indicators + [current_sample_id, progress_text, status_box]
         )
         # Connect quick ranking buttons
         sequential_btn.click(
+            fn=assign_sequential_ranks,
+            inputs=None,
             outputs=ranking_inputs
         )
         reverse_btn.click(
+            fn=assign_reverse_ranks,
+            inputs=None,
             outputs=ranking_inputs
         )
         clear_btn.click(
+            fn=clear_rankings,
+            inputs=None,
             outputs=ranking_inputs
         )
         # Connect save button
+        save_btn.click(
+            fn=save_results,
+            inputs=None,
+            outputs=[status_box]
+        )
+        # Connect auto-save toggle
+        def update_auto_save(enabled):
+            return enabled
         auto_save_toggle.change(
+            fn=update_auto_save,
             inputs=[auto_save_toggle],
+            outputs=[auto_save_enabled]
         )
     return demo
 # Main app with file upload capability and improved task management
+def create_main_app():
+    with gr.Blocks(theme=gr.themes.Soft()) as app:
+        gr.Markdown("# MTEB Human Evaluation Demo")
+        task_container = gr.HTML()
+        loaded_task_info = gr.JSON(label="Loaded Task Information", visible=False)
+        tabs = gr.Tabs()
+        with tabs:
+            with gr.TabItem("Demo"):
+                gr.Markdown("""
+                ## MTEB Human Evaluation Interface
+                This interface allows you to evaluate the relevance of documents for reranking tasks.
+                """)
+                # Function to get the most recent task file
+                def get_latest_task_file():
+                    # Check first in uploaded_tasks directory
                     os.makedirs("uploaded_tasks", exist_ok=True)
+                    uploaded_tasks = [f for f in os.listdir("uploaded_tasks") if f.endswith(".json")]
+                    if uploaded_tasks:
+                        # Sort by modification time, newest first
+                        uploaded_tasks.sort(key=lambda x: os.path.getmtime(os.path.join("uploaded_tasks", x)), reverse=True)
+                        task_path = os.path.join("uploaded_tasks", uploaded_tasks[0])
+                        # Verify this is a valid task file
+                        try:
+                            with open(task_path, "r") as f:
+                                task_data = json.load(f)
+                                if "task_name" in task_data and "samples" in task_data:
+                                    return task_path
+                        except:
+                            pass
+                    # Look for task files in the current directory
+                    current_dir_tasks = [f for f in os.listdir(".") if f.endswith("_human_eval.json")]
+                    if current_dir_tasks:
+                        # Sort by modification time, newest first
+                        current_dir_tasks.sort(key=lambda x: os.path.getmtime(x), reverse=True)
+                        return current_dir_tasks[0]
+                    # Fall back to fixed example if available
+                    if os.path.exists("AskUbuntuDupQuestions_human_eval.json"):
+                        return "AskUbuntuDupQuestions_human_eval.json"
+                    # No valid task file found
                     return None
+                # Load the task file
+                task_file = get_latest_task_file()
+                if task_file:
+                    try:
+                        with open(task_file, "r") as f:
+                            task_data = json.load(f)
+                        # Show which task is currently loaded
+                        gr.Markdown(f"**Current Task: {task_data['task_name']}** ({len(task_data['samples'])} samples)")
+                        # Display the interface
+                        demo = create_reranking_interface(task_data)
+                        task_container.update(value=f"<p>Task loaded: {task_file}</p>")
+                    except Exception as e:
+                        gr.Markdown(f"**Error loading task: {str(e)}**")
+                        gr.Markdown("Please upload a valid task file in the 'Upload & Evaluate' tab.")
+                else:
+                    gr.Markdown("**No task file found**")
+                    gr.Markdown("Please upload a valid task file in the 'Upload & Evaluate' tab.")
+            with gr.TabItem("Upload & Evaluate"):
+                gr.Markdown("""
+                ## Upload Your Own Task File
+                If you have a prepared task file, you can upload it here to create an evaluation interface.
+                """)
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        file_input = gr.File(label="Upload a task file (JSON)")
+                        load_btn = gr.Button("Load Task")
+                        message = gr.Textbox(label="Status", interactive=False)
+                        # Add task list for previously uploaded tasks
+                        gr.Markdown("### Previous Uploads")
+                        # Function to list existing task files in the tasks directory
+                        def list_task_files():
+                            os.makedirs("uploaded_tasks", exist_ok=True)
+                            tasks = [f for f in os.listdir("uploaded_tasks") if f.endswith(".json")]
+                            if not tasks:
+                                return "No task files uploaded yet."
+                            return "\n".join([f"- {t}" for t in tasks])
+                        task_list = gr.Markdown(list_task_files())
+                        refresh_btn = gr.Button("Refresh List")
+                        # Add results management section
+                        gr.Markdown("### Results Management")
+                        # Function to list existing result files
+                        def list_result_files():
+                            results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
+                            if not results:
+                                return "No result files available yet."
+                            result_links = []
+                            for r in results:
+                                # Calculate completion stats
+                                try:
+                                    with open(r, "r") as f:
+                                        result_data = json.load(f)
+                                    annotation_count = len(result_data.get("annotations", []))
+                                    task_name = result_data.get("task_name", "Unknown")
+                                    result_links.append(f"- {r} ({annotation_count} annotations for {task_name})")
+                                except:
+                                    result_links.append(f"- {r}")
+                            return "\n".join(result_links)
+                        results_list = gr.Markdown(list_result_files())
+                        download_results_btn = gr.Button("Download Results")
+                # Handle file upload and storage
+                def handle_upload(file):
+                    if not file:
+                        return "Please upload a task file", task_list.value, ""
                     try:
+                        # Create directory if it doesn't exist
+                        os.makedirs("uploaded_tasks", exist_ok=True)
+                        # Read the uploaded file
+                        with open(file.name, "r") as f:
+                            task_data = json.load(f)
+                        # Validate task format
+                        if "task_name" not in task_data or "samples" not in task_data:
+                            return "Invalid task file format. Must contain 'task_name' and 'samples' fields.", task_list.value, ""
+                        # Save to a consistent location
+                        task_filename = f"uploaded_tasks/{task_data['task_name']}_task.json"
+                        with open(task_filename, "w") as f:
+                            json.dump(task_data, f, indent=2)
+                        return f"Task '{task_data['task_name']}' uploaded successfully with {len(task_data['samples'])} samples. Please refresh the app and use the Demo tab to evaluate it.", list_task_files(), f"""
+                        <div style="padding: 20px; background-color: #f0f0f0; border-radius: 10px;">
+                            <h3>Task uploaded successfully!</h3>
+                            <p>Task Name: {task_data['task_name']}</p>
+                            <p>Samples: {len(task_data['samples'])}</p>
+                            <p>To evaluate this task:</p>
+                            <ol>
+                                <li>Refresh the app</li>
+                                <li>The Demo tab will now use your uploaded task</li>
+                                <li>Complete your evaluations</li>
+                                <li>Results will be saved as {task_data['task_name']}_human_results.json</li>
+                            </ol>
+                        </div>
+                        """
+                    except Exception as e:
+                        return f"Error processing task file: {str(e)}", task_list.value, ""
+                # Function to prepare results for download
+                def prepare_results_for_download():
+                    results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
+                    if not results:
+                        return None
+                    # Create a zip file with all results
+                    import zipfile
+                    zip_path = "mteb_human_eval_results.zip"
+                    with zipfile.ZipFile(zip_path, 'w') as zipf:
+                        for r in results:
+                            zipf.write(r)
+                    return zip_path
+                # Connect events
+                load_btn.click(
+                    fn=handle_upload,
+                    inputs=[file_input],
+                    outputs=[message, task_list, task_container]
+                )
+                refresh_btn.click(
+                    fn=list_task_files,
+                    inputs=None,
+                    outputs=[task_list]
+                )
+                download_results_btn.click(
+                    fn=prepare_results_for_download,
+                    inputs=None,
+                    outputs=[gr.File(label="Download Results")]
+                )
+            with gr.TabItem("Results Management"):
+                gr.Markdown("""
+                ## Manage Evaluation Results
+                View, download, and analyze your evaluation results.
+                """)
+                # Function to load and display result stats
+                def get_result_stats():
+                    results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
+                    if not results:
+                        return "No result files available yet."
+                    stats = []
+                    for r in results:
+                        try:
+                            with open(r, "r") as f:
+                                result_data = json.load(f)
+                            task_name = result_data.get("task_name", "Unknown")
+                            annotations = result_data.get("annotations", [])
+                            annotation_count = len(annotations)
+                            # Calculate completion percentage
+                            sample_ids = set(a.get("sample_id") for a in annotations)
+                            # Try to get the total sample count from the corresponding task file
+                            total_samples = 0
+                            # Try uploaded_tasks directory first
+                            task_file = f"uploaded_tasks/{task_name}_task.json"
                             if os.path.exists(task_file):
                                 with open(task_file, "r") as f:
                                     task_data = json.load(f)
                                 total_samples = len(task_data.get("samples", []))
+                            else:
+                                # Try human_eval file in current directory
+                                task_file = f"{task_name}_human_eval.json"
+                                if os.path.exists(task_file):
+                                    with open(task_file, "r") as f:
+                                        task_data = json.load(f)
+                                    total_samples = len(task_data.get("samples", []))
+                            completion = f"{len(sample_ids)}/{total_samples}" if total_samples else f"{len(sample_ids)} samples"
+                            stats.append(f"### {task_name}\n- Annotations: {annotation_count}\n- Completion: {completion}\n- File: {r}")
+                        except Exception as e:
+                            stats.append(f"### {r}\n- Error loading results: {str(e)}")
+                    return "\n\n".join(stats)
+                result_stats = gr.Markdown(get_result_stats())
+                refresh_results_btn = gr.Button("Refresh Results")
+                # Add download options
+                with gr.Row():
+                    download_all_btn = gr.Button("Download All Results (ZIP)")
+                    result_select = gr.Dropdown(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")], label="Select Result to Download")
+                    download_selected_btn = gr.Button("Download Selected")
+                # Function to prepare all results for download as ZIP
+                def prepare_all_results():
+                    import zipfile
+                    zip_path = "mteb_human_eval_results.zip"
+                    with zipfile.ZipFile(zip_path, 'w') as zipf:
+                        for r in [f for f in os.listdir(".") if f.endswith("_human_results.json")]:
+                            zipf.write(r)
+                    return zip_path
+                # Function to return a single result file
+                def get_selected_result(filename):
+                    if not filename:
+                        return None
+                    if os.path.exists(filename):
+                        return filename
                     return None
+                # Update dropdown when refreshing results
+                def update_result_dropdown():
+                    return gr.Dropdown.update(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")])
+                # Connect events
+                refresh_results_btn.click(
+                    fn=get_result_stats,
+                    inputs=None,
+                    outputs=[result_stats]
+                )
+                refresh_results_btn.click(
+                    fn=update_result_dropdown,
+                    inputs=None,
+                    outputs=[result_select]
+                )
+                download_all_btn.click(
+                    fn=prepare_all_results,
+                    inputs=None,
+                    outputs=[gr.File(label="Download All Results")]
+                )
+                download_selected_btn.click(
+                    fn=get_selected_result,
+                    inputs=[result_select],
+                    outputs=[gr.File(label="Download Selected Result")]
+                )
+    return app
+# Create the app
+demo = create_main_app()
 if __name__ == "__main__":
     demo.launch()