Spaces:

AdnanElAssadi
/

MTEB-Human-Eval-Demo

Sleeping

App Files Files Community

AdnanElAssadi commited on Apr 6

Commit

06f87ee

verified ·

1 Parent(s): 84410cb

Update app.py

Browse files

Files changed (1) hide show

app.py +450 -256

app.py CHANGED Viewed

@@ -1,256 +1,450 @@
-import gradio as gr
-import json
-import os
-from pathlib import Path
-def create_reranking_interface(task_data):
-    """Create a Gradio interface for reranking evaluation."""
-    samples = task_data["samples"]
-    results = {"task_name": task_data["task_name"], "task_type": "reranking", "annotations": []}
-    completed_samples = {s["id"]: False for s in samples}
-    def save_ranking(rankings, sample_id):
-        """Save the current set of rankings."""
-        # Check if all documents have rankings
-        all_ranked = all(r is not None and r != "" for r in rankings)
-        if not all_ranked:
-            return "⚠️ Please assign a rank to all documents before submitting", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
-        # Convert rankings to integers
-        processed_rankings = [int(r) for r in rankings]
-        # Check for duplicate rankings
-        if len(set(processed_rankings)) != len(processed_rankings):
-            return "⚠️ Each document must have a unique rank. Please review your rankings.", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
-        # Store this annotation
-        existing_idx = next((i for i, a in enumerate(results["annotations"]) if a["sample_id"] == sample_id), None)
-        if existing_idx is not None:
-            results["annotations"][existing_idx] = {
-                "sample_id": sample_id,
-                "rankings": processed_rankings
-            }
-        else:
-            results["annotations"].append({
-                "sample_id": sample_id,
-                "rankings": processed_rankings
-            })
-        completed_samples[sample_id] = True
-        success_msg = f"✅ Rankings for query '{sample_id}' successfully saved!"
-        progress = f"Progress: {sum(completed_samples.values())}/{len(samples)}"
-        # Auto-save results after each submission
-        output_path = f"{task_data['task_name']}_human_results.json"
-        with open(output_path, "w") as f:
-            json.dump(results, f, indent=2)
-        return success_msg, progress
-    with gr.Blocks(theme=gr.themes.Soft()) as demo:
-        gr.Markdown(f"# {task_data['task_name']} - Human Reranking Evaluation")
-        with gr.Accordion("Instructions", open=True):
-            gr.Markdown("""
-            ## Task Instructions
-            {instructions}
-            ### How to use this interface:
-            1. Read the query at the top
-            2. Review each document carefully
-            3. Assign a rank to each document (1 = most relevant, higher numbers = less relevant)
-            4. Each document must have a unique rank
-            5. Click "Submit Rankings" when you're done with the current query
-            6. Use "Previous" and "Next" to navigate between queries
-            7. Click "Save All Results" periodically to ensure your work is saved
-            """.format(instructions=task_data["instructions"]))
-        current_sample_id = gr.State(value=samples[0]["id"])
-        with gr.Row():
-            progress_text = gr.Textbox(label="Progress", value=f"Progress: 0/{len(samples)}", interactive=False)
-            status_box = gr.Textbox(label="Status", value="Ready to start evaluation", interactive=False)
-        with gr.Group():
-            gr.Markdown("## Query:")
-            query_text = gr.Textbox(value=samples[0]["query"], label="", interactive=False)
-            gr.Markdown("## Documents to Rank:")
-            # Create document displays and ranking dropdowns in synchronized pairs
-            doc_containers = []
-            ranking_dropdowns = []
-            with gr.Column():
-                for i, doc in enumerate(samples[0]["candidates"]):
-                    with gr.Row():
-                        doc_box = gr.Textbox(
-                            value=doc,
-                            label=f"Document {i+1}",
-                            interactive=False
-                        )
-                        dropdown = gr.Dropdown(
-                            choices=[str(j) for j in range(1, len(samples[0]["candidates"])+1)],
-                            label=f"Rank",
-                            value=""
-                        )
-                        doc_containers.append(doc_box)
-                        ranking_dropdowns.append(dropdown)
-            with gr.Row():
-                prev_btn = gr.Button("← Previous Query", size="sm")
-                submit_btn = gr.Button("Submit Rankings", size="lg", variant="primary")
-                next_btn = gr.Button("Next Query →", size="sm")
-            save_btn = gr.Button("💾 Save All Results", variant="secondary")
-        def load_sample(sample_id):
-            """Load a specific sample into the interface."""
-            sample = next((s for s in samples if s["id"] == sample_id), None)
-            if not sample:
-                return [query_text.value] + [d.value for d in doc_containers] + [""] * len(ranking_dropdowns) + [current_sample_id.value, progress_text.value, status_box.value]
-            # Update query
-            new_query = sample["query"]
-            # Update documents
-            new_docs = []
-            for i, doc in enumerate(sample["candidates"]):
-                if i < len(doc_containers):
-                    new_docs.append(doc)
-            # Initialize rankings
-            new_rankings = [""] * len(ranking_dropdowns)
-            # Check if this sample has already been annotated
-            existing_annotation = next((a for a in results["annotations"] if a["sample_id"] == sample_id), None)
-            if existing_annotation:
-                # Restore previous rankings
-                for i, rank in enumerate(existing_annotation["rankings"]):
-                    if i < len(new_rankings) and rank is not None:
-                        new_rankings[i] = str(rank)
-            # Update progress
-            current_idx = samples.index(sample)
-            new_progress = f"Progress: {sum(completed_samples.values())}/{len(samples)}"
-            new_status = f"Viewing query {current_idx + 1} of {len(samples)}"
-            if completed_samples[sample_id]:
-                new_status += " (already completed)"
-            return [new_query] + new_docs + new_rankings + [sample["id"], new_progress, new_status]
-        def next_sample(current_id):
-            """Load the next sample."""
-            current_sample = next((s for s in samples if s["id"] == current_id), None)
-            if not current_sample:
-                return current_id
-            current_idx = samples.index(current_sample)
-            if current_idx < len(samples) - 1:
-                next_sample = samples[current_idx + 1]
-                return next_sample["id"]
-            return current_id
-        def prev_sample(current_id):
-            """Load the previous sample."""
-            current_sample = next((s for s in samples if s["id"] == current_id), None)
-            if not current_sample:
-                return current_id
-            current_idx = samples.index(current_sample)
-            if current_idx > 0:
-                prev_sample = samples[current_idx - 1]
-                return prev_sample["id"]
-            return current_id
-        def save_results():
-            """Save all collected results to a file."""
-            output_path = f"{task_data['task_name']}_human_results.json"
-            with open(output_path, "w") as f:
-                json.dump(results, f, indent=2)
-            return f"✅ Results saved to {output_path} ({len(results['annotations'])} annotations)"
-        # Connect events
-        submit_btn.click(
-            save_ranking,
-            inputs=ranking_dropdowns + [current_sample_id],
-            outputs=[status_box, progress_text]
-        )
-        next_btn.click(
-            next_sample,
-            inputs=[current_sample_id],
-            outputs=[current_sample_id]
-        ).then(
-            load_sample,
-            inputs=[current_sample_id],
-            outputs=[query_text] + doc_containers + ranking_dropdowns + [current_sample_id, progress_text, status_box]
-        )
-        prev_btn.click(
-            prev_sample,
-            inputs=[current_sample_id],
-            outputs=[current_sample_id]
-        ).then(
-            load_sample,
-            inputs=[current_sample_id],
-            outputs=[query_text] + doc_containers + ranking_dropdowns + [current_sample_id, progress_text, status_box]
-        )
-        save_btn.click(save_results, outputs=[status_box])
-    return demo
-# Main app with file upload capability
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# MTEB Human Evaluation Demo")
-    with gr.Tabs():
-        with gr.TabItem("Demo"):
-            gr.Markdown("""
-            ## Try the MTEB Human Evaluation Interface
-            This is a demonstration of the human evaluation interface for MTEB reranking tasks.
-            The example below uses the AskUbuntuDupQuestions dataset.
-            """)
-            # Load the example task file
-            with open("AskUbuntuDupQuestions_human_eval.json", "r") as f:
-                example_data = json.load(f)
-            # Display a demo with the example data
-            reranking_demo = create_reranking_interface(example_data)
-        with gr.TabItem("Upload & Evaluate"):
-            gr.Markdown("""
-            ## Upload Your Own Task File
-            If you have a prepared task file, you can upload it here to try out the evaluation interface.
-            """)
-            file_input = gr.File(label="Upload a task file (JSON)")
-            load_btn = gr.Button("Load Task")
-            message = gr.Textbox(label="Status")
-            task_container = gr.HTML()
-            def load_custom_task(file):
-                if not file:
-                    return "Please upload a task file"
-                try:
-                    with open(file.name, "r") as f:
-                        task_data = json.load(f)
-                    task_interface = create_reranking_interface(task_data)
-                    # This is a placeholder - in Gradio you can't dynamically create interfaces this way
-                    # You would need a different approach for a real implementation
-                    return f"Task '{task_data['task_name']}' loaded with {len(task_data['samples'])} samples"
-                except Exception as e:
-                    return f"Error loading task file: {str(e)}"
-            load_btn.click(load_custom_task, inputs=[file_input], outputs=[message])
-if __name__ == "__main__":
-    demo.launch()

+import gradio as gr
+import json
+import os
+from pathlib import Path
+def create_reranking_interface(task_data):
+    """Create a Gradio interface for reranking evaluation."""
+    samples = task_data["samples"]
+    results = {"task_name": task_data["task_name"], "task_type": "reranking", "annotations": []}
+    completed_samples = {s["id"]: False for s in samples}
+    def save_ranking(rankings, sample_id):
+        """Save the current set of rankings."""
+        # Check if all documents have rankings
+        all_ranked = all(r is not None and r != "" for r in rankings)
+        if not all_ranked:
+            return "⚠️ Please assign a rank to all documents before submitting", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
+        # Convert rankings to integers
+        processed_rankings = [int(r) for r in rankings]
+        # Check for duplicate rankings
+        if len(set(processed_rankings)) != len(processed_rankings):
+            return "⚠️ Each document must have a unique rank. Please review your rankings.", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
+        # Store this annotation
+        existing_idx = next((i for i, a in enumerate(results["annotations"]) if a["sample_id"] == sample_id), None)
+        if existing_idx is not None:
+            results["annotations"][existing_idx] = {
+                "sample_id": sample_id,
+                "rankings": processed_rankings
+            }
+        else:
+            results["annotations"].append({
+                "sample_id": sample_id,
+                "rankings": processed_rankings
+            })
+        completed_samples[sample_id] = True
+        success_msg = f"✅ Rankings for query '{sample_id}' successfully saved!"
+        progress = f"Progress: {sum(completed_samples.values())}/{len(samples)}"
+        # Auto-save results after each submission
+        output_path = f"{task_data['task_name']}_human_results.json"
+        with open(output_path, "w") as f:
+            json.dump(results, f, indent=2)
+        return success_msg, progress
+    with gr.Blocks(theme=gr.themes.Soft()) as demo:
+        gr.Markdown(f"# {task_data['task_name']} - Human Reranking Evaluation")
+        with gr.Accordion("Instructions", open=True):
+            gr.Markdown("""
+            ## Task Instructions
+            {instructions}
+            ### How to use this interface:
+            1. Read the query at the top
+            2. Review each document carefully
+            3. Assign a rank to each document (1 = most relevant, higher numbers = less relevant)
+            4. Each document must have a unique rank
+            5. Click "Submit Rankings" when you're done with the current query
+            6. Use "Previous" and "Next" to navigate between queries
+            7. Click "Save All Results" periodically to ensure your work is saved
+            """.format(instructions=task_data["instructions"]))
+        current_sample_id = gr.State(value=samples[0]["id"])
+        with gr.Row():
+            progress_text = gr.Textbox(label="Progress", value=f"Progress: 0/{len(samples)}", interactive=False)
+            status_box = gr.Textbox(label="Status", value="Ready to start evaluation", interactive=False)
+        with gr.Group():
+            gr.Markdown("## Query:")
+            query_text = gr.Textbox(value=samples[0]["query"], label="", interactive=False)
+            gr.Markdown("## Documents to Rank:")
+            # Create document displays and ranking dropdowns in synchronized pairs
+            doc_containers = []
+            ranking_dropdowns = []
+            with gr.Column():
+                for i, doc in enumerate(samples[0]["candidates"]):
+                    with gr.Row():
+                        doc_box = gr.Textbox(
+                            value=doc,
+                            label=f"Document {i+1}",
+                            interactive=False
+                        )
+                        dropdown = gr.Dropdown(
+                            choices=[str(j) for j in range(1, len(samples[0]["candidates"])+1)],
+                            label=f"Rank",
+                            value=""
+                        )
+                        doc_containers.append(doc_box)
+                        ranking_dropdowns.append(dropdown)
+            with gr.Row():
+                prev_btn = gr.Button("← Previous Query", size="sm")
+                submit_btn = gr.Button("Submit Rankings", size="lg", variant="primary")
+                next_btn = gr.Button("Next Query →", size="sm")
+            save_btn = gr.Button("💾 Save All Results", variant="secondary")
+        def load_sample(sample_id):
+            """Load a specific sample into the interface."""
+            sample = next((s for s in samples if s["id"] == sample_id), None)
+            if not sample:
+                return [query_text.value] + [d.value for d in doc_containers] + [""] * len(ranking_dropdowns) + [current_sample_id.value, progress_text.value, status_box.value]
+            # Update query
+            new_query = sample["query"]
+            # Update documents
+            new_docs = []
+            for i, doc in enumerate(sample["candidates"]):
+                if i < len(doc_containers):
+                    new_docs.append(doc)
+            # Initialize rankings
+            new_rankings = [""] * len(ranking_dropdowns)
+            # Check if this sample has already been annotated
+            existing_annotation = next((a for a in results["annotations"] if a["sample_id"] == sample_id), None)
+            if existing_annotation:
+                # Restore previous rankings
+                for i, rank in enumerate(existing_annotation["rankings"]):
+                    if i < len(new_rankings) and rank is not None:
+                        new_rankings[i] = str(rank)
+            # Update progress
+            current_idx = samples.index(sample)
+            new_progress = f"Progress: {sum(completed_samples.values())}/{len(samples)}"
+            new_status = f"Viewing query {current_idx + 1} of {len(samples)}"
+            if completed_samples[sample_id]:
+                new_status += " (already completed)"
+            return [new_query] + new_docs + new_rankings + [sample["id"], new_progress, new_status]
+        def next_sample(current_id):
+            """Load the next sample."""
+            current_sample = next((s for s in samples if s["id"] == current_id), None)
+            if not current_sample:
+                return current_id
+            current_idx = samples.index(current_sample)
+            if current_idx < len(samples) - 1:
+                next_sample = samples[current_idx + 1]
+                return next_sample["id"]
+            return current_id
+        def prev_sample(current_id):
+            """Load the previous sample."""
+            current_sample = next((s for s in samples if s["id"] == current_id), None)
+            if not current_sample:
+                return current_id
+            current_idx = samples.index(current_sample)
+            if current_idx > 0:
+                prev_sample = samples[current_idx - 1]
+                return prev_sample["id"]
+            return current_id
+        def save_results():
+            """Save all collected results to a file."""
+            output_path = f"{task_data['task_name']}_human_results.json"
+            with open(output_path, "w") as f:
+                json.dump(results, f, indent=2)
+            return f"✅ Results saved to {output_path} ({len(results['annotations'])} annotations)"
+        # Connect events
+        submit_btn.click(
+            save_ranking,
+            inputs=ranking_dropdowns + [current_sample_id],
+            outputs=[status_box, progress_text]
+        )
+        next_btn.click(
+            next_sample,
+            inputs=[current_sample_id],
+            outputs=[current_sample_id]
+        ).then(
+            load_sample,
+            inputs=[current_sample_id],
+            outputs=[query_text] + doc_containers + ranking_dropdowns + [current_sample_id, progress_text, status_box]
+        )
+        prev_btn.click(
+            prev_sample,
+            inputs=[current_sample_id],
+            outputs=[current_sample_id]
+        ).then(
+            load_sample,
+            inputs=[current_sample_id],
+            outputs=[query_text] + doc_containers + ranking_dropdowns + [current_sample_id, progress_text, status_box]
+        )
+        save_btn.click(save_results, outputs=[status_box])
+    return demo
+# Main app with file upload capability
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# MTEB Human Evaluation Demo")
+    with gr.Tabs():
+        with gr.TabItem("Demo"):
+            gr.Markdown("""
+            ## MTEB Human Evaluation Interface
+            This interface allows you to evaluate the relevance of documents for reranking tasks.
+            """)
+            # Function to get the most recent task file
+            def get_latest_task_file():
+                # Check first in uploaded_tasks directory
+                os.makedirs("uploaded_tasks", exist_ok=True)
+                uploaded_tasks = [f for f in os.listdir("uploaded_tasks") if f.endswith(".json")]
+                if uploaded_tasks:
+                    # Sort by modification time, newest first
+                    uploaded_tasks.sort(key=lambda x: os.path.getmtime(os.path.join("uploaded_tasks", x)), reverse=True)
+                    return os.path.join("uploaded_tasks", uploaded_tasks[0])
+                # Fall back to default example
+                return "AskUbuntuDupQuestions_human_eval.json"
+            # Load the task file
+            task_file = get_latest_task_file()
+            try:
+                with open(task_file, "r") as f:
+                    task_data = json.load(f)
+                # Show which task is currently loaded
+                gr.Markdown(f"**Current Task: {task_data['task_name']}** ({len(task_data['samples'])} samples)")
+                # Display the interface
+                reranking_demo = create_reranking_interface(task_data)
+            except Exception as e:
+                gr.Markdown(f"**Error loading task: {str(e)}**")
+                gr.Markdown("Please upload a valid task file in the 'Upload & Evaluate' tab.")
+        with gr.TabItem("Upload & Evaluate"):
+            gr.Markdown("""
+            ## Upload Your Own Task File
+            If you have a prepared task file, you can upload it here to create an evaluation interface.
+            """)
+            with gr.Row():
+                with gr.Column(scale=1):
+                    file_input = gr.File(label="Upload a task file (JSON)")
+                    load_btn = gr.Button("Load Task")
+                    message = gr.Textbox(label="Status", interactive=False)
+                    # Add task list for previously uploaded tasks
+                    gr.Markdown("### Previous Uploads")
+                    # Function to list existing task files in the tasks directory
+                    def list_task_files():
+                        os.makedirs("uploaded_tasks", exist_ok=True)
+                        tasks = [f for f in os.listdir("uploaded_tasks") if f.endswith(".json")]
+                        if not tasks:
+                            return "No task files uploaded yet."
+                        return "\n".join([f"- [{t}](javascript:selectTask('{t}'))" for t in tasks])
+                    task_list = gr.Markdown(list_task_files())
+                    refresh_btn = gr.Button("Refresh List")
+                    # Add results management section
+                    gr.Markdown("### Results Management")
+                    # Function to list existing result files
+                    def list_result_files():
+                        results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
+                        if not results:
+                            return "No result files available yet."
+                        result_links = []
+                        for r in results:
+                            # Calculate completion stats
+                            try:
+                                with open(r, "r") as f:
+                                    result_data = json.load(f)
+                                annotation_count = len(result_data.get("annotations", []))
+                                task_name = result_data.get("task_name", "Unknown")
+                                result_links.append(f"- {r} ({annotation_count} annotations for {task_name})")
+                            except:
+                                result_links.append(f"- {r}")
+                        return "\n".join(result_links)
+                    results_list = gr.Markdown(list_result_files())
+                    download_results_btn = gr.Button("Download Results")
+                # Right side - will contain the actual interface
+                with gr.Column(scale=2):
+                    task_container = gr.HTML()
+            # Handle file upload and storage
+            def handle_upload(file):
+                if not file:
+                    return "Please upload a task file", task_list.value, task_container.value
+                try:
+                    # Create directory if it doesn't exist
+                    os.makedirs("uploaded_tasks", exist_ok=True)
+                    # Read the uploaded file
+                    with open(file.name, "r") as f:
+                        task_data = json.load(f)
+                    # Validate task format
+                    if "task_name" not in task_data or "samples" not in task_data:
+                        return "Invalid task file format. Must contain 'task_name' and 'samples' fields.", task_list.value, task_container.value
+                    # Save to a consistent location
+                    task_filename = f"uploaded_tasks/{task_data['task_name']}_task.json"
+                    with open(task_filename, "w") as f:
+                        json.dump(task_data, f, indent=2)
+                    # Instead of trying to create the interface here,
+                    # we'll return a message with instructions
+                    return f"Task '{task_data['task_name']}' uploaded successfully with {len(task_data['samples'])} samples. Please refresh the app and use the Demo tab to evaluate it.", list_task_files(), f"""
+                    <div style="padding: 20px; background-color: #f0f0f0; border-radius: 10px;">
+                        <h3>Task uploaded successfully!</h3>
+                        <p>Task Name: {task_data['task_name']}</p>
+                        <p>Samples: {len(task_data['samples'])}</p>
+                        <p>To evaluate this task:</p>
+                        <ol>
+                            <li>Refresh the app</li>
+                            <li>The Demo tab will now use your uploaded task</li>
+                            <li>Complete your evaluations</li>
+                            <li>Results will be saved as {task_data['task_name']}_human_results.json</li>
+                        </ol>
+                    </div>
+                    """
+                except Exception as e:
+                    return f"Error processing task file: {str(e)}", task_list.value, task_container.value
+            # Function to prepare results for download
+            def prepare_results_for_download():
+                results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
+                if not results:
+                    return None
+                # Create a zip file with all results
+                import zipfile
+                zip_path = "mteb_human_eval_results.zip"
+                with zipfile.ZipFile(zip_path, 'w') as zipf:
+                    for r in results:
+                        zipf.write(r)
+                return zip_path
+            # Connect events
+            load_btn.click(handle_upload, inputs=[file_input], outputs=[message, task_list, task_container])
+            refresh_btn.click(list_task_files, outputs=[task_list])
+            download_results_btn.click(prepare_results_for_download, outputs=[gr.File(label="Download Results")])
+        with gr.TabItem("Results Management"):
+            gr.Markdown("""
+            ## Manage Evaluation Results
+            View, download, and analyze your evaluation results.
+            """)
+            # Function to load and display result stats
+            def get_result_stats():
+                results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
+                if not results:
+                    return "No result files available yet."
+                stats = []
+                for r in results:
+                    try:
+                        with open(r, "r") as f:
+                            result_data = json.load(f)
+                        task_name = result_data.get("task_name", "Unknown")
+                        annotations = result_data.get("annotations", [])
+                        annotation_count = len(annotations)
+                        # Calculate completion percentage
+                        sample_ids = set(a.get("sample_id") for a in annotations)
+                        # Try to get the total sample count from the corresponding task file
+                        total_samples = 0
+                        task_file = f"uploaded_tasks/{task_name}_task.json"
+                        if os.path.exists(task_file):
+                            with open(task_file, "r") as f:
+                                task_data = json.load(f)
+                            total_samples = len(task_data.get("samples", []))
+                        completion = f"{len(sample_ids)}/{total_samples}" if total_samples else f"{len(sample_ids)} samples"
+                        stats.append(f"### {task_name}\n- Annotations: {annotation_count}\n- Completion: {completion}\n- File: {r}")
+                    except Exception as e:
+                        stats.append(f"### {r}\n- Error loading results: {str(e)}")
+                return "\n\n".join(stats)
+            result_stats = gr.Markdown(get_result_stats())
+            refresh_results_btn = gr.Button("Refresh Results")
+            # Add download options
+            with gr.Row():
+                download_all_btn = gr.Button("Download All Results (ZIP)")
+                result_select = gr.Dropdown(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")], label="Select Result to Download")
+                download_selected_btn = gr.Button("Download Selected")
+            # Add results visualization placeholder
+            gr.Markdown("### Results Visualization")
+            gr.Markdown("*Visualization features will be added in a future update.*")
+            # Connect events
+            refresh_results_btn.click(get_result_stats, outputs=[result_stats])
+            # Function to prepare all results for download as ZIP
+            def prepare_all_results():
+                import zipfile
+                zip_path = "mteb_human_eval_results.zip"
+                with zipfile.ZipFile(zip_path, 'w') as zipf:
+                    for r in [f for f in os.listdir(".") if f.endswith("_human_results.json")]:
+                        zipf.write(r)
+                return zip_path
+            # Function to return a single result file
+            def get_selected_result(filename):
+                if not filename:
+                    return None
+                if os.path.exists(filename):
+                    return filename
+                return None
+            # Update dropdown when refreshing results
+            def update_result_dropdown():
+                return gr.Dropdown.update(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")])
+            refresh_results_btn.click(update_result_dropdown, outputs=[result_select])
+            download_all_btn.click(prepare_all_results, outputs=[gr.File(label="Download All Results")])
+            download_selected_btn.click(get_selected_result, inputs=[result_select], outputs=[gr.File(label="Download Selected Result")])
+if __name__ == "__main__":
+    demo.launch()