Spaces:

AdnanElAssadi
/

MTEB-Human-Eval-Demo

Running

App Files Files Community

AdnanElAssadi commited on 30 days ago

Commit

dbcd9e2

verified ·

1 Parent(s): 48294e4

Update app.py

Browse files

Files changed (1) hide show

app.py +152 -379

app.py CHANGED Viewed

@@ -9,23 +9,55 @@ def create_reranking_interface(task_data):
     results = {"task_name": task_data["task_name"], "task_type": "reranking", "annotations": []}
     completed_samples = {s["id"]: False for s in samples}
-    # Try to load existing results
-    output_path = f"{task_data['task_name']}_human_results.json"
-    if os.path.exists(output_path):
         try:
-            with open(output_path, "r") as f:
-                existing_results = json.load(f)
-                results = existing_results
-                # Update completed samples based on existing annotations
-                for anno in results.get("annotations", []):
-                    if "sample_id" in anno:
-                        completed_samples[anno["sample_id"]] = True
         except Exception as e:
-            print(f"Error loading existing results: {str(e)}")
-    # Create the main interface
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
         gr.Markdown(f"# {task_data['task_name']} - Human Reranking Evaluation")
         with gr.Accordion("Instructions", open=True):
             gr.Markdown("""
             ## Task Instructions
@@ -34,413 +66,156 @@ def create_reranking_interface(task_data):
             ### How to use this interface:
             1. Read the query at the top
-            2. For each document, select its rank (1 = most relevant)
-            3. Make sure each document has a unique rank (1 to N)
-            4. Click "Submit Rankings" when you're done with the current query
-            5. Use "Previous" and "Next" to navigate between queries
-            6. Click "Save All Results" periodically to ensure your work is saved
             """.format(instructions=task_data["instructions"]))
-        # State variables
         current_sample_id = gr.State(value=samples[0]["id"])
-        # Progress tracking
         with gr.Row():
-            progress_text = gr.Textbox(label="Progress", value=f"Progress: {sum(completed_samples.values())}/{len(samples)}", interactive=False)
             status_box = gr.Textbox(label="Status", value="Ready to start evaluation", interactive=False)
-        # Query display
         with gr.Group():
             gr.Markdown("## Query:")
-            query_text = gr.Textbox(value=samples[0]["query"], label="", interactive=False, lines=3)
-            # Document ranking section
             gr.Markdown("## Documents to Rank:")
-            # Create simple data structure for documents
-            doc_state = gr.State(value=samples[0]["candidates"])
-            # Create dynamic HTML for the ranking interface
-            def generate_ranking_html(docs, existing_ranks=None):
-                """Generate HTML for number-based ranking interface."""
-                if not docs:
-                    return ""
-                # Use existing ranks if available
-                ranks = list(range(1, len(docs) + 1))
-                if existing_ranks and len(existing_ranks) == len(docs):
-                    ranks = existing_ranks
-                html = """
-                <style>
-                .doc-container {
-                    margin-bottom: 15px;
-                    border: 1px solid #ddd;
-                    border-radius: 8px;
-                    padding: 15px;
-                    background-color: #f9f9f9;
-                }
-                .doc-header {
-                    display: flex;
-                    align-items: center;
-                    margin-bottom: 10px;
-                }
-                .doc-rank {
-                    display: flex;
-                    align-items: center;
-                    margin-right: 15px;
-                }
-                .rank-label {
-                    font-weight: bold;
-                    margin-right: 8px;
-                    min-width: 80px;
-                }
-                .rank-input {
-                    width: 60px;
-                    padding: 5px;
-                    border: 2px solid #007bff;
-                    border-radius: 4px;
-                    text-align: center;
-                    font-size: 16px;
-                }
-                .doc-content {
-                    padding: 10px;
-                    background-color: white;
-                    border-radius: 4px;
-                    border-left: 4px solid #007bff;
-                    white-space: pre-wrap;
-                    font-family: sans-serif;
-                    line-height: 1.5;
-                }
-                </style>
-                <div id="ranking-form">
-                <input type="hidden" id="ranking-state" value="">
-                """
-                # Add each document with a number input
-                for i, doc in enumerate(docs):
-                    import html as html_lib
-                    escaped_doc = html_lib.escape(doc)
-                    current_rank = ranks[i] if i < len(ranks) else i + 1
-                    html += f"""
-                    <div class="doc-container" id="doc-{i}">
-                        <div class="doc-header">
-                            <div class="doc-rank">
-                                <span class="rank-label">Document {i+1} Rank:</span>
-                                <input type="number" class="rank-input" id="rank-{i}" value="{current_rank}"
-                                       min="1" max="{len(docs)}" data-doc-id="{i}"
-                                       onchange="updateRankings()">
-                            </div>
-                        </div>
-                        <div class="doc-content">{escaped_doc}</div>
-                    </div>
-                    """
-                # Add validation and state tracking JS
-                html += """
-                <script>
-                function updateRankings() {
-                    // Collect all rank inputs
-                    const inputs = document.querySelectorAll('.rank-input');
-                    const rankings = [];
-                    // Get values and highlight duplicates
-                    const values = new Map();
-                    const duplicates = new Set();
-                    inputs.forEach(input => {
-                        const docId = parseInt(input.getAttribute('data-doc-id'));
-                        const rank = parseInt(input.value);
-                        // Store value
-                        rankings.push({
-                            docId: docId,
-                            rank: rank
-                        });
-                        // Check for duplicates
-                        if (values.has(rank)) {
-                            duplicates.add(rank);
-                        } else {
-                            values.set(rank, docId);
-                        }
-                        // Reset styling
-                        input.style.borderColor = '#007bff';
-                    });
-                    // Highlight duplicates
-                    inputs.forEach(input => {
-                        const rank = parseInt(input.value);
-                        if (duplicates.has(rank)) {
-                            input.style.borderColor = '#ff3860';
-                        }
-                    });
-                    // Store to hidden input
-                    const stateInput = document.getElementById('ranking-state');
-                    if (stateInput) {
-                        stateInput.value = JSON.stringify(rankings);
-                    }
-                    // Update gradio text area
-                    const textArea = document.querySelector('#rankings-state-input textarea');
-                    if (textArea) {
-                        textArea.value = JSON.stringify(rankings);
-                        const event = new Event('input', { bubbles: true });
-                        textArea.dispatchEvent(event);
-                    }
-                }
-                // Initialize on page load
-                document.addEventListener('DOMContentLoaded', updateRankings);
-                // Also use a delay as a backup
-                setTimeout(updateRankings, 500);
-                </script>
-                </div>
-                """
-                return html
-            # Initial ranking HTML
-            ranking_html = gr.HTML(
-                generate_ranking_html(samples[0]["candidates"]),
-                elem_id="ranking-container"
-            )
-            # Hidden input for state
-            rankings_state = gr.Textbox(
-                value="[]",
-                visible=False,
-                elem_id="rankings-state-input"
-            )
-            # Validation message
-            validation_msg = gr.Textbox(
-                label="Validation",
-                interactive=False
-            )
-            # Navigation and submission buttons
             with gr.Row():
                 prev_btn = gr.Button("← Previous Query", size="sm")
-                validate_btn = gr.Button("Validate Rankings", variant="secondary")
                 submit_btn = gr.Button("Submit Rankings", size="lg", variant="primary")
                 next_btn = gr.Button("Next Query →", size="sm")
             save_btn = gr.Button("💾 Save All Results", variant="secondary")
-        # Function to load a sample
         def load_sample(sample_id):
-            try:
-                sample = next((s for s in samples if s["id"] == sample_id), None)
-                if not sample:
-                    return sample_id, gr.update(), gr.update(), gr.update(), gr.update(), "[]", gr.update()
-                # Get existing ranking if available
-                existing_ranking = next((anno["rankings"] for anno in results["annotations"] if anno["sample_id"] == sample_id), None)
-                # Generate HTML with existing rankings if available
-                html = generate_ranking_html(sample["candidates"], existing_ranking)
-                # Update status
-                status = "Already ranked" if completed_samples.get(sample_id, False) else "Ready to rank"
-                progress = f"Progress: {sum(completed_samples.values())}/{len(samples)}"
-                return sample_id, sample["query"], html, progress, status, "[]", ""
-            except Exception as e:
-                import traceback
-                print(traceback.format_exc())
-                return sample_id, gr.update(), gr.update(), gr.update(), f"Error: {str(e)}", "[]", ""
-        # Function to validate rankings from JSON state
-        def validate_ranking_state(state_json):
-            try:
-                if not state_json or state_json == "[]":
-                    return "Please rank all documents before submitting."
-                # Parse the state
-                state = json.loads(state_json)
-                if not state:
-                    return "No ranking data found."
-                # Extract ranks
-                ranks = [item.get("rank") for item in state if "rank" in item]
-                if not ranks:
-                    return "No valid ranks found."
-                # Check for duplicates
-                if len(set(ranks)) != len(ranks):
-                    # Find duplicates
-                    dupes = {}
-                    for r in ranks:
-                        dupes[r] = dupes.get(r, 0) + 1
-                    duplicates = [r for r, count in dupes.items() if count > 1]
-                    return f"⚠️ Duplicate ranks found: {', '.join(map(str, sorted(duplicates)))}. Each document must have a unique rank."
-                # Check for complete sequence
-                max_rank = max(ranks)
-                expected = set(range(1, max_rank + 1))
-                if set(ranks) != expected:
-                    missing = sorted(expected - set(ranks))
-                    return f"⚠️ Missing ranks: {', '.join(map(str, missing))}. Ranks must be consecutive from 1 to {max_rank}."
-                return "✅ Rankings are valid. Ready to submit."
-            except json.JSONDecodeError:
-                return "Error parsing ranking data."
-            except Exception as e:
-                return f"Error validating rankings: {str(e)}"
-        # Function to save rankings from JSON state
-        def save_ranking_state(sample_id, state_json):
-            try:
-                if not state_json or state_json == "[]":
-                    return "Please rank all documents before submitting.", progress_text.value
-                # Get the sample
-                sample = next((s for s in samples if s["id"] == sample_id), None)
-                if not sample:
-                    return "⚠️ Sample not found", progress_text.value
-                # Parse the state
-                state = json.loads(state_json)
-                # Create a rankings array in the correct order
-                num_candidates = len(sample["candidates"])
-                rankings = [0] * num_candidates
-                # Fill in rankings from state
-                for item in state:
-                    doc_id = item.get("docId")
-                    rank = item.get("rank")
-                    if doc_id is not None and doc_id < num_candidates and rank is not None:
-                        rankings[doc_id] = rank
-                # Validate rankings
-                if any(r == 0 for r in rankings):
-                    return "⚠️ Not all documents have rankings", progress_text.value
-                if sorted(rankings) != list(range(1, num_candidates + 1)):
-                    return f"⚠️ Invalid ranking sequence. Please use each number from 1 to {num_candidates} exactly once.", progress_text.value
-                # Create annotation
-                annotation = {"sample_id": sample_id, "rankings": rankings}
-                # Update or add to results
-                existing_idx = next((i for i, a in enumerate(results["annotations"]) if a["sample_id"] == sample_id), None)
-                if existing_idx is not None:
-                    results["annotations"][existing_idx] = annotation
-                else:
-                    results["annotations"].append(annotation)
-                # Mark as completed
-                completed_samples[sample_id] = True
-                # Save to file
-                with open(output_path, "w") as f:
-                    json.dump(results, f, indent=2)
-                # Update progress
-                progress = f"Progress: {sum(completed_samples.values())}/{len(samples)}"
-                return f"✅ Rankings saved successfully! ({sum(completed_samples.values())}/{len(samples)} completed)", progress
-            except Exception as e:
-                import traceback
-                print(traceback.format_exc())
-                return f"⚠️ Error saving rankings: {str(e)}", progress_text.value
-        # Function to navigate to next sample
-        def next_sample_id(current_id):
-            current_idx = next((i for i, s in enumerate(samples) if s["id"] == current_id), -1)
-            if current_idx == -1:
                 return current_id
-            next_idx = min(current_idx + 1, len(samples) - 1)
-            return samples[next_idx]["id"]
-        # Function to navigate to previous sample
-        def prev_sample_id(current_id):
-            current_idx = next((i for i, s in enumerate(samples) if s["id"] == current_id), -1)
-            if current_idx == -1:
                 return current_id
-            prev_idx = max(current_idx - 1, 0)
-            return samples[prev_idx]["id"]
-        # Function to save all results
         def save_results():
-            try:
-                with open(output_path, "w") as f:
-                    json.dump(results, f, indent=2)
-                return f"✅ Results saved to {output_path} ({len(results['annotations'])} annotations)"
-            except Exception as e:
-                return f"⚠️ Error saving results file: {str(e)}"
-        # Connect buttons
-        validate_btn.click(
-            validate_ranking_state,
-            inputs=[rankings_state],
-            outputs=[validation_msg]
-        )
         submit_btn.click(
-            save_ranking_state,
-            inputs=[current_sample_id, rankings_state],
             outputs=[status_box, progress_text]
         )
         next_btn.click(
-            next_sample_id,
-            inputs=[current_sample_id],
             outputs=[current_sample_id]
         ).then(
             load_sample,
             inputs=[current_sample_id],
-            outputs=[
-                current_sample_id,
-                query_text,
-                ranking_html,
-                progress_text,
-                status_box,
-                rankings_state,
-                validation_msg
-            ]
         )
         prev_btn.click(
-            prev_sample_id,
-            inputs=[current_sample_id],
             outputs=[current_sample_id]
         ).then(
             load_sample,
             inputs=[current_sample_id],
-            outputs=[
-                current_sample_id,
-                query_text,
-                ranking_html,
-                progress_text,
-                status_box,
-                rankings_state,
-                validation_msg
-            ]
         )
         save_btn.click(save_results, outputs=[status_box])
-        # Initialize with first sample
-        demo.load(
-            lambda: load_sample(samples[0]['id']),
-            outputs=[
-                current_sample_id,
-                query_text,
-                ranking_html,
-                progress_text,
-                status_box,
-                rankings_state,
-                validation_msg
-            ]
-        )
     return demo
 # Main app with file upload capability
@@ -493,7 +268,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             """)
             with gr.Row():
-                with gr.Column():
                     file_input = gr.File(label="Upload a task file (JSON)")
                     load_btn = gr.Button("Load Task")
                     message = gr.Textbox(label="Status", interactive=False)
@@ -539,7 +314,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                     download_results_btn = gr.Button("Download Results")
                 # Right side - will contain the actual interface
-                with gr.Column():
                     task_container = gr.HTML()
             # Handle file upload and storage
@@ -650,11 +425,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             # Add download options
             with gr.Row():
-                with gr.Column():
-                    download_all_btn = gr.Button("Download All Results (ZIP)")
-                with gr.Column():
-                    result_select = gr.Dropdown(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")], label="Select Result to Download", value=None)
-                    download_selected_btn = gr.Button("Download Selected")
             # Add results visualization placeholder
             gr.Markdown("### Results Visualization")

     results = {"task_name": task_data["task_name"], "task_type": "reranking", "annotations": []}
     completed_samples = {s["id"]: False for s in samples}
+    def save_ranking(rankings, sample_id):
+        """Save the current set of rankings."""
         try:
+            # Check if all documents have rankings
+            all_ranked = all(r is not None and r != "" for r in rankings)
+            if not all_ranked:
+                return "⚠️ Please assign a rank to all documents before submitting", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
+            # Convert rankings to integers with better error handling
+            try:
+                processed_rankings = [int(r) for r in rankings]
+            except ValueError:
+                return "⚠️ Invalid ranking value. Please use only numbers.", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
+            # Check for duplicate rankings
+            if len(set(processed_rankings)) != len(processed_rankings):
+                return "⚠️ Each document must have a unique rank. Please review your rankings.", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
+            # Store this annotation in memory
+            existing_idx = next((i for i, a in enumerate(results["annotations"]) if a["sample_id"] == sample_id), None)
+            if existing_idx is not None:
+                results["annotations"][existing_idx] = {
+                    "sample_id": sample_id,
+                    "rankings": processed_rankings
+                }
+            else:
+                results["annotations"].append({
+                    "sample_id": sample_id,
+                    "rankings": processed_rankings
+                })
+            completed_samples[sample_id] = True
+            # Try to save to file, but continue even if it fails
+            try:
+                output_path = f"{task_data['task_name']}_human_results.json"
+                with open(output_path, "w") as f:
+                    json.dump(results, f, indent=2)
+                return f"✅ Rankings saved successfully (in memory and to file)", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
+            except:
+                # If file saving fails, still mark as success since we saved in memory
+                return f"✅ Rankings saved in memory (file save failed)", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
         except Exception as e:
+            # Return specific error message
+            return f"Error: {str(e)}", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
         gr.Markdown(f"# {task_data['task_name']} - Human Reranking Evaluation")
         with gr.Accordion("Instructions", open=True):
             gr.Markdown("""
             ## Task Instructions
             ### How to use this interface:
             1. Read the query at the top
+            2. Review each document carefully
+            3. Assign a rank to each document (1 = most relevant, higher numbers = less relevant)
+            4. Each document must have a unique rank
+            5. Click "Submit Rankings" when you're done with the current query
+            6. Use "Previous" and "Next" to navigate between queries
+            7. Click "Save All Results" periodically to ensure your work is saved
             """.format(instructions=task_data["instructions"]))
         current_sample_id = gr.State(value=samples[0]["id"])
         with gr.Row():
+            progress_text = gr.Textbox(label="Progress", value=f"Progress: 0/{len(samples)}", interactive=False)
             status_box = gr.Textbox(label="Status", value="Ready to start evaluation", interactive=False)
         with gr.Group():
             gr.Markdown("## Query:")
+            query_text = gr.Textbox(value=samples[0]["query"], label="", interactive=False)
             gr.Markdown("## Documents to Rank:")
+            # Create document displays and ranking dropdowns in synchronized pairs
+            doc_containers = []
+            ranking_dropdowns = []
+            with gr.Column():
+                for i, doc in enumerate(samples[0]["candidates"]):
+                    with gr.Row():
+                        doc_box = gr.Textbox(
+                            value=doc,
+                            label=f"Document {i+1}",
+                            interactive=False
+                        )
+                        dropdown = gr.Dropdown(
+                            choices=[str(j) for j in range(1, len(samples[0]["candidates"])+1)],
+                            label=f"Rank",
+                            value=""
+                        )
+                        doc_containers.append(doc_box)
+                        ranking_dropdowns.append(dropdown)
             with gr.Row():
                 prev_btn = gr.Button("← Previous Query", size="sm")
                 submit_btn = gr.Button("Submit Rankings", size="lg", variant="primary")
                 next_btn = gr.Button("Next Query →", size="sm")
             save_btn = gr.Button("💾 Save All Results", variant="secondary")
         def load_sample(sample_id):
+            """Load a specific sample into the interface."""
+            sample = next((s for s in samples if s["id"] == sample_id), None)
+            if not sample:
+                return [query_text.value] + [d.value for d in doc_containers] + [""] * len(ranking_dropdowns) + [current_sample_id.value, progress_text.value, status_box.value]
+            # Update query
+            new_query = sample["query"]
+            # Update documents
+            new_docs = []
+            for i, doc in enumerate(sample["candidates"]):
+                if i < len(doc_containers):
+                    new_docs.append(doc)
+            # Initialize rankings
+            new_rankings = [""] * len(ranking_dropdowns)
+            # Check if this sample has already been annotated
+            existing_annotation = next((a for a in results["annotations"] if a["sample_id"] == sample_id), None)
+            if existing_annotation:
+                # Restore previous rankings
+                for i, rank in enumerate(existing_annotation["rankings"]):
+                    if i < len(new_rankings) and rank is not None:
+                        new_rankings[i] = str(rank)
+            # Update progress
+            current_idx = samples.index(sample)
+            new_progress = f"Progress: {sum(completed_samples.values())}/{len(samples)}"
+            new_status = f"Viewing query {current_idx + 1} of {len(samples)}"
+            if completed_samples[sample_id]:
+                new_status += " (already completed)"
+            return [new_query] + new_docs + new_rankings + [sample["id"], new_progress, new_status]
+        def next_sample(current_id):
+            """Load the next sample."""
+            current_sample = next((s for s in samples if s["id"] == current_id), None)
+            if not current_sample:
                 return current_id
+            current_idx = samples.index(current_sample)
+            if current_idx < len(samples) - 1:
+                next_sample = samples[current_idx + 1]
+                return next_sample["id"]
+            return current_id
+        def prev_sample(current_id):
+            """Load the previous sample."""
+            current_sample = next((s for s in samples if s["id"] == current_id), None)
+            if not current_sample:
                 return current_id
+            current_idx = samples.index(current_sample)
+            if current_idx > 0:
+                prev_sample = samples[current_idx - 1]
+                return prev_sample["id"]
+            return current_id
         def save_results():
+            """Save all collected results to a file."""
+            output_path = f"{task_data['task_name']}_human_results.json"
+            with open(output_path, "w") as f:
+                json.dump(results, f, indent=2)
+            return f"✅ Results saved to {output_path} ({len(results['annotations'])} annotations)"
+        # Define a wrapper function that collects all the dropdown values into a list
+        def save_ranking_wrapper(*args):
+            # The last argument is the sample_id, all others are rankings
+            rankings = args[:-1]
+            sample_id = args[-1]
+            return save_ranking(rankings, sample_id)
+        # Connect events
         submit_btn.click(
+            save_ranking_wrapper,
+            inputs=ranking_dropdowns + [current_sample_id],
             outputs=[status_box, progress_text]
         )
         next_btn.click(
+            next_sample,
+            inputs=[current_sample_id],
             outputs=[current_sample_id]
         ).then(
             load_sample,
             inputs=[current_sample_id],
+            outputs=[query_text] + doc_containers + ranking_dropdowns + [current_sample_id, progress_text, status_box]
         )
         prev_btn.click(
+            prev_sample,
+            inputs=[current_sample_id],
             outputs=[current_sample_id]
         ).then(
             load_sample,
             inputs=[current_sample_id],
+            outputs=[query_text] + doc_containers + ranking_dropdowns + [current_sample_id, progress_text, status_box]
         )
         save_btn.click(save_results, outputs=[status_box])
     return demo
 # Main app with file upload capability
             """)
             with gr.Row():
+                with gr.Column(scale=1):
                     file_input = gr.File(label="Upload a task file (JSON)")
                     load_btn = gr.Button("Load Task")
                     message = gr.Textbox(label="Status", interactive=False)
                     download_results_btn = gr.Button("Download Results")
                 # Right side - will contain the actual interface
+                with gr.Column(scale=2):
                     task_container = gr.HTML()
             # Handle file upload and storage
             # Add download options
             with gr.Row():
+                download_all_btn = gr.Button("Download All Results (ZIP)")
+                result_select = gr.Dropdown(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")], label="Select Result to Download")
+                download_selected_btn = gr.Button("Download Selected")
             # Add results visualization placeholder
             gr.Markdown("### Results Visualization")