Spaces:

HumanEval-V
/

HumanEval-V-Benchmark-Viewer

Running

App Files Files Community

anonymous-researcher912 commited on Feb 18

Commit

968f8be

1 Parent(s): 3dc6935

update new data

Browse files

Files changed (6) hide show

app.py +30 -10
humaneval_v_test_hf/data-00000-of-00001.arrow +2 -2
humaneval_v_test_hf/dataset_info.json +64 -9
humaneval_v_test_hf/state.json +1 -1
idx_mapping.json +255 -0
sort_id.py +18 -0

app.py CHANGED Viewed

@@ -1,13 +1,20 @@
 import streamlit as st
 import datasets
 humaneval_v_data = datasets.load_from_disk("humaneval_v_test_hf")
 st.set_page_config(layout="wide", page_title="HumanEval-V Viewer")
-st.markdown("> <i>This is a viewer for the HumanEval-V benchmark, which consists of 108 coding tasks. Use the navigation buttons or enter an index to browse the tasks. If you encounter any issues, we encourage you to start a discussion [here](https://huggingface.co/datasets/HumanEval-V/HumanEval-V-Benchmark/discussions)</i>.", unsafe_allow_html=True)
 st.markdown("---")
-max_index = 108
 # Initialize session state for index if not present
 if 'index' not in st.session_state:
@@ -23,38 +30,51 @@ with buttons[1]:
     index_input = st.number_input(
         f"Go to index (1-{max_index}):",
         min_value=1,
-        max_value=108,
         value=st.session_state.index,
         key="index_input",
         help="Enter an index and jump to that index.",
         step=1
     )
-coding_task = humaneval_v_data[index_input-1]
 qid = coding_task["qid"]
-image = coding_task["image"]
 function_signature = coding_task["function_signature"]
 ground_truth = coding_task["ground_truth_solution"]
 test_script = coding_task["test_script"]
 upper_columns = st.columns([2, 7])
 with upper_columns[0]:
     st.markdown(f"### Question ID: {qid}")
-    st.image(image, use_column_width=True)
-    st.markdown("---")
 with upper_columns[1]:
     st.markdown(f"### Function Signature:")
     st.markdown(f"")
     st.markdown(f"""```python
 {function_signature}
 ```""")
-    st.markdown(f"### Test Script:")
     st.markdown(f"")
-    st.markdown(f"""```python
-{test_script}
 ```""")
     st.markdown(f"### Ground Truth Solution:")
     st.markdown(f"")
     st.markdown(f"""```python
 {ground_truth}
 ```""")

 import streamlit as st
 import datasets
+import json
+def load_json(file_path):
+    with open(file_path, "r") as f:
+        return json.load(f)
 humaneval_v_data = datasets.load_from_disk("humaneval_v_test_hf")
+idx_mapping = load_json("idx_mapping.json")
 st.set_page_config(layout="wide", page_title="HumanEval-V Viewer")
+st.markdown("> <i>This is a viewer for the **HumanEval-V** benchmark, which includes 253 coding tasks. Use the navigation buttons or enter an index to browse through the tasks. Please note that image loading may take a moment after switching to the next task. If you encounter any issues or have questions, feel free to start a discussion [here](https://huggingface.co/datasets/HumanEval-V/HumanEval-V-Benchmark/discussions)</i>.", unsafe_allow_html=True)
 st.markdown("---")
+max_index = 253
 # Initialize session state for index if not present
 if 'index' not in st.session_state:
     index_input = st.number_input(
         f"Go to index (1-{max_index}):",
         min_value=1,
+        max_value=max_index,
         value=st.session_state.index,
         key="index_input",
         help="Enter an index and jump to that index.",
         step=1
     )
+coding_task = humaneval_v_data[idx_mapping[str(index_input-1)]]
 qid = coding_task["qid"]
+diagram = coding_task["diagram"]
+diagram_description = coding_task["ground_truth_diagram_description"]
 function_signature = coding_task["function_signature"]
 ground_truth = coding_task["ground_truth_solution"]
 test_script = coding_task["test_script"]
+task_type = coding_task["task_type"]
+capability_aspects = coding_task["capability_aspects"]
 upper_columns = st.columns([2, 7])
 with upper_columns[0]:
     st.markdown(f"### Question ID: {qid}")
+    st.image(diagram, use_column_width=True)
+    st.markdown(f"")
+    st.markdown(f"### Task Type: {task_type}")
+    st.markdown(f"")
+    st.markdown(f"### Capability Aspects:")
+    st.markdown(f"")
+    st.json(capability_aspects)
 with upper_columns[1]:
     st.markdown(f"### Function Signature:")
     st.markdown(f"")
     st.markdown(f"""```python
 {function_signature}
 ```""")
+    st.markdown(f"### Ground Truth Diagram Description:")
     st.markdown(f"")
+    st.markdown(f"""```markdown
+{diagram_description}
 ```""")
     st.markdown(f"### Ground Truth Solution:")
     st.markdown(f"")
     st.markdown(f"""```python
 {ground_truth}
+```""")
+    st.markdown(f"### Test Script:")
+    st.markdown(f"")
+    st.markdown(f"""```python
+{test_script}
 ```""")

humaneval_v_test_hf/data-00000-of-00001.arrow CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:774e4cacfc259917fb5a5e8433e6cacbcac01063cb30fd3560170b3a0a9fa76e
-size 12842912

 version https://git-lfs.github.com/spec/v1
+oid sha256:4de3ea7bbf661db191acd796966a23ffa9e4cd7c57510709013266da0b715110
+size 32923016

humaneval_v_test_hf/dataset_info.json CHANGED Viewed

@@ -3,15 +3,15 @@
   "citation": "",
   "config_name": "default",
   "dataset_name": "human_eval-v-benchmark",
-  "dataset_size": 12841384,
   "description": "",
   "download_checksums": {
-    "hf://datasets/HumanEval-V/HumanEval-V-Benchmark@50af2be232641ca618f6aecce901ca5e5a83b20e/data/test-00000-of-00001.parquet": {
-      "num_bytes": 12571814,
       "checksum": null
     }
   },
-  "download_size": 12571814,
   "features": {
     "qid": {
       "dtype": "string",
@@ -21,7 +21,7 @@
       "dtype": "string",
       "_type": "Value"
     },
-    "image_description": {
       "dtype": "string",
       "_type": "Value"
     },
@@ -33,18 +33,73 @@
       "dtype": "string",
       "_type": "Value"
     },
-    "image": {
       "_type": "Image"
     }
   },
   "homepage": "",
   "license": "",
-  "size_in_bytes": 25413198,
   "splits": {
     "test": {
       "name": "test",
-      "num_bytes": 12841384,
-      "num_examples": 108,
       "dataset_name": "human_eval-v-benchmark"
     }
   },

   "citation": "",
   "config_name": "default",
   "dataset_name": "human_eval-v-benchmark",
+  "dataset_size": 32918942,
   "description": "",
   "download_checksums": {
+    "hf://datasets/HumanEval-V/HumanEval-V-Benchmark@8a3465c8cdcd25f65b3e13651e21cef9406e1799/data/test-00000-of-00001.parquet": {
+      "num_bytes": 32012630,
       "checksum": null
     }
   },
+  "download_size": 32012630,
   "features": {
     "qid": {
       "dtype": "string",
       "dtype": "string",
       "_type": "Value"
     },
+    "ground_truth_diagram_description": {
       "dtype": "string",
       "_type": "Value"
     },
       "dtype": "string",
       "_type": "Value"
     },
+    "diagram": {
       "_type": "Image"
+    },
+    "capability_aspects": {
+      "Common Sense": {
+        "feature": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "_type": "Sequence"
+      },
+      "Data Structures": {
+        "feature": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "_type": "Sequence"
+      },
+      "Dynamic Patterns": {
+        "feature": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "_type": "Sequence"
+      },
+      "Geometric Objects": {
+        "feature": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "_type": "Sequence"
+      },
+      "Mathematical Operations": {
+        "feature": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "_type": "Sequence"
+      },
+      "Spatial Transformations": {
+        "feature": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "_type": "Sequence"
+      },
+      "Topological Relations": {
+        "feature": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "_type": "Sequence"
+      }
+    },
+    "task_type": {
+      "dtype": "string",
+      "_type": "Value"
     }
   },
   "homepage": "",
   "license": "",
+  "size_in_bytes": 64931572,
   "splits": {
     "test": {
       "name": "test",
+      "num_bytes": 32918942,
+      "num_examples": 253,
       "dataset_name": "human_eval-v-benchmark"
     }
   },

humaneval_v_test_hf/state.json CHANGED Viewed

@@ -4,7 +4,7 @@
       "filename": "data-00000-of-00001.arrow"
     }
   ],
-  "_fingerprint": "d8ffc8935ede93f4",
   "_format_columns": null,
   "_format_kwargs": {},
   "_format_type": null,

       "filename": "data-00000-of-00001.arrow"
     }
   ],
+  "_fingerprint": "4c03438fb70814f3",
   "_format_columns": null,
   "_format_kwargs": {},
   "_format_type": null,

idx_mapping.json ADDED Viewed

	@@ -0,0 +1,255 @@

+{
+    "0": 0,
+    "1": 1,
+    "2": 2,
+    "27": 3,
+    "28": 4,
+    "29": 5,
+    "250": 6,
+    "251": 7,
+    "252": 8,
+    "30": 9,
+    "31": 10,
+    "32": 11,
+    "33": 12,
+    "34": 13,
+    "35": 14,
+    "36": 15,
+    "37": 16,
+    "38": 17,
+    "39": 18,
+    "40": 19,
+    "41": 20,
+    "42": 21,
+    "43": 22,
+    "44": 23,
+    "45": 24,
+    "46": 25,
+    "47": 26,
+    "48": 27,
+    "49": 28,
+    "50": 29,
+    "51": 30,
+    "52": 31,
+    "3": 32,
+    "4": 33,
+    "5": 34,
+    "53": 35,
+    "54": 36,
+    "55": 37,
+    "56": 38,
+    "57": 39,
+    "58": 40,
+    "59": 41,
+    "60": 42,
+    "61": 43,
+    "62": 44,
+    "63": 45,
+    "64": 46,
+    "65": 47,
+    "66": 48,
+    "67": 49,
+    "68": 50,
+    "69": 51,
+    "70": 52,
+    "71": 53,
+    "72": 54,
+    "73": 55,
+    "74": 56,
+    "75": 57,
+    "76": 58,
+    "6": 59,
+    "7": 60,
+    "8": 61,
+    "77": 62,
+    "78": 63,
+    "79": 64,
+    "80": 65,
+    "81": 66,
+    "82": 67,
+    "83": 68,
+    "84": 69,
+    "85": 70,
+    "86": 71,
+    "87": 72,
+    "88": 73,
+    "89": 74,
+    "90": 75,
+    "91": 76,
+    "92": 77,
+    "93": 78,
+    "94": 79,
+    "95": 80,
+    "96": 81,
+    "97": 82,
+    "98": 83,
+    "99": 84,
+    "100": 85,
+    "101": 86,
+    "102": 87,
+    "103": 88,
+    "104": 89,
+    "9": 90,
+    "10": 91,
+    "11": 92,
+    "105": 93,
+    "106": 94,
+    "107": 95,
+    "108": 96,
+    "109": 97,
+    "110": 98,
+    "111": 99,
+    "112": 100,
+    "113": 101,
+    "114": 102,
+    "115": 103,
+    "116": 104,
+    "117": 105,
+    "118": 106,
+    "119": 107,
+    "120": 108,
+    "121": 109,
+    "122": 110,
+    "123": 111,
+    "124": 112,
+    "125": 113,
+    "126": 114,
+    "12": 115,
+    "13": 116,
+    "14": 117,
+    "127": 118,
+    "128": 119,
+    "129": 120,
+    "130": 121,
+    "131": 122,
+    "132": 123,
+    "133": 124,
+    "134": 125,
+    "135": 126,
+    "136": 127,
+    "137": 128,
+    "138": 129,
+    "139": 130,
+    "140": 131,
+    "141": 132,
+    "142": 133,
+    "143": 134,
+    "144": 135,
+    "145": 136,
+    "146": 137,
+    "15": 138,
+    "16": 139,
+    "17": 140,
+    "147": 141,
+    "148": 142,
+    "149": 143,
+    "150": 144,
+    "151": 145,
+    "152": 146,
+    "153": 147,
+    "154": 148,
+    "155": 149,
+    "156": 150,
+    "157": 151,
+    "158": 152,
+    "159": 153,
+    "160": 154,
+    "161": 155,
+    "162": 156,
+    "163": 157,
+    "164": 158,
+    "165": 159,
+    "166": 160,
+    "167": 161,
+    "168": 162,
+    "169": 163,
+    "170": 164,
+    "171": 165,
+    "172": 166,
+    "18": 167,
+    "19": 168,
+    "20": 169,
+    "173": 170,
+    "174": 171,
+    "175": 172,
+    "176": 173,
+    "177": 174,
+    "178": 175,
+    "179": 176,
+    "180": 177,
+    "181": 178,
+    "182": 179,
+    "183": 180,
+    "184": 181,
+    "185": 182,
+    "186": 183,
+    "187": 184,
+    "188": 185,
+    "189": 186,
+    "190": 187,
+    "191": 188,
+    "192": 189,
+    "193": 190,
+    "194": 191,
+    "195": 192,
+    "196": 193,
+    "197": 194,
+    "198": 195,
+    "199": 196,
+    "21": 197,
+    "22": 198,
+    "23": 199,
+    "200": 200,
+    "201": 201,
+    "202": 202,
+    "203": 203,
+    "204": 204,
+    "205": 205,
+    "206": 206,
+    "207": 207,
+    "208": 208,
+    "209": 209,
+    "210": 210,
+    "211": 211,
+    "212": 212,
+    "213": 213,
+    "214": 214,
+    "215": 215,
+    "216": 216,
+    "217": 217,
+    "218": 218,
+    "219": 219,
+    "220": 220,
+    "221": 221,
+    "222": 222,
+    "223": 223,
+    "24": 224,
+    "25": 225,
+    "26": 226,
+    "224": 227,
+    "225": 228,
+    "226": 229,
+    "227": 230,
+    "228": 231,
+    "229": 232,
+    "230": 233,
+    "231": 234,
+    "232": 235,
+    "233": 236,
+    "234": 237,
+    "235": 238,
+    "236": 239,
+    "237": 240,
+    "238": 241,
+    "239": 242,
+    "240": 243,
+    "241": 244,
+    "242": 245,
+    "243": 246,
+    "244": 247,
+    "245": 248,
+    "246": 249,
+    "247": 250,
+    "248": 251,
+    "249": 252
+}

sort_id.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import datasets
+import json
+def get_float_qid(qid):
+    return float(qid[1:].replace('-', '.'))
+humaneval_v_data = datasets.load_from_disk("humaneval_v_test_hf")
+qid_to_idx_mapping = {x["qid"]: idx for idx, x in enumerate(humaneval_v_data)}
+reranked_data = sorted(humaneval_v_data, key=lambda x: get_float_qid(x["qid"]))
+id_idx_mappping = {x["qid"]: idx  for idx, x in enumerate(reranked_data)}
+old_to_new_id = {id_idx_mappping[x["qid"]]: qid_to_idx_mapping[x["qid"]] for x in humaneval_v_data}
+print(json.dumps(old_to_new_id, indent=4))