Raplaced files with up to date version from BenchName
Browse files- .gitattributes +0 -0
- README.md +3 -14
- app.py +9 -1
- requirements.txt +0 -0
- src/__init__.py +0 -0
- src/content.py +14 -14
- src/evaluation/__init__.py +0 -0
- src/evaluation/base_task_metrics.py +0 -0
- src/evaluation/commit_message_generation/__init__.py +0 -0
- src/evaluation/commit_message_generation/cmg_metrics.py +0 -0
- src/evaluation/metrics.py +0 -0
- src/formatting.py +0 -0
- src/get_results_for_task.py +15 -11
- src/leaderboard_formatting.py +37 -15
- src/submission_uploader.py +2 -2
- src/tasks_content.py +42 -27
- src/utils.py +0 -0
.gitattributes
CHANGED
File without changes
|
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
colorFrom: yellow
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.36.1
|
8 |
app_file: app.py
|
@@ -10,14 +10,3 @@ pinned: false
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
13 |
-
|
14 |
-
## Citing
|
15 |
-
```
|
16 |
-
@article{bogomolov2024long,
|
17 |
-
title={Long Code Arena: a Set of Benchmarks for Long-Context Code Models},
|
18 |
-
author={Bogomolov, Egor and Eliseeva, Aleksandra and Galimzyanov, Timur and Glukhov, Evgeniy and Shapkin, Anton and Tigina, Maria and Golubev, Yaroslav and Kovrigin, Alexander and van Deursen, Arie and Izadi, Maliheh and Bryksin, Timofey},
|
19 |
-
journal={arXiv preprint arXiv:2406.11612},
|
20 |
-
year={2024}
|
21 |
-
}
|
22 |
-
```
|
23 |
-
You can find the paper [here](https://arxiv.org/abs/2406.11612).
|
|
|
1 |
---
|
2 |
+
title: BenchName
|
3 |
+
emoji: π
|
4 |
colorFrom: yellow
|
5 |
+
colorTo: red
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.36.1
|
8 |
app_file: app.py
|
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -32,7 +32,7 @@ logging.basicConfig(
|
|
32 |
)
|
33 |
|
34 |
submission_uploader = SubmissionUploader(
|
35 |
-
dataset_id=os.environ["DATASET_ID"], private_dataset_id=os.
|
36 |
)
|
37 |
|
38 |
|
@@ -58,6 +58,14 @@ def get_leaderboard_for_completion_task(dataset_name: str | None):
|
|
58 |
)
|
59 |
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
with gr.Blocks() as demo:
|
62 |
# intro
|
63 |
gr.HTML(INTRODUCTION_TITLE)
|
|
|
32 |
)
|
33 |
|
34 |
submission_uploader = SubmissionUploader(
|
35 |
+
dataset_id=os.environ["DATASET_ID"], private_dataset_id=os.getenv("PRIVATE_DATASET_ID")
|
36 |
)
|
37 |
|
38 |
|
|
|
58 |
)
|
59 |
|
60 |
|
61 |
+
def get_aggregated_leaderboard_for_task(task_pretty: str) -> gr.components.Dataframe:
|
62 |
+
return gr.components.Dataframe(
|
63 |
+
value=get_results_for_task(task_pretty),
|
64 |
+
interactive=False,
|
65 |
+
datatype=get_types_per_task(TASKS_PRETTY_REVERSE[task_pretty]),
|
66 |
+
)
|
67 |
+
|
68 |
+
|
69 |
with gr.Blocks() as demo:
|
70 |
# intro
|
71 |
gr.HTML(INTRODUCTION_TITLE)
|
requirements.txt
CHANGED
File without changes
|
src/__init__.py
CHANGED
File without changes
|
src/content.py
CHANGED
@@ -3,22 +3,22 @@ from .formatting import styled_warning
|
|
3 |
# ================================
|
4 |
# = ABOUT =
|
5 |
# ================================
|
6 |
-
INTRODUCTION_TITLE = """<h1 align="center">ποΈ
|
7 |
|
8 |
-
INTRODUCTION_TEXT = """ποΈ **
|
9 |
It currently spans six different tasks and contains six datasets:
|
10 |
|
11 |
-
* π€ [Library-based code generation](https://huggingface.co/datasets/
|
12 |
-
* π€ [CI builds repair](https://huggingface.co/datasets/
|
13 |
-
* π€ [Project-level code completion](https://huggingface.co/datasets/
|
14 |
-
* π€ [Commit message generation](https://huggingface.co/datasets/
|
15 |
-
* π€ [Bug localization](https://huggingface.co/datasets/
|
16 |
-
* π€ [Module summarization](https://huggingface.co/datasets/
|
17 |
|
18 |
-
We are excited to invite you to participate in solving our benchmarks! To submit your results, please send the following materials to our π© email (
|
19 |
|
20 |
* **Results**: Include the summary of your benchmark outcomes.
|
21 |
-
* **Reproduction Package**: To ensure the integrity and reproducibility of your results, please include the code for context collection (if any), generation of predictions, and evaluating. You can follow [
|
22 |
* **Metadata**: Model information, organization name, licence of your model, context size, and other information you find relevant.
|
23 |
|
24 |
We look forward to reviewing your innovative solutions!
|
@@ -30,23 +30,23 @@ We look forward to reviewing your innovative solutions!
|
|
30 |
# ================================
|
31 |
LEADERBOARD_TITLE = '<h2 align="center">π
Leaderboard</h2>'
|
32 |
|
33 |
-
LEADERBOARD_TEXT = """The raw results from the leaderboard are available in π€ [
|
34 |
|
35 |
# ================================
|
36 |
# = SUBMISSION =
|
37 |
# ================================
|
38 |
SUBMISSION_TITLE = '<h2 align="center">π© Make A Submission</h2>'
|
39 |
|
40 |
-
SUBMISSION_TEXT_INTRO = """Use the form below to submit new results to ποΈ
|
41 |
|
42 |
SUBMISSION_TEXT_TASK = """1. Select a task you want to submit results for."""
|
43 |
|
44 |
SUBMISSION_TEXT_METADATA = """2. Fill in some metadata about your submission."""
|
45 |
|
46 |
SUBMISSION_TEXT_FILES = """3. Attach one or more files with your model's predictions.
|
47 |
-
* If several files are attached, they will be treated as separate runs of the submitted model (e.g., with different seeds), and the metrics will be averaged across runs. For baselines provided by ποΈ
|
48 |
"""
|
49 |
|
50 |
-
SUBMISSION_TEXT_SUBMIT = """All set! A new PR to π€ [
|
51 |
|
52 |
β³ **Note:** It might take some time (up to 40 minutes) for PR to get created, since it involves computing metrics for your submission."""
|
|
|
3 |
# ================================
|
4 |
# = ABOUT =
|
5 |
# ================================
|
6 |
+
INTRODUCTION_TITLE = """<h1 align="center">ποΈ BenchName </h1>"""
|
7 |
|
8 |
+
INTRODUCTION_TEXT = """ποΈ **BenchName** is a suite of benchmarks for code-related tasks with large contexts, up to a whole code repository.
|
9 |
It currently spans six different tasks and contains six datasets:
|
10 |
|
11 |
+
* π€ [Library-based code generation](https://huggingface.co/datasets/icmlbenchname/library-based-code-generation)
|
12 |
+
* π€ [CI builds repair](https://huggingface.co/datasets/icmlbenchname/ci-builds-repair)
|
13 |
+
* π€ [Project-level code completion](https://huggingface.co/datasets/icmlbenchname/project-level-code-completion)
|
14 |
+
* π€ [Commit message generation](https://huggingface.co/datasets/icmlbenchname/commit-message-generation)
|
15 |
+
* π€ [Bug localization](https://huggingface.co/datasets/icmlbenchname/bug-localization)
|
16 |
+
* π€ [Module summarization](https://huggingface.co/datasets/icmlbenchname/module-summarization)
|
17 |
|
18 |
+
We are excited to invite you to participate in solving our benchmarks! To submit your results, please send the following materials to our π© email (icmlbenchname@gmail.com):
|
19 |
|
20 |
* **Results**: Include the summary of your benchmark outcomes.
|
21 |
+
* **Reproduction Package**: To ensure the integrity and reproducibility of your results, please include the code for context collection (if any), generation of predictions, and evaluating. You can follow [baselines](https://anonymous.4open.science/r/icml-benchname-2025/README.md) as a reference.
|
22 |
* **Metadata**: Model information, organization name, licence of your model, context size, and other information you find relevant.
|
23 |
|
24 |
We look forward to reviewing your innovative solutions!
|
|
|
30 |
# ================================
|
31 |
LEADERBOARD_TITLE = '<h2 align="center">π
Leaderboard</h2>'
|
32 |
|
33 |
+
LEADERBOARD_TEXT = """The raw results from the leaderboard are available in π€ [icmlbenchname/results](https://huggingface.co/datasets/icmlbenchname/results)."""
|
34 |
|
35 |
# ================================
|
36 |
# = SUBMISSION =
|
37 |
# ================================
|
38 |
SUBMISSION_TITLE = '<h2 align="center">π© Make A Submission</h2>'
|
39 |
|
40 |
+
SUBMISSION_TEXT_INTRO = """Use the form below to submit new results to ποΈ BenchName. If any problems arise, don't hesitate to contact us by email `TODO` or open a discussion π"""
|
41 |
|
42 |
SUBMISSION_TEXT_TASK = """1. Select a task you want to submit results for."""
|
43 |
|
44 |
SUBMISSION_TEXT_METADATA = """2. Fill in some metadata about your submission."""
|
45 |
|
46 |
SUBMISSION_TEXT_FILES = """3. Attach one or more files with your model's predictions.
|
47 |
+
* If several files are attached, they will be treated as separate runs of the submitted model (e.g., with different seeds), and the metrics will be averaged across runs. For baselines provided by ποΈ BenchName Team, the results are averaged across 3 runs.
|
48 |
"""
|
49 |
|
50 |
+
SUBMISSION_TEXT_SUBMIT = """All set! A new PR to π€ [icmlbenchname/results](https://huggingface.co/datasets/icmlbenchname/results) should be opened when you press "Submit" button. ποΈ BenchName Team will review it shortly, and the results will appear in the leaderboard.
|
51 |
|
52 |
β³ **Note:** It might take some time (up to 40 minutes) for PR to get created, since it involves computing metrics for your submission."""
|
src/evaluation/__init__.py
CHANGED
File without changes
|
src/evaluation/base_task_metrics.py
CHANGED
File without changes
|
src/evaluation/commit_message_generation/__init__.py
CHANGED
File without changes
|
src/evaluation/commit_message_generation/cmg_metrics.py
CHANGED
File without changes
|
src/evaluation/metrics.py
CHANGED
File without changes
|
src/formatting.py
CHANGED
File without changes
|
src/get_results_for_task.py
CHANGED
@@ -37,7 +37,7 @@ def _get_results_stub() -> pd.DataFrame:
|
|
37 |
"ChrF": "X",
|
38 |
"BERTScore": "X",
|
39 |
"BERTScore (Normalized)": "X",
|
40 |
-
"Submitted By": "
|
41 |
"Resources": "",
|
42 |
},
|
43 |
{
|
@@ -49,7 +49,7 @@ def _get_results_stub() -> pd.DataFrame:
|
|
49 |
"ChrF": "X",
|
50 |
"BERTScore": "X",
|
51 |
"BERTScore (Normalized)": "X",
|
52 |
-
"Submitted By": "
|
53 |
"Resources": "",
|
54 |
},
|
55 |
]
|
@@ -77,27 +77,31 @@ def _get_results_dataset(task_id: str) -> pd.DataFrame:
|
|
77 |
os.environ["DATASET_ID"], task_id, split="test", download_mode="force_redownload"
|
78 |
).to_pandas()
|
79 |
results_df = results_df.rename(columns=COLUMNS_PRETTY, errors="ignore")
|
80 |
-
results_df["Context Size"] = results_df["Context Size"].map(lambda x: f"{int(x) // 1000}k" if int(x) >= 1000 else x)
|
81 |
|
82 |
-
|
|
|
|
|
|
|
83 |
|
84 |
for metric_column in METRICS_PER_TASK[task_id]:
|
85 |
if "BERTScore" in metric_column:
|
86 |
results_df[metric_column] = results_df[metric_column].map(lambda x: f"{x:.5f}")
|
|
|
|
|
87 |
else:
|
88 |
results_df[metric_column] = results_df[metric_column].map(lambda x: f"{x:.2f}")
|
89 |
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
94 |
if task_id == 'project_code_completion':
|
95 |
results_df["Dataset Name"] = [_extract_dataset_name(urls) for urls in results_df["Dataset"]]
|
96 |
results_df["Dataset"] = [_process_urls(urls) for urls in results_df["Dataset"]]
|
97 |
-
results_df["Resources"] = [_process_urls(urls) for urls in results_df["Resources"]]
|
98 |
results_df = results_df[get_columns_per_task(task_id)]
|
99 |
-
if task_id == 'ci_builds_repair':
|
100 |
-
results_df = results_df.rename(columns={"Context Size": "Context"})
|
101 |
return results_df
|
102 |
|
103 |
|
|
|
37 |
"ChrF": "X",
|
38 |
"BERTScore": "X",
|
39 |
"BERTScore (Normalized)": "X",
|
40 |
+
"Submitted By": "BenchName Team",
|
41 |
"Resources": "",
|
42 |
},
|
43 |
{
|
|
|
49 |
"ChrF": "X",
|
50 |
"BERTScore": "X",
|
51 |
"BERTScore (Normalized)": "X",
|
52 |
+
"Submitted By": "BenchName Team",
|
53 |
"Resources": "",
|
54 |
},
|
55 |
]
|
|
|
77 |
os.environ["DATASET_ID"], task_id, split="test", download_mode="force_redownload"
|
78 |
).to_pandas()
|
79 |
results_df = results_df.rename(columns=COLUMNS_PRETTY, errors="ignore")
|
|
|
80 |
|
81 |
+
if task_id != "aggregated":
|
82 |
+
results_df["Context Size"] = results_df["Context Size"].map(lambda x: f"{int(x) // 1000}k" if int(x) >= 1000 else x)
|
83 |
+
results_df["Resources"] = [_process_urls(urls) for urls in results_df["Resources"]]
|
84 |
+
results_df = results_df.sort_values(by=SORT_COLUMN_PER_TASK[task_id], ascending=False)
|
85 |
|
86 |
for metric_column in METRICS_PER_TASK[task_id]:
|
87 |
if "BERTScore" in metric_column:
|
88 |
results_df[metric_column] = results_df[metric_column].map(lambda x: f"{x:.5f}")
|
89 |
+
elif "Mean Rank" in metric_column:
|
90 |
+
continue
|
91 |
else:
|
92 |
results_df[metric_column] = results_df[metric_column].map(lambda x: f"{x:.2f}")
|
93 |
|
94 |
+
if task_id == 'aggregated':
|
95 |
+
results_df["Model Name"] = results_df["Model"]
|
96 |
+
else:
|
97 |
+
results_df["Model Name"] = [
|
98 |
+
model_hyperlink(link=link, model_name=model_name) if link else model_name
|
99 |
+
for link, model_name in zip(results_df["model_url"], results_df["Model Name"])
|
100 |
+
]
|
101 |
if task_id == 'project_code_completion':
|
102 |
results_df["Dataset Name"] = [_extract_dataset_name(urls) for urls in results_df["Dataset"]]
|
103 |
results_df["Dataset"] = [_process_urls(urls) for urls in results_df["Dataset"]]
|
|
|
104 |
results_df = results_df[get_columns_per_task(task_id)]
|
|
|
|
|
105 |
return results_df
|
106 |
|
107 |
|
src/leaderboard_formatting.py
CHANGED
@@ -20,16 +20,26 @@ COLUMNS_PRETTY = {
|
|
20 |
"EM commited": "EM committed",
|
21 |
"EM non_informative": "EM non-informative",
|
22 |
"EM random": "EM random",
|
23 |
-
"EM all": "EM all",
|
|
|
|
|
24 |
"dataset": "Dataset",
|
25 |
"CompScore": "CompScore",
|
26 |
"context": "Context",
|
27 |
"task_type": "Task type",
|
28 |
-
"date": "Date, mm/yy",
|
29 |
}
|
30 |
|
31 |
# Add your metrics
|
32 |
METRICS_PER_TASK = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
"commit_message_generation": [
|
34 |
"BLEU",
|
35 |
"ChrF",
|
@@ -49,18 +59,28 @@ METRICS_PER_TASK = {
|
|
49 |
"EM all",
|
50 |
],
|
51 |
"bug_localization": [
|
52 |
-
"
|
53 |
-
"R
|
54 |
-
"
|
55 |
-
"
|
56 |
-
"
|
|
|
|
|
57 |
],
|
58 |
"module_summarization": [
|
59 |
"CompScore",
|
60 |
],
|
61 |
"library_based_code_generation": [
|
62 |
-
"
|
63 |
-
"API Recall",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
],
|
65 |
"ci_builds_repair": [
|
66 |
"Pass@1",
|
@@ -73,15 +93,17 @@ SORT_COLUMN_PER_TASK = {
|
|
73 |
"project_code_completion": "EM inproject",
|
74 |
"bug_localization": "Model Name",
|
75 |
"module_summarization": "CompScore",
|
76 |
-
"library_based_code_generation": "API Recall",
|
77 |
"ci_builds_repair": "Pass@1",
|
78 |
}
|
79 |
|
80 |
|
81 |
def get_columns_per_task(task_id: str) -> List[str]:
|
82 |
metrics_per_task = METRICS_PER_TASK[task_id]
|
|
|
|
|
83 |
if task_id == 'project_code_completion':
|
84 |
-
return ["Model Name", "Context Size", "Dataset Name", "Dataset"] + metrics_per_task + ["
|
85 |
if task_id == 'bug_localization':
|
86 |
return ["Model Name", "Availability", "Context Size"] + metrics_per_task + ["Submitted By", "Resources"]
|
87 |
|
@@ -89,10 +111,10 @@ def get_columns_per_task(task_id: str) -> List[str]:
|
|
89 |
return ["Model Name", "Context Size"] + metrics_per_task + ["Submitted By", "Resources"]
|
90 |
|
91 |
if task_id == 'library_based_code_generation':
|
92 |
-
return ["Model Name"
|
93 |
|
94 |
if task_id == 'ci_builds_repair':
|
95 |
-
return ["Model Name", "Context Size", "Task type"] + metrics_per_task + ["
|
96 |
|
97 |
return ["Model Name", "Context Size", "Availability"] + metrics_per_task + ["Submitted By", "Resources"]
|
98 |
|
@@ -100,9 +122,9 @@ def get_columns_per_task(task_id: str) -> List[str]:
|
|
100 |
def get_types_per_task(task_id: str) -> List[str]:
|
101 |
metrics_per_task = METRICS_PER_TASK.get(task_id, (0, 0, 0, 0, 0))
|
102 |
if task_id == 'project_code_completion':
|
103 |
-
return ["html", "markdown", "markdown", "html"] + ["number" for _ in metrics_per_task] + ["markdown", "
|
104 |
if task_id == 'bug_localization':
|
105 |
return ["html", "markdown", "markdown"] + ["number" for _ in metrics_per_task] + ["markdown", "html"]
|
106 |
if task_id == 'ci_builds_repair':
|
107 |
-
return ["html", "markdown", "markdown"] + ["number" for _ in metrics_per_task] + ["markdown", "markdown", "
|
108 |
return ["html", "markdown", "markdown"] + ["number" for _ in metrics_per_task] + ["markdown", "html"]
|
|
|
20 |
"EM commited": "EM committed",
|
21 |
"EM non_informative": "EM non-informative",
|
22 |
"EM random": "EM random",
|
23 |
+
"EM all": "EM all",
|
24 |
+
"context_composer": "Context Composer",
|
25 |
+
"context_length": "Context Size",
|
26 |
"dataset": "Dataset",
|
27 |
"CompScore": "CompScore",
|
28 |
"context": "Context",
|
29 |
"task_type": "Task type",
|
|
|
30 |
}
|
31 |
|
32 |
# Add your metrics
|
33 |
METRICS_PER_TASK = {
|
34 |
+
"aggregated": [
|
35 |
+
"Mean Rank",
|
36 |
+
"Mean Score",
|
37 |
+
"Library-based CG",
|
38 |
+
"CI builds repair",
|
39 |
+
"CMG",
|
40 |
+
"Bug localization",
|
41 |
+
"Module summarization",
|
42 |
+
],
|
43 |
"commit_message_generation": [
|
44 |
"BLEU",
|
45 |
"ChrF",
|
|
|
59 |
"EM all",
|
60 |
],
|
61 |
"bug_localization": [
|
62 |
+
"P",
|
63 |
+
"R",
|
64 |
+
"FPR",
|
65 |
+
"F1-score",
|
66 |
+
"All_correct",
|
67 |
+
"All_incorrect",
|
68 |
+
"Output_count",
|
69 |
],
|
70 |
"module_summarization": [
|
71 |
"CompScore",
|
72 |
],
|
73 |
"library_based_code_generation": [
|
74 |
+
"API Recall\nno context",
|
75 |
+
"API Recall\n20 APIs",
|
76 |
+
"API Recall\n200 APIs",
|
77 |
+
"API Recall\n2,000 APIs",
|
78 |
+
"API Recall\nall APIs",
|
79 |
+
"ChrF\nno context",
|
80 |
+
"ChrF\n20 APIs",
|
81 |
+
"ChrF\n200 APIs",
|
82 |
+
"ChrF\n2,000 APIs",
|
83 |
+
"ChrF\nall APIs",
|
84 |
],
|
85 |
"ci_builds_repair": [
|
86 |
"Pass@1",
|
|
|
93 |
"project_code_completion": "EM inproject",
|
94 |
"bug_localization": "Model Name",
|
95 |
"module_summarization": "CompScore",
|
96 |
+
"library_based_code_generation": "API Recall\nall APIs",
|
97 |
"ci_builds_repair": "Pass@1",
|
98 |
}
|
99 |
|
100 |
|
101 |
def get_columns_per_task(task_id: str) -> List[str]:
|
102 |
metrics_per_task = METRICS_PER_TASK[task_id]
|
103 |
+
if task_id == 'aggregated':
|
104 |
+
return ["Model Name"] + metrics_per_task
|
105 |
if task_id == 'project_code_completion':
|
106 |
+
return ["Model Name", "Context Composer", "Context Size", "Dataset Name", "Dataset"] + metrics_per_task + ["Submitted By", "Resources"]
|
107 |
if task_id == 'bug_localization':
|
108 |
return ["Model Name", "Availability", "Context Size"] + metrics_per_task + ["Submitted By", "Resources"]
|
109 |
|
|
|
111 |
return ["Model Name", "Context Size"] + metrics_per_task + ["Submitted By", "Resources"]
|
112 |
|
113 |
if task_id == 'library_based_code_generation':
|
114 |
+
return ["Model Name"] + metrics_per_task + ["Availability", "Submitted By", "Resources"]
|
115 |
|
116 |
if task_id == 'ci_builds_repair':
|
117 |
+
return ["Model Name", "Context Size", "Task type"] + metrics_per_task + ["Availability", "Submitted By", "Resources"]
|
118 |
|
119 |
return ["Model Name", "Context Size", "Availability"] + metrics_per_task + ["Submitted By", "Resources"]
|
120 |
|
|
|
122 |
def get_types_per_task(task_id: str) -> List[str]:
|
123 |
metrics_per_task = METRICS_PER_TASK.get(task_id, (0, 0, 0, 0, 0))
|
124 |
if task_id == 'project_code_completion':
|
125 |
+
return ["html", "markdown", "markdown", "markdown", "html"] + ["number" for _ in metrics_per_task] + ["markdown", "html"]
|
126 |
if task_id == 'bug_localization':
|
127 |
return ["html", "markdown", "markdown"] + ["number" for _ in metrics_per_task] + ["markdown", "html"]
|
128 |
if task_id == 'ci_builds_repair':
|
129 |
+
return ["html", "markdown", "markdown"] + ["number" for _ in metrics_per_task] + ["markdown", "markdown", "html"]
|
130 |
return ["html", "markdown", "markdown"] + ["number" for _ in metrics_per_task] + ["markdown", "html"]
|
src/submission_uploader.py
CHANGED
@@ -31,8 +31,8 @@ class SubmissionUploader:
|
|
31 |
"""
|
32 |
|
33 |
def __init__(self, dataset_id: str, private_dataset_id: str):
|
34 |
-
self._api = HfApi(token=os.
|
35 |
-
self._fs = HfFileSystem(token=os.
|
36 |
self._results_dataset_id = dataset_id
|
37 |
self._requests_dataset_id = private_dataset_id
|
38 |
|
|
|
31 |
"""
|
32 |
|
33 |
def __init__(self, dataset_id: str, private_dataset_id: str):
|
34 |
+
self._api = HfApi(token=os.getenv("HF_TOKEN"))
|
35 |
+
self._fs = HfFileSystem(token=os.getenv("HF_TOKEN"))
|
36 |
self._results_dataset_id = dataset_id
|
37 |
self._requests_dataset_id = private_dataset_id
|
38 |
|
src/tasks_content.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
from typing import Optional
|
2 |
|
3 |
TASKS_PRETTY = {
|
|
|
4 |
"library_based_code_generation": "Library-based code generation",
|
5 |
"ci_builds_repair": "CI builds repair",
|
6 |
"project_code_completion": "Project-level code completion",
|
@@ -11,24 +12,40 @@ TASKS_PRETTY = {
|
|
11 |
TASKS_PRETTY_REVERSE = {value: key for key, value in TASKS_PRETTY.items()}
|
12 |
|
13 |
TASKS_DESCRIPTIONS = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
"library_based_code_generation": """# Library-based code generation\n
|
15 |
|
16 |
-
Our Library-based code generation benchmark π€ [
|
17 |
|
18 |
For evaluation, we use two metrics:
|
19 |
* `ChrF`: textual similarity between the generated code and the reference program.
|
20 |
* `API Recall`: share of library-specific API calls used in the reference program that appear in the generated code,
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
25 |
|
26 |
**Terms of use**. As this dataset is collected from GitHub, researchers may use it for research purposes only if any publications resulting from that research are open access (see [GitHub Acceptable Use Policies](https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions)).
|
27 |
""",
|
28 |
|
29 |
"ci_builds_repair": """# CI builds repair\n
|
30 |
|
31 |
-
Our CI builds repair benchmark π€ [
|
32 |
includes 77 manually curated and assessed data points coming from 32 Python repositories, which are used to make a model fix a failed build.
|
33 |
|
34 |
The benchmark clones the repo to the local directory, the model fixes the issue according to logs and the local repo state,
|
@@ -40,16 +57,14 @@ TASKS_DESCRIPTIONS = {
|
|
40 |
* `oracle: files` β ground truth diffs are used to select files that should be corrected to fix the issue;
|
41 |
* `oracle: files, lines` β ground truth diffs are used to select files and code blocks that should be corrected to fix the issue;
|
42 |
|
43 |
-
For further details on the dataset and the baselines from the
|
44 |
-
|
45 |
-
If you have any questions or requests concerning this dataset, please contact us at [email protected].
|
46 |
|
47 |
**Terms of use**. As this dataset is collected from GitHub, researchers may use it for research purposes only if any publications resulting from that research are open access (see [GitHub Acceptable Use Policies](https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions)).
|
48 |
""",
|
49 |
|
50 |
"project_code_completion": """# Project-level code completion\n
|
51 |
|
52 |
-
Our Project-level code completion benchmark π€ [
|
53 |
* `small-context`: 144 data points,
|
54 |
* `medium-context`: 224 data points,
|
55 |
* `large-context`: 270 data points,
|
@@ -67,16 +82,14 @@ TASKS_DESCRIPTIONS = {
|
|
67 |
* *non-informative* β short/long lines, import/print lines, or comment lines;
|
68 |
* *random* β lines that don't fit any of the previous categories.
|
69 |
|
70 |
-
For further details on the dataset and the baselines from the
|
71 |
-
|
72 |
-
If you have any questions or requests concerning this dataset, please contact us at [email protected].
|
73 |
|
74 |
**Terms of use**. As this dataset is collected from GitHub, researchers may use it for research purposes only if any publications resulting from that research are open access (see [GitHub Acceptable Use Policies](https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions)).
|
75 |
""",
|
76 |
|
77 |
"commit_message_generation": """# Commit message generation\n
|
78 |
|
79 |
-
Our Commit message generation benchmark π€ [
|
80 |
|
81 |
We use the following metrics for evaluation:
|
82 |
* [BLEU](https://huggingface.co/spaces/evaluate-metric/sacrebleu)
|
@@ -84,39 +97,41 @@ TASKS_DESCRIPTIONS = {
|
|
84 |
* [ChrF](https://huggingface.co/spaces/evaluate-metric/chrf)
|
85 |
* [BERTScore](https://huggingface.co/spaces/evaluate-metric/bertscore)
|
86 |
|
87 |
-
For further details on the dataset and the baselines from the
|
88 |
|
89 |
**Note.** The leaderboard is sorted by the `ROUGE-1` metric by default.
|
90 |
|
91 |
-
If you have any questions or requests concerning this dataset, please contact us at [email protected].
|
92 |
-
|
93 |
**Terms of use**. As this dataset is collected from GitHub, researchers may use it for research purposes only if any publications resulting from that research are open access (see [GitHub Acceptable Use Policies](https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions)).
|
94 |
|
95 |
""",
|
96 |
|
97 |
"bug_localization": """# Bug localization\n
|
98 |
|
99 |
-
Our Bug localization benchmark π€ [
|
100 |
The model needs to identify the files within the repository that need to be modified to address the reported bug.
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
-
For further details on the dataset and the baselines from the
|
104 |
|
105 |
-
If you have any questions or requests concerning this dataset, please contact us at [email protected].
|
106 |
-
|
107 |
**Terms of use**. As this dataset is collected from GitHub, researchers may use it for research purposes only if any publications resulting from that research are open access (see [GitHub Acceptable Use Policies](https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions)).
|
108 |
""",
|
109 |
|
110 |
"module_summarization": """# Module summarization\n
|
111 |
-
Our Module summarization benchmark π€ [
|
112 |
The model is required to generate such description, given the relevant context code and the intent behind the documentation.
|
113 |
|
114 |
We use a novel metric for evaluation:
|
115 |
-
* `CompScore`: the new metric based on LLM as an assessor proposed for this task. Our approach involves feeding the LLM with relevant code and two versions of documentation: the ground truth and the model-generated text. More details on how it is calculated can be found in [our baselines repository](https://
|
116 |
|
117 |
-
For further details on the dataset and the baselines from the
|
118 |
-
|
119 |
-
If you have any questions or requests concerning this dataset, please contact us at [email protected].
|
120 |
|
121 |
**Terms of use**. As this dataset is collected from GitHub, researchers may use it for research purposes only if any publications resulting from that research are open access (see [GitHub Acceptable Use Policies](https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions)).
|
122 |
""",
|
@@ -130,6 +145,6 @@ def get_submission_text_files_for_task(task_pretty: Optional[str]) -> str:
|
|
130 |
task_id = TASKS_PRETTY_REVERSE[task_pretty]
|
131 |
|
132 |
if task_id == "commit_message_generation":
|
133 |
-
return f"""**{task_pretty} Instructions:**\n\n* Please, attach files in [JSONLines format](https://jsonlines.org/). For an example, check the predictions provided by
|
134 |
|
135 |
return f"**{task_pretty} Instructions:**\n\n* π§ There are no instructions for the current task yet."
|
|
|
1 |
from typing import Optional
|
2 |
|
3 |
TASKS_PRETTY = {
|
4 |
+
"aggregated": "Aggregated Results",
|
5 |
"library_based_code_generation": "Library-based code generation",
|
6 |
"ci_builds_repair": "CI builds repair",
|
7 |
"project_code_completion": "Project-level code completion",
|
|
|
12 |
TASKS_PRETTY_REVERSE = {value: key for key, value in TASKS_PRETTY.items()}
|
13 |
|
14 |
TASKS_DESCRIPTIONS = {
|
15 |
+
"aggregated": """# Aggregated Results\n
|
16 |
+
|
17 |
+
Here, we present the aggregated results across all the tasks in BenchName (except for Project-level code completion, where its specifics required a different selection of models). To get more details about each task, visit the corresponding tab.
|
18 |
+
|
19 |
+
To obtain aggregated results, we first select only one metric from metric suite for each task:
|
20 |
+
* Library-based code generation: `API Recall`
|
21 |
+
* CI builds repair: `Pass@1`
|
22 |
+
* Commit message generation: `chrF`
|
23 |
+
* Bug localization: `F1-score`
|
24 |
+
* Module summarization: `CompScore`
|
25 |
+
|
26 |
+
Then, to ensure a fair comparison across tasks with different score ranges, we normalize all scores to a 0-1 scale, where zero corresponds to the worst-performing model, and 1 to the best one. Note that for mean rank, rather than using strict rankings, we implemented a ranking system with a 10% margin to account for models with similar performance.
|
27 |
+
|
28 |
+
We report mean rank (with std) and mean score across the tasks from BenchName, and the scores for each task in the table below.
|
29 |
+
""",
|
30 |
"library_based_code_generation": """# Library-based code generation\n
|
31 |
|
32 |
+
Our Library-based code generation benchmark π€ [icmlbenchname/library-based-code-generation](https://huggingface.co/datasets/icmlbenchname/library-based-code-generation) includes 150 manually curated instructions asking a model to generate Python code using a particular library. Samples come from 62 Python repositories. All the samples in the dataset are based on reference example programs written by authors of the respective libraries.
|
33 |
|
34 |
For evaluation, we use two metrics:
|
35 |
* `ChrF`: textual similarity between the generated code and the reference program.
|
36 |
* `API Recall`: share of library-specific API calls used in the reference program that appear in the generated code,
|
37 |
|
38 |
+
As a context, we pass a prefix of the list of APIs available in the target library.
|
39 |
+
We select the prefix based on their BM-25 similarity with the provided instruction.
|
40 |
+
|
41 |
+
For further details on the dataset and the baselines from the BenchName team, refer to the `library_based_code_generation` directory in [our baselines repository](https://anonymous.4open.science/r/icml-benchname-2025/).
|
42 |
|
43 |
**Terms of use**. As this dataset is collected from GitHub, researchers may use it for research purposes only if any publications resulting from that research are open access (see [GitHub Acceptable Use Policies](https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions)).
|
44 |
""",
|
45 |
|
46 |
"ci_builds_repair": """# CI builds repair\n
|
47 |
|
48 |
+
Our CI builds repair benchmark π€ [icmlbenchname/ci-builds-repair](https://huggingface.co/datasets/icmlbenchname/ci-builds-repair)
|
49 |
includes 77 manually curated and assessed data points coming from 32 Python repositories, which are used to make a model fix a failed build.
|
50 |
|
51 |
The benchmark clones the repo to the local directory, the model fixes the issue according to logs and the local repo state,
|
|
|
57 |
* `oracle: files` β ground truth diffs are used to select files that should be corrected to fix the issue;
|
58 |
* `oracle: files, lines` β ground truth diffs are used to select files and code blocks that should be corrected to fix the issue;
|
59 |
|
60 |
+
For further details on the dataset and the baselines from the BenchName team, refer to the `ci-builds-repair` directory in [our baselines repository](https://anonymous.4open.science/r/icml-benchname-2025/).
|
|
|
|
|
61 |
|
62 |
**Terms of use**. As this dataset is collected from GitHub, researchers may use it for research purposes only if any publications resulting from that research are open access (see [GitHub Acceptable Use Policies](https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions)).
|
63 |
""",
|
64 |
|
65 |
"project_code_completion": """# Project-level code completion\n
|
66 |
|
67 |
+
Our Project-level code completion benchmark π€ [icmlbenchname/project-level-code-completion](https://huggingface.co/datasets/icmlbenchname/project-level-code-completion) includes four sets of samples:
|
68 |
* `small-context`: 144 data points,
|
69 |
* `medium-context`: 224 data points,
|
70 |
* `large-context`: 270 data points,
|
|
|
82 |
* *non-informative* β short/long lines, import/print lines, or comment lines;
|
83 |
* *random* β lines that don't fit any of the previous categories.
|
84 |
|
85 |
+
For further details on the dataset and the baselines from the BenchName team, refer to the `project_level_code_completion` directory in [our baselines repository](https://anonymous.4open.science/r/icml-benchname-2025/).
|
|
|
|
|
86 |
|
87 |
**Terms of use**. As this dataset is collected from GitHub, researchers may use it for research purposes only if any publications resulting from that research are open access (see [GitHub Acceptable Use Policies](https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions)).
|
88 |
""",
|
89 |
|
90 |
"commit_message_generation": """# Commit message generation\n
|
91 |
|
92 |
+
Our Commit message generation benchmark π€ [icmlbenchname/commit-message-generation](https://huggingface.co/datasets/icmlbenchname/commit-message-generation) includes 163 manually curated commits with large diffs from 34 Python projects, which the model needs to generate commit messages for.
|
93 |
|
94 |
We use the following metrics for evaluation:
|
95 |
* [BLEU](https://huggingface.co/spaces/evaluate-metric/sacrebleu)
|
|
|
97 |
* [ChrF](https://huggingface.co/spaces/evaluate-metric/chrf)
|
98 |
* [BERTScore](https://huggingface.co/spaces/evaluate-metric/bertscore)
|
99 |
|
100 |
+
For further details on the dataset and the baselines from the BenchName team, refer to the `commit_message_generation` directory in [our baselines repository](https://anonymous.4open.science/r/icml-benchname-2025/).
|
101 |
|
102 |
**Note.** The leaderboard is sorted by the `ROUGE-1` metric by default.
|
103 |
|
|
|
|
|
104 |
**Terms of use**. As this dataset is collected from GitHub, researchers may use it for research purposes only if any publications resulting from that research are open access (see [GitHub Acceptable Use Policies](https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions)).
|
105 |
|
106 |
""",
|
107 |
|
108 |
"bug_localization": """# Bug localization\n
|
109 |
|
110 |
+
Our Bug localization benchmark π€ [icmlbenchname/bug-localization](https://huggingface.co/datasets/icmlbenchname/bug-localization) includes 150 manually verified bug issue descriptions with information about pull request that fix them for Python, Java, and Kotlin projects.
|
111 |
The model needs to identify the files within the repository that need to be modified to address the reported bug.
|
112 |
+
|
113 |
+
To evaluate baseline performance, we use the following classification metrics:
|
114 |
+
* **P** - precision to estimate how many of the predicted buggy files were correctly identified
|
115 |
+
* **R** - recall to indicate how many of the actual buggy files were correctly found
|
116 |
+
* **FPR** - false positive rate to indicate how many non-buggy files were incorrectly predicted as buggy
|
117 |
+
* **F1-score** - score to provide a balance between precision and recall
|
118 |
+
* **All correct** - percentage of cases where all buggy files were correctly identified
|
119 |
+
* **All incorrect** - percentage of cases where all buggy files were incorrectly identified
|
120 |
+
* **# Output** - average number of buggy files detected, to further assess performance, particularly concerning high **FPR**.
|
121 |
|
122 |
+
For further details on the dataset and the baselines from the BenchName team, refer to the `bug_localization` directory in [our baselines repository](https://anonymous.4open.science/r/icml-benchname-2025/).
|
123 |
|
|
|
|
|
124 |
**Terms of use**. As this dataset is collected from GitHub, researchers may use it for research purposes only if any publications resulting from that research are open access (see [GitHub Acceptable Use Policies](https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions)).
|
125 |
""",
|
126 |
|
127 |
"module_summarization": """# Module summarization\n
|
128 |
+
Our Module summarization benchmark π€ [icmlbenchname/module-summarization](https://huggingface.co/datasets/icmlbenchname/module-summarization) includes 216 manually curated text files describing different documentation of open-source permissive Python projects.
|
129 |
The model is required to generate such description, given the relevant context code and the intent behind the documentation.
|
130 |
|
131 |
We use a novel metric for evaluation:
|
132 |
+
* `CompScore`: the new metric based on LLM as an assessor proposed for this task. Our approach involves feeding the LLM with relevant code and two versions of documentation: the ground truth and the model-generated text. More details on how it is calculated can be found in [our baselines repository](https://anonymous.4open.science/r/icml-benchname-2025/module_summarization/README.md).
|
133 |
|
134 |
+
For further details on the dataset and the baselines from the BenchName team, refer to the `module_summarization` directory in [our baselines repository](https://anonymous.4open.science/r/icml-benchname-2025/).
|
|
|
|
|
135 |
|
136 |
**Terms of use**. As this dataset is collected from GitHub, researchers may use it for research purposes only if any publications resulting from that research are open access (see [GitHub Acceptable Use Policies](https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions)).
|
137 |
""",
|
|
|
145 |
task_id = TASKS_PRETTY_REVERSE[task_pretty]
|
146 |
|
147 |
if task_id == "commit_message_generation":
|
148 |
+
return f"""**{task_pretty} Instructions:**\n\n* Please, attach files in [JSONLines format](https://jsonlines.org/). For an example, check the predictions provided by BenchName Team in π€ [icmlbenchname/results](https://huggingface.co/datasets/icmlbenchname/results/tree/main/commit_message_generation/predictions). Make sure to include `"prediction"` and `"reference"` fields for each example, the rest are optional."""
|
149 |
|
150 |
return f"**{task_pretty} Instructions:**\n\n* π§ There are no instructions for the current task yet."
|
src/utils.py
CHANGED
File without changes
|