galtimur commited on
Commit
49c07f7
Β·
1 Parent(s): f1b1d8d

Raplaced files with up to date version from BenchName

Browse files
.gitattributes CHANGED
File without changes
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: Long Code Arena
3
- emoji: 🏟️
4
  colorFrom: yellow
5
- colorTo: purple
6
  sdk: gradio
7
  sdk_version: 4.36.1
8
  app_file: app.py
@@ -10,14 +10,3 @@ pinned: false
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
13
-
14
- ## Citing
15
- ```
16
- @article{bogomolov2024long,
17
- title={Long Code Arena: a Set of Benchmarks for Long-Context Code Models},
18
- author={Bogomolov, Egor and Eliseeva, Aleksandra and Galimzyanov, Timur and Glukhov, Evgeniy and Shapkin, Anton and Tigina, Maria and Golubev, Yaroslav and Kovrigin, Alexander and van Deursen, Arie and Izadi, Maliheh and Bryksin, Timofey},
19
- journal={arXiv preprint arXiv:2406.11612},
20
- year={2024}
21
- }
22
- ```
23
- You can find the paper [here](https://arxiv.org/abs/2406.11612).
 
1
  ---
2
+ title: BenchName
3
+ emoji: πŸš€
4
  colorFrom: yellow
5
+ colorTo: red
6
  sdk: gradio
7
  sdk_version: 4.36.1
8
  app_file: app.py
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -32,7 +32,7 @@ logging.basicConfig(
32
  )
33
 
34
  submission_uploader = SubmissionUploader(
35
- dataset_id=os.environ["DATASET_ID"], private_dataset_id=os.environ["PRIVATE_DATASET_ID"]
36
  )
37
 
38
 
@@ -58,6 +58,14 @@ def get_leaderboard_for_completion_task(dataset_name: str | None):
58
  )
59
 
60
 
 
 
 
 
 
 
 
 
61
  with gr.Blocks() as demo:
62
  # intro
63
  gr.HTML(INTRODUCTION_TITLE)
 
32
  )
33
 
34
  submission_uploader = SubmissionUploader(
35
+ dataset_id=os.environ["DATASET_ID"], private_dataset_id=os.getenv("PRIVATE_DATASET_ID")
36
  )
37
 
38
 
 
58
  )
59
 
60
 
61
+ def get_aggregated_leaderboard_for_task(task_pretty: str) -> gr.components.Dataframe:
62
+ return gr.components.Dataframe(
63
+ value=get_results_for_task(task_pretty),
64
+ interactive=False,
65
+ datatype=get_types_per_task(TASKS_PRETTY_REVERSE[task_pretty]),
66
+ )
67
+
68
+
69
  with gr.Blocks() as demo:
70
  # intro
71
  gr.HTML(INTRODUCTION_TITLE)
requirements.txt CHANGED
File without changes
src/__init__.py CHANGED
File without changes
src/content.py CHANGED
@@ -3,22 +3,22 @@ from .formatting import styled_warning
3
  # ================================
4
  # = ABOUT =
5
  # ================================
6
- INTRODUCTION_TITLE = """<h1 align="center">🏟️ Long Code Arena</h1>"""
7
 
8
- INTRODUCTION_TEXT = """🏟️ **Long Code Arena** is a suite of benchmarks for code-related tasks with large contexts, up to a whole code repository.
9
  It currently spans six different tasks and contains six datasets:
10
 
11
- * πŸ€— [Library-based code generation](https://huggingface.co/datasets/JetBrains-Research/lca-library-based-code-generation)
12
- * πŸ€— [CI builds repair](https://huggingface.co/datasets/JetBrains-Research/lca-ci-builds-repair)
13
- * πŸ€— [Project-level code completion](https://huggingface.co/datasets/JetBrains-Research/lca-project-level-code-completion)
14
- * πŸ€— [Commit message generation](https://huggingface.co/datasets/JetBrains-Research/lca-commit-message-generation)
15
- * πŸ€— [Bug localization](https://huggingface.co/datasets/JetBrains-Research/lca-bug-localization)
16
- * πŸ€— [Module summarization](https://huggingface.co/datasets/JetBrains-Research/lca-module-summarization)
17
 
18
- We are excited to invite you to participate in solving our benchmarks! To submit your results, please send the following materials to our πŸ“© email (lca@jetbrains.com):
19
 
20
  * **Results**: Include the summary of your benchmark outcomes.
21
- * **Reproduction Package**: To ensure the integrity and reproducibility of your results, please include the code for context collection (if any), generation of predictions, and evaluating. You can follow [our baselines](https://github.com/JetBrains-Research/lca-baselines) as a reference.
22
  * **Metadata**: Model information, organization name, licence of your model, context size, and other information you find relevant.
23
 
24
  We look forward to reviewing your innovative solutions!
@@ -30,23 +30,23 @@ We look forward to reviewing your innovative solutions!
30
  # ================================
31
  LEADERBOARD_TITLE = '<h2 align="center">πŸ…Leaderboard</h2>'
32
 
33
- LEADERBOARD_TEXT = """The raw results from the leaderboard are available in πŸ€— [JetBrains-Research/lca-results](https://huggingface.co/datasets/JetBrains-Research/lca-results)."""
34
 
35
  # ================================
36
  # = SUBMISSION =
37
  # ================================
38
  SUBMISSION_TITLE = '<h2 align="center">πŸ“© Make A Submission</h2>'
39
 
40
- SUBMISSION_TEXT_INTRO = """Use the form below to submit new results to 🏟️ Long Code Arena. If any problems arise, don't hesitate to contact us by email `TODO` or open a discussion πŸ’›"""
41
 
42
  SUBMISSION_TEXT_TASK = """1. Select a task you want to submit results for."""
43
 
44
  SUBMISSION_TEXT_METADATA = """2. Fill in some metadata about your submission."""
45
 
46
  SUBMISSION_TEXT_FILES = """3. Attach one or more files with your model's predictions.
47
- * If several files are attached, they will be treated as separate runs of the submitted model (e.g., with different seeds), and the metrics will be averaged across runs. For baselines provided by 🏟️ Long Code Arena Team, the results are averaged across 3 runs.
48
  """
49
 
50
- SUBMISSION_TEXT_SUBMIT = """All set! A new PR to πŸ€— [JetBrains-Research/lca-results](https://huggingface.co/datasets/JetBrains-Research/lca-results) should be opened when you press "Submit" button. 🏟️ Long Code Arena Team will review it shortly, and the results will appear in the leaderboard.
51
 
52
  ⏳ **Note:** It might take some time (up to 40 minutes) for PR to get created, since it involves computing metrics for your submission."""
 
3
  # ================================
4
  # = ABOUT =
5
  # ================================
6
+ INTRODUCTION_TITLE = """<h1 align="center">🏟️ BenchName </h1>"""
7
 
8
+ INTRODUCTION_TEXT = """🏟️ **BenchName** is a suite of benchmarks for code-related tasks with large contexts, up to a whole code repository.
9
  It currently spans six different tasks and contains six datasets:
10
 
11
+ * πŸ€— [Library-based code generation](https://huggingface.co/datasets/icmlbenchname/library-based-code-generation)
12
+ * πŸ€— [CI builds repair](https://huggingface.co/datasets/icmlbenchname/ci-builds-repair)
13
+ * πŸ€— [Project-level code completion](https://huggingface.co/datasets/icmlbenchname/project-level-code-completion)
14
+ * πŸ€— [Commit message generation](https://huggingface.co/datasets/icmlbenchname/commit-message-generation)
15
+ * πŸ€— [Bug localization](https://huggingface.co/datasets/icmlbenchname/bug-localization)
16
+ * πŸ€— [Module summarization](https://huggingface.co/datasets/icmlbenchname/module-summarization)
17
 
18
+ We are excited to invite you to participate in solving our benchmarks! To submit your results, please send the following materials to our πŸ“© email (icmlbenchname@gmail.com):
19
 
20
  * **Results**: Include the summary of your benchmark outcomes.
21
+ * **Reproduction Package**: To ensure the integrity and reproducibility of your results, please include the code for context collection (if any), generation of predictions, and evaluating. You can follow [baselines](https://anonymous.4open.science/r/icml-benchname-2025/README.md) as a reference.
22
  * **Metadata**: Model information, organization name, licence of your model, context size, and other information you find relevant.
23
 
24
  We look forward to reviewing your innovative solutions!
 
30
  # ================================
31
  LEADERBOARD_TITLE = '<h2 align="center">πŸ…Leaderboard</h2>'
32
 
33
+ LEADERBOARD_TEXT = """The raw results from the leaderboard are available in πŸ€— [icmlbenchname/results](https://huggingface.co/datasets/icmlbenchname/results)."""
34
 
35
  # ================================
36
  # = SUBMISSION =
37
  # ================================
38
  SUBMISSION_TITLE = '<h2 align="center">πŸ“© Make A Submission</h2>'
39
 
40
+ SUBMISSION_TEXT_INTRO = """Use the form below to submit new results to 🏟️ BenchName. If any problems arise, don't hesitate to contact us by email `TODO` or open a discussion πŸ’›"""
41
 
42
  SUBMISSION_TEXT_TASK = """1. Select a task you want to submit results for."""
43
 
44
  SUBMISSION_TEXT_METADATA = """2. Fill in some metadata about your submission."""
45
 
46
  SUBMISSION_TEXT_FILES = """3. Attach one or more files with your model's predictions.
47
+ * If several files are attached, they will be treated as separate runs of the submitted model (e.g., with different seeds), and the metrics will be averaged across runs. For baselines provided by 🏟️ BenchName Team, the results are averaged across 3 runs.
48
  """
49
 
50
+ SUBMISSION_TEXT_SUBMIT = """All set! A new PR to πŸ€— [icmlbenchname/results](https://huggingface.co/datasets/icmlbenchname/results) should be opened when you press "Submit" button. 🏟️ BenchName Team will review it shortly, and the results will appear in the leaderboard.
51
 
52
  ⏳ **Note:** It might take some time (up to 40 minutes) for PR to get created, since it involves computing metrics for your submission."""
src/evaluation/__init__.py CHANGED
File without changes
src/evaluation/base_task_metrics.py CHANGED
File without changes
src/evaluation/commit_message_generation/__init__.py CHANGED
File without changes
src/evaluation/commit_message_generation/cmg_metrics.py CHANGED
File without changes
src/evaluation/metrics.py CHANGED
File without changes
src/formatting.py CHANGED
File without changes
src/get_results_for_task.py CHANGED
@@ -37,7 +37,7 @@ def _get_results_stub() -> pd.DataFrame:
37
  "ChrF": "X",
38
  "BERTScore": "X",
39
  "BERTScore (Normalized)": "X",
40
- "Submitted By": "🏟 Long Code Arena Team",
41
  "Resources": "",
42
  },
43
  {
@@ -49,7 +49,7 @@ def _get_results_stub() -> pd.DataFrame:
49
  "ChrF": "X",
50
  "BERTScore": "X",
51
  "BERTScore (Normalized)": "X",
52
- "Submitted By": "🏟 Long Code Arena Team",
53
  "Resources": "",
54
  },
55
  ]
@@ -77,27 +77,31 @@ def _get_results_dataset(task_id: str) -> pd.DataFrame:
77
  os.environ["DATASET_ID"], task_id, split="test", download_mode="force_redownload"
78
  ).to_pandas()
79
  results_df = results_df.rename(columns=COLUMNS_PRETTY, errors="ignore")
80
- results_df["Context Size"] = results_df["Context Size"].map(lambda x: f"{int(x) // 1000}k" if int(x) >= 1000 else x)
81
 
82
- results_df = results_df.sort_values(by=SORT_COLUMN_PER_TASK[task_id], ascending=False)
 
 
 
83
 
84
  for metric_column in METRICS_PER_TASK[task_id]:
85
  if "BERTScore" in metric_column:
86
  results_df[metric_column] = results_df[metric_column].map(lambda x: f"{x:.5f}")
 
 
87
  else:
88
  results_df[metric_column] = results_df[metric_column].map(lambda x: f"{x:.2f}")
89
 
90
- results_df["Model Name"] = [
91
- model_hyperlink(link=link, model_name=model_name) if link else model_name
92
- for link, model_name in zip(results_df["model_url"], results_df["Model Name"])
93
- ]
 
 
 
94
  if task_id == 'project_code_completion':
95
  results_df["Dataset Name"] = [_extract_dataset_name(urls) for urls in results_df["Dataset"]]
96
  results_df["Dataset"] = [_process_urls(urls) for urls in results_df["Dataset"]]
97
- results_df["Resources"] = [_process_urls(urls) for urls in results_df["Resources"]]
98
  results_df = results_df[get_columns_per_task(task_id)]
99
- if task_id == 'ci_builds_repair':
100
- results_df = results_df.rename(columns={"Context Size": "Context"})
101
  return results_df
102
 
103
 
 
37
  "ChrF": "X",
38
  "BERTScore": "X",
39
  "BERTScore (Normalized)": "X",
40
+ "Submitted By": "BenchName Team",
41
  "Resources": "",
42
  },
43
  {
 
49
  "ChrF": "X",
50
  "BERTScore": "X",
51
  "BERTScore (Normalized)": "X",
52
+ "Submitted By": "BenchName Team",
53
  "Resources": "",
54
  },
55
  ]
 
77
  os.environ["DATASET_ID"], task_id, split="test", download_mode="force_redownload"
78
  ).to_pandas()
79
  results_df = results_df.rename(columns=COLUMNS_PRETTY, errors="ignore")
 
80
 
81
+ if task_id != "aggregated":
82
+ results_df["Context Size"] = results_df["Context Size"].map(lambda x: f"{int(x) // 1000}k" if int(x) >= 1000 else x)
83
+ results_df["Resources"] = [_process_urls(urls) for urls in results_df["Resources"]]
84
+ results_df = results_df.sort_values(by=SORT_COLUMN_PER_TASK[task_id], ascending=False)
85
 
86
  for metric_column in METRICS_PER_TASK[task_id]:
87
  if "BERTScore" in metric_column:
88
  results_df[metric_column] = results_df[metric_column].map(lambda x: f"{x:.5f}")
89
+ elif "Mean Rank" in metric_column:
90
+ continue
91
  else:
92
  results_df[metric_column] = results_df[metric_column].map(lambda x: f"{x:.2f}")
93
 
94
+ if task_id == 'aggregated':
95
+ results_df["Model Name"] = results_df["Model"]
96
+ else:
97
+ results_df["Model Name"] = [
98
+ model_hyperlink(link=link, model_name=model_name) if link else model_name
99
+ for link, model_name in zip(results_df["model_url"], results_df["Model Name"])
100
+ ]
101
  if task_id == 'project_code_completion':
102
  results_df["Dataset Name"] = [_extract_dataset_name(urls) for urls in results_df["Dataset"]]
103
  results_df["Dataset"] = [_process_urls(urls) for urls in results_df["Dataset"]]
 
104
  results_df = results_df[get_columns_per_task(task_id)]
 
 
105
  return results_df
106
 
107
 
src/leaderboard_formatting.py CHANGED
@@ -20,16 +20,26 @@ COLUMNS_PRETTY = {
20
  "EM commited": "EM committed",
21
  "EM non_informative": "EM non-informative",
22
  "EM random": "EM random",
23
- "EM all": "EM all",
 
 
24
  "dataset": "Dataset",
25
  "CompScore": "CompScore",
26
  "context": "Context",
27
  "task_type": "Task type",
28
- "date": "Date, mm/yy",
29
  }
30
 
31
  # Add your metrics
32
  METRICS_PER_TASK = {
 
 
 
 
 
 
 
 
 
33
  "commit_message_generation": [
34
  "BLEU",
35
  "ChrF",
@@ -49,18 +59,28 @@ METRICS_PER_TASK = {
49
  "EM all",
50
  ],
51
  "bug_localization": [
52
- "R@1",
53
- "R@2",
54
- "P@2",
55
- "f1-score",
56
- "MAP",
 
 
57
  ],
58
  "module_summarization": [
59
  "CompScore",
60
  ],
61
  "library_based_code_generation": [
62
- "ChrF",
63
- "API Recall",
 
 
 
 
 
 
 
 
64
  ],
65
  "ci_builds_repair": [
66
  "Pass@1",
@@ -73,15 +93,17 @@ SORT_COLUMN_PER_TASK = {
73
  "project_code_completion": "EM inproject",
74
  "bug_localization": "Model Name",
75
  "module_summarization": "CompScore",
76
- "library_based_code_generation": "API Recall",
77
  "ci_builds_repair": "Pass@1",
78
  }
79
 
80
 
81
  def get_columns_per_task(task_id: str) -> List[str]:
82
  metrics_per_task = METRICS_PER_TASK[task_id]
 
 
83
  if task_id == 'project_code_completion':
84
- return ["Model Name", "Context Size", "Dataset Name", "Dataset"] + metrics_per_task + ["Availability", "Submitted By", "Resources"]
85
  if task_id == 'bug_localization':
86
  return ["Model Name", "Availability", "Context Size"] + metrics_per_task + ["Submitted By", "Resources"]
87
 
@@ -89,10 +111,10 @@ def get_columns_per_task(task_id: str) -> List[str]:
89
  return ["Model Name", "Context Size"] + metrics_per_task + ["Submitted By", "Resources"]
90
 
91
  if task_id == 'library_based_code_generation':
92
- return ["Model Name", "Context"] + metrics_per_task + ["Availability", "Submitted By", "Resources"]
93
 
94
  if task_id == 'ci_builds_repair':
95
- return ["Model Name", "Context Size", "Task type"] + metrics_per_task + ["Pass/golden", "Availability", "Submitted By", "Resources", "Date, mm/yy"]
96
 
97
  return ["Model Name", "Context Size", "Availability"] + metrics_per_task + ["Submitted By", "Resources"]
98
 
@@ -100,9 +122,9 @@ def get_columns_per_task(task_id: str) -> List[str]:
100
  def get_types_per_task(task_id: str) -> List[str]:
101
  metrics_per_task = METRICS_PER_TASK.get(task_id, (0, 0, 0, 0, 0))
102
  if task_id == 'project_code_completion':
103
- return ["html", "markdown", "markdown", "html"] + ["number" for _ in metrics_per_task] + ["markdown", "markdown", "html"]
104
  if task_id == 'bug_localization':
105
  return ["html", "markdown", "markdown"] + ["number" for _ in metrics_per_task] + ["markdown", "html"]
106
  if task_id == 'ci_builds_repair':
107
- return ["html", "markdown", "markdown"] + ["number" for _ in metrics_per_task] + ["markdown", "markdown", "markdown", "html"]
108
  return ["html", "markdown", "markdown"] + ["number" for _ in metrics_per_task] + ["markdown", "html"]
 
20
  "EM commited": "EM committed",
21
  "EM non_informative": "EM non-informative",
22
  "EM random": "EM random",
23
+ "EM all": "EM all",
24
+ "context_composer": "Context Composer",
25
+ "context_length": "Context Size",
26
  "dataset": "Dataset",
27
  "CompScore": "CompScore",
28
  "context": "Context",
29
  "task_type": "Task type",
 
30
  }
31
 
32
  # Add your metrics
33
  METRICS_PER_TASK = {
34
+ "aggregated": [
35
+ "Mean Rank",
36
+ "Mean Score",
37
+ "Library-based CG",
38
+ "CI builds repair",
39
+ "CMG",
40
+ "Bug localization",
41
+ "Module summarization",
42
+ ],
43
  "commit_message_generation": [
44
  "BLEU",
45
  "ChrF",
 
59
  "EM all",
60
  ],
61
  "bug_localization": [
62
+ "P",
63
+ "R",
64
+ "FPR",
65
+ "F1-score",
66
+ "All_correct",
67
+ "All_incorrect",
68
+ "Output_count",
69
  ],
70
  "module_summarization": [
71
  "CompScore",
72
  ],
73
  "library_based_code_generation": [
74
+ "API Recall\nno context",
75
+ "API Recall\n20 APIs",
76
+ "API Recall\n200 APIs",
77
+ "API Recall\n2,000 APIs",
78
+ "API Recall\nall APIs",
79
+ "ChrF\nno context",
80
+ "ChrF\n20 APIs",
81
+ "ChrF\n200 APIs",
82
+ "ChrF\n2,000 APIs",
83
+ "ChrF\nall APIs",
84
  ],
85
  "ci_builds_repair": [
86
  "Pass@1",
 
93
  "project_code_completion": "EM inproject",
94
  "bug_localization": "Model Name",
95
  "module_summarization": "CompScore",
96
+ "library_based_code_generation": "API Recall\nall APIs",
97
  "ci_builds_repair": "Pass@1",
98
  }
99
 
100
 
101
  def get_columns_per_task(task_id: str) -> List[str]:
102
  metrics_per_task = METRICS_PER_TASK[task_id]
103
+ if task_id == 'aggregated':
104
+ return ["Model Name"] + metrics_per_task
105
  if task_id == 'project_code_completion':
106
+ return ["Model Name", "Context Composer", "Context Size", "Dataset Name", "Dataset"] + metrics_per_task + ["Submitted By", "Resources"]
107
  if task_id == 'bug_localization':
108
  return ["Model Name", "Availability", "Context Size"] + metrics_per_task + ["Submitted By", "Resources"]
109
 
 
111
  return ["Model Name", "Context Size"] + metrics_per_task + ["Submitted By", "Resources"]
112
 
113
  if task_id == 'library_based_code_generation':
114
+ return ["Model Name"] + metrics_per_task + ["Availability", "Submitted By", "Resources"]
115
 
116
  if task_id == 'ci_builds_repair':
117
+ return ["Model Name", "Context Size", "Task type"] + metrics_per_task + ["Availability", "Submitted By", "Resources"]
118
 
119
  return ["Model Name", "Context Size", "Availability"] + metrics_per_task + ["Submitted By", "Resources"]
120
 
 
122
  def get_types_per_task(task_id: str) -> List[str]:
123
  metrics_per_task = METRICS_PER_TASK.get(task_id, (0, 0, 0, 0, 0))
124
  if task_id == 'project_code_completion':
125
+ return ["html", "markdown", "markdown", "markdown", "html"] + ["number" for _ in metrics_per_task] + ["markdown", "html"]
126
  if task_id == 'bug_localization':
127
  return ["html", "markdown", "markdown"] + ["number" for _ in metrics_per_task] + ["markdown", "html"]
128
  if task_id == 'ci_builds_repair':
129
+ return ["html", "markdown", "markdown"] + ["number" for _ in metrics_per_task] + ["markdown", "markdown", "html"]
130
  return ["html", "markdown", "markdown"] + ["number" for _ in metrics_per_task] + ["markdown", "html"]
src/submission_uploader.py CHANGED
@@ -31,8 +31,8 @@ class SubmissionUploader:
31
  """
32
 
33
  def __init__(self, dataset_id: str, private_dataset_id: str):
34
- self._api = HfApi(token=os.environ["HF_TOKEN"])
35
- self._fs = HfFileSystem(token=os.environ["HF_TOKEN"])
36
  self._results_dataset_id = dataset_id
37
  self._requests_dataset_id = private_dataset_id
38
 
 
31
  """
32
 
33
  def __init__(self, dataset_id: str, private_dataset_id: str):
34
+ self._api = HfApi(token=os.getenv("HF_TOKEN"))
35
+ self._fs = HfFileSystem(token=os.getenv("HF_TOKEN"))
36
  self._results_dataset_id = dataset_id
37
  self._requests_dataset_id = private_dataset_id
38
 
src/tasks_content.py CHANGED
@@ -1,6 +1,7 @@
1
  from typing import Optional
2
 
3
  TASKS_PRETTY = {
 
4
  "library_based_code_generation": "Library-based code generation",
5
  "ci_builds_repair": "CI builds repair",
6
  "project_code_completion": "Project-level code completion",
@@ -11,24 +12,40 @@ TASKS_PRETTY = {
11
  TASKS_PRETTY_REVERSE = {value: key for key, value in TASKS_PRETTY.items()}
12
 
13
  TASKS_DESCRIPTIONS = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  "library_based_code_generation": """# Library-based code generation\n
15
 
16
- Our Library-based code generation benchmark πŸ€— [JetBrains-Research/lca-library-based-code-generation](https://huggingface.co/datasets/JetBrains-Research/lca-library-based-code-generation) includes 150 manually curated instructions asking a model to generate Python code using a particular library. Samples come from 62 Python repositories. All the samples in the dataset are based on reference example programs written by authors of the respective libraries.
17
 
18
  For evaluation, we use two metrics:
19
  * `ChrF`: textual similarity between the generated code and the reference program.
20
  * `API Recall`: share of library-specific API calls used in the reference program that appear in the generated code,
21
 
22
- For further details on the dataset and the baselines from the 🏟️ Long Code Arena team, refer to the `library_based_code_generation` directory in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines).
23
-
24
- If you have any questions or requests concerning this dataset, please contact us at [email protected].
 
25
 
26
  **Terms of use**. As this dataset is collected from GitHub, researchers may use it for research purposes only if any publications resulting from that research are open access (see [GitHub Acceptable Use Policies](https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions)).
27
  """,
28
 
29
  "ci_builds_repair": """# CI builds repair\n
30
 
31
- Our CI builds repair benchmark πŸ€— [JetBrains-Research/lca-ci-builds-repair](https://huggingface.co/datasets/JetBrains-Research/lca-ci-builds-repair)
32
  includes 77 manually curated and assessed data points coming from 32 Python repositories, which are used to make a model fix a failed build.
33
 
34
  The benchmark clones the repo to the local directory, the model fixes the issue according to logs and the local repo state,
@@ -40,16 +57,14 @@ TASKS_DESCRIPTIONS = {
40
  * `oracle: files` – ground truth diffs are used to select files that should be corrected to fix the issue;
41
  * `oracle: files, lines` – ground truth diffs are used to select files and code blocks that should be corrected to fix the issue;
42
 
43
- For further details on the dataset and the baselines from the 🏟️ Long Code Arena team, refer to the `ci-builds-repair` directory in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines).
44
-
45
- If you have any questions or requests concerning this dataset, please contact us at [email protected].
46
 
47
  **Terms of use**. As this dataset is collected from GitHub, researchers may use it for research purposes only if any publications resulting from that research are open access (see [GitHub Acceptable Use Policies](https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions)).
48
  """,
49
 
50
  "project_code_completion": """# Project-level code completion\n
51
 
52
- Our Project-level code completion benchmark πŸ€— [JetBrains-Research/lca-project-level-code-completion](https://huggingface.co/datasets/JetBrains-Research/lca-project-level-code-completion) includes four sets of samples:
53
  * `small-context`: 144 data points,
54
  * `medium-context`: 224 data points,
55
  * `large-context`: 270 data points,
@@ -67,16 +82,14 @@ TASKS_DESCRIPTIONS = {
67
  * *non-informative* – short/long lines, import/print lines, or comment lines;
68
  * *random* – lines that don't fit any of the previous categories.
69
 
70
- For further details on the dataset and the baselines from the 🏟️ Long Code Arena team, refer to the `project_level_code_completion` directory in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines).
71
-
72
- If you have any questions or requests concerning this dataset, please contact us at [email protected].
73
 
74
  **Terms of use**. As this dataset is collected from GitHub, researchers may use it for research purposes only if any publications resulting from that research are open access (see [GitHub Acceptable Use Policies](https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions)).
75
  """,
76
 
77
  "commit_message_generation": """# Commit message generation\n
78
 
79
- Our Commit message generation benchmark πŸ€— [JetBrains-Research/lca-commit-message-generation](https://huggingface.co/datasets/JetBrains-Research/lca-commit-message-generation) includes 163 manually curated commits with large diffs from 34 Python projects, which the model needs to generate commit messages for.
80
 
81
  We use the following metrics for evaluation:
82
  * [BLEU](https://huggingface.co/spaces/evaluate-metric/sacrebleu)
@@ -84,39 +97,41 @@ TASKS_DESCRIPTIONS = {
84
  * [ChrF](https://huggingface.co/spaces/evaluate-metric/chrf)
85
  * [BERTScore](https://huggingface.co/spaces/evaluate-metric/bertscore)
86
 
87
- For further details on the dataset and the baselines from the 🏟️ Long Code Arena team, refer to the `commit_message_generation` directory in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines).
88
 
89
  **Note.** The leaderboard is sorted by the `ROUGE-1` metric by default.
90
 
91
- If you have any questions or requests concerning this dataset, please contact us at [email protected].
92
-
93
  **Terms of use**. As this dataset is collected from GitHub, researchers may use it for research purposes only if any publications resulting from that research are open access (see [GitHub Acceptable Use Policies](https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions)).
94
 
95
  """,
96
 
97
  "bug_localization": """# Bug localization\n
98
 
99
- Our Bug localization benchmark πŸ€— [JetBrains-Research/lca-bug-localization](https://huggingface.co/datasets/JetBrains-Research/lca-bug-localization) includes 150 manually verified bug issue descriptions with information about pull request that fix them for Python, Java, and Kotlin projects.
100
  The model needs to identify the files within the repository that need to be modified to address the reported bug.
101
- We used information retrieval metrics such as `R@k`, `P@k`, `F1-score`, and `MAP` for evaluation, taking `k` equal to 1 and 2.
 
 
 
 
 
 
 
 
102
 
103
- For further details on the dataset and the baselines from the 🏟️ Long Code Arena team, refer to the `bug_localization` directory in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines).
104
 
105
- If you have any questions or requests concerning this dataset, please contact us at [email protected].
106
-
107
  **Terms of use**. As this dataset is collected from GitHub, researchers may use it for research purposes only if any publications resulting from that research are open access (see [GitHub Acceptable Use Policies](https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions)).
108
  """,
109
 
110
  "module_summarization": """# Module summarization\n
111
- Our Module summarization benchmark πŸ€— [JetBrains-Research/lca-module-summarization](https://huggingface.co/datasets/JetBrains-Research/lca-module-summarization) includes 216 manually curated text files describing different documentation of open-source permissive Python projects.
112
  The model is required to generate such description, given the relevant context code and the intent behind the documentation.
113
 
114
  We use a novel metric for evaluation:
115
- * `CompScore`: the new metric based on LLM as an assessor proposed for this task. Our approach involves feeding the LLM with relevant code and two versions of documentation: the ground truth and the model-generated text. More details on how it is calculated can be found in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines/blob/main/module_summarization/README.md).
116
 
117
- For further details on the dataset and the baselines from the 🏟️ Long Code Arena team, refer to the `module_summarization` directory in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines/blob/main/module_summarization/).
118
-
119
- If you have any questions or requests concerning this dataset, please contact us at [email protected].
120
 
121
  **Terms of use**. As this dataset is collected from GitHub, researchers may use it for research purposes only if any publications resulting from that research are open access (see [GitHub Acceptable Use Policies](https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions)).
122
  """,
@@ -130,6 +145,6 @@ def get_submission_text_files_for_task(task_pretty: Optional[str]) -> str:
130
  task_id = TASKS_PRETTY_REVERSE[task_pretty]
131
 
132
  if task_id == "commit_message_generation":
133
- return f"""**{task_pretty} Instructions:**\n\n* Please, attach files in [JSONLines format](https://jsonlines.org/). For an example, check the predictions provided by 🏟️ Long Code Arena Team in πŸ€— [JetBrains-Research/lca-results](https://huggingface.co/datasets/JetBrains-Research/lca-results/tree/main/commit_message_generation/predictions). Make sure to include `"prediction"` and `"reference"` fields for each example, the rest are optional."""
134
 
135
  return f"**{task_pretty} Instructions:**\n\n* 🚧 There are no instructions for the current task yet."
 
1
  from typing import Optional
2
 
3
  TASKS_PRETTY = {
4
+ "aggregated": "Aggregated Results",
5
  "library_based_code_generation": "Library-based code generation",
6
  "ci_builds_repair": "CI builds repair",
7
  "project_code_completion": "Project-level code completion",
 
12
  TASKS_PRETTY_REVERSE = {value: key for key, value in TASKS_PRETTY.items()}
13
 
14
  TASKS_DESCRIPTIONS = {
15
+ "aggregated": """# Aggregated Results\n
16
+
17
+ Here, we present the aggregated results across all the tasks in BenchName (except for Project-level code completion, where its specifics required a different selection of models). To get more details about each task, visit the corresponding tab.
18
+
19
+ To obtain aggregated results, we first select only one metric from metric suite for each task:
20
+ * Library-based code generation: `API Recall`
21
+ * CI builds repair: `Pass@1`
22
+ * Commit message generation: `chrF`
23
+ * Bug localization: `F1-score`
24
+ * Module summarization: `CompScore`
25
+
26
+ Then, to ensure a fair comparison across tasks with different score ranges, we normalize all scores to a 0-1 scale, where zero corresponds to the worst-performing model, and 1 to the best one. Note that for mean rank, rather than using strict rankings, we implemented a ranking system with a 10% margin to account for models with similar performance.
27
+
28
+ We report mean rank (with std) and mean score across the tasks from BenchName, and the scores for each task in the table below.
29
+ """,
30
  "library_based_code_generation": """# Library-based code generation\n
31
 
32
+ Our Library-based code generation benchmark πŸ€— [icmlbenchname/library-based-code-generation](https://huggingface.co/datasets/icmlbenchname/library-based-code-generation) includes 150 manually curated instructions asking a model to generate Python code using a particular library. Samples come from 62 Python repositories. All the samples in the dataset are based on reference example programs written by authors of the respective libraries.
33
 
34
  For evaluation, we use two metrics:
35
  * `ChrF`: textual similarity between the generated code and the reference program.
36
  * `API Recall`: share of library-specific API calls used in the reference program that appear in the generated code,
37
 
38
+ As a context, we pass a prefix of the list of APIs available in the target library.
39
+ We select the prefix based on their BM-25 similarity with the provided instruction.
40
+
41
+ For further details on the dataset and the baselines from the BenchName team, refer to the `library_based_code_generation` directory in [our baselines repository](https://anonymous.4open.science/r/icml-benchname-2025/).
42
 
43
  **Terms of use**. As this dataset is collected from GitHub, researchers may use it for research purposes only if any publications resulting from that research are open access (see [GitHub Acceptable Use Policies](https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions)).
44
  """,
45
 
46
  "ci_builds_repair": """# CI builds repair\n
47
 
48
+ Our CI builds repair benchmark πŸ€— [icmlbenchname/ci-builds-repair](https://huggingface.co/datasets/icmlbenchname/ci-builds-repair)
49
  includes 77 manually curated and assessed data points coming from 32 Python repositories, which are used to make a model fix a failed build.
50
 
51
  The benchmark clones the repo to the local directory, the model fixes the issue according to logs and the local repo state,
 
57
  * `oracle: files` – ground truth diffs are used to select files that should be corrected to fix the issue;
58
  * `oracle: files, lines` – ground truth diffs are used to select files and code blocks that should be corrected to fix the issue;
59
 
60
+ For further details on the dataset and the baselines from the BenchName team, refer to the `ci-builds-repair` directory in [our baselines repository](https://anonymous.4open.science/r/icml-benchname-2025/).
 
 
61
 
62
  **Terms of use**. As this dataset is collected from GitHub, researchers may use it for research purposes only if any publications resulting from that research are open access (see [GitHub Acceptable Use Policies](https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions)).
63
  """,
64
 
65
  "project_code_completion": """# Project-level code completion\n
66
 
67
+ Our Project-level code completion benchmark πŸ€— [icmlbenchname/project-level-code-completion](https://huggingface.co/datasets/icmlbenchname/project-level-code-completion) includes four sets of samples:
68
  * `small-context`: 144 data points,
69
  * `medium-context`: 224 data points,
70
  * `large-context`: 270 data points,
 
82
  * *non-informative* – short/long lines, import/print lines, or comment lines;
83
  * *random* – lines that don't fit any of the previous categories.
84
 
85
+ For further details on the dataset and the baselines from the BenchName team, refer to the `project_level_code_completion` directory in [our baselines repository](https://anonymous.4open.science/r/icml-benchname-2025/).
 
 
86
 
87
  **Terms of use**. As this dataset is collected from GitHub, researchers may use it for research purposes only if any publications resulting from that research are open access (see [GitHub Acceptable Use Policies](https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions)).
88
  """,
89
 
90
  "commit_message_generation": """# Commit message generation\n
91
 
92
+ Our Commit message generation benchmark πŸ€— [icmlbenchname/commit-message-generation](https://huggingface.co/datasets/icmlbenchname/commit-message-generation) includes 163 manually curated commits with large diffs from 34 Python projects, which the model needs to generate commit messages for.
93
 
94
  We use the following metrics for evaluation:
95
  * [BLEU](https://huggingface.co/spaces/evaluate-metric/sacrebleu)
 
97
  * [ChrF](https://huggingface.co/spaces/evaluate-metric/chrf)
98
  * [BERTScore](https://huggingface.co/spaces/evaluate-metric/bertscore)
99
 
100
+ For further details on the dataset and the baselines from the BenchName team, refer to the `commit_message_generation` directory in [our baselines repository](https://anonymous.4open.science/r/icml-benchname-2025/).
101
 
102
  **Note.** The leaderboard is sorted by the `ROUGE-1` metric by default.
103
 
 
 
104
  **Terms of use**. As this dataset is collected from GitHub, researchers may use it for research purposes only if any publications resulting from that research are open access (see [GitHub Acceptable Use Policies](https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions)).
105
 
106
  """,
107
 
108
  "bug_localization": """# Bug localization\n
109
 
110
+ Our Bug localization benchmark πŸ€— [icmlbenchname/bug-localization](https://huggingface.co/datasets/icmlbenchname/bug-localization) includes 150 manually verified bug issue descriptions with information about pull request that fix them for Python, Java, and Kotlin projects.
111
  The model needs to identify the files within the repository that need to be modified to address the reported bug.
112
+
113
+ To evaluate baseline performance, we use the following classification metrics:
114
+ * **P** - precision to estimate how many of the predicted buggy files were correctly identified
115
+ * **R** - recall to indicate how many of the actual buggy files were correctly found
116
+ * **FPR** - false positive rate to indicate how many non-buggy files were incorrectly predicted as buggy
117
+ * **F1-score** - score to provide a balance between precision and recall
118
+ * **All correct** - percentage of cases where all buggy files were correctly identified
119
+ * **All incorrect** - percentage of cases where all buggy files were incorrectly identified
120
+ * **# Output** - average number of buggy files detected, to further assess performance, particularly concerning high **FPR**.
121
 
122
+ For further details on the dataset and the baselines from the BenchName team, refer to the `bug_localization` directory in [our baselines repository](https://anonymous.4open.science/r/icml-benchname-2025/).
123
 
 
 
124
  **Terms of use**. As this dataset is collected from GitHub, researchers may use it for research purposes only if any publications resulting from that research are open access (see [GitHub Acceptable Use Policies](https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions)).
125
  """,
126
 
127
  "module_summarization": """# Module summarization\n
128
+ Our Module summarization benchmark πŸ€— [icmlbenchname/module-summarization](https://huggingface.co/datasets/icmlbenchname/module-summarization) includes 216 manually curated text files describing different documentation of open-source permissive Python projects.
129
  The model is required to generate such description, given the relevant context code and the intent behind the documentation.
130
 
131
  We use a novel metric for evaluation:
132
+ * `CompScore`: the new metric based on LLM as an assessor proposed for this task. Our approach involves feeding the LLM with relevant code and two versions of documentation: the ground truth and the model-generated text. More details on how it is calculated can be found in [our baselines repository](https://anonymous.4open.science/r/icml-benchname-2025/module_summarization/README.md).
133
 
134
+ For further details on the dataset and the baselines from the BenchName team, refer to the `module_summarization` directory in [our baselines repository](https://anonymous.4open.science/r/icml-benchname-2025/).
 
 
135
 
136
  **Terms of use**. As this dataset is collected from GitHub, researchers may use it for research purposes only if any publications resulting from that research are open access (see [GitHub Acceptable Use Policies](https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions)).
137
  """,
 
145
  task_id = TASKS_PRETTY_REVERSE[task_pretty]
146
 
147
  if task_id == "commit_message_generation":
148
+ return f"""**{task_pretty} Instructions:**\n\n* Please, attach files in [JSONLines format](https://jsonlines.org/). For an example, check the predictions provided by BenchName Team in πŸ€— [icmlbenchname/results](https://huggingface.co/datasets/icmlbenchname/results/tree/main/commit_message_generation/predictions). Make sure to include `"prediction"` and `"reference"` fields for each example, the rest are optional."""
149
 
150
  return f"**{task_pretty} Instructions:**\n\n* 🚧 There are no instructions for the current task yet."
src/utils.py CHANGED
File without changes