hysts HF Staff commited on
Commit
d06d36f
·
1 Parent(s): 53d72f4
Files changed (12) hide show
  1. .pre-commit-config.yaml +33 -0
  2. .python-version +1 -0
  3. .vscode/extensions.json +8 -0
  4. .vscode/settings.json +17 -0
  5. README.md +6 -3
  6. app.py +208 -0
  7. app_pr.py +403 -0
  8. pyproject.toml +54 -0
  9. requirements.txt +225 -0
  10. style.css +4 -0
  11. table.py +116 -0
  12. uv.lock +0 -0
.pre-commit-config.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v5.0.0
4
+ hooks:
5
+ - id: check-executables-have-shebangs
6
+ - id: check-json
7
+ - id: check-merge-conflict
8
+ - id: check-shebang-scripts-are-executable
9
+ - id: check-toml
10
+ - id: check-yaml
11
+ - id: end-of-file-fixer
12
+ - id: mixed-line-ending
13
+ args: ["--fix=lf"]
14
+ - id: requirements-txt-fixer
15
+ - id: trailing-whitespace
16
+ - repo: https://github.com/astral-sh/ruff-pre-commit
17
+ rev: v0.11.2
18
+ hooks:
19
+ - id: ruff
20
+ args: ["--fix"]
21
+ - id: ruff-format
22
+ - repo: https://github.com/pre-commit/mirrors-mypy
23
+ rev: v1.15.0
24
+ hooks:
25
+ - id: mypy
26
+ args: ["--ignore-missing-imports"]
27
+ additional_dependencies:
28
+ [
29
+ "types-python-slugify",
30
+ "types-pytz",
31
+ "types-PyYAML",
32
+ "types-requests",
33
+ ]
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.10
.vscode/extensions.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "recommendations": [
3
+ "ms-python.python",
4
+ "charliermarsh.ruff",
5
+ "streetsidesoftware.code-spell-checker",
6
+ "tamasfe.even-better-toml"
7
+ ]
8
+ }
.vscode/settings.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "editor.formatOnSave": true,
3
+ "files.insertFinalNewline": false,
4
+ "[python]": {
5
+ "editor.defaultFormatter": "charliermarsh.ruff",
6
+ "editor.formatOnType": true,
7
+ "editor.codeActionsOnSave": {
8
+ "source.fixAll.ruff": "explicit",
9
+ "source.organizeImports": "explicit"
10
+ }
11
+ },
12
+ "[jupyter]": {
13
+ "files.insertFinalNewline": false
14
+ },
15
+ "notebook.output.scrolling": true,
16
+ "notebook.formatOnSave.enabled": true
17
+ }
README.md CHANGED
@@ -1,12 +1,15 @@
1
  ---
2
  title: ICLR2025
3
- emoji: 📚
4
- colorFrom: gray
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.23.1
8
  app_file: app.py
9
  pinned: false
 
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: ICLR2025
3
+ emoji:
4
+ colorFrom: red
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 5.25.2
8
  app_file: app.py
9
  pinned: false
10
+ hf_oauth: true
11
+ hf_oauth_scopes:
12
+ - write-discussions
13
  ---
14
 
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ import gradio as gr
4
+ import polars as pl
5
+
6
+ from app_pr import demo as demo_pr
7
+ from table import df_orig
8
+
9
+ DESCRIPTION = "# ICLR 2025"
10
+
11
+ TUTORIAL = """\
12
+ #### Claiming Authorship for Papers on arXiv
13
+
14
+ If your ICLR 2025 paper is available on arXiv and listed in the table below, you can claim authorship by following these steps:
15
+
16
+ 1. Find your paper in the table.
17
+ 2. Click the link to the paper page in the table.
18
+ 3. On that page, click your name.
19
+ 4. Click **"Claim authorship"**.
20
+ - You'll be redirected to the *Papers* section of your Settings.
21
+ 5. Confirm the request on the redirected page.
22
+
23
+ The admin team will review your request shortly.
24
+ Once confirmed, your paper page will be marked as verified, and you'll be able to add a project page and a GitHub repository.
25
+
26
+ If you need further help, check out the [guide here](https://huggingface.co/docs/hub/paper-pages).
27
+
28
+
29
+ #### Updating Missing or Incorrect Information in the Table
30
+
31
+ If you notice any missing or incorrect information in the table, feel free to submit a PR via the "Open PR" page, which you can find at the top right of this page.
32
+ """
33
+
34
+ # TODO: remove this once https://github.com/gradio-app/gradio/issues/10916 https://github.com/gradio-app/gradio/issues/11001 https://github.com/gradio-app/gradio/issues/11002 are fixed # noqa: TD002, FIX002
35
+ NOTE = """\
36
+ Note: Sorting by upvotes or comments may not work correctly due to a known bug in Gradio.
37
+ """
38
+
39
+
40
+ df_main = df_orig.select(
41
+ "title",
42
+ "authors_str",
43
+ "openreview_md",
44
+ "type",
45
+ "paper_page_md",
46
+ "upvotes",
47
+ "num_comments",
48
+ "project_page_md",
49
+ "github_md",
50
+ "Spaces",
51
+ "Models",
52
+ "Datasets",
53
+ "claimed",
54
+ )
55
+
56
+ df_main = df_main.rename(
57
+ {
58
+ "title": "Title",
59
+ "authors_str": "Authors",
60
+ "openreview_md": "OpenReview",
61
+ "type": "Type",
62
+ "paper_page_md": "Paper page",
63
+ "upvotes": "👍",
64
+ "num_comments": "💬",
65
+ "project_page_md": "Project page",
66
+ "github_md": "GitHub",
67
+ }
68
+ )
69
+
70
+ COLUMN_INFO = {
71
+ "Title": ("str", "40%"),
72
+ "Authors": ("str", "20%"),
73
+ "Type": ("str", None),
74
+ "Paper page": ("markdown", "135px"),
75
+ "👍": ("number", "50px"),
76
+ "💬": ("number", "50px"),
77
+ "OpenReview": ("markdown", None),
78
+ "Project page": ("markdown", None),
79
+ "GitHub": ("markdown", None),
80
+ "Spaces": ("markdown", None),
81
+ "Models": ("markdown", None),
82
+ "Datasets": ("markdown", None),
83
+ "claimed": ("markdown", None),
84
+ }
85
+
86
+
87
+ DEFAULT_COLUMNS = [
88
+ "Title",
89
+ "Type",
90
+ "Paper page",
91
+ "👍",
92
+ "💬",
93
+ "OpenReview",
94
+ "Project page",
95
+ "GitHub",
96
+ "Spaces",
97
+ "Models",
98
+ ]
99
+
100
+
101
+ def update_num_papers(df: pl.DataFrame) -> str:
102
+ if "claimed" in df.columns:
103
+ return f"{len(df)} / {len(df_main)} ({df.select(pl.col('claimed').str.contains('✅').sum()).item()} claimed)"
104
+ return f"{len(df)} / {len(df_main)}"
105
+
106
+
107
+ def update_df(
108
+ title_search_query: str,
109
+ presentation_type: str,
110
+ column_names: list[str],
111
+ case_insensitive: bool = True,
112
+ ) -> gr.Dataframe:
113
+ df = df_main.clone()
114
+ column_names = ["Title", *column_names]
115
+
116
+ if title_search_query:
117
+ if case_insensitive:
118
+ title_search_query = f"(?i){title_search_query}"
119
+ try:
120
+ df = df.filter(pl.col("Title").str.contains(title_search_query))
121
+ except pl.exceptions.ComputeError as e:
122
+ raise gr.Error(str(e)) from e
123
+ if presentation_type != "(ALL)":
124
+ df = df.filter(pl.col("Type").str.contains(presentation_type))
125
+
126
+ sorted_column_names = [col for col in COLUMN_INFO if col in column_names]
127
+ df = df.select(sorted_column_names)
128
+ return gr.Dataframe(
129
+ value=df,
130
+ datatype=[COLUMN_INFO[col][0] for col in sorted_column_names],
131
+ column_widths=[COLUMN_INFO[col][1] for col in sorted_column_names],
132
+ )
133
+
134
+
135
+ with gr.Blocks(css_paths="style.css") as demo:
136
+ gr.Markdown(DESCRIPTION)
137
+ with gr.Accordion(label="Tutorial", open=True):
138
+ gr.Markdown(TUTORIAL)
139
+ with gr.Group():
140
+ search_title = gr.Textbox(label="Search title")
141
+ presentation_type = gr.Radio(
142
+ label="Presentation Type",
143
+ choices=["(ALL)", "Oral", "Spotlight", "Poster"],
144
+ value="(ALL)",
145
+ )
146
+ column_names = gr.CheckboxGroup(
147
+ label="Columns",
148
+ choices=[col for col in COLUMN_INFO if col != "Title"],
149
+ value=[col for col in DEFAULT_COLUMNS if col != "Title"],
150
+ )
151
+
152
+ num_papers = gr.Textbox(label="Number of papers", value=update_num_papers(df_orig), interactive=False)
153
+
154
+ gr.Markdown(NOTE)
155
+ df = gr.Dataframe(
156
+ value=df_main,
157
+ datatype=list(COLUMN_INFO.values()),
158
+ type="polars",
159
+ row_count=(0, "dynamic"),
160
+ show_row_numbers=True,
161
+ interactive=False,
162
+ max_height=1000,
163
+ elem_id="table",
164
+ column_widths=[COLUMN_INFO[col][1] for col in COLUMN_INFO],
165
+ )
166
+
167
+ inputs = [
168
+ search_title,
169
+ presentation_type,
170
+ column_names,
171
+ ]
172
+ gr.on(
173
+ triggers=[
174
+ search_title.submit,
175
+ presentation_type.input,
176
+ column_names.input,
177
+ ],
178
+ fn=update_df,
179
+ inputs=inputs,
180
+ outputs=df,
181
+ api_name=False,
182
+ ).then(
183
+ fn=update_num_papers,
184
+ inputs=df,
185
+ outputs=num_papers,
186
+ queue=False,
187
+ api_name=False,
188
+ )
189
+ demo.load(
190
+ fn=update_df,
191
+ inputs=inputs,
192
+ outputs=df,
193
+ api_name=False,
194
+ ).then(
195
+ fn=update_num_papers,
196
+ inputs=df,
197
+ outputs=num_papers,
198
+ queue=False,
199
+ api_name=False,
200
+ )
201
+
202
+
203
+ with demo.route("Open PR"):
204
+ demo_pr.render()
205
+
206
+
207
+ if __name__ == "__main__":
208
+ demo.queue(api_open=False).launch(show_api=False)
app_pr.py ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import difflib
3
+ import json
4
+ import re
5
+ import tempfile
6
+
7
+ import gradio as gr
8
+ import polars as pl
9
+ from gradio_modal import Modal
10
+ from huggingface_hub import CommitOperationAdd, HfApi
11
+
12
+ from table import PATCH_REPO_ID, PATCH_REPO_PR_BRANCH, df_orig
13
+
14
+ # TODO: remove this once https://github.com/gradio-app/gradio/issues/11022 is fixed # noqa: FIX002, TD002
15
+ NOTE = """\
16
+ #### ⚠️ Note
17
+ You may encounter an issue when selecting table data after using the search bar.
18
+ This is due to a known bug in Gradio.
19
+
20
+ The issue typically occurs when multiple rows remain after filtering.
21
+ If only one row remains, the selection should work as expected.
22
+ """
23
+
24
+ api = HfApi()
25
+
26
+ PR_VIEW_COLUMNS = [
27
+ "title",
28
+ "authors_str",
29
+ "openreview_md",
30
+ "arxiv_id",
31
+ "github_md",
32
+ "Spaces",
33
+ "Models",
34
+ "Datasets",
35
+ "paper_id",
36
+ ]
37
+ PR_RAW_COLUMNS = [
38
+ "paper_id",
39
+ "title",
40
+ "authors",
41
+ "arxiv_id",
42
+ "project_page",
43
+ "github",
44
+ "space_ids",
45
+ "model_ids",
46
+ "dataset_ids",
47
+ ]
48
+
49
+ df_pr_view = df_orig.with_columns(pl.lit("📝").alias("Fix")).select(["Fix", *PR_VIEW_COLUMNS])
50
+ df_pr_view = df_pr_view.with_columns(pl.col("arxiv_id").fill_null(""))
51
+ df_pr_raw = df_orig.select(PR_RAW_COLUMNS)
52
+
53
+
54
+ def df_pr_row_selected(
55
+ evt: gr.SelectData,
56
+ ) -> tuple[
57
+ Modal,
58
+ gr.Textbox, # title
59
+ gr.Textbox, # authors
60
+ gr.Textbox, # arxiv_id
61
+ gr.Textbox, # project_page
62
+ gr.Textbox, # github
63
+ gr.Textbox, # space_ids
64
+ gr.Textbox, # model_ids
65
+ gr.Textbox, # dataset_ids
66
+ dict | None, # original_data
67
+ ]:
68
+ if evt.value != "📝":
69
+ return (
70
+ Modal(),
71
+ gr.Textbox(), # title
72
+ gr.Textbox(), # authors
73
+ gr.Textbox(), # arxiv_id
74
+ gr.Textbox(), # project_page
75
+ gr.Textbox(), # github
76
+ gr.Textbox(), # space_ids
77
+ gr.Textbox(), # model_ids
78
+ gr.Textbox(), # dataset_ids
79
+ None, # original_data
80
+ )
81
+
82
+ paper_id = evt.row_value[-1]
83
+ row = df_pr_raw.filter(pl.col("paper_id") == paper_id)
84
+ original_data = row.to_dicts()[0]
85
+ authors = original_data["authors"]
86
+ space_ids = original_data["space_ids"]
87
+ model_ids = original_data["model_ids"]
88
+ dataset_ids = original_data["dataset_ids"]
89
+ return (
90
+ Modal(visible=True),
91
+ gr.Textbox(value=row["title"].item()), # title
92
+ gr.Textbox(value="\n".join(authors)), # authors
93
+ gr.Textbox(value=row["arxiv_id"].item()), # arxiv_id
94
+ gr.Textbox(value=row["project_page"].item()), # project_page
95
+ gr.Textbox(value=row["github"].item()), # github
96
+ gr.Textbox(value="\n".join(space_ids)), # space_ids
97
+ gr.Textbox(value="\n".join(model_ids)), # model_ids
98
+ gr.Textbox(value="\n".join(dataset_ids)), # dataset_ids
99
+ original_data, # original_data
100
+ )
101
+
102
+
103
+ URL_PATTERN = re.compile(r"^(https?://)?([a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}(:\d+)?(/.*)?$")
104
+ GITHUB_PATTERN = re.compile(r"^https://github\.com/[^/\s]+/[^/\s]+(/tree/[^/\s]+/[^/\s].*)?$")
105
+ REPO_ID_PATTERN = re.compile(r"^[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+$")
106
+ ARXIV_ID_PATTERN = re.compile(r"^\d{4}\.\d{4,5}$")
107
+
108
+
109
+ def is_valid_url(url: str) -> bool:
110
+ return URL_PATTERN.match(url) is not None
111
+
112
+
113
+ def is_valid_github_url(url: str) -> bool:
114
+ return GITHUB_PATTERN.match(url) is not None
115
+
116
+
117
+ def is_valid_repo_id(repo_id: str) -> bool:
118
+ return REPO_ID_PATTERN.match(repo_id) is not None
119
+
120
+
121
+ def is_valid_arxiv_id(arxiv_id: str) -> bool:
122
+ return ARXIV_ID_PATTERN.match(arxiv_id) is not None
123
+
124
+
125
+ def validate_pr_data(
126
+ title_pr: str,
127
+ authors_pr: str,
128
+ arxiv_id_pr: str,
129
+ project_page_pr: str,
130
+ github_pr: str,
131
+ space_ids: list[str],
132
+ model_ids: list[str],
133
+ dataset_ids: list[str],
134
+ ) -> None:
135
+ if not title_pr:
136
+ raise gr.Error("Title cannot be empty", print_exception=False)
137
+ if not authors_pr:
138
+ raise gr.Error("Authors cannot be empty", print_exception=False)
139
+
140
+ if arxiv_id_pr and not is_valid_arxiv_id(arxiv_id_pr):
141
+ raise gr.Error(
142
+ "Invalid arXiv ID format. Expected format: 'YYYY.NNNNN' (e.g., '2023.01234')", print_exception=False
143
+ )
144
+ if project_page_pr and not is_valid_url(project_page_pr):
145
+ raise gr.Error("Project page must be a valid URL", print_exception=False)
146
+ if github_pr and not is_valid_github_url(github_pr):
147
+ raise gr.Error("GitHub must be a valid GitHub URL", print_exception=False)
148
+
149
+ for repo_id in space_ids + model_ids + dataset_ids:
150
+ if not is_valid_repo_id(repo_id):
151
+ error_msg = f"Space/Model/Dataset ID must be in the format 'org_name/repo_name'. Got: {repo_id}"
152
+ raise gr.Error(error_msg, print_exception=False)
153
+
154
+
155
+ def format_submitted_data(
156
+ title_pr: str,
157
+ authors_pr: str,
158
+ arxiv_id_pr: str,
159
+ project_page_pr: str,
160
+ github_pr: str,
161
+ space_ids_pr: str,
162
+ model_ids_pr: str,
163
+ dataset_ids_pr: str,
164
+ ) -> dict:
165
+ space_ids = [repo_id for repo_id in space_ids_pr.split("\n") if repo_id.strip()]
166
+ model_ids = [repo_id for repo_id in model_ids_pr.split("\n") if repo_id.strip()]
167
+ dataset_ids = [repo_id for repo_id in dataset_ids_pr.split("\n") if repo_id.strip()]
168
+
169
+ validate_pr_data(title_pr, authors_pr, arxiv_id_pr, project_page_pr, github_pr, space_ids, model_ids, dataset_ids)
170
+
171
+ return {
172
+ "title": title_pr,
173
+ "authors": [a for a in authors_pr.split("\n") if a.strip()],
174
+ "arxiv_id": arxiv_id_pr if arxiv_id_pr else None,
175
+ "project_page": project_page_pr if project_page_pr else None,
176
+ "github": github_pr if github_pr else None,
177
+ "space_ids": space_ids,
178
+ "model_ids": model_ids,
179
+ "dataset_ids": dataset_ids,
180
+ }
181
+
182
+
183
+ def preview_diff(
184
+ title_pr: str,
185
+ authors_pr: str,
186
+ arxiv_id_pr: str,
187
+ project_page_pr: str,
188
+ github_pr: str,
189
+ space_ids_pr: str,
190
+ model_ids_pr: str,
191
+ dataset_ids_pr: str,
192
+ original_data: dict,
193
+ ) -> tuple[gr.Markdown, gr.Button]:
194
+ submitted_data = format_submitted_data(
195
+ title_pr,
196
+ authors_pr,
197
+ arxiv_id_pr,
198
+ project_page_pr,
199
+ github_pr,
200
+ space_ids_pr,
201
+ model_ids_pr,
202
+ dataset_ids_pr,
203
+ )
204
+ submitted_data = {"paper_id": original_data["paper_id"], **submitted_data}
205
+
206
+ original_json = json.dumps(original_data, indent=2)
207
+ submitted_json = json.dumps(submitted_data, indent=2)
208
+ diff = difflib.unified_diff(
209
+ original_json.splitlines(),
210
+ submitted_json.splitlines(),
211
+ fromfile="before",
212
+ tofile="after",
213
+ lineterm="",
214
+ )
215
+ diff_str = "\n".join(diff)
216
+ return gr.Markdown(value=f"```diff\n{diff_str}\n```"), gr.Button(visible=True)
217
+
218
+
219
+ def open_pr(
220
+ title_pr: str,
221
+ authors_pr: str,
222
+ arxiv_id_pr: str,
223
+ project_page_pr: str,
224
+ github_pr: str,
225
+ space_ids_pr: str,
226
+ model_ids_pr: str,
227
+ dataset_ids_pr: str,
228
+ original_data: dict,
229
+ oauth_token: gr.OAuthToken | None,
230
+ ) -> gr.Markdown:
231
+ submitted_data = format_submitted_data(
232
+ title_pr,
233
+ authors_pr,
234
+ arxiv_id_pr,
235
+ project_page_pr,
236
+ github_pr,
237
+ space_ids_pr,
238
+ model_ids_pr,
239
+ dataset_ids_pr,
240
+ )
241
+
242
+ diff_dict = {key: submitted_data[key] for key in submitted_data if submitted_data[key] != original_data[key]}
243
+
244
+ if not diff_dict:
245
+ gr.Info("No data to submit")
246
+ return ""
247
+
248
+ paper_id = original_data["paper_id"]
249
+ diff_dict["paper_id"] = paper_id
250
+
251
+ original_json = json.dumps(original_data, indent=2)
252
+ submitted_json = json.dumps(submitted_data, indent=2)
253
+ diff = "\n".join(
254
+ difflib.unified_diff(
255
+ original_json.splitlines(),
256
+ submitted_json.splitlines(),
257
+ fromfile="before",
258
+ tofile="after",
259
+ lineterm="",
260
+ )
261
+ )
262
+ diff_dict["diff"] = diff
263
+ timestamp = datetime.datetime.now(datetime.timezone.utc)
264
+ diff_dict["timestamp"] = timestamp.isoformat()
265
+
266
+ with tempfile.NamedTemporaryFile(suffix=".json", mode="w", delete=False) as f:
267
+ json.dump(diff_dict, f, indent=2)
268
+ f.flush()
269
+
270
+ commit = CommitOperationAdd(f"data/{paper_id}--{timestamp.strftime('%Y-%m-%d-%H-%M-%S')}.json", f.name)
271
+ res = api.create_commit(
272
+ repo_id=PATCH_REPO_ID,
273
+ operations=[commit],
274
+ commit_message=f"Update {paper_id}",
275
+ repo_type="dataset",
276
+ revision=PATCH_REPO_PR_BRANCH,
277
+ create_pr=True,
278
+ token=oauth_token.token if oauth_token else None,
279
+ )
280
+ return gr.Markdown(value=res.pr_url, visible=True)
281
+
282
+
283
+ def render_open_pr_page(profile: gr.OAuthProfile | None) -> dict:
284
+ return gr.Column(visible=profile is not None)
285
+
286
+
287
+ with gr.Blocks() as demo:
288
+ gr.LoginButton()
289
+ with gr.Column(visible=False) as open_pr_col:
290
+ gr.Markdown(NOTE)
291
+ df_pr = gr.Dataframe(
292
+ value=df_pr_view,
293
+ datatype=[
294
+ "str", # Fix
295
+ "str", # Title
296
+ "str", # Authors
297
+ "markdown", # openreview
298
+ "str", # arxiv_id
299
+ "markdown", # github
300
+ "markdown", # spaces
301
+ "markdown", # models
302
+ "markdown", # datasets
303
+ "str", # paper id
304
+ ],
305
+ column_widths=[
306
+ "50px", # Fix
307
+ "40%", # Title
308
+ "20%", # Authors
309
+ None, # openreview
310
+ "100px", # arxiv_id
311
+ None, # github
312
+ None, # spaces
313
+ None, # models
314
+ None, # datasets
315
+ None, # paper id
316
+ ],
317
+ type="polars",
318
+ row_count=(0, "dynamic"),
319
+ interactive=False,
320
+ max_height=1000,
321
+ show_search="search",
322
+ )
323
+ with Modal(visible=False) as pr_modal:
324
+ with gr.Group():
325
+ title_pr = gr.Textbox(label="Title")
326
+ authors_pr = gr.Textbox(label="Authors")
327
+ arxiv_id_pr = gr.Textbox(label="arXiv ID")
328
+ project_page_pr = gr.Textbox(label="Project page")
329
+ github_pr = gr.Textbox(label="GitHub")
330
+ spaces_pr = gr.Textbox(
331
+ label="Spaces",
332
+ info="Enter one space ID (e.g., 'org_name/space_name') per line.",
333
+ )
334
+ models_pr = gr.Textbox(
335
+ label="Models",
336
+ info="Enter one model ID (e.g., 'org_name/model_name') per line.",
337
+ )
338
+ datasets_pr = gr.Textbox(
339
+ label="Datasets",
340
+ info="Enter one dataset ID (e.g., 'org_name/dataset_name') per line.",
341
+ )
342
+ original_data = gr.State()
343
+ preview_diff_button = gr.Button("Preview diff")
344
+ diff_view = gr.Markdown()
345
+ open_pr_button = gr.Button("Open PR", visible=False)
346
+ pr_url = gr.Markdown(visible=False)
347
+
348
+ pr_modal.blur(
349
+ fn=lambda: (None, gr.Button(visible=False), gr.Markdown(visible=False)),
350
+ outputs=[diff_view, open_pr_button, pr_url],
351
+ )
352
+
353
+ df_pr.select(
354
+ fn=df_pr_row_selected,
355
+ outputs=[
356
+ pr_modal,
357
+ title_pr,
358
+ authors_pr,
359
+ arxiv_id_pr,
360
+ project_page_pr,
361
+ github_pr,
362
+ spaces_pr,
363
+ models_pr,
364
+ datasets_pr,
365
+ original_data,
366
+ ],
367
+ )
368
+ preview_diff_button.click(
369
+ fn=preview_diff,
370
+ inputs=[
371
+ title_pr,
372
+ authors_pr,
373
+ arxiv_id_pr,
374
+ project_page_pr,
375
+ github_pr,
376
+ spaces_pr,
377
+ models_pr,
378
+ datasets_pr,
379
+ original_data,
380
+ ],
381
+ outputs=[diff_view, open_pr_button],
382
+ )
383
+ open_pr_button.click(
384
+ fn=open_pr,
385
+ inputs=[
386
+ title_pr,
387
+ authors_pr,
388
+ arxiv_id_pr,
389
+ project_page_pr,
390
+ github_pr,
391
+ spaces_pr,
392
+ models_pr,
393
+ datasets_pr,
394
+ original_data,
395
+ ],
396
+ outputs=pr_url,
397
+ )
398
+
399
+ demo.load(fn=render_open_pr_page, outputs=open_pr_col)
400
+
401
+
402
+ if __name__ == "__main__":
403
+ demo.queue(api_open=False).launch(show_api=False)
pyproject.toml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "iclr2025"
3
+ version = "0.1.0"
4
+ description = ""
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "datasets>=3.5.0",
9
+ "gradio[oauth]>=5.25.2",
10
+ "gradio-modal>=0.0.4",
11
+ "hf-transfer>=0.1.9",
12
+ "polars>=1.27.1",
13
+ ]
14
+
15
+ [tool.ruff]
16
+ line-length = 119
17
+
18
+ [tool.ruff.lint]
19
+ select = ["ALL"]
20
+ ignore = [
21
+ "COM812", # missing-trailing-comma
22
+ "D203", # one-blank-line-before-class
23
+ "D213", # multi-line-summary-second-line
24
+ "E501", # line-too-long
25
+ "SIM117", # multiple-with-statements
26
+ #
27
+ "D100", # undocumented-public-module
28
+ "D101", # undocumented-public-class
29
+ "D102", # undocumented-public-method
30
+ "D103", # undocumented-public-function
31
+ "D104", # undocumented-public-package
32
+ "D105", # undocumented-magic-method
33
+ "D107", # undocumented-public-init
34
+ "EM101", # raw-string-in-exception
35
+ "FBT001", # boolean-type-hint-positional-argument
36
+ "FBT002", # boolean-default-value-positional-argument
37
+ "PD901", # pandas-df-variable-name
38
+ "PGH003", # blanket-type-ignore
39
+ "PLR0913", # too-many-arguments
40
+ "PLR0915", # too-many-statements
41
+ "TRY003", # raise-vanilla-args
42
+ ]
43
+ unfixable = [
44
+ "F401", # unused-import
45
+ ]
46
+
47
+ [tool.ruff.lint.pydocstyle]
48
+ convention = "google"
49
+
50
+ [tool.ruff.lint.per-file-ignores]
51
+ "*.ipynb" = ["T201", "T203"]
52
+
53
+ [tool.ruff.format]
54
+ docstring-code-format = true
requirements.txt ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file was autogenerated by uv via the following command:
2
+ # uv pip compile pyproject.toml -o requirements.txt
3
+ aiofiles==24.1.0
4
+ # via gradio
5
+ aiohappyeyeballs==2.6.1
6
+ # via aiohttp
7
+ aiohttp==3.11.16
8
+ # via
9
+ # datasets
10
+ # fsspec
11
+ aiosignal==1.3.2
12
+ # via aiohttp
13
+ annotated-types==0.7.0
14
+ # via pydantic
15
+ anyio==4.9.0
16
+ # via
17
+ # gradio
18
+ # httpx
19
+ # starlette
20
+ async-timeout==5.0.1
21
+ # via aiohttp
22
+ attrs==25.3.0
23
+ # via aiohttp
24
+ authlib==1.5.2
25
+ # via gradio
26
+ certifi==2025.1.31
27
+ # via
28
+ # httpcore
29
+ # httpx
30
+ # requests
31
+ cffi==1.17.1
32
+ # via cryptography
33
+ charset-normalizer==3.4.1
34
+ # via requests
35
+ click==8.1.8
36
+ # via
37
+ # typer
38
+ # uvicorn
39
+ cryptography==44.0.2
40
+ # via authlib
41
+ datasets==3.5.0
42
+ # via iclr2025 (pyproject.toml)
43
+ dill==0.3.8
44
+ # via
45
+ # datasets
46
+ # multiprocess
47
+ exceptiongroup==1.2.2
48
+ # via anyio
49
+ fastapi==0.115.12
50
+ # via gradio
51
+ ffmpy==0.5.0
52
+ # via gradio
53
+ filelock==3.18.0
54
+ # via
55
+ # datasets
56
+ # huggingface-hub
57
+ frozenlist==1.5.0
58
+ # via
59
+ # aiohttp
60
+ # aiosignal
61
+ fsspec==2024.12.0
62
+ # via
63
+ # datasets
64
+ # gradio-client
65
+ # huggingface-hub
66
+ gradio==5.25.2
67
+ # via
68
+ # iclr2025 (pyproject.toml)
69
+ # gradio-modal
70
+ gradio-client==1.8.0
71
+ # via gradio
72
+ gradio-modal==0.0.4
73
+ # via iclr2025 (pyproject.toml)
74
+ groovy==0.1.2
75
+ # via gradio
76
+ h11==0.14.0
77
+ # via
78
+ # httpcore
79
+ # uvicorn
80
+ hf-transfer==0.1.9
81
+ # via iclr2025 (pyproject.toml)
82
+ httpcore==1.0.8
83
+ # via httpx
84
+ httpx==0.28.1
85
+ # via
86
+ # gradio
87
+ # gradio-client
88
+ # safehttpx
89
+ huggingface-hub==0.30.2
90
+ # via
91
+ # datasets
92
+ # gradio
93
+ # gradio-client
94
+ idna==3.10
95
+ # via
96
+ # anyio
97
+ # httpx
98
+ # requests
99
+ # yarl
100
+ itsdangerous==2.2.0
101
+ # via gradio
102
+ jinja2==3.1.6
103
+ # via gradio
104
+ markdown-it-py==3.0.0
105
+ # via rich
106
+ markupsafe==3.0.2
107
+ # via
108
+ # gradio
109
+ # jinja2
110
+ mdurl==0.1.2
111
+ # via markdown-it-py
112
+ multidict==6.4.3
113
+ # via
114
+ # aiohttp
115
+ # yarl
116
+ multiprocess==0.70.16
117
+ # via datasets
118
+ numpy==2.2.4
119
+ # via
120
+ # datasets
121
+ # gradio
122
+ # pandas
123
+ orjson==3.10.16
124
+ # via gradio
125
+ packaging==24.2
126
+ # via
127
+ # datasets
128
+ # gradio
129
+ # gradio-client
130
+ # huggingface-hub
131
+ pandas==2.2.3
132
+ # via
133
+ # datasets
134
+ # gradio
135
+ pillow==11.2.1
136
+ # via gradio
137
+ polars==1.27.1
138
+ # via iclr2025 (pyproject.toml)
139
+ propcache==0.3.1
140
+ # via
141
+ # aiohttp
142
+ # yarl
143
+ pyarrow==19.0.1
144
+ # via datasets
145
+ pycparser==2.22
146
+ # via cffi
147
+ pydantic==2.11.3
148
+ # via
149
+ # fastapi
150
+ # gradio
151
+ pydantic-core==2.33.1
152
+ # via pydantic
153
+ pydub==0.25.1
154
+ # via gradio
155
+ pygments==2.19.1
156
+ # via rich
157
+ python-dateutil==2.9.0.post0
158
+ # via pandas
159
+ python-multipart==0.0.20
160
+ # via gradio
161
+ pytz==2025.2
162
+ # via pandas
163
+ pyyaml==6.0.2
164
+ # via
165
+ # datasets
166
+ # gradio
167
+ # huggingface-hub
168
+ requests==2.32.3
169
+ # via
170
+ # datasets
171
+ # huggingface-hub
172
+ rich==14.0.0
173
+ # via typer
174
+ ruff==0.11.5
175
+ # via gradio
176
+ safehttpx==0.1.6
177
+ # via gradio
178
+ semantic-version==2.10.0
179
+ # via gradio
180
+ shellingham==1.5.4
181
+ # via typer
182
+ six==1.17.0
183
+ # via python-dateutil
184
+ sniffio==1.3.1
185
+ # via anyio
186
+ starlette==0.46.2
187
+ # via
188
+ # fastapi
189
+ # gradio
190
+ tomlkit==0.13.2
191
+ # via gradio
192
+ tqdm==4.67.1
193
+ # via
194
+ # datasets
195
+ # huggingface-hub
196
+ typer==0.15.2
197
+ # via gradio
198
+ typing-extensions==4.13.2
199
+ # via
200
+ # anyio
201
+ # fastapi
202
+ # gradio
203
+ # gradio-client
204
+ # huggingface-hub
205
+ # multidict
206
+ # pydantic
207
+ # pydantic-core
208
+ # rich
209
+ # typer
210
+ # typing-inspection
211
+ # uvicorn
212
+ typing-inspection==0.4.0
213
+ # via pydantic
214
+ tzdata==2025.2
215
+ # via pandas
216
+ urllib3==2.4.0
217
+ # via requests
218
+ uvicorn==0.34.1
219
+ # via gradio
220
+ websockets==15.0.1
221
+ # via gradio-client
222
+ xxhash==3.5.0
223
+ # via datasets
224
+ yarl==1.19.0
225
+ # via aiohttp
style.css ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ h1 {
2
+ text-align: center;
3
+ display: block;
4
+ }
table.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+ import polars as pl
3
+
4
+ BASE_REPO_ID = "ai-conferences/ICLR2025"
5
+ PATCH_REPO_ID = "ai-conferences/ICLR2025-patches"
6
+ PATCH_REPO_PR_BRANCH = "raw-jsons"
7
+ PAPER_PAGE_REPO_ID = "hysts-bot-data/paper-pages-slim"
8
+
9
+
10
+ def get_patch_latest_values(
11
+ df: pl.DataFrame, all_columns: list[str], id_col: str, timestamp_col: str = "timestamp"
12
+ ) -> pl.DataFrame:
13
+ df = df.sort(timestamp_col)
14
+ update_columns = [col for col in df.columns if col not in (id_col, timestamp_col)]
15
+
16
+ melted = df.unpivot(on=update_columns, index=[timestamp_col, id_col])
17
+ melted = melted.drop_nulls()
18
+
19
+ latest_rows = (
20
+ melted.sort(timestamp_col)
21
+ .group_by([id_col, "variable"])
22
+ .agg(pl.col("value").last())
23
+ .pivot("variable", index=id_col, values="value")
24
+ )
25
+
26
+ for col in all_columns:
27
+ if col != "id" and col not in latest_rows.columns:
28
+ latest_rows = latest_rows.with_columns(pl.lit(None).alias(col))
29
+
30
+ return latest_rows.select([id_col] + [col for col in all_columns if col != id_col])
31
+
32
+
33
+ def format_author_claim_ratio(row: dict) -> str:
34
+ n_linked_authors = row["n_linked_authors"]
35
+ n_authors = row["n_authors"]
36
+
37
+ if n_linked_authors is None or n_authors is None:
38
+ return ""
39
+
40
+ author_linked = "✅" if n_linked_authors > 0 else ""
41
+ return f"{n_linked_authors}/{n_authors} {author_linked}".strip()
42
+
43
+
44
+ df_orig = (
45
+ datasets.load_dataset(BASE_REPO_ID, split="train")
46
+ .to_polars()
47
+ .rename({"paper_url": "openreview", "submission_number": "paper_id"})
48
+ .with_columns(
49
+ pl.lit([], dtype=pl.List(pl.Utf8)).alias(col_name) for col_name in ["space_ids", "model_ids", "dataset_ids"]
50
+ )
51
+ )
52
+ df_paper_page = (
53
+ datasets.load_dataset(PAPER_PAGE_REPO_ID, split="train")
54
+ .to_polars()
55
+ .drop(["summary", "author_names", "ai_keywords"])
56
+ )
57
+ df_orig = df_orig.join(df_paper_page, on="arxiv_id", how="left")
58
+
59
+ df_patches = datasets.load_dataset(PATCH_REPO_ID, revision="main", split="train").to_polars().drop("diff")
60
+ df_patches = get_patch_latest_values(df_patches, df_orig.columns, id_col="paper_id", timestamp_col="timestamp")
61
+ df_orig = (
62
+ df_orig.join(df_patches, on="paper_id", how="left")
63
+ .with_columns(
64
+ [pl.coalesce([pl.col(col + "_right"), pl.col(col)]).alias(col) for col in df_orig.columns if col != "paper_id"]
65
+ )
66
+ .select(df_orig.columns)
67
+ )
68
+
69
+ # format authors
70
+ df_orig = df_orig.with_columns(pl.col("authors").list.join(", ").alias("authors_str"))
71
+ # format links
72
+ df_orig = df_orig.with_columns(
73
+ [
74
+ pl.format("[link]({})", pl.col(col)).fill_null("").alias(f"{col}_md")
75
+ for col in ["openreview", "project_page", "github"]
76
+ ]
77
+ )
78
+ # format paper page link
79
+ df_orig = df_orig.with_columns(
80
+ (pl.lit("https://huggingface.co/papers/") + pl.col("arxiv_id")).alias("paper_page")
81
+ ).with_columns(pl.format("[{}]({})", pl.col("arxiv_id"), pl.col("paper_page")).fill_null("").alias("paper_page_md"))
82
+
83
+ # count authors
84
+ df_orig = df_orig.with_columns(pl.col("authors").list.len().alias("n_authors"))
85
+ df_orig = df_orig.with_columns(
86
+ pl.col("author_usernames")
87
+ .map_elements(lambda lst: sum(x is not None for x in lst) if lst is not None else None, return_dtype=pl.Int64)
88
+ .alias("n_linked_authors")
89
+ )
90
+ df_orig = df_orig.with_columns(
91
+ pl.struct(["n_linked_authors", "n_authors"])
92
+ .map_elements(format_author_claim_ratio, return_dtype=pl.Utf8)
93
+ .alias("claimed")
94
+ )
95
+
96
+ # TODO: Fix this once https://github.com/gradio-app/gradio/issues/10916 is fixed # noqa: FIX002, TD002
97
+ # format numbers as strings
98
+ df_orig = df_orig.with_columns(
99
+ [pl.col(col).cast(pl.Utf8).fill_null("").alias(col) for col in ["upvotes", "num_comments"]]
100
+ )
101
+
102
+ # format spaces, models, datasets
103
+ for repo_id_col, markdown_col, base_url in [
104
+ ("space_ids", "Spaces", "https://huggingface.co/spaces/"),
105
+ ("model_ids", "Models", "https://huggingface.co/"),
106
+ ("dataset_ids", "Datasets", "https://huggingface.co/datasets/"),
107
+ ]:
108
+ df_orig = df_orig.with_columns(
109
+ pl.col(repo_id_col)
110
+ .map_elements(
111
+ lambda lst: "\n".join([f"[link]({base_url}{x})" for x in lst]) if lst is not None else None, # noqa: B023
112
+ return_dtype=pl.Utf8,
113
+ )
114
+ .fill_null("")
115
+ .alias(markdown_col)
116
+ )
uv.lock ADDED
The diff for this file is too large to render. See raw diff