Spaces:
Running
on
Zero
Running
on
Zero
Fix
Browse files
table.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import datasets
|
2 |
import polars as pl
|
3 |
from loguru import logger
|
|
|
4 |
|
5 |
BASE_REPO_ID = "ai-conferences/ICLR2025"
|
6 |
PATCH_REPO_ID = "ai-conferences/ICLR2025-patches"
|
@@ -8,13 +9,22 @@ PAPER_PAGE_REPO_ID = "hysts-bot-data/paper-pages-slim"
|
|
8 |
|
9 |
|
10 |
def get_patch_latest_values(
|
11 |
-
df: pl.DataFrame, all_columns: list[str], id_col: str, timestamp_col: str = "timestamp"
|
12 |
) -> pl.DataFrame:
|
13 |
df = df.sort(timestamp_col)
|
14 |
-
update_columns = [col for col in df.columns if col not in (id_col, timestamp_col)]
|
15 |
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
latest_rows = (
|
20 |
melted.sort(timestamp_col)
|
@@ -23,9 +33,16 @@ def get_patch_latest_values(
|
|
23 |
.pivot("variable", index=id_col, values="value")
|
24 |
)
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
return latest_rows.select([id_col] + [col for col in all_columns if col != id_col])
|
31 |
|
|
|
1 |
import datasets
|
2 |
import polars as pl
|
3 |
from loguru import logger
|
4 |
+
from polars import datatypes as pdt
|
5 |
|
6 |
BASE_REPO_ID = "ai-conferences/ICLR2025"
|
7 |
PATCH_REPO_ID = "ai-conferences/ICLR2025-patches"
|
|
|
9 |
|
10 |
|
11 |
def get_patch_latest_values(
|
12 |
+
df: pl.DataFrame, all_columns: list[str], id_col: str, timestamp_col: str = "timestamp", delimiter: str = ","
|
13 |
) -> pl.DataFrame:
|
14 |
df = df.sort(timestamp_col)
|
|
|
15 |
|
16 |
+
list_cols = [
|
17 |
+
col for col, dtype in df.schema.items() if col not in (id_col, timestamp_col) and dtype.base_type() is pdt.List
|
18 |
+
]
|
19 |
+
df = df.with_columns(
|
20 |
+
[
|
21 |
+
pl.when(pl.col(c).is_not_null()).then(pl.col(c).list.join(delimiter)).otherwise(None).alias(c)
|
22 |
+
for c in list_cols
|
23 |
+
]
|
24 |
+
)
|
25 |
+
|
26 |
+
update_columns = [col for col in df.columns if col not in (id_col, timestamp_col)]
|
27 |
+
melted = df.unpivot(on=update_columns, index=[timestamp_col, id_col]).drop_nulls()
|
28 |
|
29 |
latest_rows = (
|
30 |
melted.sort(timestamp_col)
|
|
|
33 |
.pivot("variable", index=id_col, values="value")
|
34 |
)
|
35 |
|
36 |
+
latest_rows = latest_rows.with_columns(
|
37 |
+
[
|
38 |
+
pl.when(pl.col(c).is_not_null()).then(pl.col(c).str.split(delimiter)).otherwise(None).alias(c)
|
39 |
+
for c in list_cols
|
40 |
+
]
|
41 |
+
)
|
42 |
+
|
43 |
+
missing_cols = [c for c in all_columns if c not in latest_rows.columns and c != id_col]
|
44 |
+
if missing_cols:
|
45 |
+
latest_rows = latest_rows.with_columns([pl.lit(None).alias(c) for c in missing_cols])
|
46 |
|
47 |
return latest_rows.select([id_col] + [col for col in all_columns if col != id_col])
|
48 |
|