hysts HF Staff commited on
Commit
725bd04
·
1 Parent(s): 1c00c70
Files changed (1) hide show
  1. table.py +24 -7
table.py CHANGED
@@ -1,6 +1,7 @@
1
  import datasets
2
  import polars as pl
3
  from loguru import logger
 
4
 
5
  BASE_REPO_ID = "ai-conferences/ICLR2025"
6
  PATCH_REPO_ID = "ai-conferences/ICLR2025-patches"
@@ -8,13 +9,22 @@ PAPER_PAGE_REPO_ID = "hysts-bot-data/paper-pages-slim"
8
 
9
 
10
  def get_patch_latest_values(
11
- df: pl.DataFrame, all_columns: list[str], id_col: str, timestamp_col: str = "timestamp"
12
  ) -> pl.DataFrame:
13
  df = df.sort(timestamp_col)
14
- update_columns = [col for col in df.columns if col not in (id_col, timestamp_col)]
15
 
16
- melted = df.unpivot(on=update_columns, index=[timestamp_col, id_col])
17
- melted = melted.drop_nulls()
 
 
 
 
 
 
 
 
 
 
18
 
19
  latest_rows = (
20
  melted.sort(timestamp_col)
@@ -23,9 +33,16 @@ def get_patch_latest_values(
23
  .pivot("variable", index=id_col, values="value")
24
  )
25
 
26
- for col in all_columns:
27
- if col != "id" and col not in latest_rows.columns:
28
- latest_rows = latest_rows.with_columns(pl.lit(None).alias(col))
 
 
 
 
 
 
 
29
 
30
  return latest_rows.select([id_col] + [col for col in all_columns if col != id_col])
31
 
 
1
  import datasets
2
  import polars as pl
3
  from loguru import logger
4
+ from polars import datatypes as pdt
5
 
6
  BASE_REPO_ID = "ai-conferences/ICLR2025"
7
  PATCH_REPO_ID = "ai-conferences/ICLR2025-patches"
 
9
 
10
 
11
  def get_patch_latest_values(
12
+ df: pl.DataFrame, all_columns: list[str], id_col: str, timestamp_col: str = "timestamp", delimiter: str = ","
13
  ) -> pl.DataFrame:
14
  df = df.sort(timestamp_col)
 
15
 
16
+ list_cols = [
17
+ col for col, dtype in df.schema.items() if col not in (id_col, timestamp_col) and dtype.base_type() is pdt.List
18
+ ]
19
+ df = df.with_columns(
20
+ [
21
+ pl.when(pl.col(c).is_not_null()).then(pl.col(c).list.join(delimiter)).otherwise(None).alias(c)
22
+ for c in list_cols
23
+ ]
24
+ )
25
+
26
+ update_columns = [col for col in df.columns if col not in (id_col, timestamp_col)]
27
+ melted = df.unpivot(on=update_columns, index=[timestamp_col, id_col]).drop_nulls()
28
 
29
  latest_rows = (
30
  melted.sort(timestamp_col)
 
33
  .pivot("variable", index=id_col, values="value")
34
  )
35
 
36
+ latest_rows = latest_rows.with_columns(
37
+ [
38
+ pl.when(pl.col(c).is_not_null()).then(pl.col(c).str.split(delimiter)).otherwise(None).alias(c)
39
+ for c in list_cols
40
+ ]
41
+ )
42
+
43
+ missing_cols = [c for c in all_columns if c not in latest_rows.columns and c != id_col]
44
+ if missing_cols:
45
+ latest_rows = latest_rows.with_columns([pl.lit(None).alias(c) for c in missing_cols])
46
 
47
  return latest_rows.select([id_col] + [col for col in all_columns if col != id_col])
48