lhoestq HF Staff commited on
Commit
74560e6
·
1 Parent(s): 9088a0f
Files changed (2) hide show
  1. run_job.py +8 -5
  2. start_app.py +8 -2
run_job.py CHANGED
@@ -27,11 +27,13 @@ COPY ({query}) to 'tmp' (FORMAT PARQUET, ROW_GROUP_SIZE_BYTES '100MB', ROW_GROUP
27
  CMD_SRC_DRY_RUN = CMD_SRC[:-1] + " LIMIT 5;"
28
  CMD_DST_DRY_RUN = "{query};"
29
 
 
 
30
  def sql(src: str, dst: str, query: str, config: str = "default", split: str = "train", private: bool = False, dry_run: bool = False):
31
  import os
32
  import duckdb
33
  from contextlib import nullcontext
34
- from huggingface_hub import CommitScheduler
35
 
36
  class CommitAndCleanScheduler(CommitScheduler):
37
 
@@ -58,10 +60,11 @@ def sql(src: str, dst: str, query: str, config: str = "default", split: str = "t
58
  con.sql("PRAGMA enable_progress_bar;")
59
 
60
  result = con.sql((CMD_DST_DRY_RUN if dry_run else CMD_DST).format(query=query.rstrip("\n ;")))
61
- if dry_run:
62
- print(result.df().to_markdown())
63
- else:
64
- print("done")
 
65
 
66
 
67
  if __name__ == '__main__':
 
27
  CMD_SRC_DRY_RUN = CMD_SRC[:-1] + " LIMIT 5;"
28
  CMD_DST_DRY_RUN = "{query};"
29
 
30
+ DATA_CARD = "# Dataset Card for {dst}\n\nDataset prepared from {src} using\n\n```\n{query}\n```\n"
31
+
32
  def sql(src: str, dst: str, query: str, config: str = "default", split: str = "train", private: bool = False, dry_run: bool = False):
33
  import os
34
  import duckdb
35
  from contextlib import nullcontext
36
+ from huggingface_hub import CommitScheduler, DatasetCard
37
 
38
  class CommitAndCleanScheduler(CommitScheduler):
39
 
 
60
  con.sql("PRAGMA enable_progress_bar;")
61
 
62
  result = con.sql((CMD_DST_DRY_RUN if dry_run else CMD_DST).format(query=query.rstrip("\n ;")))
63
+ DatasetCard(DATA_CARD.format(src=src, dst=dst, query=query)).push_to_hub(repo_id=dst, repo_type="dataset")
64
+ if dry_run:
65
+ print(result.df().to_markdown())
66
+ else:
67
+ print("done")
68
 
69
 
70
  if __name__ == '__main__':
start_app.py CHANGED
@@ -79,12 +79,18 @@ def run(src, config, split, dst, query, oauth_token: gr.OAuthToken | None, profi
79
  job_id = resp.json()["metadata"]["job_id"]
80
  resp = requests.get(
81
  f"https://huggingface.co/api/jobs/{username}/{job_id}/logs-stream",
82
- headers={"Authorization": f"Bearer {token}"}
83
  )
84
  for line in iter(resp.raw.readline, b""):
85
  logs += parse_log(line.decode(), pbars=pbars)
86
  yield {output_markdown: logs, progress_labels: gr.Label(pbars, visible=bool(pbars))}
87
- pbars = {"Finished" + (" ✅" if process.returncode == 0 else " with an error ❌"): 1.0}
 
 
 
 
 
 
 
88
  yield {output_markdown: logs, progress_labels: gr.Label(pbars, visible=bool(pbars))}
89
 
90
  READ_FUNCTIONS = ("pl.read_parquet", "pl.read_csv", "pl.read_json")
 
79
  job_id = resp.json()["metadata"]["job_id"]
80
  resp = requests.get(
81
  f"https://huggingface.co/api/jobs/{username}/{job_id}/logs-stream",
 
82
  )
83
  for line in iter(resp.raw.readline, b""):
84
  logs += parse_log(line.decode(), pbars=pbars)
85
  yield {output_markdown: logs, progress_labels: gr.Label(pbars, visible=bool(pbars))}
86
+ job_status = requests.get(
87
+ f"https://huggingface.co/api/jobs/{username}/{job_id}",
88
+ ).json()
89
+ if job_status["status"]["stage"] == "COMPLETED":
90
+ pbars = {"Finished ✅": 1.0}
91
+ else:
92
+ logs += f'{job_status["status"]["message"]} ({job_status["status"]["error"]})'
93
+ pbars = {"Finished with an error ❌": 1.0}
94
  yield {output_markdown: logs, progress_labels: gr.Label(pbars, visible=bool(pbars))}
95
 
96
  READ_FUNCTIONS = ("pl.read_parquet", "pl.read_csv", "pl.read_json")