asoria HF Staff commited on
Commit
aef303c
·
1 Parent(s): 554bcd2

Get parquet names

Browse files
Files changed (1) hide show
  1. app.py +28 -10
app.py CHANGED
@@ -3,27 +3,40 @@ import duckdb
3
  from huggingface_hub import HfFileSystem
4
  from huggingface_hub.hf_file_system import safe_quote
5
  import pandas as pd
 
6
 
 
7
  PARQUET_REVISION="refs/convert/parquet"
8
  TABLE_WILDCARD="{table}"
9
 
10
  fs = HfFileSystem()
11
  duckdb.register_filesystem(fs)
12
 
13
- def greet(dataset, config, split, sql):
 
 
 
 
 
 
 
 
 
 
 
14
  try:
15
  if TABLE_WILDCARD not in sql:
16
  raise Exception(f"Query must contains {TABLE_WILDCARD} wildcard.")
17
- # dataset="glue"
18
- # config="mnli"
19
- path=f"{config}/{dataset}-{split}.parquet" # Only from one split
20
- location=f"hf://datasets/{dataset}@{safe_quote(PARQUET_REVISION)}/{path}"
 
 
21
  print(location)
22
  sql = sql.replace(TABLE_WILDCARD, f"'{location}'")
23
-
24
- # result = duckdb.query(f"SELECT idx as id, premise as p FROM '{location}' LIMIT 2").to_df()
25
  result = duckdb.query(sql).to_df()
26
- print("QUERY SUCCESSED")
27
  except Exception as error:
28
  print(f"Error: {str(error)}")
29
  return pd.DataFrame({"Error": [f"❌ {str(error)}"]})
@@ -34,11 +47,16 @@ with gr.Blocks() as demo:
34
  dataset = gr.Textbox(label="dataset", placeholder="mstz/iris")
35
  config = gr.Textbox(label="config", placeholder="iris")
36
  split = gr.Textbox(label="split", placeholder="train")
37
- sql = gr.Textbox(label="sql", placeholder=f"SELECT sepal_length FROM {TABLE_WILDCARD} LIMIT 3")
 
 
 
 
 
38
  run_button = gr.Button("Run")
39
  gr.Markdown("### Result")
40
  cached_responses_table = gr.DataFrame()
41
- run_button.click(greet, inputs=[dataset, config, split, sql], outputs=cached_responses_table)
42
 
43
 
44
 
 
3
  from huggingface_hub import HfFileSystem
4
  from huggingface_hub.hf_file_system import safe_quote
5
  import pandas as pd
6
+ import requests
7
 
8
+ DATASETS_SERVER_ENDPOINT = "https://datasets-server.huggingface.co"
9
  PARQUET_REVISION="refs/convert/parquet"
10
  TABLE_WILDCARD="{table}"
11
 
12
  fs = HfFileSystem()
13
  duckdb.register_filesystem(fs)
14
 
15
+ def get_parquet_files(dataset, config, split):
16
+ response = requests.get(f"{DATASETS_SERVER_ENDPOINT}/parque?dataset={dataset}&config={config}", timeout=60)
17
+ if response.status_code != 200:
18
+ raise Exception(response)
19
+
20
+ parquet_files = response.json()
21
+ file_names = [content["filename"] for content in parquet_files if content["split"] == split]
22
+ if len(file_names) == 0:
23
+ raise Exception("No parquet files found for dataset")
24
+ return file_names
25
+
26
+ def run_command(dataset, config, split, sql):
27
  try:
28
  if TABLE_WILDCARD not in sql:
29
  raise Exception(f"Query must contains {TABLE_WILDCARD} wildcard.")
30
+
31
+ parquet_files = get_parquet_files(dataset, config, split)
32
+ print(f"File names found: {','.join(parquet_files)}")
33
+ parquet_first_file = parquet_files[0]
34
+ print(f"Trying with the first one {parquet_first_file}")
35
+ location=f"hf://datasets/{dataset}@{safe_quote(PARQUET_REVISION)}/{config}/{parquet_first_file}"
36
  print(location)
37
  sql = sql.replace(TABLE_WILDCARD, f"'{location}'")
 
 
38
  result = duckdb.query(sql).to_df()
39
+ print("Ok")
40
  except Exception as error:
41
  print(f"Error: {str(error)}")
42
  return pd.DataFrame({"Error": [f"❌ {str(error)}"]})
 
47
  dataset = gr.Textbox(label="dataset", placeholder="mstz/iris")
48
  config = gr.Textbox(label="config", placeholder="iris")
49
  split = gr.Textbox(label="split", placeholder="train")
50
+ sql = gr.Textbox(
51
+ label="Query in sql format",
52
+ placeholder=f"SELECT sepal_length FROM {TABLE_WILDCARD} LIMIT 3",
53
+ value=f"SELECT sepal_length FROM {TABLE_WILDCARD} LIMIT 3",
54
+ lines=3,
55
+ )
56
  run_button = gr.Button("Run")
57
  gr.Markdown("### Result")
58
  cached_responses_table = gr.DataFrame()
59
+ run_button.click(run_command, inputs=[dataset, config, split, sql], outputs=cached_responses_table)
60
 
61
 
62