Spaces:
Runtime error
Runtime error
Get parquet names
Browse files
app.py
CHANGED
@@ -3,27 +3,40 @@ import duckdb
|
|
3 |
from huggingface_hub import HfFileSystem
|
4 |
from huggingface_hub.hf_file_system import safe_quote
|
5 |
import pandas as pd
|
|
|
6 |
|
|
|
7 |
PARQUET_REVISION="refs/convert/parquet"
|
8 |
TABLE_WILDCARD="{table}"
|
9 |
|
10 |
fs = HfFileSystem()
|
11 |
duckdb.register_filesystem(fs)
|
12 |
|
13 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
try:
|
15 |
if TABLE_WILDCARD not in sql:
|
16 |
raise Exception(f"Query must contains {TABLE_WILDCARD} wildcard.")
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
21 |
print(location)
|
22 |
sql = sql.replace(TABLE_WILDCARD, f"'{location}'")
|
23 |
-
|
24 |
-
# result = duckdb.query(f"SELECT idx as id, premise as p FROM '{location}' LIMIT 2").to_df()
|
25 |
result = duckdb.query(sql).to_df()
|
26 |
-
print("
|
27 |
except Exception as error:
|
28 |
print(f"Error: {str(error)}")
|
29 |
return pd.DataFrame({"Error": [f"❌ {str(error)}"]})
|
@@ -34,11 +47,16 @@ with gr.Blocks() as demo:
|
|
34 |
dataset = gr.Textbox(label="dataset", placeholder="mstz/iris")
|
35 |
config = gr.Textbox(label="config", placeholder="iris")
|
36 |
split = gr.Textbox(label="split", placeholder="train")
|
37 |
-
sql = gr.Textbox(
|
|
|
|
|
|
|
|
|
|
|
38 |
run_button = gr.Button("Run")
|
39 |
gr.Markdown("### Result")
|
40 |
cached_responses_table = gr.DataFrame()
|
41 |
-
run_button.click(
|
42 |
|
43 |
|
44 |
|
|
|
3 |
from huggingface_hub import HfFileSystem
|
4 |
from huggingface_hub.hf_file_system import safe_quote
|
5 |
import pandas as pd
|
6 |
+
import requests
|
7 |
|
8 |
+
DATASETS_SERVER_ENDPOINT = "https://datasets-server.huggingface.co"
|
9 |
PARQUET_REVISION="refs/convert/parquet"
|
10 |
TABLE_WILDCARD="{table}"
|
11 |
|
12 |
fs = HfFileSystem()
|
13 |
duckdb.register_filesystem(fs)
|
14 |
|
15 |
+
def get_parquet_files(dataset, config, split):
|
16 |
+
response = requests.get(f"{DATASETS_SERVER_ENDPOINT}/parque?dataset={dataset}&config={config}", timeout=60)
|
17 |
+
if response.status_code != 200:
|
18 |
+
raise Exception(response)
|
19 |
+
|
20 |
+
parquet_files = response.json()
|
21 |
+
file_names = [content["filename"] for content in parquet_files if content["split"] == split]
|
22 |
+
if len(file_names) == 0:
|
23 |
+
raise Exception("No parquet files found for dataset")
|
24 |
+
return file_names
|
25 |
+
|
26 |
+
def run_command(dataset, config, split, sql):
|
27 |
try:
|
28 |
if TABLE_WILDCARD not in sql:
|
29 |
raise Exception(f"Query must contains {TABLE_WILDCARD} wildcard.")
|
30 |
+
|
31 |
+
parquet_files = get_parquet_files(dataset, config, split)
|
32 |
+
print(f"File names found: {','.join(parquet_files)}")
|
33 |
+
parquet_first_file = parquet_files[0]
|
34 |
+
print(f"Trying with the first one {parquet_first_file}")
|
35 |
+
location=f"hf://datasets/{dataset}@{safe_quote(PARQUET_REVISION)}/{config}/{parquet_first_file}"
|
36 |
print(location)
|
37 |
sql = sql.replace(TABLE_WILDCARD, f"'{location}'")
|
|
|
|
|
38 |
result = duckdb.query(sql).to_df()
|
39 |
+
print("Ok")
|
40 |
except Exception as error:
|
41 |
print(f"Error: {str(error)}")
|
42 |
return pd.DataFrame({"Error": [f"❌ {str(error)}"]})
|
|
|
47 |
dataset = gr.Textbox(label="dataset", placeholder="mstz/iris")
|
48 |
config = gr.Textbox(label="config", placeholder="iris")
|
49 |
split = gr.Textbox(label="split", placeholder="train")
|
50 |
+
sql = gr.Textbox(
|
51 |
+
label="Query in sql format",
|
52 |
+
placeholder=f"SELECT sepal_length FROM {TABLE_WILDCARD} LIMIT 3",
|
53 |
+
value=f"SELECT sepal_length FROM {TABLE_WILDCARD} LIMIT 3",
|
54 |
+
lines=3,
|
55 |
+
)
|
56 |
run_button = gr.Button("Run")
|
57 |
gr.Markdown("### Result")
|
58 |
cached_responses_table = gr.DataFrame()
|
59 |
+
run_button.click(run_command, inputs=[dataset, config, split, sql], outputs=cached_responses_table)
|
60 |
|
61 |
|
62 |
|