Spaces:

asoria
/

duckdb-parquet-demo

Runtime error

asoria HF Staff commited on May 24, 2023

Commit

554bcd2

1 Parent(s): 9b95e7f

Adding parameters

Files changed (1) hide show

app.py CHANGED Viewed

@@ -4,19 +4,25 @@ from huggingface_hub import HfFileSystem
 from huggingface_hub.hf_file_system import safe_quote
 import pandas as pd
-fs = HfFileSystem()
-duckdb.register_filesystem(fs)
-dataset="glue"
 PARQUET_REVISION="refs/convert/parquet"
-path=f"mnli/glue-train.parquet" # Only from one split
-# path="mnli/*.parquet" # To read all parquets but it should be grouped by split getting from datasets server
-location=f"hf://datasets/{dataset}@{safe_quote(PARQUET_REVISION)}/{path}"
-print(location)
 def greet(dataset, config, split, sql):
     try:
-        result = duckdb.query(f"SELECT idx as id, premise as p FROM '{location}' LIMIT 2").to_df()
         print("QUERY SUCCESSED")
     except Exception as error:
         print(f"Error: {str(error)}")
@@ -28,7 +34,7 @@ with gr.Blocks() as demo:
     dataset = gr.Textbox(label="dataset", placeholder="mstz/iris")
     config = gr.Textbox(label="config", placeholder="iris")
     split = gr.Textbox(label="split", placeholder="train")
-    sql = gr.Textbox(label="sql", placeholder="SELECT 1")
     run_button = gr.Button("Run")
     gr.Markdown("### Result")
     cached_responses_table = gr.DataFrame()

 from huggingface_hub.hf_file_system import safe_quote
 import pandas as pd
 PARQUET_REVISION="refs/convert/parquet"
+TABLE_WILDCARD="{table}"
+fs = HfFileSystem()
+duckdb.register_filesystem(fs)
 def greet(dataset, config, split, sql):
     try:
+        if TABLE_WILDCARD not in sql:
+            raise Exception(f"Query must contains {TABLE_WILDCARD} wildcard.")
+        # dataset="glue"
+        # config="mnli"
+        path=f"{config}/{dataset}-{split}.parquet" # Only from one split
+        location=f"hf://datasets/{dataset}@{safe_quote(PARQUET_REVISION)}/{path}"
+        print(location)
+        sql = sql.replace(TABLE_WILDCARD, f"'{location}'")
+        # result = duckdb.query(f"SELECT idx as id, premise as p FROM '{location}' LIMIT 2").to_df()
+        result = duckdb.query(sql).to_df()
         print("QUERY SUCCESSED")
     except Exception as error:
         print(f"Error: {str(error)}")
     dataset = gr.Textbox(label="dataset", placeholder="mstz/iris")
     config = gr.Textbox(label="config", placeholder="iris")
     split = gr.Textbox(label="split", placeholder="train")
+    sql = gr.Textbox(label="sql", placeholder=f"SELECT sepal_length FROM {TABLE_WILDCARD} LIMIT 3")
     run_button = gr.Button("Run")
     gr.Markdown("### Result")
     cached_responses_table = gr.DataFrame()