from gradio_huggingfacehub_search import HuggingfaceHubSearch from huggingface_hub import HfApi import pandas as pd import gradio as gr import duckdb import requests BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co" hf_api = HfApi() conn = duckdb.connect() def get_dataset_ddl(dataset_id: str) -> pd.DataFrame: view_name = "dataset_view" response = requests.get(f"{BASE_DATASETS_SERVER_URL}/parquet?dataset={dataset_id}") response.raise_for_status() # Check if the request was successful first_parquet = response.json().get("parquet_files", [])[0] first_parquet_url = first_parquet.get("url") if not first_parquet_url: raise ValueError("No valid URL found for the first parquet file.") conn.execute( f"CREATE OR REPLACE VIEW {view_name} as SELECT * FROM read_parquet('{first_parquet_url}');" ) dataset_ddl = conn.execute(f"PRAGMA table_info('{view_name}');").fetchall() column_data_types = ",\n\t".join( [f"{column[1]} {column[2]}" for column in dataset_ddl] ) sql_ddl = """ CREATE TABLE {} ( {} ); """.format( view_name, column_data_types ) return sql_ddl with gr.Blocks() as demo: gr.Markdown("# Query your HF Datasets with Natural Language 📈📊") dataset_name = HuggingfaceHubSearch( label="Hub Dataset ID", placeholder="Find your favorite dataset...", search_type="dataset", value="jamescalam/world-cities-geo", ) query_input = gr.Textbox("", label="Ask anything...") btn = gr.Button("Ask 🪄") df = gr.DataFrame(datatype="markdown") ddl = gr.Text("") btn.click( get_dataset_ddl, inputs=[dataset_name], outputs=[ddl], ) if __name__ == "__main__": demo.launch()