File size: 1,798 Bytes
a00be78
 
 
033af05
a00be78
 
033af05
a00be78
033af05
a00be78
 
033af05
a00be78
44cb622
 
 
a00be78
 
 
 
 
 
 
 
 
44cb622
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a00be78
44cb622
a00be78
 
 
 
 
 
 
 
 
 
 
 
 
 
44cb622
a00be78
 
44cb622
 
 
a00be78
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from gradio_huggingfacehub_search import HuggingfaceHubSearch
from huggingface_hub import HfApi
import pandas as pd
import gradio as gr
import duckdb
import requests

BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"

hf_api = HfApi()
conn = duckdb.connect()


def get_dataset_ddl(dataset_id: str) -> pd.DataFrame:
    view_name = "dataset_view"

    response = requests.get(f"{BASE_DATASETS_SERVER_URL}/parquet?dataset={dataset_id}")
    response.raise_for_status()  # Check if the request was successful

    first_parquet = response.json().get("parquet_files", [])[0]
    first_parquet_url = first_parquet.get("url")

    if not first_parquet_url:
        raise ValueError("No valid URL found for the first parquet file.")

    conn.execute(
        f"CREATE OR REPLACE VIEW {view_name} as SELECT * FROM read_parquet('{first_parquet_url}');"
    )
    dataset_ddl = conn.execute(f"PRAGMA table_info('{view_name}');").fetchall()

    column_data_types = ",\n\t".join(
        [f"{column[1]} {column[2]}" for column in dataset_ddl]
    )

    sql_ddl = """
CREATE TABLE {} (
    {}
);
    """.format(
        view_name, column_data_types
    )

    return sql_ddl


with gr.Blocks() as demo:
    gr.Markdown("# Query your HF Datasets with Natural Language πŸ“ˆπŸ“Š")
    dataset_name = HuggingfaceHubSearch(
        label="Hub Dataset ID",
        placeholder="Find your favorite dataset...",
        search_type="dataset",
        value="jamescalam/world-cities-geo",
    )
    query_input = gr.Textbox("", label="Ask anything...")

    btn = gr.Button("Ask πŸͺ„")
    df = gr.DataFrame(datatype="markdown")
    ddl = gr.Text("")

    btn.click(
        get_dataset_ddl,
        inputs=[dataset_name],
        outputs=[ddl],
    )


if __name__ == "__main__":
    demo.launch()