File size: 6,754 Bytes
80116c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f800a8a
80116c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d042d1
80116c5
 
 
 
 
 
 
 
 
 
 
 
 
eec2452
80116c5
 
 
 
 
 
 
 
 
 
 
 
 
f800a8a
 
 
80116c5
 
 
 
 
 
 
f800a8a
80116c5
 
 
 
db0778e
80116c5
 
 
 
 
 
 
 
508c34a
80116c5
 
 
 
db0778e
80116c5
 
 
3a4bac0
1d042d1
3a4bac0
80116c5
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from google import genai
client = genai.Client(api_key="AIzaSyD6voSAiSUim17kB90skpdisMMyFXZPxMo")
MODEL_ID = "gemini-2.0-flash-exp"
def model_response(text):

    response = client.models.generate_content(
    model=MODEL_ID,
    contents=text
)
    return response.text

def generate_dataset_queries(dataset_id,generated_glossary, schema_relationships):
  queries = model_response(f'''You are an expert in drafting BQ queries. Look at the dataset: {dataset_id}, look at the glossary: {generated_glossary} and {schema_relationships} and 
  recommend interesting data exploration queries. Format:
  Query description in the form of a single line question
  Actual query''')
  return queries

def generate_lookml(dataset_id,generated_glossary, schema_relationships):
  lookml = model_response(f'''You are an expert in drafting LookML models. Look at the dataset: {dataset_id}, look at the glossary: {generated_glossary} and {schema_relationships} and 
  recommend the LookML semantic model corresponding to the dataset. ''')
  return lookml

def run(DATASET_ID):
  dataset_description = None
  py = None
  schema_relations = None
  generated_glossary = None
  queries = None
  lookml = None
  lookml_explore = None
  dataset = model_response(f'''You are an expert in BQ public datasets. Generate a dataset schema related to {DATASET_ID}. You need to come up with atleast 5 tables with each table
containing atleast 10 columns along with their descriptions.Ensure that these tables have columns that talk about data quality issues.''')
  dataset_description = model_response(f'''Generate a succinct 3-4 line description of the dataset: {dataset}.''')
  yield dataset_description, None, None, None, None, None, None

  #get_table_doc(PROJECT_ID, DATASET_ID)

  py = model_response(f'''Based on the dataset provided: {dataset}, identify all the possible relationships
    that exist between the tables in the dataset. Discover these relationships from
    the point of view of data exploration.
    Output:
    List of relationships along with the description which is the business value of the relationship and a query
    with description that validates the relationship.

    Ensure that the column names and table names are accurate.''')
  yield dataset_description, py, schema_relations, None, None, None, None

  
  schema_relations = model_response(f'''Based on the context: {py}, generate a knowledge graph represented using ASCII art. Also generate a brief description of the graph.
    Output:
    Description of the graph listing all the relationships in markdown format
    ASCII version of the knowledge graph with nodes represented by tables and edges represented by the relationships. Edges should be annotated with the type of relationships identified - many-to-one, many-to-many, one-to-one, primary key, self joins, foreign keys etc''')
  yield dataset_description, py, schema_relations, None, None, None, None
  generated_glossary = model_response(f'''Based on the relationships identified: {schema_relations}
    and the dataset: {dataset_description}, generate glossary terms that will help business users easily find the tables in the dataset.
    ## Task
  - Your goal is to create a business glossary for the data in this dataset, aligned with the definition of business glossary specified above.
  - Provide each business term in a newline, along with the definition.
  - Include examples in the term definitions, wherever suitable.
  - Make sure the business terms are relevant as per the table and column names and descriptions, and relevant to the domain to which the data belongs.
  - Also include a few business terms around the users/clients and around 5 key metrics in the domain of the data.
  - After defining the terms, identify the relationships between the business terms identified previously.

  ## Output format
  Ensure that the output is in markdown format with proper indentation
  - Output each business glossary term definition in a newline in the folowing format:
  term: definition
  - For the business terms which are the key metrics in the business domain, mark such terms by adding "[METRIC]" in the beginning of the line, in the following format:
  [METRIC] term: definition
  - Then print a header to indicate the end of this section and start of the relationships section.
  - Then output the relationships between the business terms as follows:
  term -> [related_term1, related_term2]
    Show the relationship between the glossary term and the column broken down by each table.
    ''')
  yield dataset_description, py, schema_relations, generated_glossary, None, None, None
  queries = generate_dataset_queries(dataset, generated_glossary, schema_relations)
  yield dataset_description, py, schema_relations, generated_glossary, queries, None, None
  lookml = generate_lookml(dataset, generated_glossary, schema_relations)
  yield dataset_description, py, schema_relations, generated_glossary, queries, lookml 
  lookml_explore = model_response(f'''Given the dataset: {dataset}, schema relationships: {py} and graph:{schema_relations}, generate a data preparation pipeline that
  fixes the possible data quality issues across the tables in the dataset.''')
  yield dataset_description, py, schema_relations, generated_glossary, queries, lookml, lookml_explore
  return dataset_description, py, schema_relations, generated_glossary, queries, lookml, lookml_explore

# Modify the wrapper function to yield a tuple for Gradio outputs
def wrapper(dataset_id):
    for outputs in run(dataset_id):
        yield (
            outputs[0],
            outputs[1],
            outputs[2],  # Schema Relationships
            outputs[3],  # Generated Glossary
            outputs[4],  # Queries
            outputs[5] # LookML Model
        )

import gradio as gr

iface = gr.Interface(
        fn=wrapper,
        inputs=gr.Textbox(label="Dataset ID"),
        outputs=[
            gr.Markdown(label="Dataset description"),
            gr.Markdown(label="Knowledge Graph"),
            gr.Markdown(label="Schema Relationships"),
            gr.Markdown(label="Generated Glossary"),
            gr.Textbox(label="Queries"),
            gr.Markdown(label="LookML Model")
        ],
        live=False,
        theme = gr.themes.Ocean(),
        title="BQ knowledge engine ⚙️💡📊 (Simulator)",
        description="Provide a dataset ID to generate LookML, schema relationships, glossary, and more...", examples=['ncaa_basketball2', 'thelook_ecommerce','geo_openstreetmap','google_political_ads','noaa_historic_severe_storms','stackoverflow'],
        article = "This is a simulator that provides a sneak-peek into how BQ knowledge engine works."
    )
# Launch the app
iface.launch(share=True, debug=True)