Spaces:
Running
Running
from google import genai | |
client = genai.Client(api_key="AIzaSyD6voSAiSUim17kB90skpdisMMyFXZPxMo") | |
MODEL_ID = "gemini-2.0-flash-exp" | |
def model_response(text): | |
response = client.models.generate_content( | |
model=MODEL_ID, | |
contents=text | |
) | |
return response.text | |
def generate_dataset_queries(dataset_id,generated_glossary, schema_relationships): | |
queries = model_response(f'''You are an expert in drafting BQ queries. Look at the dataset: {dataset_id}, look at the glossary: {generated_glossary} and {schema_relationships} and | |
recommend interesting data exploration queries. Format: | |
Query description in the form of a single line question | |
Actual query''') | |
return queries | |
def generate_lookml(dataset_id,generated_glossary, schema_relationships): | |
lookml = model_response(f'''You are an expert in drafting LookML models. Look at the dataset: {dataset_id}, look at the glossary: {generated_glossary} and {schema_relationships} and | |
recommend the LookML semantic model corresponding to the dataset. ''') | |
return lookml | |
def run(DATASET_ID): | |
dataset_description = None | |
py = None | |
schema_relations = None | |
generated_glossary = None | |
queries = None | |
lookml = None | |
lookml_explore = None | |
dataset = model_response(f'''You are an expert in BQ public datasets. Generate a dataset schema related to {DATASET_ID}. You need to come up with atleast 5 tables with each table | |
containing atleast 10 columns along with their descriptions.Ensure that these tables have columns that talk about data quality issues.''') | |
dataset_description = model_response(f'''Generate a succinct 3-4 line description of the dataset: {dataset}.''') | |
yield dataset_description, None, None, None, None, None, None | |
#get_table_doc(PROJECT_ID, DATASET_ID) | |
py = model_response(f'''Based on the dataset provided: {dataset}, identify all the possible relationships | |
that exist between the tables in the dataset. Discover these relationships from | |
the point of view of data exploration. | |
Output: | |
List of relationships along with the description which is the business value of the relationship and a query | |
with description that validates the relationship. | |
Ensure that the column names and table names are accurate.''') | |
yield dataset_description, py, schema_relations, None, None, None, None | |
schema_relations = model_response(f'''Based on the context: {py}, generate a knowledge graph represented using ASCII art. Also generate a brief description of the graph. | |
Output: | |
Description of the graph listing all the relationships in markdown format | |
ASCII version of the knowledge graph with nodes represented by tables and edges represented by the relationships. Edges should be annotated with the type of relationships identified - many-to-one, many-to-many, one-to-one, primary key, self joins, foreign keys etc''') | |
yield dataset_description, py, schema_relations, None, None, None, None | |
generated_glossary = model_response(f'''Based on the relationships identified: {schema_relations} | |
and the dataset: {dataset_description}, generate glossary terms that will help business users easily find the tables in the dataset. | |
## Task | |
- Your goal is to create a business glossary for the data in this dataset, aligned with the definition of business glossary specified above. | |
- Provide each business term in a newline, along with the definition. | |
- Include examples in the term definitions, wherever suitable. | |
- Make sure the business terms are relevant as per the table and column names and descriptions, and relevant to the domain to which the data belongs. | |
- Also include a few business terms around the users/clients and around 5 key metrics in the domain of the data. | |
- After defining the terms, identify the relationships between the business terms identified previously. | |
## Output format | |
Ensure that the output is in markdown format with proper indentation | |
- Output each business glossary term definition in a newline in the folowing format: | |
term: definition | |
- For the business terms which are the key metrics in the business domain, mark such terms by adding "[METRIC]" in the beginning of the line, in the following format: | |
[METRIC] term: definition | |
- Then print a header to indicate the end of this section and start of the relationships section. | |
- Then output the relationships between the business terms as follows: | |
term -> [related_term1, related_term2] | |
Show the relationship between the glossary term and the column broken down by each table. | |
''') | |
yield dataset_description, py, schema_relations, generated_glossary, None, None, None | |
queries = generate_dataset_queries(dataset, generated_glossary, schema_relations) | |
yield dataset_description, py, schema_relations, generated_glossary, queries, None, None | |
lookml = generate_lookml(dataset, generated_glossary, schema_relations) | |
yield dataset_description, py, schema_relations, generated_glossary, queries, lookml | |
lookml_explore = model_response(f'''Given the dataset: {dataset}, schema relationships: {py} and graph:{schema_relations}, generate a data preparation pipeline that | |
fixes the possible data quality issues across the tables in the dataset.''') | |
yield dataset_description, py, schema_relations, generated_glossary, queries, lookml, lookml_explore | |
return dataset_description, py, schema_relations, generated_glossary, queries, lookml, lookml_explore | |
# Modify the wrapper function to yield a tuple for Gradio outputs | |
def wrapper(dataset_id): | |
for outputs in run(dataset_id): | |
yield ( | |
outputs[0], | |
outputs[1], | |
outputs[2], # Schema Relationships | |
outputs[3], # Generated Glossary | |
outputs[4], # Queries | |
outputs[5] # LookML Model | |
) | |
import gradio as gr | |
iface = gr.Interface( | |
fn=wrapper, | |
inputs=gr.Textbox(label="Dataset ID"), | |
outputs=[ | |
gr.Markdown(label="Dataset description"), | |
gr.Markdown(label="Knowledge Graph"), | |
gr.Markdown(label="Schema Relationships"), | |
gr.Markdown(label="Generated Glossary"), | |
gr.Textbox(label="Queries"), | |
gr.Markdown(label="LookML Model") | |
], | |
live=False, | |
theme = gr.themes.Ocean(), | |
title="BQ knowledge engine βοΈπ‘π (Simulator)", | |
description="Provide a dataset ID to generate LookML, schema relationships, glossary, and more...", examples=['ncaa_basketball2', 'thelook_ecommerce','geo_openstreetmap','google_political_ads','noaa_historic_severe_storms','stackoverflow'], | |
article = "This is a simulator that provides a sneak-peek into how BQ knowledge engine works." | |
) | |
# Launch the app | |
iface.launch(share=True, debug=True) |