bke / app.py
agentharbor's picture
Update app.py
db0778e verified
raw
history blame contribute delete
6.75 kB
from google import genai
client = genai.Client(api_key="AIzaSyD6voSAiSUim17kB90skpdisMMyFXZPxMo")
MODEL_ID = "gemini-2.0-flash-exp"
def model_response(text):
response = client.models.generate_content(
model=MODEL_ID,
contents=text
)
return response.text
def generate_dataset_queries(dataset_id,generated_glossary, schema_relationships):
queries = model_response(f'''You are an expert in drafting BQ queries. Look at the dataset: {dataset_id}, look at the glossary: {generated_glossary} and {schema_relationships} and
recommend interesting data exploration queries. Format:
Query description in the form of a single line question
Actual query''')
return queries
def generate_lookml(dataset_id,generated_glossary, schema_relationships):
lookml = model_response(f'''You are an expert in drafting LookML models. Look at the dataset: {dataset_id}, look at the glossary: {generated_glossary} and {schema_relationships} and
recommend the LookML semantic model corresponding to the dataset. ''')
return lookml
def run(DATASET_ID):
dataset_description = None
py = None
schema_relations = None
generated_glossary = None
queries = None
lookml = None
lookml_explore = None
dataset = model_response(f'''You are an expert in BQ public datasets. Generate a dataset schema related to {DATASET_ID}. You need to come up with atleast 5 tables with each table
containing atleast 10 columns along with their descriptions.Ensure that these tables have columns that talk about data quality issues.''')
dataset_description = model_response(f'''Generate a succinct 3-4 line description of the dataset: {dataset}.''')
yield dataset_description, None, None, None, None, None, None
#get_table_doc(PROJECT_ID, DATASET_ID)
py = model_response(f'''Based on the dataset provided: {dataset}, identify all the possible relationships
that exist between the tables in the dataset. Discover these relationships from
the point of view of data exploration.
Output:
List of relationships along with the description which is the business value of the relationship and a query
with description that validates the relationship.
Ensure that the column names and table names are accurate.''')
yield dataset_description, py, schema_relations, None, None, None, None
schema_relations = model_response(f'''Based on the context: {py}, generate a knowledge graph represented using ASCII art. Also generate a brief description of the graph.
Output:
Description of the graph listing all the relationships in markdown format
ASCII version of the knowledge graph with nodes represented by tables and edges represented by the relationships. Edges should be annotated with the type of relationships identified - many-to-one, many-to-many, one-to-one, primary key, self joins, foreign keys etc''')
yield dataset_description, py, schema_relations, None, None, None, None
generated_glossary = model_response(f'''Based on the relationships identified: {schema_relations}
and the dataset: {dataset_description}, generate glossary terms that will help business users easily find the tables in the dataset.
## Task
- Your goal is to create a business glossary for the data in this dataset, aligned with the definition of business glossary specified above.
- Provide each business term in a newline, along with the definition.
- Include examples in the term definitions, wherever suitable.
- Make sure the business terms are relevant as per the table and column names and descriptions, and relevant to the domain to which the data belongs.
- Also include a few business terms around the users/clients and around 5 key metrics in the domain of the data.
- After defining the terms, identify the relationships between the business terms identified previously.
## Output format
Ensure that the output is in markdown format with proper indentation
- Output each business glossary term definition in a newline in the folowing format:
term: definition
- For the business terms which are the key metrics in the business domain, mark such terms by adding "[METRIC]" in the beginning of the line, in the following format:
[METRIC] term: definition
- Then print a header to indicate the end of this section and start of the relationships section.
- Then output the relationships between the business terms as follows:
term -> [related_term1, related_term2]
Show the relationship between the glossary term and the column broken down by each table.
''')
yield dataset_description, py, schema_relations, generated_glossary, None, None, None
queries = generate_dataset_queries(dataset, generated_glossary, schema_relations)
yield dataset_description, py, schema_relations, generated_glossary, queries, None, None
lookml = generate_lookml(dataset, generated_glossary, schema_relations)
yield dataset_description, py, schema_relations, generated_glossary, queries, lookml
lookml_explore = model_response(f'''Given the dataset: {dataset}, schema relationships: {py} and graph:{schema_relations}, generate a data preparation pipeline that
fixes the possible data quality issues across the tables in the dataset.''')
yield dataset_description, py, schema_relations, generated_glossary, queries, lookml, lookml_explore
return dataset_description, py, schema_relations, generated_glossary, queries, lookml, lookml_explore
# Modify the wrapper function to yield a tuple for Gradio outputs
def wrapper(dataset_id):
for outputs in run(dataset_id):
yield (
outputs[0],
outputs[1],
outputs[2], # Schema Relationships
outputs[3], # Generated Glossary
outputs[4], # Queries
outputs[5] # LookML Model
)
import gradio as gr
iface = gr.Interface(
fn=wrapper,
inputs=gr.Textbox(label="Dataset ID"),
outputs=[
gr.Markdown(label="Dataset description"),
gr.Markdown(label="Knowledge Graph"),
gr.Markdown(label="Schema Relationships"),
gr.Markdown(label="Generated Glossary"),
gr.Textbox(label="Queries"),
gr.Markdown(label="LookML Model")
],
live=False,
theme = gr.themes.Ocean(),
title="BQ knowledge engine βš™οΈπŸ’‘πŸ“Š (Simulator)",
description="Provide a dataset ID to generate LookML, schema relationships, glossary, and more...", examples=['ncaa_basketball2', 'thelook_ecommerce','geo_openstreetmap','google_political_ads','noaa_historic_severe_storms','stackoverflow'],
article = "This is a simulator that provides a sneak-peek into how BQ knowledge engine works."
)
# Launch the app
iface.launch(share=True, debug=True)