autonomous-data-exploration

Runtime error

File size: 4,815 Bytes

5ea2a69
0d628a0
5ea2a69
 
0d628a0
76b8fa2
6620ef1
 
5ea2a69
0d628a0
5ea2a69
e73d501
5ea2a69
aee4cdd
5ea2a69
 
 
0d628a0
0741da6
5ea2a69
 
 
0d628a0
0741da6
e45053f
0d628a0
 
9cf039d
0d628a0
 
 
e1fe61c
 
5ea2a69
1480aa8
 
6620ef1
 
 
0d628a0
5ea2a69
9cf039d
0d628a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6620ef1
1480aa8
6620ef1
0d628a0
6620ef1
1480aa8
6620ef1
0d628a0
 
 
 
6620ef1
 
6794b69
6620ef1
0d628a0
 
6620ef1
5ea2a69
0d628a0
 
6620ef1
0d628a0
6620ef1
0d628a0
 
 
6620ef1
6794b69
6620ef1
5ea2a69
 
 
19d8bb4
 
e45053f
19d8bb4
 
 
e45053f
0d628a0
ba1aaa5
0d628a0
 
ba1aaa5
0d628a0
62139c9
6620ef1
aee4cdd
6620ef1
 
 
 
 
 
0d628a0
 
 
 
 
 
 
5ea2a69

import os
import shutil
import gradio as gr
from transformers import ReactCodeAgent, HfEngine, Tool
import pandas as pd

from gradio import Chatbot
from streaming import stream_to_gradio
from huggingface_hub import login
from gradio.data_classes import FileData

login(os.getenv("HUGGINGFACEHUB_API_TOKEN"))

llm_engine = HfEngine("meta-llama/Meta-Llama-3.1-70B-Instruct")

agent = ReactCodeAgent(
    tools=[],
    llm_engine=llm_engine,
    additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "scipy.stats"],
    max_iterations=10,
)

base_prompt = """You are an expert data analyst.
According to the features you have and the data structure given below, determine which feature should be the target.
Then list 5 interesting questions that could be asked on this data, for instance about specific correlations with target variable.
Then answer these questions one by one, by finding the relevant numbers.
Meanwhile, plot some figures using matplotlib/seaborn and save them to the (already existing) folder './figures/': take care to clear each figure with plt.clf() before doing another plot.
Generate a summary of each of the plot generated.
In your final answer: summarize these correlations and trends
After each number derive real worlds insights, for instance: "Correlation between is_december and boredness is 1.3453, which suggest people are more bored in winter".
Your final answer should be a long string with at least 3 numbered and detailed parts.
You should also include 3 follow-up questions that can be answered with this analysis
Provide suggestions around what additional input needs to be provided by the user for better analysis

Structure of the data:
{structure_notes}

The data file is passed to you as the variable data_file, it is a pandas dataframe, you can use it directly.
DO NOT try to load data_file, it is already a dataframe pre-loaded in your python interpreter!
"""

example_notes="""This data is about the Titanic wreck in 1912. I am interested in the survival statistics."""

def get_images_in_directory(directory):
    image_extensions = {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'}

    image_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if os.path.splitext(file)[1].lower() in image_extensions:
                image_files.append(os.path.join(root, file))
    return image_files

def interact_with_agent(file_input, additional_notes):
    shutil.rmtree("./figures")
    os.makedirs("./figures")

    data_file = pd.read_csv(file_input)
    data_structure_notes = f"""- Description (output of .describe()):
    {data_file.describe()}
    - Columns with dtypes:
    {data_file.dtypes}"""

    prompt = base_prompt.format(structure_notes=data_structure_notes)

    if additional_notes and len(additional_notes) > 0:
        prompt += "\nAdditional notes on the data:\n" + additional_notes

    messages = [gr.ChatMessage(role="user", content=prompt)]
    yield messages + [
        gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")
    ]

    plot_image_paths = {}
    for msg in stream_to_gradio(agent, prompt, data_file=data_file):
        messages.append(msg)
        for image_path in get_images_in_directory("./figures"):
            if image_path not in plot_image_paths:
                image_message = gr.ChatMessage(
                    role="assistant",
                    content=FileData(path=image_path, mime_type="image/png"),
                )
                plot_image_paths[image_path] = True
                messages.append(image_message)
        yield messages + [
            gr.ChatMessage(role="assistant", content="⏳ _Still processing..._")
        ]
    yield messages


with gr.Blocks(
    theme=gr.themes.Soft(
        primary_hue=gr.themes.colors.green,
        secondary_hue=gr.themes.colors.blue,
    )
) as demo:
    gr.Markdown("""# Agentville Data analyst 📊

Drop a `.csv` file below, add notes to describe this data if needed, and **Gemini and Llama-3.1-70B will analyze the file content and does the analysis for you!**""")
    file_input = gr.File(label="Your file to analyze")
    text_input = gr.Textbox(
        label="Additional notes to guide the analysis"
    )
    submit = gr.Button("Run analysis!", variant="primary")
    chatbot = gr.Chatbot(
        label="Data Analyst Agent",
        type="messages",
        avatar_images=(
            None,
            "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
        ),
    )
    gr.Examples(
        examples=[["./example/titanic.csv", example_notes]],
        inputs=[file_input, text_input],
        cache_examples=False
    )

    submit.click(interact_with_agent, [file_input, text_input], [chatbot])

if __name__ == "__main__":
    demo.launch()