Spaces:

mamogasr
/

llm_engineering

Sleeping

File size: 13,857 Bytes

5fdb69e

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dataset generator\n",
    "\n",
    "Suports dataset creation for the following formats (inspired by HuggingFace dashboard):\n",
    "\n",
    "Realistic to create:\n",
    " * Tabular data\n",
    " * Text \n",
    " * Time-series\n",
    "\n",
    "Output formats included:\n",
    "\n",
    "* JSON\n",
    "* CSV\n",
    "* Parquet\n",
    "* Markdown\n",
    "\n",
    "The tool works as follows: given the business problem and the dataset requirements it generates the possible dataset along with the python code that can be executed afterwards. The code saves the created dataset to the files.\n",
    "\n",
    "Supports Chatgpt and Claude models."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# imports\n",
    "import re\n",
    "import os\n",
    "import sys\n",
    "import io\n",
    "import json\n",
    "from dotenv import load_dotenv\n",
    "from openai import OpenAI\n",
    "import anthropic\n",
    "import gradio as gr\n",
    "from pathlib import Path\n",
    "from datetime import datetime\n",
    "import requests\n",
    "import subprocess\n",
    "from IPython.display import Markdown, display, update_display"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialization\n",
    "\n",
    "load_dotenv()\n",
    "\n",
    "openai_api_key = os.getenv('OPENAI_API_KEY')\n",
    "os.environ['ANTHROPIC_API_KEY'] = os.getenv('ANTHROPIC_API_KEY', 'your-key-if-not-using-env')\n",
    "if openai_api_key:\n",
    "    print(f\"OpenAI API Key exists and begins {openai_api_key[:8]}\")\n",
    "else:\n",
    "    print(\"OpenAI API Key not set\")\n",
    "    \n",
    "OPENAI_MODEL = \"gpt-4o-mini\"\n",
    "CLAUDE_MODEL = \"claude-3-5-sonnet-20240620\"\n",
    "openai = OpenAI()\n",
    "claude = anthropic.Anthropic()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Prompts definition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "system_message = \"\"\"You are a helpful assistant whose main purpose is to generate datasets for a given business problem.\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_user_prompt_tabular(business_problem, dataset_format, file_format, num_samples):\n",
    "    \n",
    "    user_message = f\"\"\"\n",
    "    The business problem is: {business_problem}. \\n\n",
    "    The dataset is expected to be in {dataset_format}. \n",
    "    For the dataset types such as tabular or time series implement python code for creating the dataset.\n",
    "    If the generated dataset contains several entities, i.e. products, users, write the output for these entities into separate files. \n",
    "    The dependencies for python code should include only standard python libraries such as numpy, pandas and built-in libraries. \n",
    "    The output dataset is stored as a {file_format} file and contains {num_samples} samples. \\n    \n",
    "    \"\"\"\n",
    "\n",
    "    return user_message\n",
    "\n",
    "def get_user_prompt_text(business_problem, dataset_format, file_format):\n",
    "    \n",
    "    user_message = f\"\"\"\n",
    "    The business problem is: {business_problem}. \\n\n",
    "    The dataset is expected to be in {dataset_format}. \n",
    "    For the text type return the generated dataset and the python code to write the output to the files.\n",
    "    If the generated dataset contains several entities, i.e. products, users, write the output for these entities into separate files. \n",
    "    The dependencies for python code should include only standard python libraries such as numpy, pandas and built-in libraries. \n",
    "    The output dataset is stored as a {file_format} file. \\n    \n",
    "    \"\"\"\n",
    "\n",
    "    return user_message\n",
    "\n",
    "def select_user_prompt(business_problem, dataset_format, file_format, num_samples):\n",
    "    user_prompt = \"\"\n",
    "    if dataset_format == \"Text\":\n",
    "        user_prompt = get_user_prompt_text(business_problem, dataset_format, file_format)\n",
    "    elif dataset_format in [\"Tabular\", \"Time-series\"]:\n",
    "        user_prompt = get_user_prompt_tabular(business_problem, dataset_format, file_format, num_samples)\n",
    "    return user_prompt\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Calls to api to fetch the dataset requirements"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def stream_gpt(business_problem, dataset_format, file_format, num_samples):\n",
    "\n",
    "    user_prompt = select_user_prompt(\n",
    "                    business_problem, dataset_format, file_format, num_samples\n",
    "                )\n",
    "    stream = openai.chat.completions.create(\n",
    "        model=OPENAI_MODEL,\n",
    "        messages=[\n",
    "            {\"role\": \"system\", \"content\": system_message},\n",
    "            {\n",
    "                \"role\": \"user\",\n",
    "                \"content\": user_prompt,\n",
    "            },\n",
    "        ],\n",
    "        stream=True,\n",
    "    )\n",
    "\n",
    "    response = \"\"\n",
    "    for chunk in stream:\n",
    "        response += chunk.choices[0].delta.content or \"\"\n",
    "        yield response\n",
    "\n",
    "    return response\n",
    "\n",
    "\n",
    "def stream_claude(business_problem, dataset_format, file_format, num_samples):\n",
    "    user_prompt = select_user_prompt(\n",
    "                    business_problem, dataset_format, file_format, num_samples\n",
    "                )\n",
    "    result = claude.messages.stream(\n",
    "        model=CLAUDE_MODEL,\n",
    "        max_tokens=2000,\n",
    "        system=system_message,\n",
    "        messages=[\n",
    "            {\n",
    "                \"role\": \"user\",\n",
    "                \"content\": user_prompt,\n",
    "            }\n",
    "        ],\n",
    "    )\n",
    "    reply = \"\"\n",
    "    with result as stream:\n",
    "        for text in stream.text_stream:\n",
    "            reply += text\n",
    "            yield reply\n",
    "            print(text, end=\"\", flush=True)\n",
    "    return reply\n",
    "\n",
    "\n",
    "def generate_dataset(business_problem, dataset_format, file_format, num_samples, model):\n",
    "    if model == \"GPT\":\n",
    "        result = stream_gpt(business_problem, dataset_format, file_format, num_samples)\n",
    "    elif model == \"Claude\":\n",
    "        result = stream_claude(business_problem, dataset_format, file_format, num_samples)\n",
    "    else:\n",
    "        raise ValueError(\"Unknown model\")\n",
    "    for stream_so_far in result:\n",
    "        yield stream_so_far\n",
    "    return result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Extract python code from the LLM output and execute it locally"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def extract_code(text):\n",
    "    # Regular expression to find text between ``python and ``\n",
    "    match = re.search(r\"```python(.*?)```\", text, re.DOTALL)\n",
    "\n",
    "    if match:\n",
    "        code = match.group(0).strip()  # Extract and strip extra spaces\n",
    "    else:\n",
    "        code = \"\"\n",
    "        print(\"No matching substring found.\")\n",
    "\n",
    "    return code.replace(\"```python\\n\", \"\").replace(\"```\", \"\")\n",
    "\n",
    "\n",
    "def execute_code_in_virtualenv(text, python_interpreter=sys.executable):\n",
    "    \"\"\"\n",
    "    Execute the given Python code string within the specified virtual environment.\n",
    "    \n",
    "    Args:\n",
    "    - code_str: str, the Python code to execute.\n",
    "    - venv_dir: str, the directory path to the virtual environment created by pipenv.\n",
    "    \"\"\"\n",
    "    # Construct the full path to the Python interpreter in the virtual environment\n",
    "    # python_interpreter = f\"{venv_dir}/bin/python\"\n",
    "\n",
    "    # Check if executing within the specified virtual environment interpreter\n",
    "    if not python_interpreter:\n",
    "        raise EnvironmentError(\"Python interpreter not found in the specified virtual environment.\")\n",
    "\n",
    "    # Prepare the command to execute the code\n",
    "    code_str = extract_code(text)\n",
    "    command = [python_interpreter, '-c', code_str]\n",
    "\n",
    "    # Execute the command\n",
    "    try:\n",
    "        result = subprocess.run(command, check=True, capture_output=True, text=True)\n",
    "        print(\"Output:\", result.stdout)\n",
    "        print(\"Errors:\", result.stderr)\n",
    "    except subprocess.CalledProcessError as e:\n",
    "        print(f\"An error occurred while executing the code: {e}\")\n",
    "    return result.stdout\n",
    "\n",
    "# Example usage\n",
    "code_string = \"\"\"\n",
    "print('Hello from Pipenv virtual environment!')\n",
    "\"\"\"\n",
    "venv_directory = sys.executable  # replace with your actual virtualenv path\n",
    "(execute_code_in_virtualenv(code_string, venv_directory))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Test example for running the code locally"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Example string\n",
    "text = \"\"\"\n",
    "Some text here \n",
    "```python\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from datetime import datetime, timedelta\n",
    "\n",
    "# Parameters\n",
    "num_records = 100\n",
    "start_date = datetime(2023, 1, 1)\n",
    "item_ids = [f'item_{i}' for i in range(1, num_records+1)]\n",
    "\n",
    "# Generate dates\n",
    "dates = [start_date + timedelta(days=i) for i in range(num_records)]\n",
    "\n",
    "# Generate random views and clicks\n",
    "np.random.seed(42)  # For reproducibility\n",
    "views = np.random.poisson(lam=100, size=num_records)  # Average 100 views\n",
    "clicks = np.random.binomial(n=views, p=0.1)  # 10% click-through rate\n",
    "\n",
    "# Calculate rank based on clicks (lower is better)\n",
    "# You can also modify this function as per your ranking criteria\n",
    "ranks = [sorted(clicks, reverse=True).index(x) + 1 for x in clicks]  # Rank 1 is highest\n",
    "\n",
    "# Assemble the DataFrame\n",
    "data = {\n",
    "    'date': dates,\n",
    "    'item_id': item_ids,\n",
    "    'views': views,\n",
    "    'clicks': clicks,\n",
    "    'rank': ranks\n",
    "}\n",
    "\n",
    "df = pd.DataFrame(data)\n",
    "\n",
    "# Save to CSV\n",
    "df.to_csv('fashion_classified_ranking_dataset.csv', index=False)\n",
    "print(\"Dataset generated and saved as 'fashion_classified_ranking_dataset.csv'\")\n",
    "```\n",
    " and more text here.\n",
    "\"\"\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# execute_code_in_virtualenv(text, venv_directory)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Gradio interface"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "with gr.Blocks() as ui:\n",
    "    gr.Markdown(\"## Create a dataset for a business problem\")\n",
    "    with gr.Column():\n",
    "        business_problem = gr.Textbox(label=\"Business problem\", lines=2)\n",
    "        dataset_type = gr.Dropdown(\n",
    "            [\"Tabular\", \"Time-series\", \"Text\"], label=\"Dataset modality\"\n",
    "        )\n",
    "        dataset_format = gr.Dropdown([\"JSON\", \"csv\", \"parquet\", \"Markdown\"], label=\"Output format\")\n",
    "        num_samples = gr.Number(label=\"Number of samples (for tabular and time-series data)\", value=10, precision=0)\n",
    "        model = gr.Dropdown([\"GPT\", \"Claude\"], label=\"Select model\", value=\"GPT\")\n",
    "    with gr.Row():\n",
    "        dataset_run = gr.Button(\"Create a dataset\")\n",
    "        code_run = gr.Button(\"Execute code for a dataset\")\n",
    "    with gr.Row():\n",
    "        dataset_out = gr.Textbox(label=\"Generated Dataset\")\n",
    "        code_out = gr.Textbox(label=\"Executed code\")\n",
    "    dataset_run.click(\n",
    "        generate_dataset,\n",
    "        inputs=[business_problem, dataset_type, dataset_format, num_samples, model],\n",
    "        outputs=[dataset_out]\n",
    "    )\n",
    "    code_run.click(execute_code_in_virtualenv, inputs=[dataset_out], outputs=[code_out])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ui.launch(inbrowser=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}