Add a rag helper notebook

Browse files

Files changed (2) hide show

rag_helper.ipynb +305 -0
src/prompts/__pycache__/pre_rag_prompt.cpython-311.pyc +0 -0

rag_helper.ipynb ADDED Viewed

	@@ -0,0 +1,305 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Run pre-trained DeepSeek Coder 1.3B Model on Chat-GPT 4o generated dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd \n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
+    "import torch\n",
+    "import sys\n",
+    "import os\n",
+    "import sqlite3 as sql\n",
+    "from huggingface_hub import snapshot_download"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "is_google_colab=False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "current_path = \"./\"\n",
+    "\n",
+    "def get_path(rel_path):\n",
+    "    return os.path.join(current_path, rel_path)\n",
+    "\n",
+    "if is_google_colab:\n",
+    "    hugging_face_path = snapshot_download(\n",
+    "        repo_id=\"USC-Applied-NLP-Group/SQL-Generation\",\n",
+    "        repo_type=\"model\",  \n",
+    "        allow_patterns=[\"src/*\", \"train-data/*\", \"deepseek-coder-1.3b-instruct/*\", \"nba-data/*\"], \n",
+    "    )\n",
+    "    sys.path.append(hugging_face_path)\n",
+    "    current_path = hugging_face_path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from src.prompts.pre_rag_prompt import input_text"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## First load dataset into pandas dataframe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total dataset examples: 1044\n",
+      "\n",
+      "\n",
+      "What is the maximum number of team rebounds recorded by the San Antonio Spurs in away games where they committed more than 20 fouls?\n",
+      "SELECT MAX(o.team_rebounds_away) FROM game g JOIN other_stats o ON g.game_id = o.game_id WHERE g.team_abbreviation_away = 'SAS' AND g.pf_away > 20 AND g.season_id = '22003';\n",
+      "13\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load dataset and check length\n",
+    "df = pd.read_csv(get_path(\"train-data/sql_train.tsv\"), sep=\"\\t\")\n",
+    "print(\"Total dataset examples: \" + str(len(df)))\n",
+    "print(\"\\n\")\n",
+    "\n",
+    "# Test sampling\n",
+    "sample = df.sample(n=1)\n",
+    "print(sample[\"natural_query\"].values[0])\n",
+    "print(sample[\"sql_query\"].values[0])\n",
+    "print(sample[\"result\"].values[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load pre-trained DeepSeek model using transformers and pytorch packages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set device to cuda if available, otherwise CPU\n",
+    "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
+    "\n",
+    "# Load model and tokenizer\n",
+    "if is_google_colab:\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(get_path(\"deepseek-coder-1.3b-instruct\"))\n",
+    "    model = AutoModelForCausalLM.from_pretrained(get_path(\"deepseek-coder-1.3b-instruct\"), torch_dtype=torch.bfloat16, device_map=device) \n",
+    "else:\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(\"./deepseek-coder-1.3b-instruct\")\n",
+    "    model = AutoModelForCausalLM.from_pretrained(\"./deepseek-coder-1.3b-instruct\", torch_dtype=torch.bfloat16, device_map=device) \n",
+    "model.generation_config.pad_token_id = tokenizer.pad_token_id"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Test model performance on a single example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Response:\n",
+      "game, other_stats\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create message with sample query and run model\n",
+    "message=[{ 'role': 'user', 'content': input_text + sample[\"natural_query\"].values[0]}]\n",
+    "inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors=\"pt\").to(model.device)\n",
+    "outputs = model.generate(inputs, max_new_tokens=512, do_sample=False, top_k=50, top_p=0.95, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)\n",
+    "\n",
+    "# Print output\n",
+    "query_output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)\n",
+    "print(query_output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Test sample output on sqlite3 database"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create connection to sqlite3 database\n",
+    "connection = sql.connect(get_path('nba-data/nba.sqlite'))\n",
+    "cursor = connection.cursor()\n",
+    "\n",
+    "# Execute query from model output and print result\n",
+    "if query_output[0:7] == \"SQLite:\":\n",
+    "    print(\"cleaned\")\n",
+    "    query = query_output[7:]\n",
+    "elif query_output[0:4] == \"SQL:\":\n",
+    "    query = query_output[4:]\n",
+    "else:\n",
+    "    query = query_output\n",
+    "\n",
+    "try:\n",
+    "    cursor.execute(query)\n",
+    "    rows = cursor.fetchall()\n",
+    "    for row in rows:\n",
+    "        print(row)\n",
+    "except:\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create function to compare output to ground truth result from examples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Which team abbreviation belongs to the team based in Phoenix?\n",
+      "SELECT abbreviation FROM team WHERE city = 'Phoenix';\n",
+      "PHX\n",
+      "\"team\"\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Obtain sample\n",
+    "sample = df.sample(n=1)\n",
+    "\n",
+    "print(sample[\"natural_query\"].values[0])\n",
+    "print(sample[\"sql_query\"].values[0])\n",
+    "print(sample[\"result\"].values[0])\n",
+    "\n",
+    "# Create message with sample query and run model\n",
+    "message=[{ 'role': 'user', 'content': input_text + sample[\"natural_query\"].values[0]}]\n",
+    "inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors=\"pt\").to(model.device)\n",
+    "outputs = model.generate(inputs, max_new_tokens=512, do_sample=False, top_k=50, top_p=0.95, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)\n",
+    "\n",
+    "# Print output\n",
+    "query_output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)\n",
+    "print(query_output)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create function to evaluate pretrained model on full datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def run_evaluation(nba_df):\n",
+    "    for index, row in nba_df.iterrows():\n",
+    "        # Create message with sample query and run model\n",
+    "        message=[{ 'role': 'user', 'content': input_text + row[\"natural_query\"]}]\n",
+    "        inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors=\"pt\").to(model.device)\n",
+    "        outputs = model.generate(inputs, max_new_tokens=512, do_sample=False, top_k=50, top_p=0.95, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)\n",
+    "\n",
+    "        # Obtain output\n",
+    "        query_output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)\n",
+    "\n",
+    "        print(\"Query: \", + row[\"sql_query\"])\n",
+    "        print(\"Response: \",query_output)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run_evaluation(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "CSCI544",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

src/prompts/__pycache__/pre_rag_prompt.cpython-311.pyc ADDED Viewed

Binary file (4.13 kB). View file