{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "9665f082-b1e2-4094-a9c4-f5fa4560e01f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
    "\n",
    "model_name = \"gpt2\" \n",
    "model = AutoModelForCausalLM.from_pretrained(model_name)\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "\n",
    "# Ensure the tokenizer uses padding if necessary\n",
    "tokenizer.pad_token = tokenizer.eos_token  \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "8c81406c-1335-4491-b8cd-67770e86e390",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "dataset = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "2fd0c7d7-1c01-416c-af00-2d11a51663f1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "833d3e6bacf94b4f83849b76e554c187",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/36718 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "def tokenize_function(examples):\n",
    "    return tokenizer(examples[\"text\"], truncation=True, padding=\"max_length\", max_length=512)\n",
    "\n",
    "tokenized_datasets = dataset.map(tokenize_function, batched=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "85a7f1be-a72d-4b94-b232-4942616810f9",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/transformers/training_args.py:1594: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "from transformers import TrainingArguments\n",
    "\n",
    "training_args = TrainingArguments(\n",
    "    output_dir=\"./results\",\n",
    "    evaluation_strategy=\"epoch\",\n",
    "    save_strategy=\"epoch\",\n",
    "    per_device_train_batch_size=8,  # Adjust based on your GPU\n",
    "    per_device_eval_batch_size=8,\n",
    "    logging_dir=\"./logs\",\n",
    "    logging_steps=10,\n",
    "    num_train_epochs=1,\n",
    "    report_to=\"none\",  # Change to \"wandb\" or \"tensorboard\" if using logging\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "cb46a328-74ef-420a-b5d7-b3159cc8f5b0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='4590' max='4590' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [4590/4590 1:19:10, Epoch 1/1]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Epoch</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>3.239600</td>\n",
       "      <td>3.291132</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=4590, training_loss=3.347612351062251, metrics={'train_runtime': 4751.264, 'train_samples_per_second': 7.728, 'train_steps_per_second': 0.966, 'total_flos': 9594120830976000.0, 'train_loss': 3.347612351062251, 'epoch': 1.0})"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling\n",
    "\n",
    "data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)\n",
    "\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    train_dataset=tokenized_datasets[\"train\"],\n",
    "    eval_dataset=tokenized_datasets[\"validation\"],\n",
    "    data_collator=data_collator,\n",
    ")\n",
    "\n",
    "trainer.train()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "d257f423-a9ea-4fe2-9fcf-bebcf1cd356d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('fine_tuned_model/tokenizer_config.json',\n",
       " 'fine_tuned_model/special_tokens_map.json',\n",
       " 'fine_tuned_model/vocab.json',\n",
       " 'fine_tuned_model/merges.txt',\n",
       " 'fine_tuned_model/added_tokens.json',\n",
       " 'fine_tuned_model/tokenizer.json')"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.save_pretrained(\"fine_tuned_model\")\n",
    "tokenizer.save_pretrained(\"fine_tuned_model\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "493e4e36-45a6-4cd2-b37d-2e8e534f1a39",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Folder 'fine_tuned_model' has been zipped as 'fine_tuned_model.zip'.\n"
     ]
    }
   ],
   "source": [
    "import shutil\n",
    "\n",
    "# Specify the folder to be zipped\n",
    "folder_path = \"fine_tuned_model\" # Replace with your actual folder name\n",
    "zip_name = \"fine_tuned_model.zip\"  # Desired zip file name\n",
    "\n",
    "# Create a zip archive\n",
    "shutil.make_archive(zip_name.replace('.zip', ''), 'zip', folder_path)\n",
    "\n",
    "print(f\"Folder '{folder_path}' has been zipped as '{zip_name}'.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "fda9cf8b-1e3c-47c2-8a60-11cccf2d608a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "d60aa595-6bff-4686-a9ba-3e9b993a54ed",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Device set to use cuda:0\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "def quicksort(arr): \n",
      "\n",
      "Proscure = \n",
      "\n",
      "Faced with a choice between the current and previous values, an error's resolution in a new value is not necessarily in order, since the first one is the first one that does not change. Prof will have to return a retry call for all possible errors returned from the previous value, which is equivalent to a new retry ( q @-@ f ). A simple recursion will perform only one recursion on the results. \n",
      "\n",
      "A recursion in alliter @-@ ordered values is done if it's possible to reorder them at all. This means a recursion in the first function of an array's contents is done if it isn 't possible to reorder them at all. This means, for example, that an array would have to be returned the same number of times in order to work as an array is. \n",
      "\n",
      "A recursion in\n"
     ]
    }
   ],
   "source": [
    "code_generator = pipeline(\"text-generation\", model=\"fine_tuned_model\", tokenizer=tokenizer)\n",
    "\n",
    "prompt = \"def quicksort(arr):\"\n",
    "generated_code = code_generator(prompt, max_length=200, num_return_sequences=1)\n",
    "\n",
    "print(generated_code[0][\"generated_text\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c82d049-147d-49e0-bc87-b7793c01dba1",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}