{ "cells": [ { "cell_type": "code", "execution_count": 6, "id": "9665f082-b1e2-4094-a9c4-f5fa4560e01f", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoModelForCausalLM, AutoTokenizer\n", "\n", "model_name = \"gpt2\" \n", "model = AutoModelForCausalLM.from_pretrained(model_name)\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "\n", "# Ensure the tokenizer uses padding if necessary\n", "tokenizer.pad_token = tokenizer.eos_token \n" ] }, { "cell_type": "code", "execution_count": 7, "id": "8c81406c-1335-4491-b8cd-67770e86e390", "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "dataset = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\")\n" ] }, { "cell_type": "code", "execution_count": 8, "id": "2fd0c7d7-1c01-416c-af00-2d11a51663f1", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "833d3e6bacf94b4f83849b76e554c187", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/36718 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def tokenize_function(examples):\n", " return tokenizer(examples[\"text\"], truncation=True, padding=\"max_length\", max_length=512)\n", "\n", "tokenized_datasets = dataset.map(tokenize_function, batched=True)\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "85a7f1be-a72d-4b94-b232-4942616810f9", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/transformers/training_args.py:1594: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n", " warnings.warn(\n" ] } ], "source": [ "from transformers import TrainingArguments\n", "\n", "training_args = TrainingArguments(\n", " output_dir=\"./results\",\n", " evaluation_strategy=\"epoch\",\n", " save_strategy=\"epoch\",\n", " per_device_train_batch_size=8, # Adjust based on your GPU\n", " per_device_eval_batch_size=8,\n", " logging_dir=\"./logs\",\n", " logging_steps=10,\n", " num_train_epochs=1,\n", " report_to=\"none\", # Change to \"wandb\" or \"tensorboard\" if using logging\n", ")\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "cb46a328-74ef-420a-b5d7-b3159cc8f5b0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
Epoch | \n", "Training Loss | \n", "Validation Loss | \n", "
---|---|---|
1 | \n", "3.239600 | \n", "3.291132 | \n", "
"
],
"text/plain": [
"