In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "gpt2" 
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Ensure the tokenizer uses padding if necessary
tokenizer.pad_token = tokenizer.eos_token  


In [7]:
from datasets import load_dataset

dataset = load_dataset("wikitext", "wikitext-2-raw-v1")


In [8]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

In [9]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,  # Adjust based on your GPU
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    num_train_epochs=1,
    report_to="none",  # Change to "wandb" or "tensorboard" if using logging
)




In [10]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
)

trainer.train()


Epoch,Training Loss,Validation Loss
1,3.2396,3.291132


TrainOutput(global_step=4590, training_loss=3.347612351062251, metrics={'train_runtime': 4751.264, 'train_samples_per_second': 7.728, 'train_steps_per_second': 0.966, 'total_flos': 9594120830976000.0, 'train_loss': 3.347612351062251, 'epoch': 1.0})

In [11]:
model.save_pretrained("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")

('fine_tuned_model/tokenizer_config.json',
 'fine_tuned_model/special_tokens_map.json',
 'fine_tuned_model/vocab.json',
 'fine_tuned_model/merges.txt',
 'fine_tuned_model/added_tokens.json',
 'fine_tuned_model/tokenizer.json')

In [12]:
import shutil

# Specify the folder to be zipped
folder_path = "fine_tuned_model" # Replace with your actual folder name
zip_name = "fine_tuned_model.zip"  # Desired zip file name

# Create a zip archive
shutil.make_archive(zip_name.replace('.zip', ''), 'zip', folder_path)

print(f"Folder '{folder_path}' has been zipped as '{zip_name}'.")

Folder 'fine_tuned_model' has been zipped as 'fine_tuned_model.zip'.


In [18]:
from transformers import pipeline

In [20]:
code_generator = pipeline("text-generation", model="fine_tuned_model", tokenizer=tokenizer)

prompt = "def quicksort(arr):"
generated_code = code_generator(prompt, max_length=200, num_return_sequences=1)

print(generated_code[0]["generated_text"])

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


def quicksort(arr): 

Proscure = 

Faced with a choice between the current and previous values, an error's resolution in a new value is not necessarily in order, since the first one is the first one that does not change. Prof will have to return a retry call for all possible errors returned from the previous value, which is equivalent to a new retry ( q @-@ f ). A simple recursion will perform only one recursion on the results. 

A recursion in alliter @-@ ordered values is done if it's possible to reorder them at all. This means a recursion in the first function of an array's contents is done if it isn 't possible to reorder them at all. This means, for example, that an array would have to be returned the same number of times in order to work as an array is. 

A recursion in
