from tokenizers import ByteLevelBPETokenizer from transformers import ( GPT2Config, GPT2LMHeadModel, GPT2TokenizerFast, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling ) import os # Step 1: Train a tokenizer from scratch tokenizer_dir = "./tokenizer" os.makedirs(tokenizer_dir, exist_ok=True) # Initialize and train the tokenizer tokenizer = ByteLevelBPETokenizer() tokenizer.train(files="train_data.txt", vocab_size=1000, min_frequency=2, special_tokens=[ "", "", "", "", "" ]) tokenizer.save_model(tokenizer_dir) # Load it into a Hugging Face tokenizer hf_tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_dir) hf_tokenizer.add_special_tokens({ "pad_token": "", "bos_token": "", "eos_token": "" }) # Step 2: Prepare the dataset def load_dataset(file_path, tokenizer, block_size=128): return TextDataset( tokenizer=tokenizer, file_path=file_path, block_size=block_size ) train_dataset = load_dataset("train_data.txt", hf_tokenizer) # Step 3: Define GPT2 config for a tiny model config = GPT2Config( vocab_size=hf_tokenizer.vocab_size, n_positions=2048, n_ctx=2048, n_embd=1024, n_layer=12, n_head=2, bos_token_id=hf_tokenizer.bos_token_id, eos_token_id=hf_tokenizer.eos_token_id, pad_token_id=hf_tokenizer.pad_token_id ) # Step 4: Initialize model from scratch model = GPT2LMHeadModel(config) model.resize_token_embeddings(len(hf_tokenizer)) # Step 5: Define data collator data_collator = DataCollatorForLanguageModeling( tokenizer=hf_tokenizer, mlm=False, ) # Step 6: Define training arguments training_args = TrainingArguments( output_dir=".", overwrite_output_dir=True, num_train_epochs=30, per_device_train_batch_size=4, save_total_limit=0, logging_steps=50, prediction_loss_only=True, report_to="none" ) # Step 7: Trainer setup trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, ) # Step 8: Train! trainer.train() # Step 9: Save everything trainer.save_model(".") hf_tokenizer.save_pretrained(".")