xcx0902's picture
Upload folder using huggingface_hub
00fcad7 verified
from tokenizers import ByteLevelBPETokenizer
from transformers import (
GPT2Config,
GPT2LMHeadModel,
GPT2TokenizerFast,
Trainer,
TrainingArguments,
TextDataset,
DataCollatorForLanguageModeling
)
import os
# Step 1: Train a tokenizer from scratch
tokenizer_dir = "./tokenizer"
os.makedirs(tokenizer_dir, exist_ok=True)
# Initialize and train the tokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files="train_data.txt", vocab_size=1000, min_frequency=2, special_tokens=[
"<s>",
"<pad>",
"</s>",
"<unk>",
"<mask>"
])
tokenizer.save_model(tokenizer_dir)
# Load it into a Hugging Face tokenizer
hf_tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_dir)
hf_tokenizer.add_special_tokens({
"pad_token": "<pad>",
"bos_token": "<s>",
"eos_token": "</s>"
})
# Step 2: Prepare the dataset
def load_dataset(file_path, tokenizer, block_size=128):
return TextDataset(
tokenizer=tokenizer,
file_path=file_path,
block_size=block_size
)
train_dataset = load_dataset("train_data.txt", hf_tokenizer)
# Step 3: Define GPT2 config for a tiny model
config = GPT2Config(
vocab_size=hf_tokenizer.vocab_size,
n_positions=2048,
n_ctx=2048,
n_embd=1024,
n_layer=12,
n_head=2,
bos_token_id=hf_tokenizer.bos_token_id,
eos_token_id=hf_tokenizer.eos_token_id,
pad_token_id=hf_tokenizer.pad_token_id
)
# Step 4: Initialize model from scratch
model = GPT2LMHeadModel(config)
model.resize_token_embeddings(len(hf_tokenizer))
# Step 5: Define data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=hf_tokenizer,
mlm=False,
)
# Step 6: Define training arguments
training_args = TrainingArguments(
output_dir=".",
overwrite_output_dir=True,
num_train_epochs=30,
per_device_train_batch_size=4,
save_total_limit=0,
logging_steps=50,
prediction_loss_only=True,
report_to="none"
)
# Step 7: Trainer setup
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
)
# Step 8: Train!
trainer.train()
# Step 9: Save everything
trainer.save_model(".")
hf_tokenizer.save_pretrained(".")