|
from tokenizers import ByteLevelBPETokenizer
|
|
from transformers import (
|
|
GPT2Config,
|
|
GPT2LMHeadModel,
|
|
GPT2TokenizerFast,
|
|
Trainer,
|
|
TrainingArguments,
|
|
TextDataset,
|
|
DataCollatorForLanguageModeling
|
|
)
|
|
import os
|
|
|
|
|
|
tokenizer_dir = "./tokenizer"
|
|
os.makedirs(tokenizer_dir, exist_ok=True)
|
|
|
|
|
|
tokenizer = ByteLevelBPETokenizer()
|
|
tokenizer.train(files="train_data.txt", vocab_size=1000, min_frequency=2, special_tokens=[
|
|
"<s>",
|
|
"<pad>",
|
|
"</s>",
|
|
"<unk>",
|
|
"<mask>"
|
|
])
|
|
tokenizer.save_model(tokenizer_dir)
|
|
|
|
|
|
hf_tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_dir)
|
|
hf_tokenizer.add_special_tokens({
|
|
"pad_token": "<pad>",
|
|
"bos_token": "<s>",
|
|
"eos_token": "</s>"
|
|
})
|
|
|
|
|
|
def load_dataset(file_path, tokenizer, block_size=128):
|
|
return TextDataset(
|
|
tokenizer=tokenizer,
|
|
file_path=file_path,
|
|
block_size=block_size
|
|
)
|
|
|
|
train_dataset = load_dataset("train_data.txt", hf_tokenizer)
|
|
|
|
|
|
config = GPT2Config(
|
|
vocab_size=hf_tokenizer.vocab_size,
|
|
n_positions=2048,
|
|
n_ctx=2048,
|
|
n_embd=1024,
|
|
n_layer=12,
|
|
n_head=2,
|
|
bos_token_id=hf_tokenizer.bos_token_id,
|
|
eos_token_id=hf_tokenizer.eos_token_id,
|
|
pad_token_id=hf_tokenizer.pad_token_id
|
|
)
|
|
|
|
|
|
model = GPT2LMHeadModel(config)
|
|
model.resize_token_embeddings(len(hf_tokenizer))
|
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling(
|
|
tokenizer=hf_tokenizer,
|
|
mlm=False,
|
|
)
|
|
|
|
|
|
training_args = TrainingArguments(
|
|
output_dir=".",
|
|
overwrite_output_dir=True,
|
|
num_train_epochs=30,
|
|
per_device_train_batch_size=4,
|
|
save_total_limit=0,
|
|
logging_steps=50,
|
|
prediction_loss_only=True,
|
|
report_to="none"
|
|
)
|
|
|
|
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
data_collator=data_collator,
|
|
train_dataset=train_dataset,
|
|
)
|
|
|
|
|
|
trainer.train()
|
|
|
|
|
|
trainer.save_model(".")
|
|
hf_tokenizer.save_pretrained(".")
|
|
|