Spaces:
Runtime error
Runtime error
File size: 997 Bytes
2b354eb f96e8fd ce79099 f96e8fd 2b354eb cbdb918 f96e8fd 2b354eb f96e8fd 2b354eb f96e8fd 2b354eb f96e8fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
from tokenizers import models, trainers, Tokenizer
from datasets import load_dataset
# Step 1: Download the dataset and save it locally
dataset = load_dataset("wikimedia/wikipedia", "20231101.en", split="train")
# Save the dataset locally to a text file
with open("wikipedia_data.txt", "w", encoding="utf-8") as file:
for example in dataset:
if "text" in example: # Ensure the 'text' column exists
file.write(example["text"] + "\n")
# Step 2: Initialize the tokenizer
tokenizer = Tokenizer(model=models.WordPiece(unk_token="[UNK]"))
# Special tokens and trainer
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)
# Train the tokenizer using the local text file
tokenizer.train(["wikipedia_data.txt"], trainer=trainer)
# Step 3: Test the tokenizer
encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences.")
print("Token IDs:", encoding.ids)
|