Spaces:

Itsme5
/

tokenGPT-2

Runtime error

Itsme5 commited on Mar 27

Commit

f96e8fd

verified ·

1 Parent(s): 1116008

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,13 +1,25 @@
 from tokenizers import models, trainers, Tokenizer
 tokenizer = Tokenizer(model=models.WordPiece(unk_token="[UNK]"))
 special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
 trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)
-tokenizer.train(["https://datasets-server.huggingface.co/rows?dataset=wikimedia%2Fwikipedia&config=20231101.en&split=train&offset=0&length=100"],
-    trainer=trainer)
 encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences.")
-print(encoding.ids)

 from tokenizers import models, trainers, Tokenizer
+from datasets import load_dataset
+# Step 1: Download the dataset and save it locally
+dataset = load_dataset("wikimedia/wikipedia", "20231101.en", split="train")
+# Save the dataset locally to a text file
+with open("wikipedia_data.txt", "w", encoding="utf-8") as file:
+    for example in dataset:
+        if "text" in example:  # Ensure the 'text' column exists
+            file.write(example["text"] + "\n")
+# Step 2: Initialize the tokenizer
 tokenizer = Tokenizer(model=models.WordPiece(unk_token="[UNK]"))
+# Special tokens and trainer
 special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
 trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)
+# Train the tokenizer using the local text file
+tokenizer.train(["wikipedia_data.txt"], trainer=trainer)
+# Step 3: Test the tokenizer
 encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences.")
+print("Token IDs:", encoding.ids)