Itsme5 commited on
Commit
f96e8fd
·
verified ·
1 Parent(s): 1116008

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -4
app.py CHANGED
@@ -1,13 +1,25 @@
1
  from tokenizers import models, trainers, Tokenizer
 
2
 
 
 
 
 
 
 
 
 
 
 
3
  tokenizer = Tokenizer(model=models.WordPiece(unk_token="[UNK]"))
4
 
 
5
  special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
6
  trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)
7
 
8
- tokenizer.train(["https://datasets-server.huggingface.co/rows?dataset=wikimedia%2Fwikipedia&config=20231101.en&split=train&offset=0&length=100"],
9
- trainer=trainer)
10
-
11
 
 
12
  encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences.")
13
- print(encoding.ids)
 
1
  from tokenizers import models, trainers, Tokenizer
2
+ from datasets import load_dataset
3
 
4
+ # Step 1: Download the dataset and save it locally
5
+ dataset = load_dataset("wikimedia/wikipedia", "20231101.en", split="train")
6
+
7
+ # Save the dataset locally to a text file
8
+ with open("wikipedia_data.txt", "w", encoding="utf-8") as file:
9
+ for example in dataset:
10
+ if "text" in example: # Ensure the 'text' column exists
11
+ file.write(example["text"] + "\n")
12
+
13
+ # Step 2: Initialize the tokenizer
14
  tokenizer = Tokenizer(model=models.WordPiece(unk_token="[UNK]"))
15
 
16
+ # Special tokens and trainer
17
  special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
18
  trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)
19
 
20
+ # Train the tokenizer using the local text file
21
+ tokenizer.train(["wikipedia_data.txt"], trainer=trainer)
 
22
 
23
+ # Step 3: Test the tokenizer
24
  encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences.")
25
+ print("Token IDs:", encoding.ids)