Update train_tokenizer.py
Browse files- train_tokenizer.py +6 -7
train_tokenizer.py
CHANGED
@@ -1,22 +1,21 @@
|
|
|
|
1 |
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
|
2 |
|
3 |
def train_tokenizer(iterator, vocab_size=50000, min_frequency=3):
|
|
|
4 |
tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
|
5 |
|
6 |
-
# Normalization για ελληνικά και
|
7 |
-
tokenizer.normalizer = normalizers.
|
8 |
-
normalizers.NFC(),
|
9 |
-
normalizers.StripAccents()
|
10 |
-
])
|
11 |
|
12 |
-
#
|
13 |
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
|
14 |
pre_tokenizers.WhitespaceSplit(),
|
15 |
pre_tokenizers.Punctuation(),
|
16 |
pre_tokenizers.Digits(individual_digits=True)
|
17 |
])
|
18 |
|
19 |
-
#
|
20 |
trainer = trainers.BpeTrainer(
|
21 |
vocab_size=vocab_size,
|
22 |
min_frequency=min_frequency,
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
|
3 |
|
4 |
def train_tokenizer(iterator, vocab_size=50000, min_frequency=3):
|
5 |
+
# Δημιουργία του Tokenizer με μοντέλο BPE και ορισμό token για άγνωστα
|
6 |
tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
|
7 |
|
8 |
+
# Normalization για ελληνικά και Unicode (διατηρεί τους τόνους)
|
9 |
+
tokenizer.normalizer = normalizers.NFC()
|
|
|
|
|
|
|
10 |
|
11 |
+
# Προ-tokenizer για μικτά κείμενα (ελληνικά και αγγλικά)
|
12 |
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
|
13 |
pre_tokenizers.WhitespaceSplit(),
|
14 |
pre_tokenizers.Punctuation(),
|
15 |
pre_tokenizers.Digits(individual_digits=True)
|
16 |
])
|
17 |
|
18 |
+
# Ορισμός ειδικών tokens (π.χ. για ιστορικά κείμενα)
|
19 |
trainer = trainers.BpeTrainer(
|
20 |
vocab_size=vocab_size,
|
21 |
min_frequency=min_frequency,
|