Update app.py
Browse files
app.py
CHANGED
@@ -16,7 +16,7 @@ from datetime import datetime
|
|
16 |
DetectorFactory.seed = 0
|
17 |
|
18 |
# Ρυθμίσεις checkpointing και αποθήκευσης του tokenizer
|
19 |
-
CHECKPOINT_FILE = "checkpoint.txt"
|
20 |
TOKENIZER_DIR = "tokenizer_model"
|
21 |
TOKENIZER_FILE = os.path.join(TOKENIZER_DIR, "tokenizer.json")
|
22 |
MAX_SAMPLES = 3000000 # Όριο δειγμάτων
|
@@ -64,11 +64,11 @@ def collect_samples(dataset_name, configs, split, chunk_size):
|
|
64 |
STOP_COLLECTION = False
|
65 |
total_processed = len(load_checkpoint())
|
66 |
progress_messages = [f"🚀 Ξεκινά η συλλογή δεδομένων... Υπάρχουν ήδη {total_processed} δείγματα στο checkpoint."]
|
67 |
-
|
68 |
print(progress_messages[-1])
|
69 |
|
70 |
dataset_iterator = create_iterator(dataset_name, configs, split)
|
71 |
new_texts = []
|
|
|
72 |
|
73 |
for text in dataset_iterator:
|
74 |
if STOP_COLLECTION:
|
@@ -79,7 +79,7 @@ def collect_samples(dataset_name, configs, split, chunk_size):
|
|
79 |
new_texts.append(text)
|
80 |
total_processed += 1
|
81 |
|
82 |
-
if len(new_texts) >=
|
83 |
append_to_checkpoint(new_texts)
|
84 |
progress_messages.append(f"✅ Αποθηκεύτηκαν {total_processed} δείγματα στο checkpoint.")
|
85 |
print(progress_messages[-1])
|
@@ -90,12 +90,16 @@ def collect_samples(dataset_name, configs, split, chunk_size):
|
|
90 |
print(progress_messages[-1])
|
91 |
break
|
92 |
|
|
|
|
|
|
|
|
|
93 |
if new_texts:
|
94 |
append_to_checkpoint(new_texts)
|
95 |
progress_messages.append(f"✅ Τελικό batch αποθηκεύτηκε ({total_processed} δείγματα).")
|
96 |
print(progress_messages[-1])
|
97 |
|
98 |
-
|
99 |
|
100 |
|
101 |
def train_tokenizer_fn(dataset_name, configs, split, vocab_size, min_freq, test_text):
|
@@ -157,7 +161,7 @@ with gr.Blocks() as demo:
|
|
157 |
dataset_name = gr.Textbox(value="wikimedia/wikipedia", label="Dataset Name")
|
158 |
configs = gr.Textbox(value="20231101.el,20231101.en", label="Configs")
|
159 |
split = gr.Dropdown(choices=["train"], value="train", label="Split")
|
160 |
-
chunk_size = gr.Slider(500, 50000, value=
|
161 |
vocab_size = gr.Slider(20000, 100000, value=50000, label="Vocabulary Size")
|
162 |
min_freq = gr.Slider(1, 100, value=3, label="Minimum Frequency")
|
163 |
test_text = gr.Textbox(value="Η Ακρόπολη είναι σύμβολο της αρχαίας Ελλάδας.", label="Test Text")
|
|
|
16 |
DetectorFactory.seed = 0
|
17 |
|
18 |
# Ρυθμίσεις checkpointing και αποθήκευσης του tokenizer
|
19 |
+
CHECKPOINT_FILE = "/persistent/checkpoint.txt" if os.path.exists("/persistent") else "checkpoint.txt"
|
20 |
TOKENIZER_DIR = "tokenizer_model"
|
21 |
TOKENIZER_FILE = os.path.join(TOKENIZER_DIR, "tokenizer.json")
|
22 |
MAX_SAMPLES = 3000000 # Όριο δειγμάτων
|
|
|
64 |
STOP_COLLECTION = False
|
65 |
total_processed = len(load_checkpoint())
|
66 |
progress_messages = [f"🚀 Ξεκινά η συλλογή δεδομένων... Υπάρχουν ήδη {total_processed} δείγματα στο checkpoint."]
|
|
|
67 |
print(progress_messages[-1])
|
68 |
|
69 |
dataset_iterator = create_iterator(dataset_name, configs, split)
|
70 |
new_texts = []
|
71 |
+
buffer_size = 100_000 # Προσωρινή αποθήκευση RAM
|
72 |
|
73 |
for text in dataset_iterator:
|
74 |
if STOP_COLLECTION:
|
|
|
79 |
new_texts.append(text)
|
80 |
total_processed += 1
|
81 |
|
82 |
+
if len(new_texts) >= buffer_size:
|
83 |
append_to_checkpoint(new_texts)
|
84 |
progress_messages.append(f"✅ Αποθηκεύτηκαν {total_processed} δείγματα στο checkpoint.")
|
85 |
print(progress_messages[-1])
|
|
|
90 |
print(progress_messages[-1])
|
91 |
break
|
92 |
|
93 |
+
# Ενημέρωση του progress κάθε 10.000 δείγματα
|
94 |
+
if total_processed % 10_000 == 0:
|
95 |
+
yield f"📊 Συλλογή σε εξέλιξη: {total_processed} δείγματα..."
|
96 |
+
|
97 |
if new_texts:
|
98 |
append_to_checkpoint(new_texts)
|
99 |
progress_messages.append(f"✅ Τελικό batch αποθηκεύτηκε ({total_processed} δείγματα).")
|
100 |
print(progress_messages[-1])
|
101 |
|
102 |
+
yield "\n".join(progress_messages)
|
103 |
|
104 |
|
105 |
def train_tokenizer_fn(dataset_name, configs, split, vocab_size, min_freq, test_text):
|
|
|
161 |
dataset_name = gr.Textbox(value="wikimedia/wikipedia", label="Dataset Name")
|
162 |
configs = gr.Textbox(value="20231101.el,20231101.en", label="Configs")
|
163 |
split = gr.Dropdown(choices=["train"], value="train", label="Split")
|
164 |
+
chunk_size = gr.Slider(500, 50000, value=10000, label="Chunk Size") # Μείωση chunk_size
|
165 |
vocab_size = gr.Slider(20000, 100000, value=50000, label="Vocabulary Size")
|
166 |
min_freq = gr.Slider(1, 100, value=3, label="Minimum Frequency")
|
167 |
test_text = gr.Textbox(value="Η Ακρόπολη είναι σύμβολο της αρχαίας Ελλάδας.", label="Test Text")
|