Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
import os
|
|
|
3 |
import gradio as gr
|
4 |
import requests
|
5 |
import time
|
@@ -11,173 +12,220 @@ from tokenizers import Tokenizer
|
|
11 |
from langdetect import detect, DetectorFactory
|
12 |
from PIL import Image
|
13 |
from datetime import datetime
|
|
|
14 |
|
15 |
# Για επαναληψιμότητα στο langdetect
|
16 |
DetectorFactory.seed = 0
|
17 |
|
18 |
-
# Ρυθμίσεις
|
19 |
-
CHECKPOINT_FILE = "
|
20 |
-
TOKENIZER_DIR =
|
|
|
21 |
TOKENIZER_FILE = os.path.join(TOKENIZER_DIR, "tokenizer.json")
|
22 |
-
MAX_SAMPLES =
|
|
|
|
|
|
|
23 |
|
24 |
-
# Παγκόσμια μεταβλητή ελέγχου
|
25 |
STOP_COLLECTION = False
|
26 |
|
27 |
# Καταγραφή εκκίνησης
|
28 |
startup_log = f"===== Application Startup at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} =====\n"
|
29 |
print(startup_log)
|
30 |
|
31 |
-
|
32 |
def load_checkpoint():
|
33 |
-
"""Φόρτωση δεδομένων από το checkpoint
|
34 |
if os.path.exists(CHECKPOINT_FILE):
|
35 |
with open(CHECKPOINT_FILE, "r", encoding="utf-8") as f:
|
36 |
return f.read().splitlines()
|
37 |
return []
|
38 |
|
39 |
-
|
40 |
def append_to_checkpoint(texts):
|
41 |
-
"""Αποθήκευση δεδομένων
|
42 |
with open(CHECKPOINT_FILE, "a", encoding="utf-8") as f:
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
|
47 |
def create_iterator(dataset_name, configs, split):
|
48 |
-
"""
|
49 |
configs_list = [c.strip() for c in configs.split(",") if c.strip()]
|
|
|
50 |
for config in configs_list:
|
51 |
try:
|
52 |
-
dataset = load_dataset(
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
except Exception as e:
|
58 |
-
print(f"⚠️ Σφάλμα
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
global STOP_COLLECTION
|
64 |
STOP_COLLECTION = False
|
65 |
total_processed = len(load_checkpoint())
|
66 |
-
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
69 |
dataset_iterator = create_iterator(dataset_name, configs, split)
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
break
|
78 |
-
|
79 |
-
|
80 |
-
total_processed += 1
|
81 |
-
|
82 |
-
if len(new_texts) >= buffer_size:
|
83 |
-
append_to_checkpoint(new_texts)
|
84 |
-
progress_messages.append(f"✅ Αποθηκεύτηκαν {total_processed} δείγματα στο checkpoint.")
|
85 |
-
print(progress_messages[-1])
|
86 |
-
new_texts = []
|
87 |
-
|
88 |
-
if total_processed >= MAX_SAMPLES:
|
89 |
-
progress_messages.append("⚠️ Έφτασε το όριο δειγμάτων.")
|
90 |
-
print(progress_messages[-1])
|
91 |
break
|
92 |
-
|
93 |
-
|
94 |
-
if total_processed % 10_000 == 0:
|
95 |
-
yield f"📊 Συλλογή σε εξέλιξη: {total_processed} δείγματα..."
|
96 |
-
|
97 |
-
if new_texts:
|
98 |
-
append_to_checkpoint(new_texts)
|
99 |
-
progress_messages.append(f"✅ Τελικό batch αποθηκεύτηκε ({total_processed} δείγματα).")
|
100 |
-
print(progress_messages[-1])
|
101 |
-
|
102 |
-
yield "\n".join(progress_messages)
|
103 |
-
|
104 |
|
105 |
def train_tokenizer_fn(dataset_name, configs, split, vocab_size, min_freq, test_text):
|
106 |
-
"""
|
107 |
-
print("🚀
|
108 |
all_texts = load_checkpoint()
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
trained_tokenizer = Tokenizer.from_file(TOKENIZER_FILE)
|
113 |
-
|
114 |
-
# Δοκιμή
|
115 |
encoded = trained_tokenizer.encode(test_text)
|
116 |
decoded = trained_tokenizer.decode(encoded.ids)
|
117 |
-
|
118 |
-
#
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
plt.ylabel('Συχνότητα')
|
124 |
-
|
125 |
-
# Αποθήκευση και μετατροπή εικόνας
|
126 |
img_buffer = BytesIO()
|
127 |
plt.savefig(img_buffer, format='png')
|
128 |
plt.close()
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
|
155 |
# Gradio Interface
|
156 |
with gr.Blocks() as demo:
|
157 |
-
gr.Markdown("## Wikipedia Tokenizer Trainer
|
158 |
-
|
159 |
with gr.Row():
|
160 |
-
with gr.Column():
|
161 |
-
dataset_name = gr.Textbox(value="wikimedia/wikipedia", label="Dataset
|
162 |
-
configs = gr.Textbox(value="20231101.el,20231101.en", label="
|
163 |
-
split = gr.Dropdown(
|
164 |
-
chunk_size = gr.Slider(
|
165 |
-
vocab_size = gr.Slider(20000,
|
166 |
min_freq = gr.Slider(1, 100, value=3, label="Minimum Frequency")
|
167 |
test_text = gr.Textbox(value="Η Ακρόπολη είναι σύμβολο της αρχαίας Ελλάδας.", label="Test Text")
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
import os
|
3 |
+
import gc
|
4 |
import gradio as gr
|
5 |
import requests
|
6 |
import time
|
|
|
12 |
from langdetect import detect, DetectorFactory
|
13 |
from PIL import Image
|
14 |
from datetime import datetime
|
15 |
+
from concurrent.futures import ThreadPoolExecutor
|
16 |
|
17 |
# Για επαναληψιμότητα στο langdetect
|
18 |
DetectorFactory.seed = 0
|
19 |
|
20 |
+
# Ρυθμίσεις
|
21 |
+
CHECKPOINT_FILE = "checkpoint.txt"
|
22 |
+
TOKENIZER_DIR = os.getcwd() # Χρησιμοποιεί τον τρέχοντα φάκελο
|
23 |
+
#TOKENIZER_DIR = "tokenizer_model"
|
24 |
TOKENIZER_FILE = os.path.join(TOKENIZER_DIR, "tokenizer.json")
|
25 |
+
MAX_SAMPLES = 5000000 # Αυξημένο όριο δειγμάτων
|
26 |
+
DEFAULT_CHUNK_SIZE = 200000 # Μεγαλύτερο chunk size
|
27 |
+
BATCH_SIZE = 1000 # Μέγεθος batch για φόρτωση δεδομένων
|
28 |
+
NUM_WORKERS = 4 # Αριθμός workers για πολυνηματική επεξεργασία
|
29 |
|
30 |
+
# Παγκόσμια μεταβλητή ελέγχου
|
31 |
STOP_COLLECTION = False
|
32 |
|
33 |
# Καταγραφή εκκίνησης
|
34 |
startup_log = f"===== Application Startup at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} =====\n"
|
35 |
print(startup_log)
|
36 |
|
|
|
37 |
def load_checkpoint():
|
38 |
+
"""Φόρτωση δεδομένων από το checkpoint."""
|
39 |
if os.path.exists(CHECKPOINT_FILE):
|
40 |
with open(CHECKPOINT_FILE, "r", encoding="utf-8") as f:
|
41 |
return f.read().splitlines()
|
42 |
return []
|
43 |
|
|
|
44 |
def append_to_checkpoint(texts):
|
45 |
+
"""Αποθήκευση δεδομένων με ομαδοποίηση."""
|
46 |
with open(CHECKPOINT_FILE, "a", encoding="utf-8") as f:
|
47 |
+
batch = "\n".join(texts) + "\n"
|
48 |
+
f.write(batch)
|
|
|
49 |
|
50 |
def create_iterator(dataset_name, configs, split):
|
51 |
+
"""Βελτιωμένο iterator με batch φόρτωση και caching."""
|
52 |
configs_list = [c.strip() for c in configs.split(",") if c.strip()]
|
53 |
+
|
54 |
for config in configs_list:
|
55 |
try:
|
56 |
+
dataset = load_dataset(
|
57 |
+
dataset_name,
|
58 |
+
name=config,
|
59 |
+
split=split,
|
60 |
+
streaming=True,
|
61 |
+
cache_dir="./dataset_cache" # Ενεργοποίηση cache
|
62 |
+
)
|
63 |
+
|
64 |
+
# Φόρτωση δεδομένων σε batches
|
65 |
+
while True:
|
66 |
+
batch = list(dataset.take(BATCH_SIZE))
|
67 |
+
if not batch:
|
68 |
+
break
|
69 |
+
dataset = dataset.skip(BATCH_SIZE)
|
70 |
+
|
71 |
+
# Πολυνηματική επεξεργασία batch
|
72 |
+
with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
|
73 |
+
processed_texts = list(executor.map(process_example, batch))
|
74 |
+
|
75 |
+
yield from filter(None, processed_texts)
|
76 |
+
|
77 |
except Exception as e:
|
78 |
+
print(f"⚠️ Σφάλμα φόρτωσης: {config}: {e}")
|
79 |
+
|
80 |
+
def process_example(example):
|
81 |
+
"""Επεξεργασία ενός παραδείγματος με έλεγχο γλώσσας."""
|
82 |
+
try:
|
83 |
+
text = example.get('text', '').strip()
|
84 |
+
if text and detect(text) in ['el', 'en']: # Φιλτράρισμα γλώσσας
|
85 |
+
return text
|
86 |
+
return None
|
87 |
+
except:
|
88 |
+
return None
|
89 |
+
|
90 |
+
def collect_samples(dataset_name, configs, split, chunk_size, max_samples):
|
91 |
+
"""Βελτιωμένη συλλογή δεδομένων με μεγάλα chunks."""
|
92 |
global STOP_COLLECTION
|
93 |
STOP_COLLECTION = False
|
94 |
total_processed = len(load_checkpoint())
|
95 |
+
|
96 |
+
progress_messages = [
|
97 |
+
f"🚀 Εκκίνηση συλλογής... Πρόοδος: {total_processed}/{max_samples}",
|
98 |
+
f"⚙️ Ρυθμίσεις: Chunk Size={chunk_size}, Workers={NUM_WORKERS}"
|
99 |
+
]
|
100 |
+
|
101 |
dataset_iterator = create_iterator(dataset_name, configs, split)
|
102 |
+
chunk = []
|
103 |
+
|
104 |
+
while not STOP_COLLECTION and total_processed < max_samples:
|
105 |
+
try:
|
106 |
+
# Φόρτωση chunk
|
107 |
+
while len(chunk) < chunk_size:
|
108 |
+
text = next(dataset_iterator)
|
109 |
+
if text:
|
110 |
+
chunk.append(text)
|
111 |
+
total_processed += 1
|
112 |
+
if total_processed >= max_samples:
|
113 |
+
break
|
114 |
+
|
115 |
+
# Αποθήκευση chunk
|
116 |
+
if chunk:
|
117 |
+
append_to_checkpoint(chunk)
|
118 |
+
progress_messages.append(
|
119 |
+
f"✅ Αποθηκεύτηκαν {len(chunk)} δείγματα (Σύνολο: {total_processed})"
|
120 |
+
)
|
121 |
+
chunk = []
|
122 |
+
|
123 |
+
# Εκκαθάριση μνήμης
|
124 |
+
gc.collect()
|
125 |
+
|
126 |
+
except StopIteration:
|
127 |
+
progress_messages.append("🏁 Ολοκληρώθηκε η επεξεργασία όλων των δεδομένων!")
|
128 |
break
|
129 |
+
except Exception as e:
|
130 |
+
progress_messages.append(f"⛔ Σφάλμα: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
break
|
132 |
+
|
133 |
+
return "\n".join(progress_messages)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
def train_tokenizer_fn(dataset_name, configs, split, vocab_size, min_freq, test_text):
|
136 |
+
"""Βελτιωμένη εκπαίδευση tokenizer με χρήση cache."""
|
137 |
+
print("🚀 Εκκίνηση εκπαίδευσης...")
|
138 |
all_texts = load_checkpoint()
|
139 |
+
|
140 |
+
# Παράλληλη επεξεργασία για εκπαίδευση
|
141 |
+
tokenizer = train_tokenizer(
|
142 |
+
all_texts,
|
143 |
+
vocab_size=vocab_size,
|
144 |
+
min_frequency=min_freq,
|
145 |
+
output_dir=TOKENIZER_DIR,
|
146 |
+
num_threads=NUM_WORKERS # Παράλληλη επεξεργασία
|
147 |
+
)
|
148 |
+
|
149 |
+
# Φόρτωση και δοκιμή tokenizer
|
150 |
trained_tokenizer = Tokenizer.from_file(TOKENIZER_FILE)
|
|
|
|
|
151 |
encoded = trained_tokenizer.encode(test_text)
|
152 |
decoded = trained_tokenizer.decode(encoded.ids)
|
153 |
+
|
154 |
+
# Δημιουργία γραφήματος
|
155 |
+
fig, ax = plt.subplots()
|
156 |
+
ax.hist([len(t) for t in encoded.tokens], bins=20)
|
157 |
+
ax.set_xlabel('Μήκος Token')
|
158 |
+
ax.set_ylabel('Συχνότητα')
|
|
|
|
|
|
|
159 |
img_buffer = BytesIO()
|
160 |
plt.savefig(img_buffer, format='png')
|
161 |
plt.close()
|
162 |
+
|
163 |
+
return ("✅ Εκπαίδευση ολοκληρώθηκε!", decoded, Image.open(img_buffer))
|
164 |
+
print(f"Ο tokenizer αποθηκεύτηκε στον φάκελο: {TOKENIZER_DIR}")
|
165 |
+
|
166 |
+
def analyze_checkpoint():
|
167 |
+
"""Νέα λειτουργία ανάλυσης δεδομένων."""
|
168 |
+
texts = load_checkpoint()
|
169 |
+
if not texts:
|
170 |
+
return "Δεν βρέθηκαν δεδομένα για ανάλυση."
|
171 |
+
|
172 |
+
# Βασική στατιστική
|
173 |
+
total_chars = sum(len(t) for t in texts)
|
174 |
+
avg_length = total_chars / len(texts) if texts else 0
|
175 |
+
|
176 |
+
# Ανάλυση γλώσσας
|
177 |
+
languages = {}
|
178 |
+
for t in texts[:1000]: # Δειγματοληψία για ταχύτητα
|
179 |
+
try:
|
180 |
+
lang = detect(t)
|
181 |
+
languages[lang] = languages.get(lang, 0) + 1
|
182 |
+
except:
|
183 |
+
continue
|
184 |
+
|
185 |
+
report = [
|
186 |
+
f"📊 Σύνολο δειγμάτων: {len(texts)}",
|
187 |
+
f"📝 Μέσο μήκος: {avg_length:.1f} χαρακτήρες",
|
188 |
+
"🌍 Γλώσσες (δείγμα 1000):",
|
189 |
+
*[f"- {k}: {v} ({v/10:.1f}%)" for k, v in languages.items()]
|
190 |
+
]
|
191 |
+
|
192 |
+
return "\n".join(report)
|
193 |
|
194 |
# Gradio Interface
|
195 |
with gr.Blocks() as demo:
|
196 |
+
gr.Markdown("## Βελτιωμένος Wikipedia Tokenizer Trainer")
|
197 |
+
|
198 |
with gr.Row():
|
199 |
+
with gr.Column(scale=2):
|
200 |
+
dataset_name = gr.Textbox(value="wikimedia/wikipedia", label="Dataset")
|
201 |
+
configs = gr.Textbox(value="20231101.el,20231101.en", label="Configurations")
|
202 |
+
split = gr.Dropdown(["train"], value="train", label="Split")
|
203 |
+
chunk_size = gr.Slider(10000, 500000, value=200000, step=10000, label="Chunk Size")
|
204 |
+
vocab_size = gr.Slider(20000, 200000, value=50000, step=10000, label="Vocabulary Size")
|
205 |
min_freq = gr.Slider(1, 100, value=3, label="Minimum Frequency")
|
206 |
test_text = gr.Textbox(value="Η Ακρόπολη είναι σύμβολο της αρχαίας Ελλάδας.", label="Test Text")
|
207 |
+
max_samples = gr.Slider(10000, 10000000, value=5000000, step=100000, label="Maximum Samples")
|
208 |
+
|
209 |
+
with gr.Row():
|
210 |
+
start_btn = gr.Button("Start", variant="primary")
|
211 |
+
stop_btn = gr.Button("Stop", variant="stop")
|
212 |
+
restart_btn = gr.Button("Restart")
|
213 |
+
|
214 |
+
analyze_btn = gr.Button("Analyze Data")
|
215 |
+
train_btn = gr.Button("Train Tokenizer", variant="primary")
|
216 |
+
|
217 |
+
with gr.Column(scale=3):
|
218 |
+
progress = gr.Textbox(label="Πρόοδος", lines=10, interactive=False)
|
219 |
+
gr.Markdown("### Αποτελέσματα")
|
220 |
+
decoded_text = gr.Textbox(label="Αποκωδικοποιημένο Κείμενο")
|
221 |
+
token_distribution = gr.Image(label="Κατανομή Tokens")
|
222 |
+
|
223 |
+
# Event handlers
|
224 |
+
start_btn.click(collect_samples, [dataset_name, configs, split, chunk_size, max_samples], progress)
|
225 |
+
stop_btn.click(lambda: "⏹️ Διακοπή συλλογής...", None, progress, queue=False)
|
226 |
+
restart_btn.click(lambda: "🔄 Επαναφορά...", None, progress).then(restart_collection, None, progress)
|
227 |
+
analyze_btn.click(analyze_checkpoint, None, progress)
|
228 |
+
train_btn.click(train_tokenizer_fn, [dataset_name, configs, split, vocab_size, min_freq, test_text],
|
229 |
+
[progress, decoded_text, token_distribution])
|
230 |
+
|
231 |
+
demo.queue(concurrency_count=4).launch()
|