|
|
|
import gradio as gr |
|
import requests |
|
import json |
|
import re |
|
from io import BytesIO |
|
import matplotlib.pyplot as plt |
|
from train_tokenizer import train_tokenizer |
|
from datasets import load_dataset |
|
from tokenizers import Tokenizer |
|
import tempfile |
|
import os |
|
|
|
def fetch_splits(dataset_name): |
|
try: |
|
response = requests.get( |
|
f"https://datasets-server.huggingface.co/splits?dataset={dataset_name}", |
|
timeout=10 |
|
) |
|
response.raise_for_status() |
|
data = response.json() |
|
|
|
splits_info = {} |
|
for split in data['splits']: |
|
config = split['config'] |
|
split_name = split['split'] |
|
if config not in splits_info: |
|
splits_info[config] = [] |
|
splits_info[config].append(split_name) |
|
|
|
return { |
|
"splits": splits_info, |
|
"viewer_template": f"https://huggingface.co/datasets/{dataset_name}/embed/viewer/{{config}}/{{split}}" |
|
} |
|
except Exception as e: |
|
raise gr.Error(f"Σφάλμα κατά την ανάκτηση των splits: {str(e)}") |
|
|
|
def update_components(dataset_name): |
|
if not dataset_name: |
|
return [gr.Dropdown.update(choices=[], value=None), gr.Dropdown.update(choices=[]), gr.HTML.update(value="")] |
|
|
|
try: |
|
splits_data = fetch_splits(dataset_name) |
|
config_choices = list(splits_data['splits'].keys()) |
|
|
|
first_config = config_choices[0] if config_choices else None |
|
iframe_html = f""" |
|
<iframe |
|
src="{splits_data['viewer_template'].format(config=first_config, split='train')}" |
|
frameborder="0" |
|
width="100%" |
|
height="560px" |
|
></iframe> |
|
""" if first_config else "Δεν βρέθηκαν διαθέσιμα δεδομένα" |
|
|
|
return [ |
|
gr.Dropdown.update(choices=config_choices, value=first_config), |
|
gr.Dropdown.update(choices=splits_data['splits'].get(first_config, [])), |
|
gr.HTML.update(value=iframe_html) |
|
] |
|
except Exception as e: |
|
raise gr.Error(f"Σφάλμα: {str(e)}") |
|
|
|
def update_split_choices(dataset_name, config): |
|
if not dataset_name or not config: |
|
return gr.Dropdown.update(choices=[]) |
|
|
|
try: |
|
splits_data = fetch_splits(dataset_name) |
|
return gr.Dropdown.update(choices=splits_data['splits'].get(config, [])) |
|
except: |
|
return gr.Dropdown.update(choices=[]) |
|
|
|
def create_iterator(dataset_name, config, split): |
|
try: |
|
dataset = load_dataset( |
|
dataset_name, |
|
name=config, |
|
split=split, |
|
streaming=True |
|
) |
|
for example in dataset: |
|
yield example.get('text', '') |
|
except Exception as e: |
|
raise gr.Error(f"Σφάλμα φόρτωσης dataset: {str(e)}") |
|
|
|
def train_and_test(dataset_name, config, split, vocab_size, min_freq, test_text, custom_files): |
|
try: |
|
dataset_iterator = create_iterator(dataset_name, config, split) |
|
|
|
|
|
def combined_iterator(): |
|
|
|
for text in dataset_iterator: |
|
if text: |
|
yield text |
|
|
|
if custom_files: |
|
for file_path in custom_files: |
|
try: |
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
content = f.read() |
|
if content: |
|
yield content |
|
except Exception as file_error: |
|
print(f"Σφάλμα ανάγνωσης αρχείου {file_path}: {file_error}") |
|
|
|
with gr.Progress() as progress: |
|
progress(0.2, desc="Δημιουργία tokenizer...") |
|
tokenizer = train_tokenizer(combined_iterator(), vocab_size, min_freq) |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as f: |
|
tokenizer.save(f.name) |
|
trained_tokenizer = Tokenizer.from_file(f.name) |
|
os.unlink(f.name) |
|
|
|
|
|
encoded = trained_tokenizer.encode(test_text) |
|
decoded = trained_tokenizer.decode(encoded.ids) |
|
|
|
|
|
token_lengths = [len(t) for t in encoded.tokens] |
|
fig = plt.figure() |
|
plt.hist(token_lengths, bins=20) |
|
plt.xlabel('Μήκος Token') |
|
plt.ylabel('Συχνότητα') |
|
img_buffer = BytesIO() |
|
plt.savefig(img_buffer, format='png') |
|
plt.close() |
|
|
|
return { |
|
"Πρωτότυπο Κείμενο": test_text, |
|
"Αποκωδικοποιημένο": decoded, |
|
"Αριθμός Tokens": len(encoded.tokens), |
|
"Αγνώστων Tokens": sum(1 for t in encoded.tokens if t == "<unk>") |
|
}, img_buffer.getvalue() |
|
|
|
except Exception as e: |
|
raise gr.Error(f"Σφάλμα εκπαίδευσης: {str(e)}") |
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as demo: |
|
gr.Markdown("## Wikipedia Tokenizer Trainer") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
dataset_name = gr.Textbox( |
|
label="Dataset Name", |
|
value="wikimedia/wikipedia", |
|
placeholder="π.χ. 'wikimedia/wikipedia'" |
|
) |
|
config = gr.Dropdown( |
|
label="Config (π.χ. '20231101.el' για ελληνικά ή '20231101.en' για αγγλικά)", |
|
choices=[], |
|
interactive=True |
|
) |
|
split = gr.Dropdown( |
|
label="Split", |
|
choices=[], |
|
value="train" |
|
) |
|
vocab_size = gr.Slider(20000, 100000, value=50000, label="Μέγεθος Λεξιλογίου") |
|
min_freq = gr.Slider(1, 100, value=3, label="Ελάχιστη Συχνότητα") |
|
test_text = gr.Textbox( |
|
value='Η Ακρόπολη είναι σύμβολο της αρχαίας ελληνικής πολιτισμικής κληρονομιάς.', |
|
label="Test Text" |
|
) |
|
custom_files = gr.File( |
|
label="Προσαρμοσμένα Ελληνικά Κείμενα", |
|
file_count="multiple", |
|
type="file" |
|
) |
|
train_btn = gr.Button("Εκπαίδευση", variant="primary") |
|
|
|
with gr.Column(): |
|
preview = gr.HTML(label="Dataset Preview") |
|
results_json = gr.JSON(label="Αποτελέσματα") |
|
results_plot = gr.Image(label="Κατανομή Μηκών Tokens") |
|
|
|
|
|
dataset_name.change( |
|
fn=update_components, |
|
inputs=dataset_name, |
|
outputs=[config, split, preview] |
|
) |
|
|
|
config.change( |
|
fn=update_split_choices, |
|
inputs=[dataset_name, config], |
|
outputs=split |
|
) |
|
|
|
train_btn.click( |
|
fn=train_and_test, |
|
inputs=[dataset_name, config, split, vocab_size, min_freq, test_text, custom_files], |
|
outputs=[results_json, results_plot] |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |