tymbos commited on
Commit
0430da2
·
verified ·
1 Parent(s): 5e7614b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -0
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ from io import BytesIO
5
+ from train_tokenizer import train_tokenizer
6
+ from tokenizers import Tokenizer
7
+ from datasets import load_dataset
8
+ import tempfile
9
+ import os
10
+
11
+ def create_iterator(files=None, dataset_name=None, split="train", streaming=True):
12
+ if dataset_name:
13
+ dataset = load_dataset(dataset_name, split=split, streaming=streaming)
14
+ for example in dataset:
15
+ yield example['text']
16
+ elif files:
17
+ for file in files:
18
+ with open(file.name, 'r', encoding='utf-8') as f:
19
+ for line in f:
20
+ yield line.strip()
21
+
22
+ def enhanced_validation(tokenizer, test_text):
23
+ encoded = tokenizer.encode(test_text)
24
+ decoded = tokenizer.decode(encoded.ids)
25
+
26
+ # Ανάλυση Unknown Tokens
27
+ unknown_tokens = sum(1 for t in encoded.tokens if t == "<unk>")
28
+ unknown_percent = unknown_tokens / len(encoded.tokens) * 100 if encoded.tokens else 0
29
+
30
+ # Κατανομή μηκών tokens
31
+ token_lengths = [len(t) for t in encoded.tokens]
32
+ avg_length = np.mean(token_lengths) if token_lengths else 0
33
+
34
+ # Έλεγχος code coverage
35
+ code_symbols = ['{', '}', '(', ')', ';', '//', 'printf']
36
+ code_coverage = {sym: sym in test_text and sym in encoded.tokens for sym in code_symbols}
37
+
38
+ # Δημιουργία histogram
39
+ fig = plt.figure()
40
+ plt.hist(token_lengths, bins=20)
41
+ plt.xlabel('Token Length')
42
+ plt.ylabel('Frequency')
43
+ img_buffer = BytesIO()
44
+ plt.savefig(img_buffer, format='png')
45
+ plt.close()
46
+
47
+ return {
48
+ "roundtrip_success": test_text == decoded,
49
+ "unknown_tokens": f"{unknown_tokens} ({unknown_percent:.2f}%)",
50
+ "average_token_length": f"{avg_length:.2f}",
51
+ "code_coverage": code_coverage,
52
+ "token_length_distribution": img_buffer.getvalue()
53
+ }
54
+
55
+ def train_and_test(files, dataset_name, split, vocab_size, min_freq, test_text):
56
+ # Επιβεβαίωση εισόδων
57
+ if not files and not dataset_name:
58
+ raise gr.Error("Πρέπει να παρέχετε αρχεία ή όνομα dataset!")
59
+
60
+ # Δημιουργία iterator με streaming
61
+ iterator = create_iterator(files, dataset_name, split)
62
+
63
+ try:
64
+ tokenizer = train_tokenizer(iterator, vocab_size, min_freq)
65
+ except Exception as e:
66
+ raise gr.Error(f"Σφάλμα εκπαίδευσης: {str(e)}")
67
+
68
+ # Αποθήκευση και φόρτωση για validation
69
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as f:
70
+ tokenizer.save(f.name)
71
+ trained_tokenizer = Tokenizer.from_file(f.name)
72
+ os.unlink(f.name)
73
+
74
+ # Εκτενής επικύρωση
75
+ validation = enhanced_validation(trained_tokenizer, test_text)
76
+
77
+ return {
78
+ "validation_metrics": {k:v for k,v in validation.items() if k != "token_length_distribution"},
79
+ "histogram": validation["token_length_distribution"]
80
+ }
81
+
82
+ # Gradio Interface
83
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
84
+ gr.Markdown("## Προχωρημένος BPE Tokenizer Trainer")
85
+
86
+ with gr.Row():
87
+ with gr.Column():
88
+ with gr.Tab("Local Files"):
89
+ file_input = gr.File(file_count="multiple", label="Ανέβασμα αρχείων")
90
+ with gr.Tab("Hugging Face Dataset"):
91
+ dataset_name = gr.Textbox(label="Όνομα Dataset (π.χ. 'wikitext', 'codeparrot/github-code')")
92
+ split = gr.Textbox(value="train", label="Split")
93
+
94
+ vocab_size = gr.Slider(1000, 100000, value=32000, label="Μέγεθος Λεξιλογίου")
95
+ min_freq = gr.Slider(1, 100, value=2, label="Ελάχιστη Συχνότητα")
96
+ test_text = gr.Textbox(
97
+ value='function helloWorld() { console.log("Γειά σου Κόσμε!"); } // Ελληνικά + κώδικας',
98
+ label="Test Text"
99
+ )
100
+ train_btn = gr.Button("Εκπαίδευση Tokenizer", variant="primary")
101
+
102
+ with gr.Column():
103
+ results_json = gr.JSON(label="Μετρικές")
104
+ results_plot = gr.Image(label="Κατανομή Μηκών Tokens")
105
+
106
+ train_btn.click(
107
+ fn=train_and_test,
108
+ inputs=[file_input, dataset_name, split, vocab_size, min_freq, test_text],
109
+ outputs=[results_json, results_plot]
110
+ )
111
+
112
+ if __name__ == "__main__":
113
+ demo.launch()