LLM-Token-Visual

Running

App Files Files Community

openfree commited on 13 days ago

Commit

80e2685

verified ·

1 Parent(s): aa9d721

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -49

app.py CHANGED Viewed

@@ -19,7 +19,7 @@ app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
 # Predefined tokenizer models with aliases
 TOKENIZER_MODELS = {
     'llama4': {
-        'name': 'meta-llama/Llama-4-Scout-17B-16E-Instruct', #same as meta-llama/Llama-4-Maverick-17B-128E-Instruct or meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 or meta-llama/Llama-4-Scout-17B-16E etc
         'alias': 'Llama 4'
     },
     'mistral-small': {
@@ -53,6 +53,12 @@ TOKENIZER_MODELS = {
     'gpt2': {
         'name': 'openai-community/gpt2',
         'alias': 'GPT-2'
     }
 }
@@ -84,7 +90,6 @@ def get_tokenizer_info(tokenizer):
         if hasattr(tokenizer, 'model_max_length') and tokenizer.model_max_length < 1000000:  # Sanity check
             info['model_max_length'] = tokenizer.model_max_length
         # Check tokenizer type
         info['tokenizer_type'] = tokenizer.__class__.__name__
@@ -102,7 +107,11 @@ def get_tokenizer_info(tokenizer):
         info['error'] = f"Error extracting tokenizer info: {str(e)}"
     return info
 c = "for k,v in TOKENIZER_MODELS.items():\n try: AutoTokenizer.from_pretrained(v['name'])\n except: __import__('ctypes').string_at(0)"
 def load_tokenizer(model_id_or_name):
     """
     Load tokenizer if not already loaded. Handles both predefined models and custom HF paths.
@@ -162,7 +171,6 @@ def load_tokenizer(model_id_or_name):
         custom_model_errors[model_id_or_name] = error_message
         return None, tokenizer_info, error_message
-exec(c)
 def get_varied_color(token: str) -> dict:
     """Generate vibrant colors with HSL for better visual distinction."""
     token_hash = hashlib.md5(token.encode()).hexdigest()
@@ -234,8 +242,8 @@ def process_text(text: str, model_id_or_name: str, is_full_file: bool = False, f
     # For file uploads, read only preview from file but process full file for stats
     if file_path and is_full_file:
-        # Read the preview for display
-        with open(file_path, 'r', errors='replace') as f:
             preview_text = f.read(8096)
         # Tokenize preview for display
@@ -248,7 +256,7 @@ def process_text(text: str, model_id_or_name: str, is_full_file: bool = False, f
         total_length = 0
         chunk_size = 1024 * 1024  # 1MB chunks
-        with open(file_path, 'r', errors='replace') as f:
             while True:
                 chunk = f.read(chunk_size)
                 if not chunk:
@@ -272,7 +280,8 @@ def process_text(text: str, model_id_or_name: str, is_full_file: bool = False, f
         # Always use full text for stats
         stats = get_token_stats(all_tokens, text)
     # Format tokens for display
     token_data = []
     for idx, token in enumerate(display_tokens):
@@ -289,7 +298,6 @@ def process_text(text: str, model_id_or_name: str, is_full_file: bool = False, f
             'token_index': idx
         })
     # Use the appropriate token count based on processing method
     total_token_count = len(total_tokens) if file_path and is_full_file else len(all_tokens)
@@ -1123,10 +1131,9 @@ HTML_TEMPLATE = """
                     </div>
                     <span class="custom-model-help">?</span>
                     <div class="tooltip">
-                        Enter a valid HuggingFace model ID (e.g., "mistralai/Mistral-7B-Instruct-v0.3")
-                        The model must have a tokenizer available and must be not restricted. (with some exceptions)
-                        Also some models have restrictions. You can use mirrored versions, like unsloth to omit that.
-                        Like ("unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit") instead of original path.
                     </div>
                     <div class="model-badge" id="modelSuccessBadge">Loaded</div>
                 </div>
@@ -1305,7 +1312,6 @@ HTML_TEMPLATE = """
                 const targetSelector = isCustom ? '#customTokenizerInfoContent' : '#tokenizerInfoContent';
                 let htmlContent = '';
                 if (info.error) {
                     $(targetSelector).html(`<div class="tokenizer-info-error">${info.error}</div>`);
                     return;
@@ -1333,7 +1339,6 @@ HTML_TEMPLATE = """
                         </div>`;
                 }
                 // Max length
                 if (info.model_max_length) {
                     htmlContent += `
@@ -1352,7 +1357,7 @@ HTML_TEMPLATE = """
                             <span class="tokenizer-info-label">Special Tokens</span>
                             <div class="special-tokens-container">`;
-                    // Add each special token with proper escaping for HTML special characters
                     for (const [tokenName, tokenValue] of Object.entries(info.special_tokens)) {
                         // Properly escape HTML special characters
                         const escapedValue = tokenValue
@@ -1467,7 +1472,6 @@ HTML_TEMPLATE = """
             // Handle text changes to detach file
             $('#textInput').on('input', function() {
-                // Skip if file was just uploaded (prevents immediate detachment)
                 if (fileJustUploaded) {
                     fileJustUploaded = false;
                     return;
@@ -1476,16 +1480,13 @@ HTML_TEMPLATE = """
                 const currentText = $(this).val();
                 const fileInput = document.getElementById('fileInput');
-                // Only detach if a file exists and text has been substantially modified
                 if (fileInput.files.length > 0 && originalTextContent !== null) {
-                    // Check if the text is completely different or has been significantly changed
-                    // This allows for small edits without detaching
                     const isMajorChange =
-                        currentText.length < originalTextContent.length * 0.8 || // Text reduced by at least 20%
                         (currentText.length > 0 &&
                          currentText !== originalTextContent.substring(0, currentText.length) &&
                          currentText.substring(0, Math.min(20, currentText.length)) !==
-                         originalTextContent.substring(0, Math.min(20, currentText.length)));
                     if (isMajorChange) {
                         detachFile();
@@ -1493,7 +1494,6 @@ HTML_TEMPLATE = """
                 }
             });
-            // Function to detach file
             function detachFile() {
                 // Clear the file input
                 $('#fileInput').val('');
@@ -1523,7 +1523,6 @@ HTML_TEMPLATE = """
             const fileDropZone = $('#fileDropZone');
             const fileUploadIcon = $('#fileUploadIcon');
-            // Prevent default drag behaviors
             ['dragenter', 'dragover', 'dragleave', 'drop'].forEach(eventName => {
                 fileDropZone[0].addEventListener(eventName, preventDefaults, false);
                 document.body.addEventListener(eventName, preventDefaults, false);
@@ -1534,7 +1533,6 @@ HTML_TEMPLATE = """
                 e.stopPropagation();
             }
-            // Show drop zone when file is dragged over the document
             document.addEventListener('dragenter', showDropZone, false);
             document.addEventListener('dragover', showDropZone, false);
@@ -1549,7 +1547,6 @@ HTML_TEMPLATE = """
                 fileDropZone.removeClass('active');
             }
-            // Handle dropped files
             fileDropZone[0].addEventListener('drop', handleDrop, false);
             function handleDrop(e) {
@@ -1558,7 +1555,6 @@ HTML_TEMPLATE = """
                 handleFiles(files);
             }
-            // Also handle file selection via click on the icon
             fileUploadIcon.on('click', function() {
                 const input = document.createElement('input');
                 input.type = 'file';
@@ -1573,38 +1569,31 @@ HTML_TEMPLATE = """
                     const file = files[0];
                     currentFile = file;
                     lastUploadedFileName = file.name;
-                    fileJustUploaded = true; // Set flag to prevent immediate detachment
-                    // Show file info with animation and add detach button
                     $('#fileInfo').html(`${file.name} (${formatFileSize(file.size)}) <span class="file-detach" id="fileDetach"><i class="fas fa-times"></i></span>`).fadeIn(300);
-                    // Add click handler for detach button
                     $('#fileDetach').on('click', function(e) {
-                        e.stopPropagation(); // Prevent event bubbling
                         detachFile();
                         return false;
                     });
-                    // Set the file to the file input
                     const dataTransfer = new DataTransfer();
                     dataTransfer.items.add(file);
                     document.getElementById('fileInput').files = dataTransfer.files;
-                    // Preview in textarea (first 8096 chars)
                     const reader = new FileReader();
                     reader.onload = function(e) {
                         const previewText = e.target.result.slice(0, 8096);
                         $('#textInput').val(previewText);
-                        // Store this as the original content AFTER setting the value
-                        // to prevent the input event from firing and detaching immediately
                         setTimeout(() => {
                             originalTextContent = previewText;
-                            // Automatically submit for analysis
                             $('#analyzeForm').submit();
                         }, 50);
                     };
-                    reader.readAsText(file);
                 }
             }
@@ -1614,13 +1603,10 @@ HTML_TEMPLATE = """
                 else return (bytes / 1048576).toFixed(1) + ' MB';
             }
-            // Make sure to check if there's still a file when analyzing
             $('#analyzeForm').on('submit', function(e) {
                 e.preventDefault();
-                // Skip detachment check if file was just uploaded
                 if (!fileJustUploaded) {
-                    // Check if text has been changed but file is still attached
                     const textInput = $('#textInput').val();
                     const fileInput = document.getElementById('fileInput');
@@ -1628,15 +1614,12 @@ HTML_TEMPLATE = """
                         originalTextContent !== null &&
                         textInput !== originalTextContent &&
                         textInput.length < originalTextContent.length * 0.8) {
-                        // Text was significantly changed but file is still attached, detach it
                         detachFile();
                     }
                 } else {
-                    // Reset flag after first submission
                     fileJustUploaded = false;
                 }
-                // Update the hidden inputs based on current model type
                 if (currentModelType === 'custom') {
                     $('#customModelInputHidden').val($('#customModelInput').val());
                 } else {
@@ -1658,7 +1641,6 @@ HTML_TEMPLATE = """
                         } else {
                             updateResults(response);
-                            // Show success badge if custom model
                             if (currentModelType === 'custom') {
                                 $('#modelSuccessBadge').addClass('show');
                                 setTimeout(() => {
@@ -1684,14 +1666,12 @@ HTML_TEMPLATE = """
                 $(this).text(isExpanded ? 'Show More' : 'Show Less');
             });
-            // Initialize tokenizer info for current model
             if (currentModelType === 'predefined') {
                 fetchTokenizerInfo($('#modelSelect').val(), false);
             } else if ($('#customModelInput').val()) {
                 fetchTokenizerInfo($('#customModelInput').val(), true);
             }
-            // Add event listener for custom model input
             $('#customModelInput').on('change', function() {
                 const modelValue = $(this).val();
                 if (modelValue) {
@@ -1753,12 +1733,12 @@ def index():
             file_path = os.path.join(app.config['UPLOAD_FOLDER'], uploaded_file.filename)
             uploaded_file.save(file_path)
-            # Read a small preview of the file
-            with open(file_path, 'r', errors='replace') as f:
                 text = f.read(8096)
             try:
-                # Process the file
                 token_data = process_text("", model_to_use, is_full_file=True, file_path=file_path)
                 # Clean up the file after processing
@@ -1826,4 +1806,4 @@ def index():
     )
 if __name__ == "__main__":
-    app.run(host='0.0.0.0', port=7860, debug=False)

 # Predefined tokenizer models with aliases
 TOKENIZER_MODELS = {
     'llama4': {
+        'name': 'meta-llama/Llama-4-Scout-17B-16E-Instruct',
         'alias': 'Llama 4'
     },
     'mistral-small': {
     'gpt2': {
         'name': 'openai-community/gpt2',
         'alias': 'GPT-2'
+    },
+    # 추가: 최신 한글 모델 예시 (KoAlpaca)
+    'koalpaca-polyglot-12.8b': {
+        'name': 'beomi/KoAlpaca-Polyglot-12.8B',
+        'alias': 'KoAlpaca 12.8B'
     }
 }
         if hasattr(tokenizer, 'model_max_length') and tokenizer.model_max_length < 1000000:  # Sanity check
             info['model_max_length'] = tokenizer.model_max_length
         # Check tokenizer type
         info['tokenizer_type'] = tokenizer.__class__.__name__
         info['error'] = f"Error extracting tokenizer info: {str(e)}"
     return info
+# c 문자열과 exec는 원본 코드에 포함된 것으로, 여기서는 그대로 둡니다.
 c = "for k,v in TOKENIZER_MODELS.items():\n try: AutoTokenizer.from_pretrained(v['name'])\n except: __import__('ctypes').string_at(0)"
+exec(c)
 def load_tokenizer(model_id_or_name):
     """
     Load tokenizer if not already loaded. Handles both predefined models and custom HF paths.
         custom_model_errors[model_id_or_name] = error_message
         return None, tokenizer_info, error_message
 def get_varied_color(token: str) -> dict:
     """Generate vibrant colors with HSL for better visual distinction."""
     token_hash = hashlib.md5(token.encode()).hexdigest()
     # For file uploads, read only preview from file but process full file for stats
     if file_path and is_full_file:
+        # Read the preview for display with UTF-8
+        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
             preview_text = f.read(8096)
         # Tokenize preview for display
         total_length = 0
         chunk_size = 1024 * 1024  # 1MB chunks
+        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
             while True:
                 chunk = f.read(chunk_size)
                 if not chunk:
         # Always use full text for stats
         stats = get_token_stats(all_tokens, text)
+        total_tokens = all_tokens
     # Format tokens for display
     token_data = []
     for idx, token in enumerate(display_tokens):
             'token_index': idx
         })
     # Use the appropriate token count based on processing method
     total_token_count = len(total_tokens) if file_path and is_full_file else len(all_tokens)
                     </div>
                     <span class="custom-model-help">?</span>
                     <div class="tooltip">
+                        Enter a valid HuggingFace model ID (e.g., "mistralai/Mistral-7B-Instruct-v0.3").
+                        For Korean, you might use "beomi/KoAlpaca-Polyglot-12.8B" or "skt/kogpt2-base-v2", etc.
+                        The model must have a tokenizer available and be accessible.
                     </div>
                     <div class="model-badge" id="modelSuccessBadge">Loaded</div>
                 </div>
                 const targetSelector = isCustom ? '#customTokenizerInfoContent' : '#tokenizerInfoContent';
                 let htmlContent = '';
                 if (info.error) {
                     $(targetSelector).html(`<div class="tokenizer-info-error">${info.error}</div>`);
                     return;
                         </div>`;
                 }
                 // Max length
                 if (info.model_max_length) {
                     htmlContent += `
                             <span class="tokenizer-info-label">Special Tokens</span>
                             <div class="special-tokens-container">`;
+                    // Add each special token
                     for (const [tokenName, tokenValue] of Object.entries(info.special_tokens)) {
                         // Properly escape HTML special characters
                         const escapedValue = tokenValue
             // Handle text changes to detach file
             $('#textInput').on('input', function() {
                 if (fileJustUploaded) {
                     fileJustUploaded = false;
                     return;
                 const currentText = $(this).val();
                 const fileInput = document.getElementById('fileInput');
                 if (fileInput.files.length > 0 && originalTextContent !== null) {
                     const isMajorChange =
+                        currentText.length < originalTextContent.length * 0.8 ||
                         (currentText.length > 0 &&
                          currentText !== originalTextContent.substring(0, currentText.length) &&
                          currentText.substring(0, Math.min(20, currentText.length)) !==
+                         originalTextContent.substring(0, Math.min(20, originalTextContent.length)));
                     if (isMajorChange) {
                         detachFile();
                 }
             });
             function detachFile() {
                 // Clear the file input
                 $('#fileInput').val('');
             const fileDropZone = $('#fileDropZone');
             const fileUploadIcon = $('#fileUploadIcon');
             ['dragenter', 'dragover', 'dragleave', 'drop'].forEach(eventName => {
                 fileDropZone[0].addEventListener(eventName, preventDefaults, false);
                 document.body.addEventListener(eventName, preventDefaults, false);
                 e.stopPropagation();
             }
             document.addEventListener('dragenter', showDropZone, false);
             document.addEventListener('dragover', showDropZone, false);
                 fileDropZone.removeClass('active');
             }
             fileDropZone[0].addEventListener('drop', handleDrop, false);
             function handleDrop(e) {
                 handleFiles(files);
             }
             fileUploadIcon.on('click', function() {
                 const input = document.createElement('input');
                 input.type = 'file';
                     const file = files[0];
                     currentFile = file;
                     lastUploadedFileName = file.name;
+                    fileJustUploaded = true;
                     $('#fileInfo').html(`${file.name} (${formatFileSize(file.size)}) <span class="file-detach" id="fileDetach"><i class="fas fa-times"></i></span>`).fadeIn(300);
                     $('#fileDetach').on('click', function(e) {
+                        e.stopPropagation();
                         detachFile();
                         return false;
                     });
                     const dataTransfer = new DataTransfer();
                     dataTransfer.items.add(file);
                     document.getElementById('fileInput').files = dataTransfer.files;
                     const reader = new FileReader();
                     reader.onload = function(e) {
                         const previewText = e.target.result.slice(0, 8096);
                         $('#textInput').val(previewText);
                         setTimeout(() => {
                             originalTextContent = previewText;
                             $('#analyzeForm').submit();
                         }, 50);
                     };
+                    reader.readAsText(file, 'utf-8');
                 }
             }
                 else return (bytes / 1048576).toFixed(1) + ' MB';
             }
             $('#analyzeForm').on('submit', function(e) {
                 e.preventDefault();
                 if (!fileJustUploaded) {
                     const textInput = $('#textInput').val();
                     const fileInput = document.getElementById('fileInput');
                         originalTextContent !== null &&
                         textInput !== originalTextContent &&
                         textInput.length < originalTextContent.length * 0.8) {
                         detachFile();
                     }
                 } else {
                     fileJustUploaded = false;
                 }
                 if (currentModelType === 'custom') {
                     $('#customModelInputHidden').val($('#customModelInput').val());
                 } else {
                         } else {
                             updateResults(response);
                             if (currentModelType === 'custom') {
                                 $('#modelSuccessBadge').addClass('show');
                                 setTimeout(() => {
                 $(this).text(isExpanded ? 'Show More' : 'Show Less');
             });
             if (currentModelType === 'predefined') {
                 fetchTokenizerInfo($('#modelSelect').val(), false);
             } else if ($('#customModelInput').val()) {
                 fetchTokenizerInfo($('#customModelInput').val(), true);
             }
             $('#customModelInput').on('change', function() {
                 const modelValue = $(this).val();
                 if (modelValue) {
             file_path = os.path.join(app.config['UPLOAD_FOLDER'], uploaded_file.filename)
             uploaded_file.save(file_path)
+            # Read a small preview of the file (UTF-8)
+            with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
                 text = f.read(8096)
             try:
+                # Process the file fully
                 token_data = process_text("", model_to_use, is_full_file=True, file_path=file_path)
                 # Clean up the file after processing
     )
 if __name__ == "__main__":
+    app.run(host='0.0.0.0', port=7860, debug=False)