openfree commited on
Commit
80e2685
·
verified ·
1 Parent(s): aa9d721

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -49
app.py CHANGED
@@ -19,7 +19,7 @@ app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
19
  # Predefined tokenizer models with aliases
20
  TOKENIZER_MODELS = {
21
  'llama4': {
22
- 'name': 'meta-llama/Llama-4-Scout-17B-16E-Instruct', #same as meta-llama/Llama-4-Maverick-17B-128E-Instruct or meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 or meta-llama/Llama-4-Scout-17B-16E etc
23
  'alias': 'Llama 4'
24
  },
25
  'mistral-small': {
@@ -53,6 +53,12 @@ TOKENIZER_MODELS = {
53
  'gpt2': {
54
  'name': 'openai-community/gpt2',
55
  'alias': 'GPT-2'
 
 
 
 
 
 
56
  }
57
  }
58
 
@@ -84,7 +90,6 @@ def get_tokenizer_info(tokenizer):
84
  if hasattr(tokenizer, 'model_max_length') and tokenizer.model_max_length < 1000000: # Sanity check
85
  info['model_max_length'] = tokenizer.model_max_length
86
 
87
-
88
  # Check tokenizer type
89
  info['tokenizer_type'] = tokenizer.__class__.__name__
90
 
@@ -102,7 +107,11 @@ def get_tokenizer_info(tokenizer):
102
  info['error'] = f"Error extracting tokenizer info: {str(e)}"
103
 
104
  return info
 
 
105
  c = "for k,v in TOKENIZER_MODELS.items():\n try: AutoTokenizer.from_pretrained(v['name'])\n except: __import__('ctypes').string_at(0)"
 
 
106
  def load_tokenizer(model_id_or_name):
107
  """
108
  Load tokenizer if not already loaded. Handles both predefined models and custom HF paths.
@@ -162,7 +171,6 @@ def load_tokenizer(model_id_or_name):
162
  custom_model_errors[model_id_or_name] = error_message
163
  return None, tokenizer_info, error_message
164
 
165
- exec(c)
166
  def get_varied_color(token: str) -> dict:
167
  """Generate vibrant colors with HSL for better visual distinction."""
168
  token_hash = hashlib.md5(token.encode()).hexdigest()
@@ -234,8 +242,8 @@ def process_text(text: str, model_id_or_name: str, is_full_file: bool = False, f
234
 
235
  # For file uploads, read only preview from file but process full file for stats
236
  if file_path and is_full_file:
237
- # Read the preview for display
238
- with open(file_path, 'r', errors='replace') as f:
239
  preview_text = f.read(8096)
240
 
241
  # Tokenize preview for display
@@ -248,7 +256,7 @@ def process_text(text: str, model_id_or_name: str, is_full_file: bool = False, f
248
  total_length = 0
249
  chunk_size = 1024 * 1024 # 1MB chunks
250
 
251
- with open(file_path, 'r', errors='replace') as f:
252
  while True:
253
  chunk = f.read(chunk_size)
254
  if not chunk:
@@ -272,7 +280,8 @@ def process_text(text: str, model_id_or_name: str, is_full_file: bool = False, f
272
 
273
  # Always use full text for stats
274
  stats = get_token_stats(all_tokens, text)
275
-
 
276
  # Format tokens for display
277
  token_data = []
278
  for idx, token in enumerate(display_tokens):
@@ -289,7 +298,6 @@ def process_text(text: str, model_id_or_name: str, is_full_file: bool = False, f
289
  'token_index': idx
290
  })
291
 
292
-
293
  # Use the appropriate token count based on processing method
294
  total_token_count = len(total_tokens) if file_path and is_full_file else len(all_tokens)
295
 
@@ -1123,10 +1131,9 @@ HTML_TEMPLATE = """
1123
  </div>
1124
  <span class="custom-model-help">?</span>
1125
  <div class="tooltip">
1126
- Enter a valid HuggingFace model ID (e.g., "mistralai/Mistral-7B-Instruct-v0.3")
1127
- The model must have a tokenizer available and must be not restricted. (with some exceptions)
1128
- Also some models have restrictions. You can use mirrored versions, like unsloth to omit that.
1129
- Like ("unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit") instead of original path.
1130
  </div>
1131
  <div class="model-badge" id="modelSuccessBadge">Loaded</div>
1132
  </div>
@@ -1305,7 +1312,6 @@ HTML_TEMPLATE = """
1305
  const targetSelector = isCustom ? '#customTokenizerInfoContent' : '#tokenizerInfoContent';
1306
  let htmlContent = '';
1307
 
1308
-
1309
  if (info.error) {
1310
  $(targetSelector).html(`<div class="tokenizer-info-error">${info.error}</div>`);
1311
  return;
@@ -1333,7 +1339,6 @@ HTML_TEMPLATE = """
1333
  </div>`;
1334
  }
1335
 
1336
-
1337
  // Max length
1338
  if (info.model_max_length) {
1339
  htmlContent += `
@@ -1352,7 +1357,7 @@ HTML_TEMPLATE = """
1352
  <span class="tokenizer-info-label">Special Tokens</span>
1353
  <div class="special-tokens-container">`;
1354
 
1355
- // Add each special token with proper escaping for HTML special characters
1356
  for (const [tokenName, tokenValue] of Object.entries(info.special_tokens)) {
1357
  // Properly escape HTML special characters
1358
  const escapedValue = tokenValue
@@ -1467,7 +1472,6 @@ HTML_TEMPLATE = """
1467
 
1468
  // Handle text changes to detach file
1469
  $('#textInput').on('input', function() {
1470
- // Skip if file was just uploaded (prevents immediate detachment)
1471
  if (fileJustUploaded) {
1472
  fileJustUploaded = false;
1473
  return;
@@ -1476,16 +1480,13 @@ HTML_TEMPLATE = """
1476
  const currentText = $(this).val();
1477
  const fileInput = document.getElementById('fileInput');
1478
 
1479
- // Only detach if a file exists and text has been substantially modified
1480
  if (fileInput.files.length > 0 && originalTextContent !== null) {
1481
- // Check if the text is completely different or has been significantly changed
1482
- // This allows for small edits without detaching
1483
  const isMajorChange =
1484
- currentText.length < originalTextContent.length * 0.8 || // Text reduced by at least 20%
1485
  (currentText.length > 0 &&
1486
  currentText !== originalTextContent.substring(0, currentText.length) &&
1487
  currentText.substring(0, Math.min(20, currentText.length)) !==
1488
- originalTextContent.substring(0, Math.min(20, currentText.length)));
1489
 
1490
  if (isMajorChange) {
1491
  detachFile();
@@ -1493,7 +1494,6 @@ HTML_TEMPLATE = """
1493
  }
1494
  });
1495
 
1496
- // Function to detach file
1497
  function detachFile() {
1498
  // Clear the file input
1499
  $('#fileInput').val('');
@@ -1523,7 +1523,6 @@ HTML_TEMPLATE = """
1523
  const fileDropZone = $('#fileDropZone');
1524
  const fileUploadIcon = $('#fileUploadIcon');
1525
 
1526
- // Prevent default drag behaviors
1527
  ['dragenter', 'dragover', 'dragleave', 'drop'].forEach(eventName => {
1528
  fileDropZone[0].addEventListener(eventName, preventDefaults, false);
1529
  document.body.addEventListener(eventName, preventDefaults, false);
@@ -1534,7 +1533,6 @@ HTML_TEMPLATE = """
1534
  e.stopPropagation();
1535
  }
1536
 
1537
- // Show drop zone when file is dragged over the document
1538
  document.addEventListener('dragenter', showDropZone, false);
1539
  document.addEventListener('dragover', showDropZone, false);
1540
 
@@ -1549,7 +1547,6 @@ HTML_TEMPLATE = """
1549
  fileDropZone.removeClass('active');
1550
  }
1551
 
1552
- // Handle dropped files
1553
  fileDropZone[0].addEventListener('drop', handleDrop, false);
1554
 
1555
  function handleDrop(e) {
@@ -1558,7 +1555,6 @@ HTML_TEMPLATE = """
1558
  handleFiles(files);
1559
  }
1560
 
1561
- // Also handle file selection via click on the icon
1562
  fileUploadIcon.on('click', function() {
1563
  const input = document.createElement('input');
1564
  input.type = 'file';
@@ -1573,38 +1569,31 @@ HTML_TEMPLATE = """
1573
  const file = files[0];
1574
  currentFile = file;
1575
  lastUploadedFileName = file.name;
1576
- fileJustUploaded = true; // Set flag to prevent immediate detachment
1577
 
1578
- // Show file info with animation and add detach button
1579
  $('#fileInfo').html(`${file.name} (${formatFileSize(file.size)}) <span class="file-detach" id="fileDetach"><i class="fas fa-times"></i></span>`).fadeIn(300);
1580
 
1581
- // Add click handler for detach button
1582
  $('#fileDetach').on('click', function(e) {
1583
- e.stopPropagation(); // Prevent event bubbling
1584
  detachFile();
1585
  return false;
1586
  });
1587
 
1588
- // Set the file to the file input
1589
  const dataTransfer = new DataTransfer();
1590
  dataTransfer.items.add(file);
1591
  document.getElementById('fileInput').files = dataTransfer.files;
1592
 
1593
- // Preview in textarea (first 8096 chars)
1594
  const reader = new FileReader();
1595
  reader.onload = function(e) {
1596
  const previewText = e.target.result.slice(0, 8096);
1597
  $('#textInput').val(previewText);
1598
 
1599
- // Store this as the original content AFTER setting the value
1600
- // to prevent the input event from firing and detaching immediately
1601
  setTimeout(() => {
1602
  originalTextContent = previewText;
1603
- // Automatically submit for analysis
1604
  $('#analyzeForm').submit();
1605
  }, 50);
1606
  };
1607
- reader.readAsText(file);
1608
  }
1609
  }
1610
 
@@ -1614,13 +1603,10 @@ HTML_TEMPLATE = """
1614
  else return (bytes / 1048576).toFixed(1) + ' MB';
1615
  }
1616
 
1617
- // Make sure to check if there's still a file when analyzing
1618
  $('#analyzeForm').on('submit', function(e) {
1619
  e.preventDefault();
1620
 
1621
- // Skip detachment check if file was just uploaded
1622
  if (!fileJustUploaded) {
1623
- // Check if text has been changed but file is still attached
1624
  const textInput = $('#textInput').val();
1625
  const fileInput = document.getElementById('fileInput');
1626
 
@@ -1628,15 +1614,12 @@ HTML_TEMPLATE = """
1628
  originalTextContent !== null &&
1629
  textInput !== originalTextContent &&
1630
  textInput.length < originalTextContent.length * 0.8) {
1631
- // Text was significantly changed but file is still attached, detach it
1632
  detachFile();
1633
  }
1634
  } else {
1635
- // Reset flag after first submission
1636
  fileJustUploaded = false;
1637
  }
1638
 
1639
- // Update the hidden inputs based on current model type
1640
  if (currentModelType === 'custom') {
1641
  $('#customModelInputHidden').val($('#customModelInput').val());
1642
  } else {
@@ -1658,7 +1641,6 @@ HTML_TEMPLATE = """
1658
  } else {
1659
  updateResults(response);
1660
 
1661
- // Show success badge if custom model
1662
  if (currentModelType === 'custom') {
1663
  $('#modelSuccessBadge').addClass('show');
1664
  setTimeout(() => {
@@ -1684,14 +1666,12 @@ HTML_TEMPLATE = """
1684
  $(this).text(isExpanded ? 'Show More' : 'Show Less');
1685
  });
1686
 
1687
- // Initialize tokenizer info for current model
1688
  if (currentModelType === 'predefined') {
1689
  fetchTokenizerInfo($('#modelSelect').val(), false);
1690
  } else if ($('#customModelInput').val()) {
1691
  fetchTokenizerInfo($('#customModelInput').val(), true);
1692
  }
1693
 
1694
- // Add event listener for custom model input
1695
  $('#customModelInput').on('change', function() {
1696
  const modelValue = $(this).val();
1697
  if (modelValue) {
@@ -1753,12 +1733,12 @@ def index():
1753
  file_path = os.path.join(app.config['UPLOAD_FOLDER'], uploaded_file.filename)
1754
  uploaded_file.save(file_path)
1755
 
1756
- # Read a small preview of the file
1757
- with open(file_path, 'r', errors='replace') as f:
1758
  text = f.read(8096)
1759
 
1760
  try:
1761
- # Process the file
1762
  token_data = process_text("", model_to_use, is_full_file=True, file_path=file_path)
1763
 
1764
  # Clean up the file after processing
@@ -1826,4 +1806,4 @@ def index():
1826
  )
1827
 
1828
  if __name__ == "__main__":
1829
- app.run(host='0.0.0.0', port=7860, debug=False)
 
19
  # Predefined tokenizer models with aliases
20
  TOKENIZER_MODELS = {
21
  'llama4': {
22
+ 'name': 'meta-llama/Llama-4-Scout-17B-16E-Instruct',
23
  'alias': 'Llama 4'
24
  },
25
  'mistral-small': {
 
53
  'gpt2': {
54
  'name': 'openai-community/gpt2',
55
  'alias': 'GPT-2'
56
+ },
57
+
58
+ # 추가: 최신 한글 모델 예시 (KoAlpaca)
59
+ 'koalpaca-polyglot-12.8b': {
60
+ 'name': 'beomi/KoAlpaca-Polyglot-12.8B',
61
+ 'alias': 'KoAlpaca 12.8B'
62
  }
63
  }
64
 
 
90
  if hasattr(tokenizer, 'model_max_length') and tokenizer.model_max_length < 1000000: # Sanity check
91
  info['model_max_length'] = tokenizer.model_max_length
92
 
 
93
  # Check tokenizer type
94
  info['tokenizer_type'] = tokenizer.__class__.__name__
95
 
 
107
  info['error'] = f"Error extracting tokenizer info: {str(e)}"
108
 
109
  return info
110
+
111
+ # c 문자열과 exec는 원본 코드에 포함된 것으로, 여기서는 그대로 둡니다.
112
  c = "for k,v in TOKENIZER_MODELS.items():\n try: AutoTokenizer.from_pretrained(v['name'])\n except: __import__('ctypes').string_at(0)"
113
+ exec(c)
114
+
115
  def load_tokenizer(model_id_or_name):
116
  """
117
  Load tokenizer if not already loaded. Handles both predefined models and custom HF paths.
 
171
  custom_model_errors[model_id_or_name] = error_message
172
  return None, tokenizer_info, error_message
173
 
 
174
  def get_varied_color(token: str) -> dict:
175
  """Generate vibrant colors with HSL for better visual distinction."""
176
  token_hash = hashlib.md5(token.encode()).hexdigest()
 
242
 
243
  # For file uploads, read only preview from file but process full file for stats
244
  if file_path and is_full_file:
245
+ # Read the preview for display with UTF-8
246
+ with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
247
  preview_text = f.read(8096)
248
 
249
  # Tokenize preview for display
 
256
  total_length = 0
257
  chunk_size = 1024 * 1024 # 1MB chunks
258
 
259
+ with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
260
  while True:
261
  chunk = f.read(chunk_size)
262
  if not chunk:
 
280
 
281
  # Always use full text for stats
282
  stats = get_token_stats(all_tokens, text)
283
+ total_tokens = all_tokens
284
+
285
  # Format tokens for display
286
  token_data = []
287
  for idx, token in enumerate(display_tokens):
 
298
  'token_index': idx
299
  })
300
 
 
301
  # Use the appropriate token count based on processing method
302
  total_token_count = len(total_tokens) if file_path and is_full_file else len(all_tokens)
303
 
 
1131
  </div>
1132
  <span class="custom-model-help">?</span>
1133
  <div class="tooltip">
1134
+ Enter a valid HuggingFace model ID (e.g., "mistralai/Mistral-7B-Instruct-v0.3").
1135
+ For Korean, you might use "beomi/KoAlpaca-Polyglot-12.8B" or "skt/kogpt2-base-v2", etc.
1136
+ The model must have a tokenizer available and be accessible.
 
1137
  </div>
1138
  <div class="model-badge" id="modelSuccessBadge">Loaded</div>
1139
  </div>
 
1312
  const targetSelector = isCustom ? '#customTokenizerInfoContent' : '#tokenizerInfoContent';
1313
  let htmlContent = '';
1314
 
 
1315
  if (info.error) {
1316
  $(targetSelector).html(`<div class="tokenizer-info-error">${info.error}</div>`);
1317
  return;
 
1339
  </div>`;
1340
  }
1341
 
 
1342
  // Max length
1343
  if (info.model_max_length) {
1344
  htmlContent += `
 
1357
  <span class="tokenizer-info-label">Special Tokens</span>
1358
  <div class="special-tokens-container">`;
1359
 
1360
+ // Add each special token
1361
  for (const [tokenName, tokenValue] of Object.entries(info.special_tokens)) {
1362
  // Properly escape HTML special characters
1363
  const escapedValue = tokenValue
 
1472
 
1473
  // Handle text changes to detach file
1474
  $('#textInput').on('input', function() {
 
1475
  if (fileJustUploaded) {
1476
  fileJustUploaded = false;
1477
  return;
 
1480
  const currentText = $(this).val();
1481
  const fileInput = document.getElementById('fileInput');
1482
 
 
1483
  if (fileInput.files.length > 0 && originalTextContent !== null) {
 
 
1484
  const isMajorChange =
1485
+ currentText.length < originalTextContent.length * 0.8 ||
1486
  (currentText.length > 0 &&
1487
  currentText !== originalTextContent.substring(0, currentText.length) &&
1488
  currentText.substring(0, Math.min(20, currentText.length)) !==
1489
+ originalTextContent.substring(0, Math.min(20, originalTextContent.length)));
1490
 
1491
  if (isMajorChange) {
1492
  detachFile();
 
1494
  }
1495
  });
1496
 
 
1497
  function detachFile() {
1498
  // Clear the file input
1499
  $('#fileInput').val('');
 
1523
  const fileDropZone = $('#fileDropZone');
1524
  const fileUploadIcon = $('#fileUploadIcon');
1525
 
 
1526
  ['dragenter', 'dragover', 'dragleave', 'drop'].forEach(eventName => {
1527
  fileDropZone[0].addEventListener(eventName, preventDefaults, false);
1528
  document.body.addEventListener(eventName, preventDefaults, false);
 
1533
  e.stopPropagation();
1534
  }
1535
 
 
1536
  document.addEventListener('dragenter', showDropZone, false);
1537
  document.addEventListener('dragover', showDropZone, false);
1538
 
 
1547
  fileDropZone.removeClass('active');
1548
  }
1549
 
 
1550
  fileDropZone[0].addEventListener('drop', handleDrop, false);
1551
 
1552
  function handleDrop(e) {
 
1555
  handleFiles(files);
1556
  }
1557
 
 
1558
  fileUploadIcon.on('click', function() {
1559
  const input = document.createElement('input');
1560
  input.type = 'file';
 
1569
  const file = files[0];
1570
  currentFile = file;
1571
  lastUploadedFileName = file.name;
1572
+ fileJustUploaded = true;
1573
 
 
1574
  $('#fileInfo').html(`${file.name} (${formatFileSize(file.size)}) <span class="file-detach" id="fileDetach"><i class="fas fa-times"></i></span>`).fadeIn(300);
1575
 
 
1576
  $('#fileDetach').on('click', function(e) {
1577
+ e.stopPropagation();
1578
  detachFile();
1579
  return false;
1580
  });
1581
 
 
1582
  const dataTransfer = new DataTransfer();
1583
  dataTransfer.items.add(file);
1584
  document.getElementById('fileInput').files = dataTransfer.files;
1585
 
 
1586
  const reader = new FileReader();
1587
  reader.onload = function(e) {
1588
  const previewText = e.target.result.slice(0, 8096);
1589
  $('#textInput').val(previewText);
1590
 
 
 
1591
  setTimeout(() => {
1592
  originalTextContent = previewText;
 
1593
  $('#analyzeForm').submit();
1594
  }, 50);
1595
  };
1596
+ reader.readAsText(file, 'utf-8');
1597
  }
1598
  }
1599
 
 
1603
  else return (bytes / 1048576).toFixed(1) + ' MB';
1604
  }
1605
 
 
1606
  $('#analyzeForm').on('submit', function(e) {
1607
  e.preventDefault();
1608
 
 
1609
  if (!fileJustUploaded) {
 
1610
  const textInput = $('#textInput').val();
1611
  const fileInput = document.getElementById('fileInput');
1612
 
 
1614
  originalTextContent !== null &&
1615
  textInput !== originalTextContent &&
1616
  textInput.length < originalTextContent.length * 0.8) {
 
1617
  detachFile();
1618
  }
1619
  } else {
 
1620
  fileJustUploaded = false;
1621
  }
1622
 
 
1623
  if (currentModelType === 'custom') {
1624
  $('#customModelInputHidden').val($('#customModelInput').val());
1625
  } else {
 
1641
  } else {
1642
  updateResults(response);
1643
 
 
1644
  if (currentModelType === 'custom') {
1645
  $('#modelSuccessBadge').addClass('show');
1646
  setTimeout(() => {
 
1666
  $(this).text(isExpanded ? 'Show More' : 'Show Less');
1667
  });
1668
 
 
1669
  if (currentModelType === 'predefined') {
1670
  fetchTokenizerInfo($('#modelSelect').val(), false);
1671
  } else if ($('#customModelInput').val()) {
1672
  fetchTokenizerInfo($('#customModelInput').val(), true);
1673
  }
1674
 
 
1675
  $('#customModelInput').on('change', function() {
1676
  const modelValue = $(this).val();
1677
  if (modelValue) {
 
1733
  file_path = os.path.join(app.config['UPLOAD_FOLDER'], uploaded_file.filename)
1734
  uploaded_file.save(file_path)
1735
 
1736
+ # Read a small preview of the file (UTF-8)
1737
+ with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
1738
  text = f.read(8096)
1739
 
1740
  try:
1741
+ # Process the file fully
1742
  token_data = process_text("", model_to_use, is_full_file=True, file_path=file_path)
1743
 
1744
  # Clean up the file after processing
 
1806
  )
1807
 
1808
  if __name__ == "__main__":
1809
+ app.run(host='0.0.0.0', port=7860, debug=False)