openfree commited on
Commit
623f1f7
·
verified ·
1 Parent(s): 80e2685

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -16
app.py CHANGED
@@ -184,12 +184,18 @@ def get_varied_color(token: str) -> dict:
184
  'text': f'hsl({hue}, {saturation}%, {text_lightness}%)'
185
  }
186
 
187
- def fix_token(token: str) -> str:
188
- """Fix token for display with improved space visualization."""
189
- if token.startswith('Ġ'):
190
- space_count = token.count('Ġ')
191
- return '·' * space_count + token[space_count:]
192
- return token
 
 
 
 
 
 
193
 
194
  def get_token_stats(tokens: list, original_text: str) -> dict:
195
  """Calculate enhanced statistics about the tokens."""
@@ -286,14 +292,23 @@ def process_text(text: str, model_id_or_name: str, is_full_file: bool = False, f
286
  token_data = []
287
  for idx, token in enumerate(display_tokens):
288
  colors = get_varied_color(token)
289
- fixed_token = fix_token(token)
 
 
290
  # Compute the numerical token ID from the tokenizer
291
  token_id = tokenizer.convert_tokens_to_ids(token)
 
 
 
 
 
 
 
292
  token_data.append({
293
- 'original': token,
294
- 'display': fixed_token[:-1] if fixed_token.endswith('Ċ') else fixed_token,
295
  'colors': colors,
296
- 'newline': fixed_token.endswith('Ċ'),
297
  'token_id': token_id,
298
  'token_index': idx
299
  })
@@ -1549,12 +1564,6 @@ HTML_TEMPLATE = """
1549
 
1550
  fileDropZone[0].addEventListener('drop', handleDrop, false);
1551
 
1552
- function handleDrop(e) {
1553
- const dt = e.dataTransfer;
1554
- const files = dt.files;
1555
- handleFiles(files);
1556
- }
1557
-
1558
  fileUploadIcon.on('click', function() {
1559
  const input = document.createElement('input');
1560
  input.type = 'file';
 
184
  'text': f'hsl({hue}, {saturation}%, {text_lightness}%)'
185
  }
186
 
187
+ def fix_token(token: str, tokenizer) -> str:
188
+ """
189
+ 실제로 UI에 표시하기 전에, tokenizer.decode()를 통해
190
+ 사람이 읽을 수 있는 형태로 디코딩한다.
191
+ """
192
+ if not token.strip():
193
+ return token
194
+
195
+ # 해당 토큰(서브워드)에 대한 ID를 구한 뒤, 다시 decode
196
+ token_id = tokenizer.convert_tokens_to_ids(token)
197
+ decoded = tokenizer.decode([token_id], clean_up_tokenization_spaces=False)
198
+ return decoded
199
 
200
  def get_token_stats(tokens: list, original_text: str) -> dict:
201
  """Calculate enhanced statistics about the tokens."""
 
292
  token_data = []
293
  for idx, token in enumerate(display_tokens):
294
  colors = get_varied_color(token)
295
+ # 디코딩된 토큰으로 교체
296
+ decoded_token = fix_token(token, tokenizer)
297
+
298
  # Compute the numerical token ID from the tokenizer
299
  token_id = tokenizer.convert_tokens_to_ids(token)
300
+
301
+ # 개행 여부를 단순히 decoded_token의 끝이 newline인지만 확인 (원하는대로 조정 가능)
302
+ newline_flag = decoded_token.endswith('\n')
303
+
304
+ # UI에 넣을 display(맨 끝 \n 제거 등)
305
+ display_str = decoded_token[:-1] if newline_flag else decoded_token
306
+
307
  token_data.append({
308
+ 'original': token, # raw token
309
+ 'display': display_str, # 사람이 읽을 있는 디코딩된 토큰
310
  'colors': colors,
311
+ 'newline': newline_flag,
312
  'token_id': token_id,
313
  'token_index': idx
314
  })
 
1564
 
1565
  fileDropZone[0].addEventListener('drop', handleDrop, false);
1566
 
 
 
 
 
 
 
1567
  fileUploadIcon.on('click', function() {
1568
  const input = document.createElement('input');
1569
  input.type = 'file';