Spaces:

mebubo
/

gpted

Sleeping

mebubo commited on Oct 4, 2024

Commit

6735ae4

1 Parent(s): 91f2f92

refactor: separate text processing logic into a new module for better organization

Files changed (2) hide show

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 #%%
-from dataclasses import dataclass
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from pprint import pprint
@@ -73,34 +73,6 @@ for word, avg_logprob in words:
 # %%
-@dataclass
-class Word:
-    tokens: list[int]
-    text: str
-    logprob: float
-    first_token_index: int
-def split_into_words(tokens, log_probs) -> list[Word]:
-    words = []
-    current_word = []
-    current_log_probs = []
-    current_word_first_token_index = 0
-    for i, (token, logprob) in enumerate(zip(tokens, log_probs)):
-        if not token.startswith(chr(9601)) and token.isalpha():
-            current_word.append(token)
-            current_log_probs.append(logprob)
-        else:
-            if current_word:
-                words.append(Word(current_word, "".join(current_word), sum(current_log_probs), current_word_first_token_index))
-            current_word = [token]
-            current_log_probs = [logprob]
-            current_word_first_token_index = i
-    if current_word:
-        words.append(Word(current_word, "".join(current_word), sum(current_log_probs), current_word_first_token_index))
-    return words
 words = split_into_words(tokens[1:], token_log_probs)

 #%%
+from text_processing import split_into_words, Word
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from pprint import pprint
 # %%
 words = split_into_words(tokens[1:], token_log_probs)

text_processing.py ADDED Viewed

+from dataclasses import dataclass
+@dataclass
+class Word:
+    tokens: list[int]
+    text: str
+    logprob: float
+    first_token_index: int
+def split_into_words(tokens, log_probs) -> list[Word]:
+    words = []
+    current_word = []
+    current_log_probs = []
+    current_word_first_token_index = 0
+    for i, (token, logprob) in enumerate(zip(tokens, log_probs)):
+        if not token.startswith(chr(9601)) and token.isalpha():
+            current_word.append(token)
+            current_log_probs.append(logprob)
+        else:
+            if current_word:
+                words.append(Word(current_word, "".join(current_word), sum(current_log_probs), current_word_first_token_index))
+            current_word = [token]
+            current_log_probs = [logprob]
+            current_word_first_token_index = i
+    if current_word:
+        words.append(Word(current_word, "".join(current_word), sum(current_log_probs), current_word_first_token_index))
+    return words