Spaces:

Ruurd
/

radiolm

Running on Zero

App Files Files Community

Ruurd commited on 23 days ago

Commit

205d52f

1 Parent(s): 80f8fa5

Implement reasoning blocks and fix eos token showing

Browse files

Files changed (1) hide show

app.py +24 -9

app.py CHANGED Viewed

@@ -45,8 +45,6 @@ def chat_with_model(messages):
         yield messages + [{"role": "assistant", "content": "⚠️ No model loaded."}]
         return
     pad_id = current_tokenizer.pad_token_id
     if pad_id is None:
         pad_id = current_tokenizer.unk_token_id or 0
@@ -58,11 +56,8 @@ def chat_with_model(messages):
     inputs = current_tokenizer(prompt, return_tensors="pt")
     inputs = {k: v.to(device) for k, v in inputs.items()}
-    # streamer = TextIteratorStreamer(current_tokenizer, skip_prompt=True, skip_special_tokens=False)
     streamer = RichTextStreamer(current_tokenizer, skip_prompt=True, skip_special_tokens=False)
     generation_kwargs = dict(
         **inputs,
         max_new_tokens=256,
@@ -78,22 +73,42 @@ def chat_with_model(messages):
     output_text = ""
     messages = messages.copy()
     messages.append({"role": "assistant", "content": ""})
     for token_info in streamer:
         token_str = token_info["token"]
         is_special = token_info["is_special"]
-        output_text += token_str
         messages[-1]["content"] = output_text
         yield messages
-        if is_special and token_info["token_id"] == current_tokenizer.eos_token_id:
-            break
     current_model.to("cpu")
     torch.cuda.empty_cache()
 # Globals
 current_model = None
 current_tokenizer = None

         yield messages + [{"role": "assistant", "content": "⚠️ No model loaded."}]
         return
     pad_id = current_tokenizer.pad_token_id
     if pad_id is None:
         pad_id = current_tokenizer.unk_token_id or 0
     inputs = current_tokenizer(prompt, return_tensors="pt")
     inputs = {k: v.to(device) for k, v in inputs.items()}
     streamer = RichTextStreamer(current_tokenizer, skip_prompt=True, skip_special_tokens=False)
     generation_kwargs = dict(
         **inputs,
         max_new_tokens=256,
     output_text = ""
     messages = messages.copy()
     messages.append({"role": "assistant", "content": ""})
+    in_think = False
     for token_info in streamer:
         token_str = token_info["token"]
+        token_id = token_info["token_id"]
         is_special = token_info["is_special"]
+        # Skip appending the EOS token to output
+        if token_id == current_tokenizer.eos_token_id:
+            break
+        # Detect reasoning block
+        if "<think>" in token_str:
+            in_think = True
+            token_str = token_str.replace("<think>", "")
+            output_text += "*"
+        if "</think>" in token_str:
+            in_think = False
+            token_str = token_str.replace("</think>", "")
+            output_text += token_str + "*"
+        else:
+            output_text += token_str
         messages[-1]["content"] = output_text
         yield messages
+    if in_think:
+        output_text += "*"
+        messages[-1]["content"] = output_text
+        yield messages
     current_model.to("cpu")
     torch.cuda.empty_cache()
 # Globals
 current_model = None
 current_tokenizer = None