Add break after end of sequence token
Browse files
app.py
CHANGED
@@ -62,7 +62,7 @@ with gr.Blocks() as demo:
|
|
62 |
yield messages + [{"role": "assistant", "content": "⚠️ No model loaded."}]
|
63 |
return
|
64 |
|
65 |
-
current_model.to("cuda")
|
66 |
|
67 |
prompt = format_prompt(messages)
|
68 |
inputs = current_tokenizer(prompt, return_tensors="pt").to(current_model.device)
|
@@ -79,10 +79,18 @@ with gr.Blocks() as demo:
|
|
79 |
output_scores=False
|
80 |
).sequences[0][inputs['input_ids'].shape[-1]:]: # skip input tokens
|
81 |
output_ids.append(token_id.item())
|
82 |
-
decoded = current_tokenizer.decode(output_ids, skip_special_tokens=
|
|
|
|
|
|
|
|
|
83 |
messages[-1]["content"] = decoded
|
84 |
yield messages
|
85 |
|
|
|
|
|
|
|
|
|
86 |
with gr.Row():
|
87 |
model_selector = gr.Dropdown(choices=model_choices, label="Select Model")
|
88 |
model_status = gr.Textbox(label="Model Status", interactive=False)
|
|
|
62 |
yield messages + [{"role": "assistant", "content": "⚠️ No model loaded."}]
|
63 |
return
|
64 |
|
65 |
+
current_model = current_model.half().to("cuda")
|
66 |
|
67 |
prompt = format_prompt(messages)
|
68 |
inputs = current_tokenizer(prompt, return_tensors="pt").to(current_model.device)
|
|
|
79 |
output_scores=False
|
80 |
).sequences[0][inputs['input_ids'].shape[-1]:]: # skip input tokens
|
81 |
output_ids.append(token_id.item())
|
82 |
+
decoded = current_tokenizer.decode(output_ids, skip_special_tokens=False)
|
83 |
+
if output_ids[-1] == current_tokenizer.eos_token_id:
|
84 |
+
current_model.to("cpu")
|
85 |
+
torch.cuda.empty_cache()
|
86 |
+
return
|
87 |
messages[-1]["content"] = decoded
|
88 |
yield messages
|
89 |
|
90 |
+
current_model.to("cpu")
|
91 |
+
torch.cuda.empty_cache()
|
92 |
+
return
|
93 |
+
|
94 |
with gr.Row():
|
95 |
model_selector = gr.Dropdown(choices=model_choices, label="Select Model")
|
96 |
model_status = gr.Textbox(label="Model Status", interactive=False)
|