|
import gradio as gr |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer |
|
import torch |
|
from threading import Thread |
|
import time |
|
|
|
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
tokenizer.chat_template = "{% for message in messages %}<|im_start|>{{ message.role }}\n{{ message.content }}<|im_end|>\n{% endfor %}<|im_start|>assistant\n" |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
torch_dtype=torch.float32, |
|
device_map="cpu", |
|
low_cpu_mem_usage=True |
|
).to('cpu') |
|
class deepstreamer(TextIteratorStreamer): |
|
def __init__(self, tokenizer): |
|
super().__init__(tokenizer, skip_prompt=True, skip_special_tokens=True) |
|
self.token_count = 0 |
|
self.start_time = None |
|
|
|
def put(self, value): |
|
if self.start_time is None: |
|
self.start_time = time.time() |
|
self.token_count += 1 |
|
return super().put(value) |
|
|
|
def get_tps(self): |
|
if self.start_time is None: |
|
return 0 |
|
return self.token_count / (time.time() - self.start_time) |
|
|
|
def format_response(text, tps=None): |
|
return f"{text}\n\n**Tokens per second:** {tps:.2f}" if tps else text |
|
def chat_response(message, history, max_tokens=512): |
|
messages = [] |
|
for human, assistant in history: |
|
messages.append({"role": "user", "content": human}) |
|
messages.append({"role": "assistant", "content": assistant}) |
|
messages.append({"role": "user", "content": message}) |
|
formatted_input = tokenizer.apply_chat_template( |
|
messages, |
|
tokenize=False, |
|
add_generation_prompt=True |
|
) |
|
inputs = tokenizer(formatted_input, return_tensors="pt").to('cpu') |
|
streamer = deepstreamer(tokenizer) |
|
generation_kwargs = dict( |
|
inputs, |
|
streamer=streamer, |
|
max_new_tokens=max_tokens, |
|
do_sample=True, |
|
temperature=0.7, |
|
top_p=0.9, |
|
eos_token_id=tokenizer.eos_token_id |
|
) |
|
thread = Thread(target=model.generate, kwargs=generation_kwargs) |
|
thread.start() |
|
partial_response = "" |
|
try: |
|
for token in streamer: |
|
partial_response += token |
|
yield format_response(partial_response, streamer.get_tps()) |
|
|
|
final_tps = streamer.token_count / (time.time() - streamer.start_time) |
|
yield format_response(partial_response, final_tps) |
|
finally: |
|
thread.join() |
|
|
|
demo = gr.ChatInterface( |
|
fn=chat_response, |
|
title="DeepSeek-R1-Distill-Qwen-1.5B on CPU", |
|
description="Runnig on CPU so expect less tokens", |
|
examples=[ |
|
"Discuss the future of renewable energy", |
|
"What's the history of the Roman Empire?", |
|
"What's the capital of China?", |
|
"Tell me a fun fact about space" |
|
] |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.queue().launch() |