phi-4 / app.py
likewendy's picture
code
90aa4a9
raw
history blame contribute delete
3.13 kB
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import gradio as gr
import os
from threading import Thread
os.system("rm -rf /data-nvme/zerogpu-offload/*")
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
model = AutoModelForCausalLM.from_pretrained(
"NyxKrage/Microsoft_Phi-4",
device_map="cuda",
torch_dtype="auto",
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("NyxKrage/Microsoft_Phi-4")
streamer = TextIteratorStreamer(tokenizer)
@spaces.GPU
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
seed,
):
messages = [{"role": "system", "content": system_message}]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
messages.append({"role": "user", "content": message})
# Convert messages to the format expected by the model
input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
torch.random.manual_seed(seed)
generation_kwargs = dict(
input_ids=input_ids,
max_new_tokens=max_tokens,
temperature=temperature,
streamer=streamer,
top_p=top_p,
return_full_text=False,
do_sample=True,
)
response = ""
# Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
# Print the generated text in real-time
for new_text in streamer:
response += new_text
yield response
with gr.Blocks() as demo:
with gr.Row():
gr.LoginButton(min_width=100)
gr.Markdown("""
This is the space I built.
As of 2025/1/7, this is the first phi-4 space.
If this helps you, and if you have enough money, can you give me 1$? I am facing a financial crisis.
If you do this, I will pass on the kindness.
This is my bank card number:5592921230414708
Thank you!!
""")
with gr.Row():
with gr.Column():
system_message = gr.Textbox(value="You are a friendly Chatbot.", label="System message")
max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens")
temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
seed = gr.Slider(minimum=0, maximum=20091114, value=42, step=1, label="Seed")
with gr.Column():
gr.ChatInterface(
respond,
additional_inputs=[system_message, max_tokens, temperature, top_p, seed],
)
if __name__ == "__main__":
demo.launch()