Spaces:

merterbak
/

phi-4

Running on Zero

File size: 7,490 Bytes

import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import torch
from threading import Thread

phi4_model_path = "microsoft/phi-4"
phi4_mini_model_path = "microsoft/Phi-4-mini-instruct"

device = "cuda:0" if torch.cuda.is_available() else "cpu"

phi4_model = AutoModelForCausalLM.from_pretrained(phi4_model_path, torch_dtype="auto").to(device)
phi4_tokenizer = AutoTokenizer.from_pretrained(phi4_model_path)
phi4_mini_model = AutoModelForCausalLM.from_pretrained(phi4_mini_model_path, torch_dtype="auto").to(device)
phi4_mini_tokenizer = AutoTokenizer.from_pretrained(phi4_mini_model_path)

@spaces.GPU(duration=60)
def generate_response(user_message, model_name, max_tokens, temperature, top_k, top_p, repetition_penalty, history_state):
    if not user_message.strip():
        return history_state, history_state
        
    # Select models 
    if model_name == "Phi-4":
        model = phi4_model
        tokenizer = phi4_tokenizer
        start_tag = "<|im_start|>"
        sep_tag = "<|im_sep|>"
        end_tag = "<|im_end|>"
    elif model_name == "Phi-4-mini-instruct":
        model = phi4_mini_model
        tokenizer = phi4_mini_tokenizer
        start_tag = ""
        sep_tag = ""
        end_tag = "<|end|>"
    else:
        raise ValueError("Error loading on models")

    # Recommended prompt settings by Microsoft
    system_message = "You are a friendly and knowledgeable assistant, here to help with any questions or tasks."
    if model_name == "Phi-4":
        prompt = f"{start_tag}system{sep_tag}{system_message}{end_tag}"
        for message in history_state:
            if message["role"] == "user":
                prompt += f"{start_tag}user{sep_tag}{message['content']}{end_tag}"
            elif message["role"] == "assistant" and message["content"]:
                prompt += f"{start_tag}assistant{sep_tag}{message['content']}{end_tag}"
        prompt += f"{start_tag}user{sep_tag}{user_message}{end_tag}{start_tag}assistant{sep_tag}"
    else:
        prompt = f"<|system|>{system_message}{end_tag}"
        for message in history_state:
            if message["role"] == "user":
                prompt += f"<|user|>{message['content']}{end_tag}"
            elif message["role"] == "assistant" and message["content"]:
                prompt += f"<|assistant|>{message['content']}{end_tag}"
        prompt += f"<|user|>{user_message}{end_tag}<|assistant|>"

    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    do_sample = not (temperature == 1.0 and top_k >= 100 and top_p == 1.0)

    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)

    # sampling techniques
    generation_kwargs = {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "max_new_tokens": int(max_tokens),
        "do_sample": do_sample,
        "temperature": temperature,
        "top_k": int(top_k),
        "top_p": top_p,
        "repetition_penalty": repetition_penalty,
        "streamer": streamer,
    }

    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    # Stream the response
    assistant_response = ""
    new_history = history_state + [
        {"role": "user", "content": user_message},
        {"role": "assistant", "content": ""}
    ]
    for new_token in streamer:
        cleaned_token = new_token.replace("<|im_start|>", "").replace("<|im_sep|>", "").replace("<|im_end|>", "").replace("<|end|>", "").replace("<|system|>", "").replace("<|user|>", "").replace("<|assistant|>", "")
        assistant_response += cleaned_token
        new_history[-1]["content"] = assistant_response.strip()
        yield new_history, new_history

    yield new_history, new_history

example_messages = {
    "Learn about physics": "Explain Newton’s laws of motion.",
    "Discover space facts": "What are some interesting facts about black holes?",
    "Write a factorial function": "Write a Python function to calculate the factorial of a number."
}

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # Phi-4 Models Chatbot 
        Welcome to the Phi-4 Chatbot! You can chat with Microsoft's Phi-4 or Phi-4-mini-instruct models. Adjust the settings on the left to customize the model's responses.
        """
    )
    
    history_state = gr.State([])

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### Settings")
            model_dropdown = gr.Dropdown(
                choices=["Phi-4", "Phi-4-mini-instruct"],
                label="Select Model",
                value="Phi-4" 
            )
            max_tokens_slider = gr.Slider(
                minimum=64,
                maximum=4096,
                step=50,
                value=512,
                label="Max Tokens"
            )
            with gr.Accordion("Advanced Settings", open=False):
                temperature_slider = gr.Slider(
                    minimum=0.1,
                    maximum=2.0,
                    value=1.0,
                    label="Temperature"
                )
                top_k_slider = gr.Slider(
                    minimum=1,
                    maximum=100,
                    step=1,
                    value=50,
                    label="Top-k"
                )
                top_p_slider = gr.Slider(
                    minimum=0.1,
                    maximum=1.0,
                    value=0.9,
                    label="Top-p"
                )
                repetition_penalty_slider = gr.Slider(
                    minimum=1.0,
                    maximum=2.0,
                    value=1.0,
                    label="Repetition Penalty"
                )
        
        with gr.Column(scale=4):
            chatbot = gr.Chatbot(label="Chat", type="messages")
            with gr.Row():
                user_input = gr.Textbox(
                    label="Your message",
                    placeholder="Type your message here...",
                    scale=3
                )
                submit_button = gr.Button("Send", variant="primary", scale=1)
                clear_button = gr.Button("Clear", scale=1)
            gr.Markdown("**Try these examples:**")
            with gr.Row():
                example1_button = gr.Button("Learn about physics")
                example2_button = gr.Button("Discover space facts")
                example3_button = gr.Button("Write a factorial function")

    submit_button.click(
        fn=generate_response,
        inputs=[user_input, model_dropdown, max_tokens_slider, temperature_slider, top_k_slider, top_p_slider, repetition_penalty_slider, history_state],
        outputs=[chatbot, history_state]
    ).then(
        fn=lambda: gr.update(value=""),
        inputs=None,
        outputs=user_input
    )

    clear_button.click(
        fn=lambda: ([], []),
        inputs=None,
        outputs=[chatbot, history_state]
    )

    example1_button.click(
        fn=lambda: gr.update(value=example_messages["Learn about physics"]),
        inputs=None,
        outputs=user_input
    )
    example2_button.click(
        fn=lambda: gr.update(value=example_messages["Discover space facts"]),
        inputs=None,
        outputs=user_input
    )
    example3_button.click(
        fn=lambda: gr.update(value=example_messages["Write a factorial function"]),
        inputs=None,
        outputs=user_input
    )

demo.launch(ssr_mode=False)