File size: 3,027 Bytes
44c4d91
e769dfe
 
ce4b3d4
 
e769dfe
8b1f0bb
85f8f28
 
 
 
 
588eaad
3592c57
 
bfea3d1
 
 
1db4a17
22e90eb
5877ea3
85f8f28
1db4a17
5e4ad0a
6f27e51
44c4d91
ce4b3d4
 
 
 
 
 
 
 
 
e769dfe
32952ac
5877ea3
22e90eb
f50e1fa
76deac1
f50e1fa
6f27e51
44c4d91
9268605
 
22e90eb
76deac1
5609bb1
 
 
 
 
 
 
 
 
 
e769dfe
 
 
 
078e1ae
e769dfe
 
5609bb1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2779df
e769dfe
5609bb1
79b9a75
5609bb1
 
078e1ae
79b9a75
078e1ae
 
e769dfe
c2779df
e769dfe
 
699d2be
 
 
a89fdf4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoProcessor, Llama4ForConditionalGeneration

import torch

# from transformers import BitsAndBytesConfig
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     llm_int8_enable_fp32_cpu_offload=True,
# )

#Qwen/Qwen2.5-14B-Instruct-1M
#Qwen/Qwen2-0.5B
# model_name = "bartowski/simplescaling_s1-32B-GGUF"
# subfolder = "Qwen-0.5B-GRPO/checkpoint-1868"
# filename = "simplescaling_s1-32B-Q4_K_S.gguf"
# model_name = "simplescaling/s1.1-32B"
# model_name = "unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF"
model_name = "unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-bnb-4bit"
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
filename = "Llama-4-Scout-17B-16E-Instruct-UD-IQ2_XXS.gguf"
torch_dtype = torch.bfloat16 # could be torch.float16 or torch.bfloat16 torch.float32 too
cache_dir = "/data"

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     # subfolder=subfolder,
#     gguf_file=filename,
#     torch_dtype=torch_dtype,
#     device_map="auto",
#     cache_dir = cache_dir,
# )
model = Llama4ForConditionalGeneration.from_pretrained(
    model_name,
    # default is eager attention
    # attn_implementation="flex_attention",
    # gguf_file=filename,
    cache_dir = cache_dir,
    torch_dtype=torch_dtype,
    # quantization_config=bnb_config,
    device_map="auto",
)
# processor = AutoProcessor.from_pretrained(model_name, cache_dir = cache_dir)
processor = AutoTokenizer.from_pretrained(model_name, cache_dir = cache_dir)
    # , gguf_file=filename
    # , subfolder=subfolder
SYSTEM_PROMPT = "You are a friendly Chatbot."
# """
# Respond in the following format:
# <reasoning>
# ...
# </reasoning>
# <answer>
# ...
# </answer>
# """

@spaces.GPU
def generate(prompt, history):
    messages = [
        # {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt}
    ]
    # text = tokenizer.apply_chat_template(
    #     messages,
    #     # tokenize=False,
    #     tokenize=True,
    #     add_generation_prompt=True
    # )
    # model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    # generated_ids = model.generate(
    #     **model_inputs,
    #     max_new_tokens=512
    # )
    # generated_ids = [
    #     output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    # ]
    
    # response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    # return response
    inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        # tokenize=True,
        return_dict=True,
        return_tensors="pt",
    )
    outputs = model.generate(
        **inputs.to(model.device),
        max_new_tokens=100,
    )
    response = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0]


chat_interface = gr.ChatInterface(
    fn=generate,
)
chat_interface.launch(share=True)