File size: 2,919 Bytes
44c4d91
e769dfe
 
ce4b3d4
 
e769dfe
8b1f0bb
f50e1fa
 
 
 
 
588eaad
3592c57
 
bfea3d1
 
 
1db4a17
22e90eb
f50e1fa
1bd88b5
1db4a17
5e4ad0a
6f27e51
44c4d91
ce4b3d4
 
 
 
 
 
 
 
 
e769dfe
ce4b3d4
22e90eb
f50e1fa
76deac1
f50e1fa
6f27e51
44c4d91
c2779df
22e90eb
76deac1
5609bb1
 
 
 
 
 
 
 
 
 
e769dfe
 
 
 
dbefc37
e769dfe
 
5609bb1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2779df
e769dfe
5609bb1
ce4b3d4
5609bb1
 
 
 
 
 
e769dfe
c2779df
e769dfe
 
699d2be
 
 
a89fdf4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoProcessor, Llama4ForConditionalGeneration

import torch

# from transformers import BitsAndBytesConfig
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     llm_int8_enable_fp32_cpu_offload=True,
# )

#Qwen/Qwen2.5-14B-Instruct-1M
#Qwen/Qwen2-0.5B
# model_name = "bartowski/simplescaling_s1-32B-GGUF"
# subfolder = "Qwen-0.5B-GRPO/checkpoint-1868"
# filename = "simplescaling_s1-32B-Q4_K_S.gguf"
# model_name = "simplescaling/s1.1-32B"
# model_name = "unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF"
# model_name = "unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-bnb-4bit"
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
filename = "Llama-4-Scout-17B-16E-Instruct-UD-IQ2_XXS.gguf"
torch_dtype = torch.bfloat16 # could be torch.float16 or torch.bfloat16 torch.float32 too
cache_dir = "/data"

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     # subfolder=subfolder,
#     gguf_file=filename,
#     torch_dtype=torch_dtype,
#     device_map="auto",
#     cache_dir = cache_dir,
# )
model = Llama4ForConditionalGeneration.from_pretrained(
    model_name,
    attn_implementation="flex_attention",
    # gguf_file=filename,
    cache_dir = cache_dir,
    torch_dtype=torch_dtype,
    # quantization_config=bnb_config,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_name, cache_dir = cache_dir)
    # , gguf_file=filename
    # , subfolder=subfolder
SYSTEM_PROMPT = "You are a friendly Chatbot."
# """
# Respond in the following format:
# <reasoning>
# ...
# </reasoning>
# <answer>
# ...
# </answer>
# """

@spaces.GPU
def generate(prompt, history):
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt}
    ]
    # text = tokenizer.apply_chat_template(
    #     messages,
    #     # tokenize=False,
    #     tokenize=True,
    #     add_generation_prompt=True
    # )
    # model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    # generated_ids = model.generate(
    #     **model_inputs,
    #     max_new_tokens=512
    # )
    # generated_ids = [
    #     output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    # ]
    
    # response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    # return response
    inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    ).to(gpu_model.device)
    outputs = gpu_model.generate(
        **inputs,
        max_new_tokens=512,
    )
    response = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0]


chat_interface = gr.ChatInterface(
    fn=generate,
)
chat_interface.launch(share=True)