File size: 2,119 Bytes
44c4d91 e769dfe ce4b3d4 e769dfe 8b1f0bb 3592c57 bfea3d1 1db4a17 5e4ad0a 6f27e51 44c4d91 ce4b3d4 e769dfe ce4b3d4 1db4a17 76deac1 6f27e51 44c4d91 ce4b3d4 76deac1 1db4a17 76deac1 dbefc37 e769dfe dbefc37 e769dfe ce4b3d4 e769dfe 699d2be a89fdf4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoProcessor, Llama4ForConditionalGeneration
import torch
#Qwen/Qwen2.5-14B-Instruct-1M
#Qwen/Qwen2-0.5B
# model_name = "bartowski/simplescaling_s1-32B-GGUF"
# subfolder = "Qwen-0.5B-GRPO/checkpoint-1868"
# filename = "simplescaling_s1-32B-Q4_K_S.gguf"
# model_name = "simplescaling/s1.1-32B"
model_name = "unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF"
filename = "Llama-4-Scout-17B-16E-Instruct-UD-IQ2_XXS.gguf"
torch_dtype = torch.bfloat16 # could be torch.float16 or torch.bfloat16 torch.float32 too
cache_dir = "/data"
# model = AutoModelForCausalLM.from_pretrained(
# model_name,
# # subfolder=subfolder,
# gguf_file=filename,
# torch_dtype=torch_dtype,
# device_map="auto",
# cache_dir = cache_dir,
# )
model = Llama4ForConditionalGeneration.from_pretrained(
model_name,
attn_implementation="flex_attention",
gguf_file=filename,
torch_dtype=torch_dtype,
device_map="auto",
cache_dir = cache_dir,
)
tokenizer = AutoTokenizer.from_pretrained(model_name
, gguf_file=filename
# , subfolder=subfolder
)
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""
@spaces.GPU
def generate(prompt, history):
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
messages,
# tokenize=False,
tokenize=True,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
generated_ids = model.generate(
**model_inputs,
max_new_tokens=512
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
return response
chat_interface = gr.ChatInterface(
fn=generate,
)
chat_interface.launch(share=True)
|