File size: 3,027 Bytes
44c4d91 e769dfe ce4b3d4 e769dfe 8b1f0bb 85f8f28 588eaad 3592c57 bfea3d1 1db4a17 22e90eb 5877ea3 85f8f28 1db4a17 5e4ad0a 6f27e51 44c4d91 ce4b3d4 e769dfe 32952ac 5877ea3 22e90eb f50e1fa 76deac1 f50e1fa 6f27e51 44c4d91 9268605 22e90eb 76deac1 5609bb1 e769dfe 078e1ae e769dfe 5609bb1 c2779df e769dfe 5609bb1 79b9a75 5609bb1 078e1ae 79b9a75 078e1ae e769dfe c2779df e769dfe 699d2be a89fdf4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoProcessor, Llama4ForConditionalGeneration
import torch
# from transformers import BitsAndBytesConfig
# bnb_config = BitsAndBytesConfig(
# load_in_4bit=True,
# llm_int8_enable_fp32_cpu_offload=True,
# )
#Qwen/Qwen2.5-14B-Instruct-1M
#Qwen/Qwen2-0.5B
# model_name = "bartowski/simplescaling_s1-32B-GGUF"
# subfolder = "Qwen-0.5B-GRPO/checkpoint-1868"
# filename = "simplescaling_s1-32B-Q4_K_S.gguf"
# model_name = "simplescaling/s1.1-32B"
# model_name = "unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF"
model_name = "unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-bnb-4bit"
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
filename = "Llama-4-Scout-17B-16E-Instruct-UD-IQ2_XXS.gguf"
torch_dtype = torch.bfloat16 # could be torch.float16 or torch.bfloat16 torch.float32 too
cache_dir = "/data"
# model = AutoModelForCausalLM.from_pretrained(
# model_name,
# # subfolder=subfolder,
# gguf_file=filename,
# torch_dtype=torch_dtype,
# device_map="auto",
# cache_dir = cache_dir,
# )
model = Llama4ForConditionalGeneration.from_pretrained(
model_name,
# default is eager attention
# attn_implementation="flex_attention",
# gguf_file=filename,
cache_dir = cache_dir,
torch_dtype=torch_dtype,
# quantization_config=bnb_config,
device_map="auto",
)
# processor = AutoProcessor.from_pretrained(model_name, cache_dir = cache_dir)
processor = AutoTokenizer.from_pretrained(model_name, cache_dir = cache_dir)
# , gguf_file=filename
# , subfolder=subfolder
SYSTEM_PROMPT = "You are a friendly Chatbot."
# """
# Respond in the following format:
# <reasoning>
# ...
# </reasoning>
# <answer>
# ...
# </answer>
# """
@spaces.GPU
def generate(prompt, history):
messages = [
# {"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": prompt}
]
# text = tokenizer.apply_chat_template(
# messages,
# # tokenize=False,
# tokenize=True,
# add_generation_prompt=True
# )
# model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
# generated_ids = model.generate(
# **model_inputs,
# max_new_tokens=512
# )
# generated_ids = [
# output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
# ]
# response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
# return response
inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
# tokenize=True,
return_dict=True,
return_tensors="pt",
)
outputs = model.generate(
**inputs.to(model.device),
max_new_tokens=100,
)
response = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0]
chat_interface = gr.ChatInterface(
fn=generate,
)
chat_interface.launch(share=True)
|