Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
import gradio as gr
|
2 |
import spaces
|
3 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
|
4 |
import torch
|
5 |
|
6 |
#Qwen/Qwen2.5-14B-Instruct-1M
|
@@ -14,14 +16,23 @@ filename = "Llama-4-Scout-17B-16E-Instruct-UD-IQ2_XXS.gguf"
|
|
14 |
torch_dtype = torch.bfloat16 # could be torch.float16 or torch.bfloat16 torch.float32 too
|
15 |
cache_dir = "/data"
|
16 |
|
17 |
-
model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
model_name,
|
19 |
-
|
20 |
gguf_file=filename,
|
21 |
torch_dtype=torch_dtype,
|
22 |
device_map="auto",
|
23 |
cache_dir = cache_dir,
|
24 |
)
|
|
|
25 |
tokenizer = AutoTokenizer.from_pretrained(model_name
|
26 |
, gguf_file=filename
|
27 |
# , subfolder=subfolder
|
@@ -44,7 +55,8 @@ def generate(prompt, history):
|
|
44 |
]
|
45 |
text = tokenizer.apply_chat_template(
|
46 |
messages,
|
47 |
-
tokenize=False,
|
|
|
48 |
add_generation_prompt=True
|
49 |
)
|
50 |
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
|
|
|
1 |
import gradio as gr
|
2 |
import spaces
|
3 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
4 |
+
from transformers import AutoProcessor, Llama4ForConditionalGeneration
|
5 |
+
|
6 |
import torch
|
7 |
|
8 |
#Qwen/Qwen2.5-14B-Instruct-1M
|
|
|
16 |
torch_dtype = torch.bfloat16 # could be torch.float16 or torch.bfloat16 torch.float32 too
|
17 |
cache_dir = "/data"
|
18 |
|
19 |
+
# model = AutoModelForCausalLM.from_pretrained(
|
20 |
+
# model_name,
|
21 |
+
# # subfolder=subfolder,
|
22 |
+
# gguf_file=filename,
|
23 |
+
# torch_dtype=torch_dtype,
|
24 |
+
# device_map="auto",
|
25 |
+
# cache_dir = cache_dir,
|
26 |
+
# )
|
27 |
+
model = Llama4ForConditionalGeneration.from_pretrained(
|
28 |
model_name,
|
29 |
+
attn_implementation="flex_attention",
|
30 |
gguf_file=filename,
|
31 |
torch_dtype=torch_dtype,
|
32 |
device_map="auto",
|
33 |
cache_dir = cache_dir,
|
34 |
)
|
35 |
+
|
36 |
tokenizer = AutoTokenizer.from_pretrained(model_name
|
37 |
, gguf_file=filename
|
38 |
# , subfolder=subfolder
|
|
|
55 |
]
|
56 |
text = tokenizer.apply_chat_template(
|
57 |
messages,
|
58 |
+
# tokenize=False,
|
59 |
+
tokenize=True,
|
60 |
add_generation_prompt=True
|
61 |
)
|
62 |
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
|