Update app.py
Browse files
app.py
CHANGED
@@ -34,7 +34,8 @@ cache_dir = "/data"
|
|
34 |
# )
|
35 |
model = Llama4ForConditionalGeneration.from_pretrained(
|
36 |
model_name,
|
37 |
-
|
|
|
38 |
# gguf_file=filename,
|
39 |
cache_dir = cache_dir,
|
40 |
torch_dtype=torch_dtype,
|
@@ -83,11 +84,11 @@ def generate(prompt, history):
|
|
83 |
inputs = processor.apply_chat_template(
|
84 |
messages,
|
85 |
add_generation_prompt=True,
|
86 |
-
tokenize=True,
|
87 |
return_dict=True,
|
88 |
return_tensors="pt",
|
89 |
).to(gpu_model.device)
|
90 |
-
outputs =
|
91 |
**inputs,
|
92 |
max_new_tokens=512,
|
93 |
)
|
|
|
34 |
# )
|
35 |
model = Llama4ForConditionalGeneration.from_pretrained(
|
36 |
model_name,
|
37 |
+
# flex_attention is only needed for image
|
38 |
+
# attn_implementation="flex_attention",
|
39 |
# gguf_file=filename,
|
40 |
cache_dir = cache_dir,
|
41 |
torch_dtype=torch_dtype,
|
|
|
84 |
inputs = processor.apply_chat_template(
|
85 |
messages,
|
86 |
add_generation_prompt=True,
|
87 |
+
# tokenize=True,
|
88 |
return_dict=True,
|
89 |
return_tensors="pt",
|
90 |
).to(gpu_model.device)
|
91 |
+
outputs = model.generate(
|
92 |
**inputs,
|
93 |
max_new_tokens=512,
|
94 |
)
|