s1.1-32B

Runtime error

bobber commited on 23 days ago

Commit

79b9a75

verified ·

1 Parent(s): 9268605

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -34,7 +34,8 @@ cache_dir = "/data"
 # )
 model = Llama4ForConditionalGeneration.from_pretrained(
     model_name,
-    attn_implementation="flex_attention",
     # gguf_file=filename,
     cache_dir = cache_dir,
     torch_dtype=torch_dtype,
@@ -83,11 +84,11 @@ def generate(prompt, history):
     inputs = processor.apply_chat_template(
         messages,
         add_generation_prompt=True,
-        tokenize=True,
         return_dict=True,
         return_tensors="pt",
     ).to(gpu_model.device)
-    outputs = gpu_model.generate(
         **inputs,
         max_new_tokens=512,
     )

 # )
 model = Llama4ForConditionalGeneration.from_pretrained(
     model_name,
+    # flex_attention is only needed for image
+    # attn_implementation="flex_attention",
     # gguf_file=filename,
     cache_dir = cache_dir,
     torch_dtype=torch_dtype,
     inputs = processor.apply_chat_template(
         messages,
         add_generation_prompt=True,
+        # tokenize=True,
         return_dict=True,
         return_tensors="pt",
     ).to(gpu_model.device)
+    outputs = model.generate(
         **inputs,
         max_new_tokens=512,
     )