Update app.py
Browse files
app.py
CHANGED
@@ -41,8 +41,7 @@ model = Llama4ForConditionalGeneration.from_pretrained(
|
|
41 |
# quantization_config=bnb_config,
|
42 |
device_map="auto",
|
43 |
)
|
44 |
-
|
45 |
-
tokenizer = AutoProcessor.from_pretrained(model_name
|
46 |
# , gguf_file=filename
|
47 |
# , subfolder=subfolder
|
48 |
)
|
@@ -81,7 +80,7 @@ def generate(prompt, history):
|
|
81 |
|
82 |
# response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
83 |
# return response
|
84 |
-
inputs =
|
85 |
messages,
|
86 |
add_generation_prompt=True,
|
87 |
tokenize=True,
|
@@ -92,7 +91,7 @@ def generate(prompt, history):
|
|
92 |
**inputs,
|
93 |
max_new_tokens=512,
|
94 |
)
|
95 |
-
response =
|
96 |
|
97 |
|
98 |
chat_interface = gr.ChatInterface(
|
|
|
41 |
# quantization_config=bnb_config,
|
42 |
device_map="auto",
|
43 |
)
|
44 |
+
processor = AutoProcessor.from_pretrained(model_name, cache_dir = cache_dir)
|
|
|
45 |
# , gguf_file=filename
|
46 |
# , subfolder=subfolder
|
47 |
)
|
|
|
80 |
|
81 |
# response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
82 |
# return response
|
83 |
+
inputs = processor.apply_chat_template(
|
84 |
messages,
|
85 |
add_generation_prompt=True,
|
86 |
tokenize=True,
|
|
|
91 |
**inputs,
|
92 |
max_new_tokens=512,
|
93 |
)
|
94 |
+
response = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0]
|
95 |
|
96 |
|
97 |
chat_interface = gr.ChatInterface(
|