Spaces:
Sleeping
Sleeping
Change LoRA size from 256 to 512, also back to bidirectional_masked
Browse files- llama_diffusion_model.py +3 -3
llama_diffusion_model.py
CHANGED
@@ -192,7 +192,7 @@ class CustomTransformerModel(PreTrainedModel):
|
|
192 |
self.llama.resize_token_embeddings(config.vocab_size)
|
193 |
|
194 |
for i, layer in enumerate(self.llama.model.layers):
|
195 |
-
layer.self_attn = BidirectionalLlamaAttention(layer.self_attn, masking='
|
196 |
|
197 |
# Freeze Llama to retain pre-trained knowledge
|
198 |
for param in self.llama.parameters():
|
@@ -202,8 +202,8 @@ class CustomTransformerModel(PreTrainedModel):
|
|
202 |
param.requires_grad = True
|
203 |
|
204 |
lora_config = LoraConfig(
|
205 |
-
r=
|
206 |
-
lora_alpha=
|
207 |
lora_dropout=0.0,
|
208 |
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], # Llama-3 uses these attention modules
|
209 |
bias="none",
|
|
|
192 |
self.llama.resize_token_embeddings(config.vocab_size)
|
193 |
|
194 |
for i, layer in enumerate(self.llama.model.layers):
|
195 |
+
layer.self_attn = BidirectionalLlamaAttention(layer.self_attn, masking='bidirectional_masked')
|
196 |
|
197 |
# Freeze Llama to retain pre-trained knowledge
|
198 |
for param in self.llama.parameters():
|
|
|
202 |
param.requires_grad = True
|
203 |
|
204 |
lora_config = LoraConfig(
|
205 |
+
r=512,
|
206 |
+
lora_alpha=512,
|
207 |
lora_dropout=0.0,
|
208 |
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], # Llama-3 uses these attention modules
|
209 |
bias="none",
|