Spaces:

Ruurd
/

tini

Running on Zero

Ruurd commited on 19 days ago

Commit

7141e39

verified ·

1 Parent(s): 7ec3bd7

New masking implementation

Files changed (1) hide show

llama_diffusion_model.py CHANGED Viewed

@@ -97,8 +97,8 @@ class CustomTransformerModel(PreTrainedModel):
         self.llama = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B", torch_dtype=torch.float16, device_map="auto", token=hf_token)
         self.llama.resize_token_embeddings(config.vocab_size)
-        for i, layer in enumerate(self.llama.model.layers):
-            layer.self_attn = BidirectionalLlamaAttention(layer.self_attn, masking=config.masking_type)
         for param in self.llama.parameters():
             param.requires_grad = False
@@ -113,7 +113,7 @@ class CustomTransformerModel(PreTrainedModel):
         self.llama = get_peft_model(self.llama, lora_config)
         self.llama.print_trainable_parameters()
-        self.llama = self.llama.to(torch.float16)
     def forward(self, input_ids, labels=None, **kwargs):
         batch_size, seq_len = input_ids.shape
@@ -121,8 +121,8 @@ class CustomTransformerModel(PreTrainedModel):
         # Build attention mask
         device = input_ids.device
-        masking_type = getattr(self.config, "masking_type", "bidirectional")
         if masking_type == 'bidirectional':
             base_mask = torch.ones(seq_len, seq_len, dtype=torch.bool, device=device)
         elif masking_type == 'bidirectional_masked':
@@ -142,6 +142,7 @@ class CustomTransformerModel(PreTrainedModel):
                 input_ids,
                 attention_mask=attention_mask,
                 output_hidden_states=True,
                 **kwargs
             )

         self.llama = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B", torch_dtype=torch.float16, device_map="auto", token=hf_token)
         self.llama.resize_token_embeddings(config.vocab_size)
+        # for i, layer in enumerate(self.llama.model.layers):
+        #     layer.self_attn = BidirectionalLlamaAttention(layer.self_attn, masking=config.masking_type)
         for param in self.llama.parameters():
             param.requires_grad = False
         self.llama = get_peft_model(self.llama, lora_config)
         self.llama.print_trainable_parameters()
+        # self.llama = self.llama.to(torch.float16)
     def forward(self, input_ids, labels=None, **kwargs):
         batch_size, seq_len = input_ids.shape
         # Build attention mask
         device = input_ids.device
+        masking_type = getattr(self.config, "masking_type", "bidirectional_masked")
         if masking_type == 'bidirectional':
             base_mask = torch.ones(seq_len, seq_len, dtype=torch.bool, device=device)
         elif masking_type == 'bidirectional_masked':
                 input_ids,
                 attention_mask=attention_mask,
                 output_hidden_states=True,
+                use_cache=False,
                 **kwargs
             )