plamo-embedding-1b / modeling_plamo.py
kaitos255's picture
initial commit
e29924d
raw
history blame
38.9 kB
import enum
from typing import Any, List, NamedTuple, Optional, Tuple, Union
import torch
from torch import nn
from torch.nn import functional as F
from transformers import AutoTokenizer, PretrainedConfig, PreTrainedModel
from transformers.modeling_attn_mask_utils import (
_prepare_4d_causal_attention_mask,
_prepare_4d_causal_attention_mask_for_sdpa,
)
from transformers.modeling_outputs import BaseModelOutputWithPast
from transformers.tokenization_utils_base import BatchEncoding
def _swiglu(h: torch.Tensor) -> torch.Tensor:
h0, h1 = h.chunk(2, dim=-1)
return torch.nn.functional.silu(h0) * h1
class PlamoAttentionCache:
def __init__(self, key: torch.Tensor, value: torch.Tensor) -> None:
B, nh, L, c = key.shape
assert len(value.shape) == 4
assert value.shape[0] == B
assert value.shape[2] == L
self.key = key
self.value = value
def _validate(self, cache: torch.Tensor, new_tensor: torch.Tensor) -> None:
assert len(cache.shape) == 4
assert len(new_tensor.shape) == 4
assert cache.shape[0] == new_tensor.shape[0]
assert cache.shape[1] == new_tensor.shape[1]
assert cache.shape[3] == new_tensor.shape[3]
def append_cache(self, k: torch.Tensor, v: torch.Tensor) -> None:
self._validate(self.key, k)
self._validate(self.value, v)
assert k.shape[2] == v.shape[2]
self.key = torch.cat([self.key, k], dim=2)
self.value = torch.cat([self.value, v], dim=2)
def sequence_length(self) -> int:
return self.key.shape[2]
PlamoLayerCache = PlamoAttentionCache
PlamoCache = list[PlamoLayerCache]
class DecoderInput(NamedTuple):
hidden_states: torch.Tensor
position_ids: torch.Tensor
attention_mask: Optional[torch.Tensor] = None
past_key_values: Optional[PlamoCache] = None
output_hidden_states: Optional[bool] = False
output_attentions: Optional[bool] = False
use_cache: Optional[bool] = False
gradient_checkpointing: bool = False
input_ids: Optional[torch.Tensor] = None
class DecoderOutput(NamedTuple):
hidden_states: torch.Tensor
all_hidden_states: Optional[Tuple[torch.Tensor, ...]]
all_self_attns: Optional[Tuple[torch.Tensor, ...]]
next_decoder_cache: Optional[PlamoCache]
class LinearType(str, enum.Enum):
Normal = "normal"
Fp8 = "fp8"
Fp8Retain = "fp8-retain"
class PlamoConfig(PretrainedConfig): # type: ignore
model_type: str = "plamo"
def __init__(
self,
vocab_size: int = 32000,
hidden_size: int = 4096,
intermediate_size: int = 13312,
num_hidden_layers: int = 32,
num_attention_heads: int = 32,
num_key_value_heads: int = 4,
hidden_size_per_head: int = 128,
max_position_embeddings: int = 2048,
initializer_range: float = 0.02,
rms_norm_eps: float = 1e-6,
use_cache: bool = True,
tokenizer_class: str = "PlamoTokenizer",
pad_token_id: Optional[int] = None,
bos_token_id: int = 1,
eos_token_id: int = 2,
tie_word_embeddings: bool = False,
n_expert: Optional[int] = None,
k_expert: Optional[int] = None,
expert_dropout: float = 0.0,
capacity_factor: float = 1.0,
group_size: int = 1024,
sparse_step: Optional[int] = None,
sparse_intermediate_size: Optional[int] = None,
shared_intermediate_size: Optional[int] = None,
linear_type: LinearType = LinearType.Normal,
fp8_accum_dtype: Optional[str] = None,
eval_attention_n_bit: Optional[int] = None,
eval_mlp_n_bit: Optional[int] = None,
eval_offload_moe: bool = False,
attention_dropout: float = 0.0,
**kwargs: Any,
) -> None:
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_size_per_head = hidden_size_per_head
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.num_key_value_heads = num_key_value_heads
self.n_expert = n_expert
self.k_expert = k_expert
self.sparse_intermediate_size = sparse_intermediate_size
self.shared_intermediate_size = shared_intermediate_size
self.expert_dropout = expert_dropout
self.capacity_factor = capacity_factor
self.group_size = group_size
self.sparse_step = sparse_step
self.linear_type = linear_type
self.fp8_accum_dtype = fp8_accum_dtype
self.eval_attention_n_bit = eval_attention_n_bit
self.eval_mlp_n_bit = eval_mlp_n_bit
self.eval_offload_moe = eval_offload_moe
self.attention_dropout = attention_dropout
super().__init__(
tokenizer_class=tokenizer_class,
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
# Copied from transformers.models.bart.modeling_bart._make_causal_mask
def _make_causal_mask(
input_ids_shape: Tuple[int, int],
dtype: torch.dtype,
device: torch.device,
past_key_values_length: int = 0,
) -> torch.Tensor:
"""
Make causal mask used for bi-directional self-attention.
"""
bsz, tgt_len = input_ids_shape
mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
mask_cond = torch.arange(mask.size(-1), device=device)
mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
mask = mask.to(dtype)
if past_key_values_length > 0:
mask = torch.cat(
[
torch.zeros(
tgt_len, past_key_values_length, dtype=dtype, device=device
),
mask,
],
dim=-1,
)
return mask[None, None, :, :].expand(
bsz, 1, tgt_len, tgt_len + past_key_values_length
)
# Copied from transformers.models.bart.modeling_bart._expand_mask
def _expand_mask(
mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None
) -> torch.Tensor:
"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
"""
bsz, src_len = mask.size()
tgt_len = tgt_len if tgt_len is not None else src_len
expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
inverted_mask = 1.0 - expanded_mask
return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) # type: ignore
class RotaryEmbedding(torch.nn.Module):
def __init__(
self,
dim: int,
max_position_embeddings: int = 2048,
base: int = 10000,
device: Optional[torch.device] = None,
) -> None:
super().__init__()
self.dim = dim
self.max_position_embeddings = max_position_embeddings
self.base = base
inv_freq = 1.0 / (
self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
)
self.register_buffer("inv_freq", inv_freq, persistent=False)
# Build here to make `torch.jit.trace` work.
self._set_cos_sin_cache(
seq_len=max_position_embeddings,
device=self.inv_freq.device,
dtype=torch.get_default_dtype(),
)
def _set_cos_sin_cache(self, seq_len: int, device: Any, dtype: Any) -> None:
self.max_seq_len_cached = seq_len
t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) # type: ignore
freqs = torch.einsum("i,j->ij", t, self.inv_freq)
# Different from paper, but it uses a different permutation in order to obtain the same calculation
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer(
"cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False
)
self.register_buffer(
"sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False
)
def forward(
self, x: torch.Tensor, seq_len: int
) -> Tuple[torch.Tensor, torch.Tensor]:
# x: [bs, num_attention_heads, seq_len, head_size]
if seq_len > self.max_seq_len_cached:
self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
return (
self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype), # type: ignore
self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype), # type: ignore
)
def _rotate_half(x: torch.Tensor) -> torch.Tensor:
"""Rotates half the hidden dims of the input."""
x1 = x[..., : x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2 :]
return torch.cat((-x2, x1), dim=-1)
def _rotary_pos_emb(
x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, position_ids: torch.Tensor
) -> torch.Tensor:
# The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
x_embed = (x * cos) + (_rotate_half(x) * sin)
return x_embed
def _rms_norm(
hidden_states: torch.Tensor, weight: Optional[torch.Tensor], eps: float
) -> torch.Tensor:
input_dtype = hidden_states.dtype
hidden_states = hidden_states.to(torch.float32)
variance = hidden_states.pow(2).mean(-1, keepdim=True)
hidden_states = hidden_states * torch.rsqrt(variance + eps)
hidden_states = hidden_states.to(input_dtype)
if weight is not None:
hidden_states = weight * hidden_states
return hidden_states
class RMSNorm(nn.Module):
def __init__(
self,
hidden_size: int,
eps: float = 1e-6,
device: Optional[Union[torch.device, str]] = None,
) -> None:
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size, device=device))
self.variance_epsilon = eps
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return _rms_norm(hidden_states, self.weight, self.variance_epsilon)
class Attention(torch.nn.Module):
def __init__(self, config: PlamoConfig) -> None:
super().__init__()
self.config = config
self.hidden_size = config.hidden_size
head_dim = config.hidden_size_per_head
self.max_position_embeddings = config.max_position_embeddings
self.q_num_heads = config.num_attention_heads
self.qk_dim = self.v_dim = head_dim
self.k_num_heads = self.v_num_heads = config.num_key_value_heads
assert self.q_num_heads % self.k_num_heads == 0
self.n_group = self.q_num_heads // self.k_num_heads
self.q_proj_dim = self.q_num_heads * self.qk_dim
self.k_proj_dim = self.k_num_heads * self.qk_dim
self.v_proj_dim = self.k_num_heads * self.v_dim
self.qkv_proj = nn.Linear(
self.hidden_size,
self.q_proj_dim + self.k_proj_dim + self.v_proj_dim,
bias=False,
)
self.o_proj = nn.Linear(
self.q_num_heads * self.v_dim, self.hidden_size, bias=False
)
self.rotary_emb = RotaryEmbedding(
self.qk_dim, max_position_embeddings=self.max_position_embeddings
)
self.q_weight = torch.nn.Parameter(torch.ones((self.q_num_heads, self.qk_dim)))
self.k_weight = torch.nn.Parameter(torch.ones((self.k_num_heads, self.qk_dim)))
self.is_causal = True
self.attention_dropout = config.attention_dropout
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
past_key_value: Optional[PlamoLayerCache] = None,
output_attentions: bool = False,
use_cache: bool = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[PlamoLayerCache]]:
bsz, q_len, _ = hidden_states.size()
qkv = self.qkv_proj(hidden_states)
query_states, key_states, value_states = torch.split(
qkv, [self.q_proj_dim, self.k_proj_dim, self.v_proj_dim], dim=-1
)
query_states = query_states.view(
bsz, q_len, self.q_num_heads, self.qk_dim
).transpose(1, 2)
key_states = key_states.view(
bsz, q_len, self.k_num_heads, self.qk_dim
).transpose(1, 2)
value_states = value_states.view(
bsz, q_len, self.v_num_heads, self.v_dim
).transpose(1, 2)
attn_dtype = query_states.dtype
query_states = (
_rms_norm(query_states, None, 1e-6) * self.q_weight[None, :, None]
)
key_states = _rms_norm(key_states, None, 1e-6) * self.k_weight[None, :, None]
if use_cache and past_key_value is None:
bsz, nhead_k, _, c_k = key_states.shape
_, nhead_v, _, c_v = value_states.shape
past_key_value = PlamoAttentionCache(
torch.zeros(
(bsz, nhead_k, 0, c_k),
dtype=key_states.dtype,
device=key_states.device,
),
torch.zeros(
(bsz, nhead_v, 0, c_v),
dtype=value_states.dtype,
device=value_states.device,
),
)
kv_seq_len = key_states.shape[-2]
if past_key_value is not None:
kv_seq_len += past_key_value.sequence_length()
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
assert position_ids is not None
query_states = _rotary_pos_emb(query_states, cos, sin, position_ids)
key_states = _rotary_pos_emb(key_states, cos, sin, position_ids)
# [bsz, nh, t, hd]
if past_key_value is not None:
# reuse k, v, self_attention
past_key_value.append_cache(key_states, value_states)
key_states = past_key_value.key
value_states = past_key_value.value
def _expand_kv(t: torch.Tensor, repeat: int, target: int) -> torch.Tensor:
t = torch.repeat_interleave(t, repeat, dim=1)
return t[:, :target]
# expand shared kv
assert self.k_num_heads == self.v_num_heads
key_states = _expand_kv(key_states, self.n_group, self.q_num_heads)
value_states = _expand_kv(value_states, self.n_group, self.q_num_heads)
query_states = query_states.to(attn_dtype)
key_states = key_states.to(attn_dtype)
value_states = value_states.to(attn_dtype)
if attention_mask is not None and attention_mask.dtype != torch.bool:
attention_mask = attention_mask.to(attn_dtype)
attn_output = F.scaled_dot_product_attention(
query_states,
key_states,
value_states,
attn_mask=attention_mask,
is_causal=self.is_causal,
dropout_p=self.attention_dropout if self.training else 0.0,
)
attn_output = attn_output.transpose(1, 2)
attn_output = attn_output.reshape(bsz, q_len, self.q_num_heads * self.v_dim)
attn_output = self.o_proj(attn_output)
if not output_attentions:
attn_weights = None
return attn_output, attn_weights, past_key_value
class DenseMLP(nn.Module):
def __init__(self, config: PlamoConfig) -> None:
super().__init__()
self.config = config
self.hidden_size = config.hidden_size
self.intermediate_size = config.intermediate_size
self.gate_up_proj = torch.nn.Linear(
self.hidden_size, self.intermediate_size * 2, bias=False
)
self.down_proj = torch.nn.Linear(
self.intermediate_size, self.hidden_size, bias=False
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
h = self.gate_up_proj(x)
h = _swiglu(h)
return self.down_proj(h) # type: ignore
def MLP(config: PlamoConfig, is_sparse: bool) -> torch.nn.Module:
return DenseMLP(config)
class PlamoDecoderLayer(torch.nn.Module):
def __init__(self, config: PlamoConfig, is_sparse: bool) -> None:
super().__init__()
self.config = config
self.hidden_size = config.hidden_size
self.self_attn = Attention(config)
self.mlp = MLP(config, is_sparse=is_sparse)
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.norm2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[PlamoLayerCache] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
) -> Tuple[Any, ...]:
# from LlamaDecoder
residual = hidden_states
hidden_states = self.norm(hidden_states)
# Self Attention
hidden_states_sa, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
)
hidden_states = residual + hidden_states_sa
residual = hidden_states
hidden_states = self.norm2(hidden_states)
# Fully Connected
hidden_states_mlp = self.mlp(hidden_states)
# Residual
hidden_states = residual + hidden_states_mlp
outputs: Any = (hidden_states,)
if output_attentions:
outputs += (self_attn_weights,)
if use_cache:
outputs += (present_key_value,)
return outputs # type: ignore
def is_sparse(config: PlamoConfig, i: int) -> bool:
if config.sparse_step is None:
return False
if config.sparse_step == 1:
return True
return (i % config.sparse_step) == 1
class PlamoDecoder(torch.nn.Module):
def __init__(self, config: PlamoConfig) -> None:
super().__init__()
self.layers = torch.nn.ModuleList(
[
PlamoDecoderLayer(config, is_sparse=is_sparse(config, i))
for i in range(config.num_hidden_layers)
]
)
def forward(self, x: DecoderInput) -> DecoderOutput:
all_hidden_states: Optional[Tuple[torch.Tensor, ...]] = (
() if x.output_hidden_states else None
)
all_self_attns: Optional[Tuple[torch.Tensor, ...]] = (
() if x.output_attentions else None
)
next_decoder_cache: Optional[PlamoCache] = [] if x.use_cache else None
hidden_states = x.hidden_states
for idx, decoder_layer in enumerate(self.layers):
if x.output_hidden_states:
assert all_hidden_states is not None
all_hidden_states += (hidden_states,)
past_key_value = (
x.past_key_values[idx] if x.past_key_values is not None else None
)
if self.training and x.gradient_checkpointing:
def create_custom_forward(module): # type: ignore
def custom_forward(*inputs): # type: ignore
# None for past_key_value
return module(*inputs, x.output_attentions, None)
return custom_forward
layer_outputs = torch.utils.checkpoint.checkpoint(
create_custom_forward(decoder_layer), # type: ignore
hidden_states,
x.attention_mask,
x.position_ids,
None,
use_reentrant=False,
)
else:
layer_outputs = decoder_layer(
hidden_states,
attention_mask=x.attention_mask,
position_ids=x.position_ids,
past_key_value=past_key_value,
output_attentions=x.output_attentions,
use_cache=x.use_cache,
)
hidden_states = layer_outputs[0]
if x.use_cache:
cache = layer_outputs[2 if x.output_attentions else 1]
assert cache is not None
assert next_decoder_cache is not None
next_decoder_cache += (cache,)
if x.output_attentions:
assert layer_outputs[1] is not None
assert all_self_attns is not None
all_self_attns += (layer_outputs[1],)
return DecoderOutput(
hidden_states, all_hidden_states, all_self_attns, next_decoder_cache
)
class PlamoPreTrainedModel(PreTrainedModel): # type: ignore
config_class = PlamoConfig
_no_split_modules: List[str]
base_model_prefix = "model"
supports_gradient_checkpointing = True
_supports_sdpa = True
_no_split_modules = ["PlamoDecoderLayer"]
_skip_keys_device_placement = "past_key_values"
_keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
def _init_weights(self, module: torch.nn.Module) -> None:
std = self.config.initializer_range
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
def _set_gradient_checkpointing(
self, module: torch.nn.Module, value: bool = False
) -> None:
module.gradient_checkpointing = value # type: ignore
class PlamoModel(PlamoPreTrainedModel):
def __init__(self, config: PlamoConfig):
super().__init__(config)
assert config.eval_attention_n_bit is None
assert config.eval_mlp_n_bit is None
assert not config.eval_offload_moe
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
self.embed_tokens = nn.Embedding(
config.vocab_size, config.hidden_size, self.padding_idx
)
self.layers = PlamoDecoder(config) # type: ignore
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self) -> torch.nn.Embedding:
return self.embed_tokens
def set_input_embeddings(self, value: torch.nn.Embedding) -> None:
self.embed_tokens = value
# Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
def _prepare_decoder_attention_mask(
self,
attention_mask: torch.Tensor,
input_shape: Tuple[int, int],
inputs_embeds: Optional[torch.Tensor],
past_key_values_length: int,
) -> Optional[torch.Tensor]:
# create causal mask
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
combined_attention_mask: Optional[torch.Tensor] = None
if input_shape[-1] > 1:
assert inputs_embeds is not None
combined_attention_mask = _make_causal_mask(
input_shape,
inputs_embeds.dtype,
device=inputs_embeds.device,
past_key_values_length=past_key_values_length,
)
if attention_mask is not None:
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
assert inputs_embeds is not None
expanded_attn_mask = _expand_mask(
attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
).to(inputs_embeds.device)
combined_attention_mask = (
expanded_attn_mask
if combined_attention_mask is None
else expanded_attn_mask + combined_attention_mask
)
return combined_attention_mask
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
past_key_values: Optional[PlamoCache] = None,
inputs_embeds: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPast]:
assert input_ids is not None
output_attentions = (
output_attentions
if output_attentions is not None
else self.config.output_attentions
)
output_hidden_states = (
output_hidden_states
if output_hidden_states is not None
else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = (
return_dict if return_dict is not None else self.config.use_return_dict
)
# retrieve input_ids and inputs_embeds
if input_ids is not None and inputs_embeds is not None:
raise ValueError(
"You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
)
elif input_ids is not None:
batch_size, seq_length = input_ids.shape
else:
raise ValueError(
"You have to specify either decoder_input_ids or decoder_inputs_embeds"
)
seq_length_with_past = seq_length
past_key_values_length = 0
if past_key_values is not None:
past_key_values_length = past_key_values[0].sequence_length()
seq_length_with_past = seq_length_with_past + past_key_values_length
if position_ids is None:
device = input_ids.device
position_ids = torch.arange(
past_key_values_length,
seq_length + past_key_values_length,
dtype=torch.long,
device=device,
)
position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
else:
position_ids = position_ids.view(-1, seq_length).long()
if inputs_embeds is None:
inputs_embeds = self.embed_tokens(input_ids)
# embed positions
if (
attention_mask is not None
or not self.training
or past_key_values is not None
):
if attention_mask is None:
attention_mask = torch.ones(
(batch_size, seq_length_with_past),
dtype=torch.bool,
device=inputs_embeds.device,
)
# attention_mask = self._prepare_decoder_attention_mask(
# attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
# )
attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
attention_mask,
(batch_size, seq_length),
inputs_embeds,
past_key_values_length,
)
hidden_states = inputs_embeds
if self.gradient_checkpointing and self.training:
if use_cache:
use_cache = False
# decoder layers
out = self.layers(
DecoderInput(
hidden_states,
position_ids,
attention_mask,
past_key_values,
output_hidden_states,
output_attentions,
use_cache,
self.gradient_checkpointing,
)
)
assert isinstance(out, DecoderOutput)
hidden_states = out.hidden_states
all_hidden_states = out.all_hidden_states
all_self_attns = out.all_self_attns
next_decoder_cache = out.next_decoder_cache
hidden_states = self.norm(hidden_states)
# add hidden states from the last decoder layer
if output_hidden_states:
assert all_hidden_states is not None
all_hidden_states += (hidden_states,)
next_cache = next_decoder_cache if use_cache else None
if not return_dict:
return tuple(
v
for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
if v is not None
)
return BaseModelOutputWithPast(
last_hidden_state=hidden_states,
past_key_values=next_cache,
hidden_states=all_hidden_states,
attentions=all_self_attns,
)
class ModifiedAttention(Attention):
def __init__(self, config: PlamoConfig, **kwargs):
super().__init__(config, **kwargs)
self.is_causal = False
PLAMO_ATTENTION_CLASSES = {
"sdpa": ModifiedAttention,
}
class ModifiedPlamoDecoderLayer(PlamoDecoderLayer):
def __init__(self, config: PlamoConfig, is_sparse: bool):
nn.Module.__init__(self)
self.config = config
self.hidden_size = config.hidden_size
self.self_attn = PLAMO_ATTENTION_CLASSES[config._attn_implementation](
config=config
)
self.mlp = MLP(config, is_sparse=is_sparse)
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.norm2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
class ModifiedPlamoDecoder(PlamoDecoder):
def __init__(self, config: PlamoConfig) -> None:
nn.Module.__init__(self)
self.layers = nn.ModuleList(
[
ModifiedPlamoDecoderLayer(
config, is_sparse=is_sparse(config, layer_idx)
)
for layer_idx in range(config.num_hidden_layers)
]
)
class PlamoBiModel(PlamoModel):
_no_split_modules = ["ModifiedPlamoDecoderLayer"]
def __init__(self, config: PlamoConfig):
PlamoPreTrainedModel.__init__(self, config)
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
self.embed_tokens = nn.Embedding(
config.vocab_size, config.hidden_size, self.padding_idx
)
self.layers = ModifiedPlamoDecoder(config)
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.gradient_checkpointing = False
self._attn_implementation = config._attn_implementation
self.post_init()
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
past_key_values: Optional[PlamoCache] = None,
inputs_embeds: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPast]:
assert input_ids is not None
output_attentions = (
output_attentions
if output_attentions is not None
else self.config.output_attentions
)
output_hidden_states = (
output_hidden_states
if output_hidden_states is not None
else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = (
return_dict if return_dict is not None else self.config.use_return_dict
)
if input_ids is not None and inputs_embeds is not None:
raise ValueError(
"You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
)
elif input_ids is not None:
batch_size, seq_length = input_ids.shape
else:
raise ValueError(
"You have to specify either decoder_input_ids or decoder_inputs_embeds"
)
seq_length_with_past = seq_length
past_key_values_length = 0
if past_key_values is not None:
past_key_values_length = past_key_values[0].sequence_length()
seq_length_with_past = seq_length_with_past + past_key_values_length
if position_ids is None:
device = input_ids.device
position_ids = torch.arange(
past_key_values_length,
seq_length + past_key_values_length,
dtype=torch.long,
device=device,
)
position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
else:
position_ids = position_ids.view(-1, seq_length).long()
if inputs_embeds is None:
inputs_embeds = self.embed_tokens(input_ids)
if self._attn_implementation == "sdpa" and not output_attentions:
attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
attention_mask,
(batch_size, seq_length),
inputs_embeds,
past_key_values_length,
)
else:
attention_mask = _prepare_4d_causal_attention_mask(
attention_mask,
(batch_size, seq_length),
inputs_embeds,
past_key_values_length,
sliding_window=self.config.sliding_window,
)
hidden_states = inputs_embeds
if self.gradient_checkpointing and self.training:
if use_cache:
use_cache = False
out = self.layers(
DecoderInput(
hidden_states,
position_ids,
attention_mask,
past_key_values,
output_hidden_states,
output_attentions,
use_cache,
self.gradient_checkpointing,
)
)
assert isinstance(out, DecoderOutput)
hidden_states = out.hidden_states
all_hidden_states = out.all_hidden_states
all_self_attns = out.all_self_attns
next_decoder_cache = out.next_decoder_cache
hidden_states = self.norm(hidden_states)
if output_hidden_states:
assert all_hidden_states is not None
all_hidden_states += (hidden_states,)
next_cache = next_decoder_cache if use_cache else None
if not return_dict:
return tuple(
v
for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
if v is not None
)
return BaseModelOutputWithPast(
last_hidden_state=hidden_states,
past_key_values=next_cache,
hidden_states=all_hidden_states,
attentions=all_self_attns,
)
def _tokenize(
self,
texts: List[str],
tokenizer: AutoTokenizer,
add_special_tokens: bool = True,
) -> BatchEncoding:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
return tokenizer(
texts,
return_tensors="pt",
truncation=True,
padding=True,
max_length=self.config.max_length,
add_special_tokens=add_special_tokens,
)
def _tokenize_with_instruction(
self,
sentences: List[str],
tokenizer: AutoTokenizer,
instruction: str,
add_special_tokens: bool = True,
) -> Tuple[BatchEncoding, torch.Tensor]:
sentence_features = self._tokenize(
sentences, tokenizer, add_special_tokens=False
)
sentences_with_instruction = [instruction + sentence for sentence in sentences]
sentence_features_with_instruction = self._tokenize(
sentences_with_instruction, tokenizer, add_special_tokens
)
embed_mask_list = []
for i in range(len(sentences)):
n_tokens = int(sentence_features["attention_mask"][i].sum().item())
mask = torch.zeros_like(
sentence_features_with_instruction["attention_mask"][i]
)
if n_tokens > 0:
mask[-n_tokens:] = torch.ones(n_tokens, dtype=mask.dtype)
embed_mask_list.append(mask.unsqueeze(0))
embed_mask = torch.cat(embed_mask_list, dim=0)
return sentence_features_with_instruction, embed_mask
def _mean_pooling(
self,
sentence_features: BatchEncoding,
last_hidden_state: torch.Tensor,
embed_mask: Optional[torch.Tensor] = None,
) -> torch.Tensor:
if embed_mask is None:
mask = sentence_features["attention_mask"]
else:
mask = embed_mask
sum_hidden = (
last_hidden_state * mask.unsqueeze(-1).type_as(last_hidden_state)
).sum(dim=1)
lengths = mask.sum(dim=1, keepdim=True).clamp(min=1)
return sum_hidden / lengths
def encode(
self,
sentences: Union[str, List[str]],
tokenizer: AutoTokenizer,
instruction: str,
) -> torch.Tensor:
if isinstance(sentences, str):
sentences = [sentences]
sentence_features, embed_mask = self._tokenize_with_instruction(
sentences,
tokenizer,
instruction=instruction,
)
sentence_features = sentence_features.to(self.device)
embed_mask = embed_mask.to(self.device)
reps = self(**sentence_features)
return self._mean_pooling(sentence_features, reps.last_hidden_state, embed_mask)
def encode_document(
self,
sentences: Union[str, List[str]],
tokenizer: AutoTokenizer,
) -> torch.Tensor:
default_document_instruction = ""
return self.encode(sentences, tokenizer, default_document_instruction)
def encode_query(
self,
sentences: Union[str, List[str]],
tokenizer: AutoTokenizer,
) -> torch.Tensor:
default_query_instruction = "次の文章に対して、関連する文章を検索してください: "
return self.encode(sentences, tokenizer, default_query_instruction)