@@ -432,93 +432,6 @@ class AriaTextMoELayer(nn.Module):
|
|||||||
return output + shared_expert_output
|
return output + shared_expert_output
|
||||||
|
|
||||||
|
|
||||||
class AriaTextRotaryEmbedding(nn.Module):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
dim=None,
|
|
||||||
max_position_embeddings=2048,
|
|
||||||
base=10000,
|
|
||||||
device=None,
|
|
||||||
scaling_factor=1.0,
|
|
||||||
rope_type="default",
|
|
||||||
config: Optional[AriaTextConfig] = None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
|
||||||
# TODO (joao): remove the `if` below, only used for BC
|
|
||||||
self.rope_kwargs = {}
|
|
||||||
if config is None:
|
|
||||||
logger.warning_once(
|
|
||||||
"`AriaTextRotaryEmbedding` can now be fully parameterized by passing the model config through the "
|
|
||||||
"`config` argument. All other arguments will be removed in v4.46"
|
|
||||||
)
|
|
||||||
self.rope_kwargs = {
|
|
||||||
"rope_type": rope_type,
|
|
||||||
"factor": scaling_factor,
|
|
||||||
"dim": dim,
|
|
||||||
"base": base,
|
|
||||||
"max_position_embeddings": max_position_embeddings,
|
|
||||||
}
|
|
||||||
self.rope_type = rope_type
|
|
||||||
self.max_seq_len_cached = max_position_embeddings
|
|
||||||
self.original_max_seq_len = max_position_embeddings
|
|
||||||
else:
|
|
||||||
# BC: "rope_type" was originally "type"
|
|
||||||
if config.rope_scaling is not None:
|
|
||||||
self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
|
|
||||||
else:
|
|
||||||
self.rope_type = "default"
|
|
||||||
self.max_seq_len_cached = config.max_position_embeddings
|
|
||||||
self.original_max_seq_len = config.max_position_embeddings
|
|
||||||
|
|
||||||
self.config = config
|
|
||||||
self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
|
|
||||||
|
|
||||||
inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
|
|
||||||
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
||||||
self.original_inv_freq = self.inv_freq
|
|
||||||
|
|
||||||
def _dynamic_frequency_update(self, position_ids, device):
|
|
||||||
"""
|
|
||||||
dynamic RoPE layers should recompute `inv_freq` in the following situations:
|
|
||||||
1 - growing beyond the cached sequence length (allow scaling)
|
|
||||||
2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
|
|
||||||
"""
|
|
||||||
seq_len = torch.max(position_ids) + 1
|
|
||||||
if seq_len > self.max_seq_len_cached: # growth
|
|
||||||
inv_freq, self.attention_scaling = self.rope_init_fn(
|
|
||||||
self.config, device, seq_len=seq_len, **self.rope_kwargs
|
|
||||||
)
|
|
||||||
self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation
|
|
||||||
self.max_seq_len_cached = seq_len
|
|
||||||
|
|
||||||
if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset
|
|
||||||
self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
|
|
||||||
self.max_seq_len_cached = self.original_max_seq_len
|
|
||||||
|
|
||||||
@torch.no_grad()
|
|
||||||
def forward(self, x, position_ids):
|
|
||||||
if "dynamic" in self.rope_type:
|
|
||||||
self._dynamic_frequency_update(position_ids, device=x.device)
|
|
||||||
|
|
||||||
# Core RoPE block
|
|
||||||
inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
|
|
||||||
position_ids_expanded = position_ids[:, None, :].float()
|
|
||||||
# Force float32 (see https://github.com/huggingface/transformers/pull/29285)
|
|
||||||
device_type = x.device.type
|
|
||||||
device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
|
|
||||||
with torch.autocast(device_type=device_type, enabled=False):
|
|
||||||
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
||||||
emb = torch.cat((freqs, freqs), dim=-1)
|
|
||||||
cos = emb.cos()
|
|
||||||
sin = emb.sin()
|
|
||||||
|
|
||||||
# Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
|
|
||||||
cos = cos * self.attention_scaling
|
|
||||||
sin = sin * self.attention_scaling
|
|
||||||
|
|
||||||
return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
|
|
||||||
|
|
||||||
|
|
||||||
def rotate_half(x):
|
def rotate_half(x):
|
||||||
"""Rotates half the hidden dims of the input."""
|
"""Rotates half the hidden dims of the input."""
|
||||||
x1 = x[..., : x.shape[-1] // 2]
|
x1 = x[..., : x.shape[-1] // 2]
|
||||||
@@ -594,9 +507,6 @@ class AriaTextAttention(nn.Module):
|
|||||||
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
|
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
|
||||||
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
|
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
|
||||||
|
|
||||||
# TODO (joao): remove in v4.46 (RoPE is computed in the model, not in the decoder layers)
|
|
||||||
self.rotary_emb = AriaTextRotaryEmbedding(config=self.config)
|
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
hidden_states: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
@@ -606,7 +516,7 @@ class AriaTextAttention(nn.Module):
|
|||||||
output_attentions: bool = False,
|
output_attentions: bool = False,
|
||||||
use_cache: bool = False,
|
use_cache: bool = False,
|
||||||
cache_position: Optional[torch.LongTensor] = None,
|
cache_position: Optional[torch.LongTensor] = None,
|
||||||
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
|
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||||
bsz, q_len, _ = hidden_states.size()
|
bsz, q_len, _ = hidden_states.size()
|
||||||
@@ -620,15 +530,6 @@ class AriaTextAttention(nn.Module):
|
|||||||
key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
|
key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
|
||||||
value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
|
value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
|
||||||
|
|
||||||
if position_embeddings is None:
|
|
||||||
logger.warning_once(
|
|
||||||
"The attention layers in this model are transitioning from computing the RoPE embeddings internally "
|
|
||||||
"through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
|
|
||||||
"`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
|
|
||||||
"removed and `position_embeddings` will be mandatory."
|
|
||||||
)
|
|
||||||
cos, sin = self.rotary_emb(value_states, position_ids)
|
|
||||||
else:
|
|
||||||
cos, sin = position_embeddings
|
cos, sin = position_embeddings
|
||||||
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
||||||
|
|
||||||
@@ -692,7 +593,7 @@ class AriaTextFlashAttention2(AriaTextAttention):
|
|||||||
output_attentions: bool = False,
|
output_attentions: bool = False,
|
||||||
use_cache: bool = False,
|
use_cache: bool = False,
|
||||||
cache_position: Optional[torch.LongTensor] = None,
|
cache_position: Optional[torch.LongTensor] = None,
|
||||||
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
|
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
||||||
**kwargs: Unpack[FlashAttentionKwargs],
|
**kwargs: Unpack[FlashAttentionKwargs],
|
||||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||||
if isinstance(past_key_value, StaticCache):
|
if isinstance(past_key_value, StaticCache):
|
||||||
@@ -716,15 +617,6 @@ class AriaTextFlashAttention2(AriaTextAttention):
|
|||||||
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
|
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
|
||||||
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
|
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
|
||||||
|
|
||||||
if position_embeddings is None:
|
|
||||||
logger.warning_once(
|
|
||||||
"The attention layers in this model are transitioning from computing the RoPE embeddings internally "
|
|
||||||
"through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
|
|
||||||
"`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
|
|
||||||
"removed and `position_embeddings` will be mandatory."
|
|
||||||
)
|
|
||||||
cos, sin = self.rotary_emb(value_states, position_ids)
|
|
||||||
else:
|
|
||||||
cos, sin = position_embeddings
|
cos, sin = position_embeddings
|
||||||
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
||||||
|
|
||||||
@@ -807,7 +699,7 @@ class AriaTextSdpaAttention(AriaTextAttention):
|
|||||||
output_attentions: bool = False,
|
output_attentions: bool = False,
|
||||||
use_cache: bool = False,
|
use_cache: bool = False,
|
||||||
cache_position: Optional[torch.LongTensor] = None,
|
cache_position: Optional[torch.LongTensor] = None,
|
||||||
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
|
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||||
if output_attentions:
|
if output_attentions:
|
||||||
@@ -838,15 +730,6 @@ class AriaTextSdpaAttention(AriaTextAttention):
|
|||||||
key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
|
key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
|
||||||
value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
|
value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
|
||||||
|
|
||||||
if position_embeddings is None:
|
|
||||||
logger.warning_once(
|
|
||||||
"The attention layers in this model are transitioning from computing the RoPE embeddings internally "
|
|
||||||
"through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
|
|
||||||
"`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
|
|
||||||
"removed and `position_embeddings` will be mandatory."
|
|
||||||
)
|
|
||||||
cos, sin = self.rotary_emb(value_states, position_ids)
|
|
||||||
else:
|
|
||||||
cos, sin = position_embeddings
|
cos, sin = position_embeddings
|
||||||
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
||||||
|
|
||||||
@@ -928,7 +811,7 @@ class AriaTextDecoderLayer(nn.Module):
|
|||||||
output_attentions: Optional[bool] = False,
|
output_attentions: Optional[bool] = False,
|
||||||
use_cache: Optional[bool] = False,
|
use_cache: Optional[bool] = False,
|
||||||
cache_position: Optional[torch.LongTensor] = None,
|
cache_position: Optional[torch.LongTensor] = None,
|
||||||
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
|
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
||||||
"""
|
"""
|
||||||
@@ -1067,6 +950,93 @@ class AriaPreTrainedModel(PreTrainedModel):
|
|||||||
nn.init.trunc_normal_(module.query, std=std)
|
nn.init.trunc_normal_(module.query, std=std)
|
||||||
|
|
||||||
|
|
||||||
|
class AriaTextRotaryEmbedding(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
dim=None,
|
||||||
|
max_position_embeddings=2048,
|
||||||
|
base=10000,
|
||||||
|
device=None,
|
||||||
|
scaling_factor=1.0,
|
||||||
|
rope_type="default",
|
||||||
|
config: Optional[AriaTextConfig] = None,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
# TODO (joao): remove the `if` below, only used for BC
|
||||||
|
self.rope_kwargs = {}
|
||||||
|
if config is None:
|
||||||
|
logger.warning_once(
|
||||||
|
"`AriaTextRotaryEmbedding` can now be fully parameterized by passing the model config through the "
|
||||||
|
"`config` argument. All other arguments will be removed in v4.46"
|
||||||
|
)
|
||||||
|
self.rope_kwargs = {
|
||||||
|
"rope_type": rope_type,
|
||||||
|
"factor": scaling_factor,
|
||||||
|
"dim": dim,
|
||||||
|
"base": base,
|
||||||
|
"max_position_embeddings": max_position_embeddings,
|
||||||
|
}
|
||||||
|
self.rope_type = rope_type
|
||||||
|
self.max_seq_len_cached = max_position_embeddings
|
||||||
|
self.original_max_seq_len = max_position_embeddings
|
||||||
|
else:
|
||||||
|
# BC: "rope_type" was originally "type"
|
||||||
|
if config.rope_scaling is not None:
|
||||||
|
self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
|
||||||
|
else:
|
||||||
|
self.rope_type = "default"
|
||||||
|
self.max_seq_len_cached = config.max_position_embeddings
|
||||||
|
self.original_max_seq_len = config.max_position_embeddings
|
||||||
|
|
||||||
|
self.config = config
|
||||||
|
self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
|
||||||
|
|
||||||
|
inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
|
||||||
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
||||||
|
self.original_inv_freq = self.inv_freq
|
||||||
|
|
||||||
|
def _dynamic_frequency_update(self, position_ids, device):
|
||||||
|
"""
|
||||||
|
dynamic RoPE layers should recompute `inv_freq` in the following situations:
|
||||||
|
1 - growing beyond the cached sequence length (allow scaling)
|
||||||
|
2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
|
||||||
|
"""
|
||||||
|
seq_len = torch.max(position_ids) + 1
|
||||||
|
if seq_len > self.max_seq_len_cached: # growth
|
||||||
|
inv_freq, self.attention_scaling = self.rope_init_fn(
|
||||||
|
self.config, device, seq_len=seq_len, **self.rope_kwargs
|
||||||
|
)
|
||||||
|
self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation
|
||||||
|
self.max_seq_len_cached = seq_len
|
||||||
|
|
||||||
|
if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset
|
||||||
|
self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
|
||||||
|
self.max_seq_len_cached = self.original_max_seq_len
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def forward(self, x, position_ids):
|
||||||
|
if "dynamic" in self.rope_type:
|
||||||
|
self._dynamic_frequency_update(position_ids, device=x.device)
|
||||||
|
|
||||||
|
# Core RoPE block
|
||||||
|
inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
|
||||||
|
position_ids_expanded = position_ids[:, None, :].float()
|
||||||
|
# Force float32 (see https://github.com/huggingface/transformers/pull/29285)
|
||||||
|
device_type = x.device.type
|
||||||
|
device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
|
||||||
|
with torch.autocast(device_type=device_type, enabled=False):
|
||||||
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
||||||
|
emb = torch.cat((freqs, freqs), dim=-1)
|
||||||
|
cos = emb.cos()
|
||||||
|
sin = emb.sin()
|
||||||
|
|
||||||
|
# Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
|
||||||
|
cos = cos * self.attention_scaling
|
||||||
|
sin = sin * self.attention_scaling
|
||||||
|
|
||||||
|
return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
|
||||||
|
|
||||||
|
|
||||||
ARIA_TEXT_INPUTS_DOCSTRING = r"""
|
ARIA_TEXT_INPUTS_DOCSTRING = r"""
|
||||||
Args:
|
Args:
|
||||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
||||||
|
|||||||
Reference in New Issue
Block a user