More model refactoring! (#35359)
* cohere * style * phi3 * style * small fix * small fix * phi3 longrope * oups * Update rope (only for phi3 still) * Update test_modeling_rope_utils.py * Update modeling_phi3.py * fix * fix copies * style * Fix copied from bad renaming
This commit is contained in:
@@ -279,25 +279,20 @@ def _compute_longrope_parameters(
|
|||||||
# `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
|
# `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
|
||||||
# values to compute the default attention scaling factor, instead of using `factor`.
|
# values to compute the default attention scaling factor, instead of using `factor`.
|
||||||
if hasattr(config, "original_max_position_embeddings"):
|
if hasattr(config, "original_max_position_embeddings"):
|
||||||
if seq_len and seq_len < config.original_max_position_embeddings:
|
original_max_position_embeddings = config.original_max_position_embeddings
|
||||||
expanded_max_position_embeddings = config.original_max_position_embeddings
|
factor = config.max_position_embeddings / config.original_max_position_embeddings
|
||||||
else:
|
|
||||||
expanded_max_position_embeddings = config.max_position_embeddings
|
|
||||||
max_position_embeddings = config.original_max_position_embeddings
|
|
||||||
factor = expanded_max_position_embeddings / max_position_embeddings
|
|
||||||
else:
|
else:
|
||||||
max_position_embeddings = config.max_position_embeddings
|
original_max_position_embeddings = config.max_position_embeddings
|
||||||
expanded_max_position_embeddings = max_position_embeddings * factor
|
|
||||||
|
|
||||||
# Sets the attention factor as suggested in the paper
|
# Sets the attention factor as suggested in the paper
|
||||||
if attention_factor is None:
|
if attention_factor is None:
|
||||||
if factor <= 1.0:
|
if factor <= 1.0:
|
||||||
attention_factor = 1.0
|
attention_factor = 1.0
|
||||||
else:
|
else:
|
||||||
attention_factor = math.sqrt(1 + math.log(factor) / math.log(max_position_embeddings))
|
attention_factor = math.sqrt(1 + math.log(factor) / math.log(original_max_position_embeddings))
|
||||||
|
|
||||||
# Compute the inverse frequencies -- scaled based on the target sequence length
|
# Compute the inverse frequencies -- scaled based on the target sequence length
|
||||||
if expanded_max_position_embeddings > max_position_embeddings:
|
if seq_len and seq_len > original_max_position_embeddings:
|
||||||
ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device)
|
ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device)
|
||||||
else:
|
else:
|
||||||
ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device)
|
ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device)
|
||||||
|
|||||||
@@ -723,11 +723,7 @@ class AriaPreTrainedModel(PreTrainedModel):
|
|||||||
|
|
||||||
|
|
||||||
class AriaTextRotaryEmbedding(nn.Module):
|
class AriaTextRotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: AriaTextConfig, device=None):
|
||||||
self,
|
|
||||||
config: AriaTextConfig,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
|
|||||||
@@ -120,11 +120,7 @@ class HybridMambaAttentionDynamicCache(modeling_jamba.HybridMambaAttentionDynami
|
|||||||
|
|
||||||
|
|
||||||
class BambaRotaryEmbedding(nn.Module):
|
class BambaRotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: BambaConfig, device=None):
|
||||||
self,
|
|
||||||
config: BambaConfig,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
|
|||||||
@@ -1,3 +1,9 @@
|
|||||||
|
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||||
|
# This file was automatically generated from src/transformers/models/cohere/modular_cohere.py.
|
||||||
|
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
||||||
|
# the file from the modular. If any change should be done, please apply the change to the
|
||||||
|
# modular_cohere.py file directly. One of our CI enforces this.
|
||||||
|
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
# Copyright 2024 Cohere team. All rights reserved.
|
# Copyright 2024 Cohere team. All rights reserved.
|
||||||
#
|
#
|
||||||
@@ -20,13 +26,10 @@
|
|||||||
|
|
||||||
# This file is based on the LLama model definition file in transformers
|
# This file is based on the LLama model definition file in transformers
|
||||||
|
|
||||||
"""PyTorch Cohere model."""
|
|
||||||
|
|
||||||
import math
|
from typing import Callable, List, Optional, Tuple, Union
|
||||||
from typing import List, Optional, Tuple, Union
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.checkpoint
|
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from ...activations import ACT2FN
|
from ...activations import ACT2FN
|
||||||
@@ -34,31 +37,21 @@ from ...cache_utils import Cache, DynamicCache, StaticCache
|
|||||||
from ...generation import GenerationMixin
|
from ...generation import GenerationMixin
|
||||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||||
from ...modeling_outputs import (
|
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
||||||
BaseModelOutputWithPast,
|
|
||||||
CausalLMOutputWithPast,
|
|
||||||
)
|
|
||||||
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
|
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
|
||||||
from ...modeling_utils import PreTrainedModel
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||||
from ...processing_utils import Unpack
|
from ...processing_utils import Unpack
|
||||||
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
|
|
||||||
from ...utils import (
|
from ...utils import (
|
||||||
|
LossKwargs,
|
||||||
add_start_docstrings,
|
add_start_docstrings,
|
||||||
add_start_docstrings_to_model_forward,
|
add_start_docstrings_to_model_forward,
|
||||||
is_flash_attn_2_available,
|
|
||||||
is_flash_attn_greater_or_equal_2_10,
|
|
||||||
logging,
|
logging,
|
||||||
replace_return_docstrings,
|
replace_return_docstrings,
|
||||||
)
|
)
|
||||||
from .configuration_cohere import CohereConfig
|
from .configuration_cohere import CohereConfig
|
||||||
|
|
||||||
|
|
||||||
if is_flash_attn_2_available():
|
|
||||||
from ...modeling_flash_attention_utils import _flash_attention_forward
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
_CONFIG_FOR_DOC = "CohereConfig"
|
_CONFIG_FOR_DOC = "CohereConfig"
|
||||||
|
|
||||||
|
|
||||||
@@ -79,49 +72,17 @@ class CohereLayerNorm(nn.Module):
|
|||||||
return hidden_states.to(input_dtype)
|
return hidden_states.to(input_dtype)
|
||||||
|
|
||||||
|
|
||||||
ALL_LAYERNORM_LAYERS.append(CohereLayerNorm)
|
|
||||||
|
|
||||||
|
|
||||||
class CohereRotaryEmbedding(nn.Module):
|
class CohereRotaryEmbedding(nn.Module):
|
||||||
# Note: the forward pass of this RoPE is slightly different from Llama's, resulting in different `sin`/`cos` for
|
def __init__(self, config: CohereConfig, device=None):
|
||||||
# the same parameterization. The differences are highlighted with a comment.
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
dim=None,
|
|
||||||
max_position_embeddings=2048,
|
|
||||||
base=10000,
|
|
||||||
device=None,
|
|
||||||
scaling_factor=1.0,
|
|
||||||
rope_type="default",
|
|
||||||
config: Optional[CohereConfig] = None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
# TODO (joao): remove the `if` below, only used for BC
|
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
if config is None:
|
# BC: "rope_type" was originally "type"
|
||||||
logger.warning_once(
|
if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
|
||||||
"`CohereRotaryEmbedding` can now be fully parameterized by passing the model config through the "
|
self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
|
||||||
"`config` argument. All other arguments will be removed in v4.46"
|
|
||||||
)
|
|
||||||
self.rope_kwargs = {
|
|
||||||
"rope_type": rope_type,
|
|
||||||
"factor": scaling_factor,
|
|
||||||
"dim": dim,
|
|
||||||
"base": base,
|
|
||||||
"max_position_embeddings": max_position_embeddings,
|
|
||||||
}
|
|
||||||
self.rope_type = rope_type
|
|
||||||
self.max_seq_len_cached = max_position_embeddings
|
|
||||||
self.original_max_seq_len = max_position_embeddings
|
|
||||||
else:
|
else:
|
||||||
# BC: "rope_type" was originally "type"
|
self.rope_type = "default"
|
||||||
if config.rope_scaling is not None:
|
self.max_seq_len_cached = config.max_position_embeddings
|
||||||
self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
|
self.original_max_seq_len = config.max_position_embeddings
|
||||||
else:
|
|
||||||
self.rope_type = "default"
|
|
||||||
self.max_seq_len_cached = config.max_position_embeddings
|
|
||||||
self.original_max_seq_len = config.max_position_embeddings
|
|
||||||
|
|
||||||
self.config = config
|
self.config = config
|
||||||
self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
|
self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
|
||||||
@@ -161,7 +122,7 @@ class CohereRotaryEmbedding(nn.Module):
|
|||||||
device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
|
device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
|
||||||
with torch.autocast(device_type=device_type, enabled=False):
|
with torch.autocast(device_type=device_type, enabled=False):
|
||||||
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
||||||
emb = torch.repeat_interleave(freqs, 2, dim=-1) # This line differs from Llama's implementation
|
emb = torch.repeat_interleave(freqs, 2, dim=-1) # diff from Llama: we interleave() instead of cat()
|
||||||
cos = emb.cos()
|
cos = emb.cos()
|
||||||
sin = emb.sin()
|
sin = emb.sin()
|
||||||
|
|
||||||
@@ -172,6 +133,60 @@ class CohereRotaryEmbedding(nn.Module):
|
|||||||
return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
|
return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
|
||||||
|
|
||||||
|
|
||||||
|
class CohereMLP(nn.Module):
|
||||||
|
def __init__(self, config):
|
||||||
|
super().__init__()
|
||||||
|
self.config = config
|
||||||
|
self.hidden_size = config.hidden_size
|
||||||
|
self.intermediate_size = config.intermediate_size
|
||||||
|
self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
|
||||||
|
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
|
||||||
|
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
|
||||||
|
self.act_fn = ACT2FN[config.hidden_act]
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
|
||||||
|
return down_proj
|
||||||
|
|
||||||
|
|
||||||
|
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
|
||||||
|
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
|
||||||
|
"""
|
||||||
|
batch, num_key_value_heads, slen, head_dim = hidden_states.shape
|
||||||
|
if n_rep == 1:
|
||||||
|
return hidden_states
|
||||||
|
hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
|
||||||
|
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
|
||||||
|
|
||||||
|
|
||||||
|
def eager_attention_forward(
|
||||||
|
module: nn.Module,
|
||||||
|
query: torch.Tensor,
|
||||||
|
key: torch.Tensor,
|
||||||
|
value: torch.Tensor,
|
||||||
|
attention_mask: Optional[torch.Tensor],
|
||||||
|
scaling: float,
|
||||||
|
dropout: float = 0.0,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
key_states = repeat_kv(key, module.num_key_value_groups)
|
||||||
|
value_states = repeat_kv(value, module.num_key_value_groups)
|
||||||
|
|
||||||
|
attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
|
||||||
|
if attention_mask is not None:
|
||||||
|
causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
|
||||||
|
attn_weights = attn_weights + causal_mask
|
||||||
|
|
||||||
|
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
|
||||||
|
attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
|
||||||
|
attn_output = torch.matmul(attn_weights, value_states)
|
||||||
|
attn_output = attn_output.transpose(1, 2).contiguous()
|
||||||
|
|
||||||
|
return attn_output, attn_weights
|
||||||
|
|
||||||
|
|
||||||
def rotate_half(x):
|
def rotate_half(x):
|
||||||
# Split and rotate. Note that this function is different from e.g. Llama.
|
# Split and rotate. Note that this function is different from e.g. Llama.
|
||||||
x1 = x[..., ::2]
|
x1 = x[..., ::2]
|
||||||
@@ -210,36 +225,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
|
|||||||
return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype)
|
return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype)
|
||||||
|
|
||||||
|
|
||||||
class CohereMLP(nn.Module):
|
|
||||||
def __init__(self, config):
|
|
||||||
super().__init__()
|
|
||||||
self.config = config
|
|
||||||
self.hidden_size = config.hidden_size
|
|
||||||
self.intermediate_size = config.intermediate_size
|
|
||||||
self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
|
|
||||||
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
|
|
||||||
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
|
|
||||||
self.act_fn = ACT2FN[config.hidden_act]
|
|
||||||
|
|
||||||
# Ignore copy
|
|
||||||
def forward(self, x):
|
|
||||||
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
|
|
||||||
return down_proj
|
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.llama.modeling_llama.repeat_kv
|
|
||||||
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
|
||||||
"""
|
|
||||||
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
|
|
||||||
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
|
|
||||||
"""
|
|
||||||
batch, num_key_value_heads, slen, head_dim = hidden_states.shape
|
|
||||||
if n_rep == 1:
|
|
||||||
return hidden_states
|
|
||||||
hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
|
|
||||||
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
|
|
||||||
|
|
||||||
|
|
||||||
class CohereAttention(nn.Module):
|
class CohereAttention(nn.Module):
|
||||||
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
||||||
|
|
||||||
@@ -247,334 +232,97 @@ class CohereAttention(nn.Module):
|
|||||||
super().__init__()
|
super().__init__()
|
||||||
self.config = config
|
self.config = config
|
||||||
self.layer_idx = layer_idx
|
self.layer_idx = layer_idx
|
||||||
if layer_idx is None:
|
self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
|
||||||
logger.warning_once(
|
self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
|
||||||
f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
|
self.scaling = self.head_dim**-0.5
|
||||||
"lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
|
|
||||||
"when creating this class."
|
|
||||||
)
|
|
||||||
|
|
||||||
self.attention_dropout = config.attention_dropout
|
self.attention_dropout = config.attention_dropout
|
||||||
self.hidden_size = config.hidden_size
|
|
||||||
self.num_heads = config.num_attention_heads
|
|
||||||
self.head_dim = self.hidden_size // self.num_heads
|
|
||||||
self.num_key_value_heads = config.num_key_value_heads
|
|
||||||
self.num_key_value_groups = self.num_heads // self.num_key_value_heads
|
|
||||||
self.max_position_embeddings = config.max_position_embeddings
|
|
||||||
self.rope_theta = config.rope_theta
|
|
||||||
self.is_causal = True
|
self.is_causal = True
|
||||||
|
|
||||||
|
self.q_proj = nn.Linear(
|
||||||
|
config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
|
||||||
|
)
|
||||||
|
self.k_proj = nn.Linear(
|
||||||
|
config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
|
||||||
|
)
|
||||||
|
self.v_proj = nn.Linear(
|
||||||
|
config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
|
||||||
|
)
|
||||||
|
self.o_proj = nn.Linear(
|
||||||
|
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
|
||||||
|
)
|
||||||
self.use_qk_norm = config.use_qk_norm
|
self.use_qk_norm = config.use_qk_norm
|
||||||
|
|
||||||
if (self.head_dim * self.num_heads) != self.hidden_size:
|
|
||||||
raise ValueError(
|
|
||||||
f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
|
|
||||||
f" and `num_heads`: {self.num_heads})."
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.use_qk_norm:
|
if self.use_qk_norm:
|
||||||
# When sharding the model using Tensor Parallelism, need to be careful to use n_local_heads
|
# When sharding the model using Tensor Parallelism, need to be careful to use n_local_heads
|
||||||
self.q_norm = CohereLayerNorm(hidden_size=(self.num_heads, self.head_dim), eps=config.layer_norm_eps)
|
self.q_norm = CohereLayerNorm(
|
||||||
|
hidden_size=(config.num_attention_heads, self.head_dim), eps=config.layer_norm_eps
|
||||||
|
)
|
||||||
self.k_norm = CohereLayerNorm(
|
self.k_norm = CohereLayerNorm(
|
||||||
hidden_size=(self.num_key_value_heads, self.head_dim), eps=config.layer_norm_eps
|
hidden_size=(config.num_key_value_heads, self.head_dim), eps=config.layer_norm_eps
|
||||||
)
|
)
|
||||||
|
|
||||||
self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
|
|
||||||
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
|
|
||||||
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
|
|
||||||
self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
|
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
hidden_states: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
attention_mask: Optional[torch.Tensor] = None,
|
position_embeddings: Tuple[torch.Tensor, torch.Tensor],
|
||||||
position_ids: Optional[torch.LongTensor] = None,
|
attention_mask: Optional[torch.Tensor],
|
||||||
past_key_value: Optional[Cache] = None,
|
past_key_value: Optional[Cache] = None,
|
||||||
output_attentions: bool = False,
|
|
||||||
use_cache: bool = False,
|
|
||||||
cache_position: Optional[torch.LongTensor] = None,
|
cache_position: Optional[torch.LongTensor] = None,
|
||||||
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
**kwargs: Unpack[FlashAttentionKwargs],
|
||||||
**kwargs,
|
|
||||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||||
bsz, q_len, _ = hidden_states.size()
|
input_shape = hidden_states.shape[:-1]
|
||||||
|
hidden_shape = (*input_shape, -1, self.head_dim)
|
||||||
|
|
||||||
query_states = self.q_proj(hidden_states)
|
query_states = self.q_proj(hidden_states).view(hidden_shape)
|
||||||
key_states = self.k_proj(hidden_states)
|
key_states = self.k_proj(hidden_states).view(hidden_shape)
|
||||||
value_states = self.v_proj(hidden_states)
|
value_states = self.v_proj(hidden_states).view(hidden_shape)
|
||||||
|
|
||||||
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
|
if self.use_qk_norm: # main diff from Llama
|
||||||
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
|
|
||||||
if self.use_qk_norm:
|
|
||||||
query_states = self.q_norm(query_states)
|
query_states = self.q_norm(query_states)
|
||||||
key_states = self.k_norm(key_states)
|
key_states = self.k_norm(key_states)
|
||||||
|
|
||||||
query_states = query_states.transpose(1, 2)
|
|
||||||
key_states = key_states.transpose(1, 2)
|
|
||||||
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
|
|
||||||
|
|
||||||
cos, sin = position_embeddings
|
|
||||||
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
|
||||||
|
|
||||||
if past_key_value is not None:
|
|
||||||
# sin and cos are specific to RoPE models; position_ids needed for the static cache
|
|
||||||
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
|
||||||
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
|
||||||
|
|
||||||
key_states = repeat_kv(key_states, self.num_key_value_groups)
|
|
||||||
value_states = repeat_kv(value_states, self.num_key_value_groups)
|
|
||||||
|
|
||||||
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
|
|
||||||
|
|
||||||
if attention_mask is not None: # no matter the length, we just slice it
|
|
||||||
causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
|
|
||||||
attn_weights = attn_weights + causal_mask
|
|
||||||
|
|
||||||
# upcast attention to fp32
|
|
||||||
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
|
|
||||||
attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
|
|
||||||
attn_output = torch.matmul(attn_weights, value_states)
|
|
||||||
|
|
||||||
if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
|
|
||||||
raise ValueError(
|
|
||||||
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
|
|
||||||
f" {attn_output.size()}"
|
|
||||||
)
|
|
||||||
|
|
||||||
attn_output = attn_output.transpose(1, 2).contiguous()
|
|
||||||
|
|
||||||
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
|
|
||||||
|
|
||||||
attn_output = self.o_proj(attn_output)
|
|
||||||
|
|
||||||
if not output_attentions:
|
|
||||||
attn_weights = None
|
|
||||||
|
|
||||||
return attn_output, attn_weights, past_key_value
|
|
||||||
|
|
||||||
|
|
||||||
# NO LONGER EXIST Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Cohere
|
|
||||||
# TODO cyril: modular
|
|
||||||
class CohereFlashAttention2(CohereAttention):
|
|
||||||
"""
|
|
||||||
Cohere flash attention module. This module inherits from `CohereAttention` as the weights of the module stays
|
|
||||||
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
|
|
||||||
flash attention and deal with padding tokens in case the input contains any of them.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
|
|
||||||
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
|
|
||||||
# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
|
|
||||||
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
|
|
||||||
|
|
||||||
# Ignore copy
|
|
||||||
def forward(
|
|
||||||
self,
|
|
||||||
hidden_states: torch.Tensor,
|
|
||||||
attention_mask: Optional[torch.LongTensor] = None,
|
|
||||||
position_ids: Optional[torch.LongTensor] = None,
|
|
||||||
past_key_value: Optional[Cache] = None,
|
|
||||||
output_attentions: bool = False,
|
|
||||||
use_cache: bool = False,
|
|
||||||
cache_position: Optional[torch.LongTensor] = None,
|
|
||||||
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
|
||||||
**kwargs,
|
|
||||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
|
||||||
if isinstance(past_key_value, StaticCache):
|
|
||||||
raise ValueError(
|
|
||||||
"`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
|
|
||||||
"make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
|
|
||||||
)
|
|
||||||
output_attentions = False
|
|
||||||
|
|
||||||
bsz, q_len, _ = hidden_states.size()
|
|
||||||
|
|
||||||
query_states = self.q_proj(hidden_states)
|
|
||||||
key_states = self.k_proj(hidden_states)
|
|
||||||
value_states = self.v_proj(hidden_states)
|
|
||||||
|
|
||||||
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
|
|
||||||
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
|
|
||||||
if self.use_qk_norm:
|
|
||||||
query_states = self.q_norm(query_states)
|
|
||||||
key_states = self.k_norm(key_states)
|
|
||||||
|
|
||||||
query_states = query_states.transpose(1, 2)
|
|
||||||
key_states = key_states.transpose(1, 2)
|
|
||||||
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
|
|
||||||
|
|
||||||
cos, sin = position_embeddings
|
|
||||||
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
|
||||||
|
|
||||||
if past_key_value is not None:
|
|
||||||
# sin and cos are specific to RoPE models; position_ids needed for the static cache
|
|
||||||
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
|
||||||
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
|
||||||
|
|
||||||
# TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
|
|
||||||
# to be able to avoid many of these transpose/reshape/view.
|
|
||||||
query_states = query_states.transpose(1, 2)
|
query_states = query_states.transpose(1, 2)
|
||||||
key_states = key_states.transpose(1, 2)
|
key_states = key_states.transpose(1, 2)
|
||||||
value_states = value_states.transpose(1, 2)
|
value_states = value_states.transpose(1, 2)
|
||||||
|
|
||||||
dropout_rate = self.attention_dropout if self.training else 0.0
|
|
||||||
|
|
||||||
# In PEFT, usually we cast the layer norms in float32 for training stability reasons
|
|
||||||
# therefore the input hidden states gets silently casted in float32. Hence, we need
|
|
||||||
# cast them back in the correct dtype just to be sure everything works as expected.
|
|
||||||
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
|
|
||||||
# in fp32. (CohereLayerNorm handles it correctly)
|
|
||||||
|
|
||||||
input_dtype = query_states.dtype
|
|
||||||
if input_dtype == torch.float32:
|
|
||||||
if torch.is_autocast_enabled():
|
|
||||||
target_dtype = torch.get_autocast_gpu_dtype()
|
|
||||||
# Handle the case where the model is quantized
|
|
||||||
elif hasattr(self.config, "_pre_quantization_dtype"):
|
|
||||||
target_dtype = self.config._pre_quantization_dtype
|
|
||||||
else:
|
|
||||||
target_dtype = self.q_proj.weight.dtype
|
|
||||||
|
|
||||||
logger.warning_once(
|
|
||||||
f"The input hidden states seems to be silently casted in float32, this might be related to"
|
|
||||||
f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
|
|
||||||
f" {target_dtype}."
|
|
||||||
)
|
|
||||||
|
|
||||||
query_states = query_states.to(target_dtype)
|
|
||||||
key_states = key_states.to(target_dtype)
|
|
||||||
value_states = value_states.to(target_dtype)
|
|
||||||
|
|
||||||
attn_output = _flash_attention_forward(
|
|
||||||
query_states,
|
|
||||||
key_states,
|
|
||||||
value_states,
|
|
||||||
attention_mask,
|
|
||||||
q_len,
|
|
||||||
dropout=dropout_rate,
|
|
||||||
use_top_left_mask=self._flash_attn_uses_top_left_mask,
|
|
||||||
is_causal=self.is_causal,
|
|
||||||
)
|
|
||||||
|
|
||||||
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
|
|
||||||
attn_output = self.o_proj(attn_output)
|
|
||||||
|
|
||||||
if not output_attentions:
|
|
||||||
attn_weights = None
|
|
||||||
|
|
||||||
return attn_output, attn_weights, past_key_value
|
|
||||||
|
|
||||||
|
|
||||||
class CohereSdpaAttention(CohereAttention):
|
|
||||||
"""
|
|
||||||
Cohere attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
|
|
||||||
`CohereAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
|
|
||||||
SDPA API.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def forward(
|
|
||||||
self,
|
|
||||||
hidden_states: torch.Tensor,
|
|
||||||
attention_mask: Optional[torch.Tensor] = None,
|
|
||||||
position_ids: Optional[torch.LongTensor] = None,
|
|
||||||
past_key_value: Optional[Cache] = None,
|
|
||||||
output_attentions: bool = False,
|
|
||||||
use_cache: bool = False,
|
|
||||||
cache_position: Optional[torch.LongTensor] = None,
|
|
||||||
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
|
||||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
|
||||||
if output_attentions:
|
|
||||||
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
|
|
||||||
logger.warning_once(
|
|
||||||
"CohereModel is using CohereSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
|
|
||||||
'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
|
|
||||||
)
|
|
||||||
return super().forward(
|
|
||||||
hidden_states=hidden_states,
|
|
||||||
attention_mask=attention_mask,
|
|
||||||
position_ids=position_ids,
|
|
||||||
past_key_value=past_key_value,
|
|
||||||
output_attentions=output_attentions,
|
|
||||||
use_cache=use_cache,
|
|
||||||
cache_position=cache_position,
|
|
||||||
position_embeddings=position_embeddings,
|
|
||||||
)
|
|
||||||
|
|
||||||
bsz, q_len, _ = hidden_states.size()
|
|
||||||
|
|
||||||
query_states = self.q_proj(hidden_states)
|
|
||||||
key_states = self.k_proj(hidden_states)
|
|
||||||
value_states = self.v_proj(hidden_states)
|
|
||||||
|
|
||||||
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
|
|
||||||
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
|
|
||||||
if self.use_qk_norm:
|
|
||||||
query_states = self.q_norm(query_states)
|
|
||||||
key_states = self.k_norm(key_states)
|
|
||||||
|
|
||||||
query_states = query_states.transpose(1, 2)
|
|
||||||
key_states = key_states.transpose(1, 2)
|
|
||||||
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
|
|
||||||
|
|
||||||
cos, sin = position_embeddings
|
cos, sin = position_embeddings
|
||||||
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
||||||
|
|
||||||
if past_key_value is not None:
|
if past_key_value is not None:
|
||||||
# sin and cos are specific to RoPE models; cache_position needed for the static cache
|
# sin and cos are specific to RoPE models; position_ids needed for the static cache
|
||||||
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
||||||
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
||||||
|
|
||||||
key_states = repeat_kv(key_states, self.num_key_value_groups)
|
attention_interface: Callable = eager_attention_forward
|
||||||
value_states = repeat_kv(value_states, self.num_key_value_groups)
|
if self.config._attn_implementation != "eager":
|
||||||
|
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
|
||||||
|
logger.warning_once(
|
||||||
|
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
|
||||||
|
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||||
|
|
||||||
causal_mask = attention_mask
|
attn_output, attn_weights = attention_interface(
|
||||||
# if attention_mask is not None and cache_position is not None:
|
self,
|
||||||
if attention_mask is not None:
|
|
||||||
causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
|
|
||||||
|
|
||||||
# SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
|
|
||||||
# Reference: https://github.com/pytorch/pytorch/issues/112577.
|
|
||||||
if query_states.device.type == "cuda" and causal_mask is not None:
|
|
||||||
query_states = query_states.contiguous()
|
|
||||||
key_states = key_states.contiguous()
|
|
||||||
value_states = value_states.contiguous()
|
|
||||||
|
|
||||||
# We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
|
|
||||||
# in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
|
|
||||||
is_causal = True if causal_mask is None and q_len > 1 else False
|
|
||||||
|
|
||||||
attn_output = torch.nn.functional.scaled_dot_product_attention(
|
|
||||||
query_states,
|
query_states,
|
||||||
key_states,
|
key_states,
|
||||||
value_states,
|
value_states,
|
||||||
attn_mask=causal_mask,
|
attention_mask,
|
||||||
dropout_p=self.attention_dropout if self.training else 0.0,
|
dropout=0.0 if not self.training else self.attention_dropout,
|
||||||
is_causal=is_causal,
|
scaling=self.scaling,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
attn_output = attn_output.transpose(1, 2).contiguous()
|
attn_output = attn_output.reshape(*input_shape, -1).contiguous()
|
||||||
attn_output = attn_output.view(bsz, q_len, self.hidden_size)
|
|
||||||
|
|
||||||
attn_output = self.o_proj(attn_output)
|
attn_output = self.o_proj(attn_output)
|
||||||
|
return attn_output, attn_weights
|
||||||
return attn_output, None, past_key_value
|
|
||||||
|
|
||||||
|
|
||||||
COHERE_ATTENTION_CLASSES = {
|
|
||||||
"eager": CohereAttention,
|
|
||||||
"flash_attention_2": CohereFlashAttention2,
|
|
||||||
"sdpa": CohereSdpaAttention,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class CohereDecoderLayer(nn.Module):
|
class CohereDecoderLayer(nn.Module):
|
||||||
def __init__(self, config: CohereConfig, layer_idx: int):
|
def __init__(self, config: CohereConfig, layer_idx: int):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.hidden_size = config.hidden_size
|
self.hidden_size = config.hidden_size
|
||||||
|
self.self_attn = CohereAttention(config=config, layer_idx=layer_idx)
|
||||||
self.self_attn = COHERE_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
|
|
||||||
|
|
||||||
self.mlp = CohereMLP(config)
|
self.mlp = CohereMLP(config)
|
||||||
self.input_layernorm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
|
self.input_layernorm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
|
||||||
|
|
||||||
@@ -583,11 +331,12 @@ class CohereDecoderLayer(nn.Module):
|
|||||||
hidden_states: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
attention_mask: Optional[torch.Tensor] = None,
|
attention_mask: Optional[torch.Tensor] = None,
|
||||||
position_ids: Optional[torch.LongTensor] = None,
|
position_ids: Optional[torch.LongTensor] = None,
|
||||||
past_key_value: Optional[Tuple[torch.Tensor]] = None,
|
past_key_value: Optional[Cache] = None,
|
||||||
output_attentions: Optional[bool] = False,
|
output_attentions: Optional[bool] = False,
|
||||||
use_cache: Optional[bool] = False,
|
use_cache: Optional[bool] = False,
|
||||||
cache_position: Optional[torch.LongTensor] = None,
|
cache_position: Optional[torch.LongTensor] = None,
|
||||||
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
||||||
|
**kwargs: Unpack[FlashAttentionKwargs],
|
||||||
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
@@ -595,13 +344,13 @@ class CohereDecoderLayer(nn.Module):
|
|||||||
attention_mask (`torch.FloatTensor`, *optional*):
|
attention_mask (`torch.FloatTensor`, *optional*):
|
||||||
attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
|
attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
|
||||||
query_sequence_length, key_sequence_length)` if default attention is used.
|
query_sequence_length, key_sequence_length)` if default attention is used.
|
||||||
|
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
|
||||||
output_attentions (`bool`, *optional*):
|
output_attentions (`bool`, *optional*):
|
||||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
||||||
returned tensors for more detail.
|
returned tensors for more detail.
|
||||||
use_cache (`bool`, *optional*):
|
use_cache (`bool`, *optional*):
|
||||||
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
|
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
|
||||||
(see `past_key_values`).
|
(see `past_key_values`).
|
||||||
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
|
|
||||||
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
|
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
|
||||||
Indices depicting the position of the input sequence tokens in the sequence
|
Indices depicting the position of the input sequence tokens in the sequence
|
||||||
position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
|
position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
|
||||||
@@ -613,7 +362,7 @@ class CohereDecoderLayer(nn.Module):
|
|||||||
hidden_states = self.input_layernorm(hidden_states)
|
hidden_states = self.input_layernorm(hidden_states)
|
||||||
|
|
||||||
# Self Attention
|
# Self Attention
|
||||||
hidden_states_attention, self_attn_weights, present_key_value = self.self_attn(
|
hidden_states_attention, self_attn_weights = self.self_attn(
|
||||||
hidden_states=hidden_states,
|
hidden_states=hidden_states,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
position_ids=position_ids,
|
position_ids=position_ids,
|
||||||
@@ -622,6 +371,7 @@ class CohereDecoderLayer(nn.Module):
|
|||||||
use_cache=use_cache,
|
use_cache=use_cache,
|
||||||
cache_position=cache_position,
|
cache_position=cache_position,
|
||||||
position_embeddings=position_embeddings,
|
position_embeddings=position_embeddings,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Fully Connected
|
# Fully Connected
|
||||||
@@ -631,19 +381,16 @@ class CohereDecoderLayer(nn.Module):
|
|||||||
hidden_states = residual + hidden_states_attention + hidden_states_mlp
|
hidden_states = residual + hidden_states_attention + hidden_states_mlp
|
||||||
|
|
||||||
outputs = (hidden_states,)
|
outputs = (hidden_states,)
|
||||||
|
|
||||||
if output_attentions:
|
if output_attentions:
|
||||||
outputs += (self_attn_weights,)
|
outputs += (self_attn_weights,)
|
||||||
|
|
||||||
if use_cache:
|
|
||||||
outputs += (present_key_value,)
|
|
||||||
|
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
COHERE_START_DOCSTRING = r"""
|
COHERE_START_DOCSTRING = r"""
|
||||||
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
||||||
library implements for all its model (such as downloading or saving, resizing the input embeddings etc.).
|
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
||||||
|
etc.)
|
||||||
|
|
||||||
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
||||||
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
||||||
@@ -661,7 +408,6 @@ COHERE_START_DOCSTRING = r"""
|
|||||||
"The bare Cohere Model outputting raw hidden-states without any specific head on top.",
|
"The bare Cohere Model outputting raw hidden-states without any specific head on top.",
|
||||||
COHERE_START_DOCSTRING,
|
COHERE_START_DOCSTRING,
|
||||||
)
|
)
|
||||||
# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->Cohere
|
|
||||||
class CoherePreTrainedModel(PreTrainedModel):
|
class CoherePreTrainedModel(PreTrainedModel):
|
||||||
config_class = CohereConfig
|
config_class = CohereConfig
|
||||||
base_model_prefix = "model"
|
base_model_prefix = "model"
|
||||||
@@ -754,6 +500,10 @@ COHERE_INPUTS_DOCSTRING = r"""
|
|||||||
more detail.
|
more detail.
|
||||||
return_dict (`bool`, *optional*):
|
return_dict (`bool`, *optional*):
|
||||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
||||||
|
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
|
||||||
|
Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
|
||||||
|
this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
|
||||||
|
the complete sequence length.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@@ -761,8 +511,6 @@ COHERE_INPUTS_DOCSTRING = r"""
|
|||||||
"The bare Cohere Model outputting raw hidden-states without any specific head on top.",
|
"The bare Cohere Model outputting raw hidden-states without any specific head on top.",
|
||||||
COHERE_START_DOCSTRING,
|
COHERE_START_DOCSTRING,
|
||||||
)
|
)
|
||||||
# copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Cohere, LLAMA->COHERE
|
|
||||||
# TODO cyril: modular
|
|
||||||
class CohereModel(CoherePreTrainedModel):
|
class CohereModel(CoherePreTrainedModel):
|
||||||
"""
|
"""
|
||||||
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`CohereDecoderLayer`]
|
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`CohereDecoderLayer`]
|
||||||
@@ -771,7 +519,6 @@ class CohereModel(CoherePreTrainedModel):
|
|||||||
config: CohereConfig
|
config: CohereConfig
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Ignore copy
|
|
||||||
def __init__(self, config: CohereConfig):
|
def __init__(self, config: CohereConfig):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.padding_idx = config.pad_token_id
|
self.padding_idx = config.pad_token_id
|
||||||
@@ -800,7 +547,7 @@ class CohereModel(CoherePreTrainedModel):
|
|||||||
input_ids: torch.LongTensor = None,
|
input_ids: torch.LongTensor = None,
|
||||||
attention_mask: Optional[torch.Tensor] = None,
|
attention_mask: Optional[torch.Tensor] = None,
|
||||||
position_ids: Optional[torch.LongTensor] = None,
|
position_ids: Optional[torch.LongTensor] = None,
|
||||||
past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
|
past_key_values: Optional[Cache] = None,
|
||||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||||
use_cache: Optional[bool] = None,
|
use_cache: Optional[bool] = None,
|
||||||
output_attentions: Optional[bool] = None,
|
output_attentions: Optional[bool] = None,
|
||||||
@@ -1023,11 +770,13 @@ class CohereModel(CoherePreTrainedModel):
|
|||||||
return causal_mask
|
return causal_mask
|
||||||
|
|
||||||
|
|
||||||
# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with Llama->Cohere
|
class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
|
||||||
|
|
||||||
|
|
||||||
class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
|
class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
|
||||||
_tied_weights_keys = ["lm_head.weight"]
|
_tied_weights_keys = ["lm_head.weight"]
|
||||||
|
_tp_plan = {"lm_head": "colwise_rep"}
|
||||||
|
|
||||||
# Ignore copy
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.model = CohereModel(config)
|
self.model = CohereModel(config)
|
||||||
@@ -1035,6 +784,7 @@ class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
|
|||||||
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
||||||
self.logit_scale = config.logit_scale
|
self.logit_scale = config.logit_scale
|
||||||
self.tie_word_embeddings = config.tie_word_embeddings
|
self.tie_word_embeddings = config.tie_word_embeddings
|
||||||
|
|
||||||
# Initialize weights and apply final processing
|
# Initialize weights and apply final processing
|
||||||
self.post_init()
|
self.post_init()
|
||||||
|
|
||||||
@@ -1056,7 +806,6 @@ class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
|
|||||||
def get_decoder(self):
|
def get_decoder(self):
|
||||||
return self.model
|
return self.model
|
||||||
|
|
||||||
# Ignore copy
|
|
||||||
@add_start_docstrings_to_model_forward(COHERE_INPUTS_DOCSTRING)
|
@add_start_docstrings_to_model_forward(COHERE_INPUTS_DOCSTRING)
|
||||||
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
|
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
|
||||||
def forward(
|
def forward(
|
||||||
@@ -1064,7 +813,7 @@ class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
|
|||||||
input_ids: torch.LongTensor = None,
|
input_ids: torch.LongTensor = None,
|
||||||
attention_mask: Optional[torch.Tensor] = None,
|
attention_mask: Optional[torch.Tensor] = None,
|
||||||
position_ids: Optional[torch.LongTensor] = None,
|
position_ids: Optional[torch.LongTensor] = None,
|
||||||
past_key_values: Optional[List[torch.FloatTensor]] = None,
|
past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
|
||||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||||
labels: Optional[torch.LongTensor] = None,
|
labels: Optional[torch.LongTensor] = None,
|
||||||
use_cache: Optional[bool] = None,
|
use_cache: Optional[bool] = None,
|
||||||
@@ -1073,7 +822,7 @@ class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
|
|||||||
return_dict: Optional[bool] = None,
|
return_dict: Optional[bool] = None,
|
||||||
cache_position: Optional[torch.LongTensor] = None,
|
cache_position: Optional[torch.LongTensor] = None,
|
||||||
num_logits_to_keep: int = 0,
|
num_logits_to_keep: int = 0,
|
||||||
**loss_kwargs,
|
**kwargs: Unpack[KwargsForCausalLM],
|
||||||
) -> Union[Tuple, CausalLMOutputWithPast]:
|
) -> Union[Tuple, CausalLMOutputWithPast]:
|
||||||
r"""
|
r"""
|
||||||
Args:
|
Args:
|
||||||
@@ -1123,16 +872,17 @@ class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
|
|||||||
output_hidden_states=output_hidden_states,
|
output_hidden_states=output_hidden_states,
|
||||||
return_dict=return_dict,
|
return_dict=return_dict,
|
||||||
cache_position=cache_position,
|
cache_position=cache_position,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
hidden_states = outputs[0]
|
hidden_states = outputs[0]
|
||||||
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
|
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
|
||||||
logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
|
logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
|
||||||
logits = logits * self.logit_scale
|
logits = logits * self.logit_scale # main diff from Llama
|
||||||
|
|
||||||
loss = None
|
loss = None
|
||||||
if labels is not None:
|
if labels is not None:
|
||||||
loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
|
loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
|
||||||
|
|
||||||
if not return_dict:
|
if not return_dict:
|
||||||
output = (logits,) + outputs[1:]
|
output = (logits,) + outputs[1:]
|
||||||
|
|||||||
393
src/transformers/models/cohere/modular_cohere.py
Normal file
393
src/transformers/models/cohere/modular_cohere.py
Normal file
@@ -0,0 +1,393 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2024 Cohere team. All rights reserved.
|
||||||
|
#
|
||||||
|
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||||
|
# and OPT implementations in this library. It has been modified from its
|
||||||
|
# original forms to accommodate minor architectural differences compared
|
||||||
|
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
# This file is based on the LLama model definition file in transformers
|
||||||
|
|
||||||
|
"""PyTorch Cohere model."""
|
||||||
|
|
||||||
|
from typing import Callable, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.utils.checkpoint
|
||||||
|
from torch import nn
|
||||||
|
|
||||||
|
from ...cache_utils import Cache
|
||||||
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||||
|
from ...modeling_outputs import CausalLMOutputWithPast
|
||||||
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
||||||
|
from ...processing_utils import Unpack
|
||||||
|
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
|
||||||
|
from ...utils import LossKwargs, logging
|
||||||
|
from ..llama.modeling_llama import (
|
||||||
|
LlamaAttention,
|
||||||
|
LlamaForCausalLM,
|
||||||
|
LlamaMLP,
|
||||||
|
LlamaModel,
|
||||||
|
LlamaRotaryEmbedding,
|
||||||
|
eager_attention_forward,
|
||||||
|
)
|
||||||
|
from .configuration_cohere import CohereConfig
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
_CONFIG_FOR_DOC = "CohereConfig"
|
||||||
|
|
||||||
|
|
||||||
|
class CohereLayerNorm(nn.Module):
|
||||||
|
def __init__(self, hidden_size=None, eps=1e-5, bias=False):
|
||||||
|
"""The hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dim"""
|
||||||
|
super().__init__()
|
||||||
|
self.weight = nn.Parameter(torch.ones(hidden_size))
|
||||||
|
self.variance_epsilon = eps
|
||||||
|
|
||||||
|
def forward(self, hidden_states):
|
||||||
|
input_dtype = hidden_states.dtype
|
||||||
|
hidden_states = hidden_states.to(torch.float32)
|
||||||
|
mean = hidden_states.mean(-1, keepdim=True)
|
||||||
|
variance = (hidden_states - mean).pow(2).mean(-1, keepdim=True)
|
||||||
|
hidden_states = (hidden_states - mean) * torch.rsqrt(variance + self.variance_epsilon)
|
||||||
|
hidden_states = self.weight.to(torch.float32) * hidden_states
|
||||||
|
return hidden_states.to(input_dtype)
|
||||||
|
|
||||||
|
|
||||||
|
ALL_LAYERNORM_LAYERS.append(CohereLayerNorm)
|
||||||
|
|
||||||
|
|
||||||
|
class CohereRotaryEmbedding(LlamaRotaryEmbedding):
|
||||||
|
@torch.no_grad()
|
||||||
|
def forward(self, x, position_ids):
|
||||||
|
if "dynamic" in self.rope_type:
|
||||||
|
self._dynamic_frequency_update(position_ids, device=x.device)
|
||||||
|
|
||||||
|
# Core RoPE block
|
||||||
|
inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
|
||||||
|
position_ids_expanded = position_ids[:, None, :].float()
|
||||||
|
# Force float32 (see https://github.com/huggingface/transformers/pull/29285)
|
||||||
|
device_type = x.device.type
|
||||||
|
device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
|
||||||
|
with torch.autocast(device_type=device_type, enabled=False):
|
||||||
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
||||||
|
emb = torch.repeat_interleave(freqs, 2, dim=-1) # diff from Llama: we interleave() instead of cat()
|
||||||
|
cos = emb.cos()
|
||||||
|
sin = emb.sin()
|
||||||
|
|
||||||
|
# Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
|
||||||
|
cos = cos * self.attention_scaling
|
||||||
|
sin = sin * self.attention_scaling
|
||||||
|
|
||||||
|
return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
|
||||||
|
|
||||||
|
|
||||||
|
def rotate_half(x):
|
||||||
|
# Split and rotate. Note that this function is different from e.g. Llama.
|
||||||
|
x1 = x[..., ::2]
|
||||||
|
x2 = x[..., 1::2]
|
||||||
|
rot_x = torch.stack([-x2, x1], dim=-1).flatten(-2)
|
||||||
|
return rot_x
|
||||||
|
|
||||||
|
|
||||||
|
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
|
||||||
|
"""Applies Rotary Position Embedding to the query and key tensors.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
q (`torch.Tensor`): The query tensor.
|
||||||
|
k (`torch.Tensor`): The key tensor.
|
||||||
|
cos (`torch.Tensor`): The cosine part of the rotary embedding.
|
||||||
|
sin (`torch.Tensor`): The sine part of the rotary embedding.
|
||||||
|
position_ids (`torch.Tensor`, *optional*):
|
||||||
|
Deprecated and unused.
|
||||||
|
unsqueeze_dim (`int`, *optional*, defaults to 1):
|
||||||
|
The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
|
||||||
|
sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
|
||||||
|
that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
|
||||||
|
k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
|
||||||
|
cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
|
||||||
|
the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
|
||||||
|
Returns:
|
||||||
|
`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
|
||||||
|
"""
|
||||||
|
dtype = q.dtype
|
||||||
|
q = q.float()
|
||||||
|
k = k.float()
|
||||||
|
cos = cos.unsqueeze(unsqueeze_dim)
|
||||||
|
sin = sin.unsqueeze(unsqueeze_dim)
|
||||||
|
q_embed = (q * cos) + (rotate_half(q) * sin)
|
||||||
|
k_embed = (k * cos) + (rotate_half(k) * sin)
|
||||||
|
return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype)
|
||||||
|
|
||||||
|
|
||||||
|
class CohereMLP(LlamaMLP):
|
||||||
|
def __init__(self, config):
|
||||||
|
super().__init__(config)
|
||||||
|
self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
|
||||||
|
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
|
||||||
|
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
|
||||||
|
|
||||||
|
|
||||||
|
class CohereAttention(LlamaAttention):
|
||||||
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
||||||
|
|
||||||
|
def __init__(self, config: CohereConfig, layer_idx: Optional[int] = None):
|
||||||
|
super().__init__(config, layer_idx)
|
||||||
|
self.use_qk_norm = config.use_qk_norm
|
||||||
|
if self.use_qk_norm:
|
||||||
|
# When sharding the model using Tensor Parallelism, need to be careful to use n_local_heads
|
||||||
|
self.q_norm = CohereLayerNorm(
|
||||||
|
hidden_size=(config.num_attention_heads, self.head_dim), eps=config.layer_norm_eps
|
||||||
|
)
|
||||||
|
self.k_norm = CohereLayerNorm(
|
||||||
|
hidden_size=(config.num_key_value_heads, self.head_dim), eps=config.layer_norm_eps
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
hidden_states: torch.Tensor,
|
||||||
|
position_embeddings: Tuple[torch.Tensor, torch.Tensor],
|
||||||
|
attention_mask: Optional[torch.Tensor],
|
||||||
|
past_key_value: Optional[Cache] = None,
|
||||||
|
cache_position: Optional[torch.LongTensor] = None,
|
||||||
|
**kwargs: Unpack[FlashAttentionKwargs],
|
||||||
|
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||||
|
input_shape = hidden_states.shape[:-1]
|
||||||
|
hidden_shape = (*input_shape, -1, self.head_dim)
|
||||||
|
|
||||||
|
query_states = self.q_proj(hidden_states).view(hidden_shape)
|
||||||
|
key_states = self.k_proj(hidden_states).view(hidden_shape)
|
||||||
|
value_states = self.v_proj(hidden_states).view(hidden_shape)
|
||||||
|
|
||||||
|
if self.use_qk_norm: # main diff from Llama
|
||||||
|
query_states = self.q_norm(query_states)
|
||||||
|
key_states = self.k_norm(key_states)
|
||||||
|
|
||||||
|
query_states = query_states.transpose(1, 2)
|
||||||
|
key_states = key_states.transpose(1, 2)
|
||||||
|
value_states = value_states.transpose(1, 2)
|
||||||
|
|
||||||
|
cos, sin = position_embeddings
|
||||||
|
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
||||||
|
|
||||||
|
if past_key_value is not None:
|
||||||
|
# sin and cos are specific to RoPE models; position_ids needed for the static cache
|
||||||
|
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
||||||
|
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
||||||
|
|
||||||
|
attention_interface: Callable = eager_attention_forward
|
||||||
|
if self.config._attn_implementation != "eager":
|
||||||
|
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
|
||||||
|
logger.warning_once(
|
||||||
|
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
|
||||||
|
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||||
|
|
||||||
|
attn_output, attn_weights = attention_interface(
|
||||||
|
self,
|
||||||
|
query_states,
|
||||||
|
key_states,
|
||||||
|
value_states,
|
||||||
|
attention_mask,
|
||||||
|
dropout=0.0 if not self.training else self.attention_dropout,
|
||||||
|
scaling=self.scaling,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
attn_output = attn_output.reshape(*input_shape, -1).contiguous()
|
||||||
|
attn_output = self.o_proj(attn_output)
|
||||||
|
return attn_output, attn_weights
|
||||||
|
|
||||||
|
|
||||||
|
class CohereDecoderLayer(nn.Module):
|
||||||
|
def __init__(self, config: CohereConfig, layer_idx: int):
|
||||||
|
super().__init__()
|
||||||
|
self.hidden_size = config.hidden_size
|
||||||
|
self.self_attn = CohereAttention(config=config, layer_idx=layer_idx)
|
||||||
|
self.mlp = CohereMLP(config)
|
||||||
|
self.input_layernorm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
hidden_states: torch.Tensor,
|
||||||
|
attention_mask: Optional[torch.Tensor] = None,
|
||||||
|
position_ids: Optional[torch.LongTensor] = None,
|
||||||
|
past_key_value: Optional[Cache] = None,
|
||||||
|
output_attentions: Optional[bool] = False,
|
||||||
|
use_cache: Optional[bool] = False,
|
||||||
|
cache_position: Optional[torch.LongTensor] = None,
|
||||||
|
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
||||||
|
**kwargs: Unpack[FlashAttentionKwargs],
|
||||||
|
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
|
||||||
|
attention_mask (`torch.FloatTensor`, *optional*):
|
||||||
|
attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
|
||||||
|
query_sequence_length, key_sequence_length)` if default attention is used.
|
||||||
|
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
|
||||||
|
output_attentions (`bool`, *optional*):
|
||||||
|
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
||||||
|
returned tensors for more detail.
|
||||||
|
use_cache (`bool`, *optional*):
|
||||||
|
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
|
||||||
|
(see `past_key_values`).
|
||||||
|
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
|
||||||
|
Indices depicting the position of the input sequence tokens in the sequence
|
||||||
|
position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
|
||||||
|
Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
|
||||||
|
with `head_dim` being the embedding dimension of each attention head.
|
||||||
|
"""
|
||||||
|
residual = hidden_states
|
||||||
|
|
||||||
|
hidden_states = self.input_layernorm(hidden_states)
|
||||||
|
|
||||||
|
# Self Attention
|
||||||
|
hidden_states_attention, self_attn_weights = self.self_attn(
|
||||||
|
hidden_states=hidden_states,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
position_ids=position_ids,
|
||||||
|
past_key_value=past_key_value,
|
||||||
|
output_attentions=output_attentions,
|
||||||
|
use_cache=use_cache,
|
||||||
|
cache_position=cache_position,
|
||||||
|
position_embeddings=position_embeddings,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fully Connected
|
||||||
|
hidden_states_mlp = self.mlp(hidden_states)
|
||||||
|
|
||||||
|
# Add everything together
|
||||||
|
hidden_states = residual + hidden_states_attention + hidden_states_mlp
|
||||||
|
|
||||||
|
outputs = (hidden_states,)
|
||||||
|
if output_attentions:
|
||||||
|
outputs += (self_attn_weights,)
|
||||||
|
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class CohereModel(LlamaModel):
|
||||||
|
def __init__(self, config: CohereConfig):
|
||||||
|
super().__init__(config)
|
||||||
|
self.layers = nn.ModuleList(
|
||||||
|
[CohereDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
|
||||||
|
)
|
||||||
|
self.rotary_emb = CohereRotaryEmbedding(config=config)
|
||||||
|
self.norm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
|
||||||
|
|
||||||
|
|
||||||
|
class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
|
||||||
|
|
||||||
|
|
||||||
|
class CohereForCausalLM(LlamaForCausalLM):
|
||||||
|
def __init__(self, config):
|
||||||
|
super().__init__(config)
|
||||||
|
self.model = CohereModel(config)
|
||||||
|
self.logit_scale = config.logit_scale
|
||||||
|
self.tie_word_embeddings = config.tie_word_embeddings
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
input_ids: torch.LongTensor = None,
|
||||||
|
attention_mask: Optional[torch.Tensor] = None,
|
||||||
|
position_ids: Optional[torch.LongTensor] = None,
|
||||||
|
past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
|
||||||
|
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||||
|
labels: Optional[torch.LongTensor] = None,
|
||||||
|
use_cache: Optional[bool] = None,
|
||||||
|
output_attentions: Optional[bool] = None,
|
||||||
|
output_hidden_states: Optional[bool] = None,
|
||||||
|
return_dict: Optional[bool] = None,
|
||||||
|
cache_position: Optional[torch.LongTensor] = None,
|
||||||
|
num_logits_to_keep: int = 0,
|
||||||
|
**kwargs: Unpack[KwargsForCausalLM],
|
||||||
|
) -> Union[Tuple, CausalLMOutputWithPast]:
|
||||||
|
r"""
|
||||||
|
Args:
|
||||||
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||||
|
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||||
|
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||||
|
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||||
|
|
||||||
|
num_logits_to_keep (`int`, *optional*):
|
||||||
|
Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
|
||||||
|
`input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
|
||||||
|
token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```python
|
||||||
|
>> from transformers import AutoTokenizer, CohereForCausalLM
|
||||||
|
|
||||||
|
>> model = CohereForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01")
|
||||||
|
>> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")
|
||||||
|
|
||||||
|
>> prompt = "Hey, are you conscious? Can you talk to me?"
|
||||||
|
>> inputs = tokenizer(prompt, return_tensors="pt")
|
||||||
|
|
||||||
|
>> # Generate
|
||||||
|
>> generate_ids = model.generate(inputs.input_ids, max_length=30)
|
||||||
|
>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
||||||
|
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
|
||||||
|
```"""
|
||||||
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||||
|
output_hidden_states = (
|
||||||
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||||
|
)
|
||||||
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
|
|
||||||
|
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
|
||||||
|
outputs = self.model(
|
||||||
|
input_ids=input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
position_ids=position_ids,
|
||||||
|
past_key_values=past_key_values,
|
||||||
|
inputs_embeds=inputs_embeds,
|
||||||
|
use_cache=use_cache,
|
||||||
|
output_attentions=output_attentions,
|
||||||
|
output_hidden_states=output_hidden_states,
|
||||||
|
return_dict=return_dict,
|
||||||
|
cache_position=cache_position,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
hidden_states = outputs[0]
|
||||||
|
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
|
||||||
|
logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
|
||||||
|
logits = logits * self.logit_scale # main diff from Llama
|
||||||
|
|
||||||
|
loss = None
|
||||||
|
if labels is not None:
|
||||||
|
loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
|
||||||
|
|
||||||
|
if not return_dict:
|
||||||
|
output = (logits,) + outputs[1:]
|
||||||
|
return (loss,) + output if loss is not None else output
|
||||||
|
|
||||||
|
return CausalLMOutputWithPast(
|
||||||
|
loss=loss,
|
||||||
|
logits=logits,
|
||||||
|
past_key_values=outputs.past_key_values,
|
||||||
|
hidden_states=outputs.hidden_states,
|
||||||
|
attentions=outputs.attentions,
|
||||||
|
)
|
||||||
@@ -28,10 +28,13 @@ import torch.nn as nn
|
|||||||
from ...activations import ACT2FN
|
from ...activations import ACT2FN
|
||||||
from ...cache_utils import Cache, HybridCache
|
from ...cache_utils import Cache, HybridCache
|
||||||
from ...generation import GenerationMixin
|
from ...generation import GenerationMixin
|
||||||
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||||
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
||||||
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
|
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
|
||||||
from ...modeling_utils import PreTrainedModel
|
from ...modeling_utils import PreTrainedModel
|
||||||
|
from ...processing_utils import Unpack
|
||||||
from ...utils import (
|
from ...utils import (
|
||||||
|
LossKwargs,
|
||||||
add_start_docstrings,
|
add_start_docstrings,
|
||||||
add_start_docstrings_to_model_forward,
|
add_start_docstrings_to_model_forward,
|
||||||
is_flash_attn_2_available,
|
is_flash_attn_2_available,
|
||||||
@@ -46,50 +49,20 @@ if is_flash_attn_2_available():
|
|||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
_CONFIG_FOR_DOC = "Cohere2Config"
|
_CONFIG_FOR_DOC = "Cohere2Config"
|
||||||
|
|
||||||
|
|
||||||
class Cohere2RotaryEmbedding(nn.Module):
|
class Cohere2RotaryEmbedding(nn.Module):
|
||||||
# Note: the forward pass of this RoPE is slightly different from Llama's, resulting in different `sin`/`cos` for
|
def __init__(self, config: Cohere2Config, device=None):
|
||||||
# the same parameterization. The differences are highlighted with a comment.
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
dim=None,
|
|
||||||
max_position_embeddings=2048,
|
|
||||||
base=10000,
|
|
||||||
device=None,
|
|
||||||
scaling_factor=1.0,
|
|
||||||
rope_type="default",
|
|
||||||
config: Optional[Cohere2Config] = None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
# TODO (joao): remove the `if` below, only used for BC
|
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
if config is None:
|
# BC: "rope_type" was originally "type"
|
||||||
logger.warning_once(
|
if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
|
||||||
"`Cohere2RotaryEmbedding` can now be fully parameterized by passing the model config through the "
|
self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
|
||||||
"`config` argument. All other arguments will be removed in v4.46"
|
|
||||||
)
|
|
||||||
self.rope_kwargs = {
|
|
||||||
"rope_type": rope_type,
|
|
||||||
"factor": scaling_factor,
|
|
||||||
"dim": dim,
|
|
||||||
"base": base,
|
|
||||||
"max_position_embeddings": max_position_embeddings,
|
|
||||||
}
|
|
||||||
self.rope_type = rope_type
|
|
||||||
self.max_seq_len_cached = max_position_embeddings
|
|
||||||
self.original_max_seq_len = max_position_embeddings
|
|
||||||
else:
|
else:
|
||||||
# BC: "rope_type" was originally "type"
|
self.rope_type = "default"
|
||||||
if config.rope_scaling is not None:
|
self.max_seq_len_cached = config.max_position_embeddings
|
||||||
self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
|
self.original_max_seq_len = config.max_position_embeddings
|
||||||
else:
|
|
||||||
self.rope_type = "default"
|
|
||||||
self.max_seq_len_cached = config.max_position_embeddings
|
|
||||||
self.original_max_seq_len = config.max_position_embeddings
|
|
||||||
|
|
||||||
self.config = config
|
self.config = config
|
||||||
self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
|
self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
|
||||||
@@ -129,7 +102,7 @@ class Cohere2RotaryEmbedding(nn.Module):
|
|||||||
device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
|
device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
|
||||||
with torch.autocast(device_type=device_type, enabled=False):
|
with torch.autocast(device_type=device_type, enabled=False):
|
||||||
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
||||||
emb = torch.repeat_interleave(freqs, 2, dim=-1) # This line differs from Llama's implementation
|
emb = torch.repeat_interleave(freqs, 2, dim=-1) # diff from Llama: we interleave() instead of cat()
|
||||||
cos = emb.cos()
|
cos = emb.cos()
|
||||||
sin = emb.sin()
|
sin = emb.sin()
|
||||||
|
|
||||||
@@ -157,6 +130,18 @@ class Cohere2LayerNorm(nn.Module):
|
|||||||
return hidden_states.to(input_dtype)
|
return hidden_states.to(input_dtype)
|
||||||
|
|
||||||
|
|
||||||
|
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
|
||||||
|
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
|
||||||
|
"""
|
||||||
|
batch, num_key_value_heads, slen, head_dim = hidden_states.shape
|
||||||
|
if n_rep == 1:
|
||||||
|
return hidden_states
|
||||||
|
hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
|
||||||
|
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
|
||||||
|
|
||||||
|
|
||||||
def rotate_half(x):
|
def rotate_half(x):
|
||||||
# Split and rotate. Note that this function is different from e.g. Llama.
|
# Split and rotate. Note that this function is different from e.g. Llama.
|
||||||
x1 = x[..., ::2]
|
x1 = x[..., ::2]
|
||||||
@@ -195,18 +180,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
|
|||||||
return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype)
|
return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype)
|
||||||
|
|
||||||
|
|
||||||
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
|
||||||
"""
|
|
||||||
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
|
|
||||||
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
|
|
||||||
"""
|
|
||||||
batch, num_key_value_heads, slen, head_dim = hidden_states.shape
|
|
||||||
if n_rep == 1:
|
|
||||||
return hidden_states
|
|
||||||
hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
|
|
||||||
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
|
|
||||||
|
|
||||||
|
|
||||||
def eager_attention_forward(
|
def eager_attention_forward(
|
||||||
config: Cohere2Config,
|
config: Cohere2Config,
|
||||||
query: torch.Tensor,
|
query: torch.Tensor,
|
||||||
@@ -425,7 +398,6 @@ class Cohere2MLP(nn.Module):
|
|||||||
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
|
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
|
||||||
self.act_fn = ACT2FN[config.hidden_act]
|
self.act_fn = ACT2FN[config.hidden_act]
|
||||||
|
|
||||||
# Ignore copy
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
|
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
|
||||||
return down_proj
|
return down_proj
|
||||||
@@ -436,7 +408,6 @@ class Cohere2DecoderLayer(nn.Module):
|
|||||||
super().__init__()
|
super().__init__()
|
||||||
self.hidden_size = config.hidden_size
|
self.hidden_size = config.hidden_size
|
||||||
self.self_attn = Cohere2Attention(config, layer_idx)
|
self.self_attn = Cohere2Attention(config, layer_idx)
|
||||||
|
|
||||||
self.mlp = Cohere2MLP(config)
|
self.mlp = Cohere2MLP(config)
|
||||||
self.input_layernorm = Cohere2LayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
|
self.input_layernorm = Cohere2LayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
|
||||||
self.config = config
|
self.config = config
|
||||||
@@ -521,7 +492,8 @@ class Cohere2DecoderLayer(nn.Module):
|
|||||||
|
|
||||||
COHERE2_START_DOCSTRING = r"""
|
COHERE2_START_DOCSTRING = r"""
|
||||||
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
||||||
library implements for all its model (such as downloading or saving, resizing the input embeddings etc.).
|
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
||||||
|
etc.)
|
||||||
|
|
||||||
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
||||||
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
||||||
@@ -874,11 +846,13 @@ class Cohere2Model(Cohere2PreTrainedModel):
|
|||||||
return causal_mask
|
return causal_mask
|
||||||
|
|
||||||
|
|
||||||
# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with Llama->Cohere2
|
class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
|
||||||
|
|
||||||
|
|
||||||
class Cohere2ForCausalLM(Cohere2PreTrainedModel, GenerationMixin):
|
class Cohere2ForCausalLM(Cohere2PreTrainedModel, GenerationMixin):
|
||||||
_tied_weights_keys = ["lm_head.weight"]
|
_tied_weights_keys = ["lm_head.weight"]
|
||||||
|
_tp_plan = {"lm_head": "colwise_rep"}
|
||||||
|
|
||||||
# Ignore copy
|
|
||||||
def __init__(self, config: Cohere2Config):
|
def __init__(self, config: Cohere2Config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.model = Cohere2Model(config)
|
self.model = Cohere2Model(config)
|
||||||
@@ -886,6 +860,7 @@ class Cohere2ForCausalLM(Cohere2PreTrainedModel, GenerationMixin):
|
|||||||
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
||||||
self.logit_scale = config.logit_scale
|
self.logit_scale = config.logit_scale
|
||||||
self.tie_word_embeddings = config.tie_word_embeddings
|
self.tie_word_embeddings = config.tie_word_embeddings
|
||||||
|
|
||||||
# Initialize weights and apply final processing
|
# Initialize weights and apply final processing
|
||||||
self.post_init()
|
self.post_init()
|
||||||
|
|
||||||
@@ -907,7 +882,6 @@ class Cohere2ForCausalLM(Cohere2PreTrainedModel, GenerationMixin):
|
|||||||
def get_decoder(self):
|
def get_decoder(self):
|
||||||
return self.model
|
return self.model
|
||||||
|
|
||||||
# Ignore copy
|
|
||||||
@add_start_docstrings_to_model_forward(COHERE2_INPUTS_DOCSTRING)
|
@add_start_docstrings_to_model_forward(COHERE2_INPUTS_DOCSTRING)
|
||||||
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
|
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
|
||||||
def forward(
|
def forward(
|
||||||
@@ -915,7 +889,7 @@ class Cohere2ForCausalLM(Cohere2PreTrainedModel, GenerationMixin):
|
|||||||
input_ids: torch.LongTensor = None,
|
input_ids: torch.LongTensor = None,
|
||||||
attention_mask: Optional[torch.Tensor] = None,
|
attention_mask: Optional[torch.Tensor] = None,
|
||||||
position_ids: Optional[torch.LongTensor] = None,
|
position_ids: Optional[torch.LongTensor] = None,
|
||||||
past_key_values: Optional[List[torch.FloatTensor]] = None,
|
past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
|
||||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||||
labels: Optional[torch.LongTensor] = None,
|
labels: Optional[torch.LongTensor] = None,
|
||||||
use_cache: Optional[bool] = None,
|
use_cache: Optional[bool] = None,
|
||||||
@@ -924,7 +898,7 @@ class Cohere2ForCausalLM(Cohere2PreTrainedModel, GenerationMixin):
|
|||||||
return_dict: Optional[bool] = None,
|
return_dict: Optional[bool] = None,
|
||||||
cache_position: Optional[torch.LongTensor] = None,
|
cache_position: Optional[torch.LongTensor] = None,
|
||||||
num_logits_to_keep: int = 0,
|
num_logits_to_keep: int = 0,
|
||||||
**loss_kwargs,
|
**kwargs: Unpack[KwargsForCausalLM],
|
||||||
) -> Union[Tuple, CausalLMOutputWithPast]:
|
) -> Union[Tuple, CausalLMOutputWithPast]:
|
||||||
r"""
|
r"""
|
||||||
Args:
|
Args:
|
||||||
@@ -974,16 +948,17 @@ class Cohere2ForCausalLM(Cohere2PreTrainedModel, GenerationMixin):
|
|||||||
output_hidden_states=output_hidden_states,
|
output_hidden_states=output_hidden_states,
|
||||||
return_dict=return_dict,
|
return_dict=return_dict,
|
||||||
cache_position=cache_position,
|
cache_position=cache_position,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
hidden_states = outputs[0]
|
hidden_states = outputs[0]
|
||||||
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
|
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
|
||||||
logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
|
logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
|
||||||
logits = logits * self.logit_scale
|
logits = logits * self.logit_scale # main diff from Llama
|
||||||
|
|
||||||
loss = None
|
loss = None
|
||||||
if labels is not None:
|
if labels is not None:
|
||||||
loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
|
loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
|
||||||
|
|
||||||
if not return_dict:
|
if not return_dict:
|
||||||
output = (logits,) + outputs[1:]
|
output = (logits,) + outputs[1:]
|
||||||
|
|||||||
@@ -110,11 +110,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
|
|||||||
|
|
||||||
# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Falcon
|
# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Falcon
|
||||||
class FalconRotaryEmbedding(nn.Module):
|
class FalconRotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: FalconConfig, device=None):
|
||||||
self,
|
|
||||||
config: FalconConfig,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
|
|||||||
@@ -92,11 +92,7 @@ class GemmaMLP(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class GemmaRotaryEmbedding(nn.Module):
|
class GemmaRotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: GemmaConfig, device=None):
|
||||||
self,
|
|
||||||
config: GemmaConfig,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
|
|||||||
@@ -324,11 +324,7 @@ class Gemma2DecoderLayer(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class Gemma2RotaryEmbedding(nn.Module):
|
class Gemma2RotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: Gemma2Config, device=None):
|
||||||
self,
|
|
||||||
config: Gemma2Config,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
|
|||||||
@@ -62,7 +62,6 @@ class GlmMLP(nn.Module):
|
|||||||
self.config = config
|
self.config = config
|
||||||
self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
|
self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
|
||||||
self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
|
self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
|
||||||
|
|
||||||
self.activation_fn = ACT2FN[config.hidden_act]
|
self.activation_fn = ACT2FN[config.hidden_act]
|
||||||
|
|
||||||
def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
|
def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
|
||||||
@@ -256,11 +255,7 @@ class GlmRMSNorm(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class GlmRotaryEmbedding(nn.Module):
|
class GlmRotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: GlmConfig, device=None):
|
||||||
self,
|
|
||||||
config: GlmConfig,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
|
|||||||
@@ -491,11 +491,7 @@ class GPTNeoXSdpaAttention(GPTNeoXAttention):
|
|||||||
|
|
||||||
# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->GPTNeoX
|
# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->GPTNeoX
|
||||||
class GPTNeoXRotaryEmbedding(nn.Module):
|
class GPTNeoXRotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: GPTNeoXConfig, device=None):
|
||||||
self,
|
|
||||||
config: GPTNeoXConfig,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
|
|||||||
@@ -225,11 +225,7 @@ class GPTNeoXJapaneseAttention(nn.Module):
|
|||||||
|
|
||||||
# Copied from transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXRotaryEmbedding with GPTNeoX->GPTNeoXJapanese
|
# Copied from transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXRotaryEmbedding with GPTNeoX->GPTNeoXJapanese
|
||||||
class GPTNeoXJapaneseRotaryEmbedding(nn.Module):
|
class GPTNeoXJapaneseRotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: GPTNeoXJapaneseConfig, device=None):
|
||||||
self,
|
|
||||||
config: GPTNeoXJapaneseConfig,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
|
|||||||
@@ -309,11 +309,7 @@ class GraniteDecoderLayer(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class GraniteRotaryEmbedding(nn.Module):
|
class GraniteRotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: GraniteConfig, device=None):
|
||||||
self,
|
|
||||||
config: GraniteConfig,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
|
|||||||
@@ -158,11 +158,7 @@ ALL_LAYERNORM_LAYERS.append(GraniteMoeRMSNorm)
|
|||||||
|
|
||||||
# Copied from transformers.models.granite.modeling_granite.GraniteRotaryEmbedding with Granite->GraniteMoe
|
# Copied from transformers.models.granite.modeling_granite.GraniteRotaryEmbedding with Granite->GraniteMoe
|
||||||
class GraniteMoeRotaryEmbedding(nn.Module):
|
class GraniteMoeRotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: GraniteMoeConfig, device=None):
|
||||||
self,
|
|
||||||
config: GraniteMoeConfig,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
|
|||||||
@@ -386,11 +386,7 @@ class JetMoeRMSNorm(nn.Module):
|
|||||||
|
|
||||||
# Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with Gemma->JetMoe
|
# Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with Gemma->JetMoe
|
||||||
class JetMoeRotaryEmbedding(nn.Module):
|
class JetMoeRotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: JetMoeConfig, device=None):
|
||||||
self,
|
|
||||||
config: JetMoeConfig,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
|
|||||||
@@ -80,11 +80,7 @@ ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm)
|
|||||||
|
|
||||||
|
|
||||||
class LlamaRotaryEmbedding(nn.Module):
|
class LlamaRotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: LlamaConfig, device=None):
|
||||||
self,
|
|
||||||
config: LlamaConfig,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
|
|||||||
@@ -365,11 +365,7 @@ class MimiLayerScale(nn.Module):
|
|||||||
|
|
||||||
# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Mimi
|
# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Mimi
|
||||||
class MimiRotaryEmbedding(nn.Module):
|
class MimiRotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: MimiConfig, device=None):
|
||||||
self,
|
|
||||||
config: MimiConfig,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
@@ -1063,7 +1059,7 @@ class MimiTransformerModel(nn.Module):
|
|||||||
attentions=all_self_attns,
|
attentions=all_self_attns,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask
|
# Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask with Phi3->Mimi
|
||||||
def _update_causal_mask(
|
def _update_causal_mask(
|
||||||
self,
|
self,
|
||||||
attention_mask: torch.Tensor,
|
attention_mask: torch.Tensor,
|
||||||
@@ -1073,6 +1069,14 @@ class MimiTransformerModel(nn.Module):
|
|||||||
output_attentions: bool,
|
output_attentions: bool,
|
||||||
):
|
):
|
||||||
if self.config._attn_implementation == "flash_attention_2":
|
if self.config._attn_implementation == "flash_attention_2":
|
||||||
|
if attention_mask is not None and past_key_values is not None:
|
||||||
|
is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
|
||||||
|
if is_padding_right:
|
||||||
|
raise ValueError(
|
||||||
|
"You are attempting to perform batched generation with padding_side='right'"
|
||||||
|
" this may lead to unexpected behaviour for Flash Attention version of Mimi. Make sure to "
|
||||||
|
" call `tokenizer.padding_side = 'left'` before tokenizing the input. "
|
||||||
|
)
|
||||||
if attention_mask is not None and 0.0 in attention_mask:
|
if attention_mask is not None and 0.0 in attention_mask:
|
||||||
return attention_mask
|
return attention_mask
|
||||||
return None
|
return None
|
||||||
|
|||||||
@@ -270,11 +270,7 @@ class MistralDecoderLayer(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class MistralRotaryEmbedding(nn.Module):
|
class MistralRotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: MistralConfig, device=None):
|
||||||
self,
|
|
||||||
config: MistralConfig,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
|
|||||||
@@ -392,11 +392,7 @@ class MixtralDecoderLayer(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class MixtralRotaryEmbedding(nn.Module):
|
class MixtralRotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: MixtralConfig, device=None):
|
||||||
self,
|
|
||||||
config: MixtralConfig,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
|
|||||||
@@ -308,11 +308,7 @@ class MoshiLinear(nn.Module):
|
|||||||
|
|
||||||
# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Moshi
|
# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Moshi
|
||||||
class MoshiRotaryEmbedding(nn.Module):
|
class MoshiRotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: MoshiConfig, device=None):
|
||||||
self,
|
|
||||||
config: MoshiConfig,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
@@ -1292,7 +1288,7 @@ class MoshiDepthDecoder(MoshiPreTrainedModel, GenerationMixin):
|
|||||||
attentions=all_self_attns,
|
attentions=all_self_attns,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask
|
# Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask with Phi3->Moshi
|
||||||
def _update_causal_mask(
|
def _update_causal_mask(
|
||||||
self,
|
self,
|
||||||
attention_mask: torch.Tensor,
|
attention_mask: torch.Tensor,
|
||||||
@@ -1302,6 +1298,14 @@ class MoshiDepthDecoder(MoshiPreTrainedModel, GenerationMixin):
|
|||||||
output_attentions: bool,
|
output_attentions: bool,
|
||||||
):
|
):
|
||||||
if self.config._attn_implementation == "flash_attention_2":
|
if self.config._attn_implementation == "flash_attention_2":
|
||||||
|
if attention_mask is not None and past_key_values is not None:
|
||||||
|
is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
|
||||||
|
if is_padding_right:
|
||||||
|
raise ValueError(
|
||||||
|
"You are attempting to perform batched generation with padding_side='right'"
|
||||||
|
" this may lead to unexpected behaviour for Flash Attention version of Moshi. Make sure to "
|
||||||
|
" call `tokenizer.padding_side = 'left'` before tokenizing the input. "
|
||||||
|
)
|
||||||
if attention_mask is not None and 0.0 in attention_mask:
|
if attention_mask is not None and 0.0 in attention_mask:
|
||||||
return attention_mask
|
return attention_mask
|
||||||
return None
|
return None
|
||||||
@@ -1596,7 +1600,7 @@ class MoshiModel(MoshiPreTrainedModel):
|
|||||||
attentions=all_self_attns,
|
attentions=all_self_attns,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask
|
# Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask with Phi3->Moshi
|
||||||
def _update_causal_mask(
|
def _update_causal_mask(
|
||||||
self,
|
self,
|
||||||
attention_mask: torch.Tensor,
|
attention_mask: torch.Tensor,
|
||||||
@@ -1606,6 +1610,14 @@ class MoshiModel(MoshiPreTrainedModel):
|
|||||||
output_attentions: bool,
|
output_attentions: bool,
|
||||||
):
|
):
|
||||||
if self.config._attn_implementation == "flash_attention_2":
|
if self.config._attn_implementation == "flash_attention_2":
|
||||||
|
if attention_mask is not None and past_key_values is not None:
|
||||||
|
is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
|
||||||
|
if is_padding_right:
|
||||||
|
raise ValueError(
|
||||||
|
"You are attempting to perform batched generation with padding_side='right'"
|
||||||
|
" this may lead to unexpected behaviour for Flash Attention version of Moshi. Make sure to "
|
||||||
|
" call `tokenizer.padding_side = 'left'` before tokenizing the input. "
|
||||||
|
)
|
||||||
if attention_mask is not None and 0.0 in attention_mask:
|
if attention_mask is not None and 0.0 in attention_mask:
|
||||||
return attention_mask
|
return attention_mask
|
||||||
return None
|
return None
|
||||||
|
|||||||
@@ -274,11 +274,7 @@ class OlmoDecoderLayer(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class OlmoRotaryEmbedding(nn.Module):
|
class OlmoRotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: OlmoConfig, device=None):
|
||||||
self,
|
|
||||||
config: OlmoConfig,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
|
|||||||
@@ -275,11 +275,7 @@ class Olmo2DecoderLayer(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class Olmo2RotaryEmbedding(nn.Module):
|
class Olmo2RotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: Olmo2Config, device=None):
|
||||||
self,
|
|
||||||
config: Olmo2Config,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
|
|||||||
@@ -158,11 +158,7 @@ ALL_LAYERNORM_LAYERS.append(OlmoeRMSNorm)
|
|||||||
|
|
||||||
# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Olmoe
|
# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Olmoe
|
||||||
class OlmoeRotaryEmbedding(nn.Module):
|
class OlmoeRotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: OlmoeConfig, device=None):
|
||||||
self,
|
|
||||||
config: OlmoeConfig,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
|
|||||||
@@ -57,11 +57,7 @@ _CONFIG_FOR_DOC = "PersimmonConfig"
|
|||||||
|
|
||||||
# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Persimmon
|
# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Persimmon
|
||||||
class PersimmonRotaryEmbedding(nn.Module):
|
class PersimmonRotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: PersimmonConfig, device=None):
|
||||||
self,
|
|
||||||
config: PersimmonConfig,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
|
|||||||
@@ -270,11 +270,7 @@ class PhiDecoderLayer(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class PhiRotaryEmbedding(nn.Module):
|
class PhiRotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: PhiConfig, device=None):
|
||||||
self,
|
|
||||||
config: PhiConfig,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
311
src/transformers/models/phi3/modular_phi3.py
Normal file
311
src/transformers/models/phi3/modular_phi3.py
Normal file
@@ -0,0 +1,311 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""PyTorch Phi-3 model."""
|
||||||
|
|
||||||
|
from typing import Callable, Optional, Tuple
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.utils.checkpoint
|
||||||
|
from torch import nn
|
||||||
|
|
||||||
|
from ...activations import ACT2FN
|
||||||
|
from ...cache_utils import Cache
|
||||||
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||||
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
||||||
|
from ...processing_utils import Unpack
|
||||||
|
from ...utils import logging
|
||||||
|
from ..mistral.modeling_mistral import (
|
||||||
|
MistralDecoderLayer,
|
||||||
|
MistralForCausalLM,
|
||||||
|
MistralForSequenceClassification,
|
||||||
|
MistralForTokenClassification,
|
||||||
|
MistralPreTrainedModel,
|
||||||
|
MistralRotaryEmbedding,
|
||||||
|
apply_rotary_pos_emb,
|
||||||
|
eager_attention_forward,
|
||||||
|
)
|
||||||
|
from .configuration_phi3 import Phi3Config
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
_CHECKPOINT_FOR_DOC = "microsoft/Phi-3-mini-4k-instruct"
|
||||||
|
_CONFIG_FOR_DOC = "Phi3Config"
|
||||||
|
|
||||||
|
|
||||||
|
class Phi3MLP(nn.Module):
|
||||||
|
def __init__(self, config):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.config = config
|
||||||
|
self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
|
||||||
|
self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
|
||||||
|
self.activation_fn = ACT2FN[config.hidden_act]
|
||||||
|
|
||||||
|
def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
|
||||||
|
up_states = self.gate_up_proj(hidden_states)
|
||||||
|
|
||||||
|
gate, up_states = up_states.chunk(2, dim=-1)
|
||||||
|
up_states = up_states * self.activation_fn(gate)
|
||||||
|
|
||||||
|
return self.down_proj(up_states)
|
||||||
|
|
||||||
|
|
||||||
|
class Phi3Attention(nn.Module):
|
||||||
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
||||||
|
|
||||||
|
def __init__(self, config: Phi3Config, layer_idx: Optional[int] = None):
|
||||||
|
super().__init__()
|
||||||
|
self.config = config
|
||||||
|
self.layer_idx = layer_idx
|
||||||
|
self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
|
||||||
|
self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
|
||||||
|
self.num_key_value_heads = config.num_key_value_heads
|
||||||
|
self.scaling = self.head_dim**-0.5
|
||||||
|
self.attention_dropout = config.attention_dropout
|
||||||
|
self.is_causal = True
|
||||||
|
|
||||||
|
op_size = config.num_attention_heads * self.head_dim + 2 * (config.num_key_value_heads * self.head_dim)
|
||||||
|
self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
|
||||||
|
self.qkv_proj = nn.Linear(config.hidden_size, op_size, bias=False)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
hidden_states: torch.Tensor,
|
||||||
|
position_embeddings: Tuple[torch.Tensor, torch.Tensor],
|
||||||
|
attention_mask: Optional[torch.Tensor],
|
||||||
|
past_key_value: Optional[Cache] = None,
|
||||||
|
cache_position: Optional[torch.LongTensor] = None,
|
||||||
|
**kwargs: Unpack[FlashAttentionKwargs],
|
||||||
|
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||||
|
input_shape = hidden_states.shape[:-1]
|
||||||
|
hidden_shape = (*input_shape, -1, self.head_dim)
|
||||||
|
|
||||||
|
qkv = self.qkv_proj(hidden_states)
|
||||||
|
query_pos = self.config.num_attention_heads * self.head_dim
|
||||||
|
query_states = qkv[..., :query_pos]
|
||||||
|
key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
|
||||||
|
value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
|
||||||
|
|
||||||
|
query_states = query_states.view(hidden_shape).transpose(1, 2)
|
||||||
|
key_states = key_states.view(hidden_shape).transpose(1, 2)
|
||||||
|
value_states = value_states.view(hidden_shape).transpose(1, 2)
|
||||||
|
|
||||||
|
cos, sin = position_embeddings
|
||||||
|
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
||||||
|
|
||||||
|
if past_key_value is not None:
|
||||||
|
# sin and cos are specific to RoPE models; cache_position needed for the static cache
|
||||||
|
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
||||||
|
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
||||||
|
|
||||||
|
attention_interface: Callable = eager_attention_forward
|
||||||
|
if self.config._attn_implementation != "eager":
|
||||||
|
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
|
||||||
|
logger.warning_once(
|
||||||
|
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
|
||||||
|
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||||
|
|
||||||
|
attn_output, attn_weights = attention_interface(
|
||||||
|
self,
|
||||||
|
query_states,
|
||||||
|
key_states,
|
||||||
|
value_states,
|
||||||
|
attention_mask,
|
||||||
|
dropout=0.0 if not self.training else self.attention_dropout,
|
||||||
|
scaling=self.scaling,
|
||||||
|
sliding_window=getattr(self.config, "sliding_window", None),
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
attn_output = attn_output.reshape(*input_shape, -1).contiguous()
|
||||||
|
attn_output = self.o_proj(attn_output)
|
||||||
|
return attn_output, attn_weights
|
||||||
|
|
||||||
|
|
||||||
|
class Phi3DecoderLayer(MistralDecoderLayer):
|
||||||
|
def __init__(self, config: Phi3Config, layer_idx: int):
|
||||||
|
super().__init__(config, layer_idx)
|
||||||
|
self.config = config
|
||||||
|
self.self_attn = Phi3Attention(config=config, layer_idx=layer_idx)
|
||||||
|
self.mlp = Phi3MLP(config)
|
||||||
|
self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
|
||||||
|
self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
hidden_states: torch.Tensor,
|
||||||
|
attention_mask: Optional[torch.Tensor] = None,
|
||||||
|
position_ids: Optional[torch.LongTensor] = None,
|
||||||
|
past_key_value: Optional[Cache] = None,
|
||||||
|
output_attentions: Optional[bool] = False,
|
||||||
|
use_cache: Optional[bool] = False,
|
||||||
|
cache_position: Optional[torch.LongTensor] = None,
|
||||||
|
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
||||||
|
**kwargs: Unpack[FlashAttentionKwargs],
|
||||||
|
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
hidden_states (`torch.FloatTensor`):
|
||||||
|
input to the layer of shape `(batch, seq_len, embed_dim)`
|
||||||
|
attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
|
||||||
|
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
|
||||||
|
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
|
||||||
|
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
|
||||||
|
`[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
|
||||||
|
past_key_value (`Cache`, *optional*): cached past key and value projection states
|
||||||
|
output_attentions (`bool`, *optional*):
|
||||||
|
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
||||||
|
returned tensors for more detail.
|
||||||
|
use_cache (`bool`, *optional*):
|
||||||
|
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
|
||||||
|
(see `past_key_values`).
|
||||||
|
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
|
||||||
|
Indices depicting the position of the input sequence tokens in the sequence
|
||||||
|
kwargs (`dict`, *optional*):
|
||||||
|
Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
|
||||||
|
into the model
|
||||||
|
"""
|
||||||
|
residual = hidden_states
|
||||||
|
|
||||||
|
hidden_states = self.input_layernorm(hidden_states)
|
||||||
|
|
||||||
|
# Self Attention
|
||||||
|
hidden_states, self_attn_weights = self.self_attn(
|
||||||
|
hidden_states=hidden_states,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
position_ids=position_ids,
|
||||||
|
past_key_value=past_key_value,
|
||||||
|
output_attentions=output_attentions,
|
||||||
|
use_cache=use_cache,
|
||||||
|
cache_position=cache_position,
|
||||||
|
position_embeddings=position_embeddings,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
hidden_states = residual + self.resid_attn_dropout(hidden_states) # main diff with Llama
|
||||||
|
|
||||||
|
residual = hidden_states
|
||||||
|
hidden_states = self.post_attention_layernorm(hidden_states)
|
||||||
|
hidden_states = self.mlp(hidden_states)
|
||||||
|
hidden_states = residual + self.resid_mlp_dropout(hidden_states) # main diff with Llama
|
||||||
|
|
||||||
|
outputs = (hidden_states,)
|
||||||
|
if output_attentions:
|
||||||
|
outputs += (self_attn_weights,)
|
||||||
|
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class Phi3RotaryEmbedding(MistralRotaryEmbedding):
|
||||||
|
def __init__(self, config: Phi3Config, device=None):
|
||||||
|
super().__init__(config, device)
|
||||||
|
|
||||||
|
def _longrope_frequency_update(self, position_ids, device):
|
||||||
|
"""Longrope uses long factor if sequence is larger than original pretraining length, short otherwise."""
|
||||||
|
seq_len = torch.max(position_ids) + 1
|
||||||
|
if hasattr(self.config, "original_max_position_embeddings"):
|
||||||
|
original_max_position_embeddings = self.config.original_max_position_embeddings
|
||||||
|
else:
|
||||||
|
original_max_position_embeddings = self.config.max_position_embeddings
|
||||||
|
if seq_len > original_max_position_embeddings:
|
||||||
|
if not hasattr(self, "long_inv_freq"):
|
||||||
|
self.long_inv_freq, _ = self.rope_init_fn(
|
||||||
|
self.config, device, seq_len=original_max_position_embeddings + 1
|
||||||
|
)
|
||||||
|
self.register_buffer("inv_freq", self.long_inv_freq, persistent=False)
|
||||||
|
else:
|
||||||
|
self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def forward(self, x, position_ids):
|
||||||
|
if "dynamic" in self.rope_type:
|
||||||
|
self._dynamic_frequency_update(position_ids, device=x.device)
|
||||||
|
elif self.rope_type == "longrope":
|
||||||
|
self._longrope_frequency_update(position_ids, device=x.device)
|
||||||
|
|
||||||
|
# Core RoPE block
|
||||||
|
inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
|
||||||
|
position_ids_expanded = position_ids[:, None, :].float()
|
||||||
|
# Force float32 (see https://github.com/huggingface/transformers/pull/29285)
|
||||||
|
device_type = x.device.type
|
||||||
|
device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
|
||||||
|
with torch.autocast(device_type=device_type, enabled=False):
|
||||||
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
||||||
|
emb = torch.cat((freqs, freqs), dim=-1)
|
||||||
|
cos = emb.cos()
|
||||||
|
sin = emb.sin()
|
||||||
|
|
||||||
|
# Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
|
||||||
|
cos = cos * self.attention_scaling
|
||||||
|
sin = sin * self.attention_scaling
|
||||||
|
|
||||||
|
return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
|
||||||
|
|
||||||
|
|
||||||
|
class Phi3PreTrainedModel(MistralPreTrainedModel):
|
||||||
|
_version = "0.0.5"
|
||||||
|
|
||||||
|
|
||||||
|
class Phi3ForCausalLM(MistralForCausalLM, Phi3PreTrainedModel):
|
||||||
|
def prepare_inputs_for_generation(
|
||||||
|
self,
|
||||||
|
input_ids,
|
||||||
|
past_key_values=None,
|
||||||
|
attention_mask=None,
|
||||||
|
inputs_embeds=None,
|
||||||
|
cache_position=None,
|
||||||
|
position_ids=None,
|
||||||
|
use_cache=True,
|
||||||
|
num_logits_to_keep=None,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
# Overwritten -- this model may need to switch between short and long rope, invalidating the cache in the
|
||||||
|
# process
|
||||||
|
|
||||||
|
# When the first time input length reached long and short factor switching point, enforce re-compute cache
|
||||||
|
# It will cause downside of slower at this single token position, however, better than current failure.
|
||||||
|
if (
|
||||||
|
past_key_values
|
||||||
|
and self.config.rope_scaling
|
||||||
|
and input_ids.shape[1] >= self.config.original_max_position_embeddings + 1
|
||||||
|
):
|
||||||
|
past_length = cache_position[0]
|
||||||
|
if past_length <= self.config.original_max_position_embeddings:
|
||||||
|
past_key_values = None
|
||||||
|
|
||||||
|
model_inputs = Phi3PreTrainedModel().prepare_inputs_for_generation(
|
||||||
|
input_ids=input_ids,
|
||||||
|
past_key_values=past_key_values,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
inputs_embeds=inputs_embeds,
|
||||||
|
cache_position=cache_position,
|
||||||
|
position_ids=position_ids,
|
||||||
|
use_cache=use_cache,
|
||||||
|
num_logits_to_keep=num_logits_to_keep,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
return model_inputs
|
||||||
|
|
||||||
|
|
||||||
|
class Phi3ForSequenceClassification(MistralForSequenceClassification):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class Phi3ForTokenClassification(MistralForTokenClassification):
|
||||||
|
pass
|
||||||
@@ -1173,7 +1173,7 @@ class PhimoeModel(PhimoePreTrainedModel):
|
|||||||
router_logits=all_router_logits,
|
router_logits=all_router_logits,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask
|
# Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask with Phi3->Phimoe
|
||||||
def _update_causal_mask(
|
def _update_causal_mask(
|
||||||
self,
|
self,
|
||||||
attention_mask: torch.Tensor,
|
attention_mask: torch.Tensor,
|
||||||
@@ -1183,6 +1183,14 @@ class PhimoeModel(PhimoePreTrainedModel):
|
|||||||
output_attentions: bool,
|
output_attentions: bool,
|
||||||
):
|
):
|
||||||
if self.config._attn_implementation == "flash_attention_2":
|
if self.config._attn_implementation == "flash_attention_2":
|
||||||
|
if attention_mask is not None and past_key_values is not None:
|
||||||
|
is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
|
||||||
|
if is_padding_right:
|
||||||
|
raise ValueError(
|
||||||
|
"You are attempting to perform batched generation with padding_side='right'"
|
||||||
|
" this may lead to unexpected behaviour for Flash Attention version of Phimoe. Make sure to "
|
||||||
|
" call `tokenizer.padding_side = 'left'` before tokenizing the input. "
|
||||||
|
)
|
||||||
if attention_mask is not None and 0.0 in attention_mask:
|
if attention_mask is not None and 0.0 in attention_mask:
|
||||||
return attention_mask
|
return attention_mask
|
||||||
return None
|
return None
|
||||||
|
|||||||
@@ -283,11 +283,7 @@ class Qwen2DecoderLayer(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class Qwen2RotaryEmbedding(nn.Module):
|
class Qwen2RotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: Qwen2Config, device=None):
|
||||||
self,
|
|
||||||
config: Qwen2Config,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
|
|||||||
@@ -167,11 +167,7 @@ class Qwen2MoeRMSNorm(nn.Module):
|
|||||||
|
|
||||||
# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2Moe
|
# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2Moe
|
||||||
class Qwen2MoeRotaryEmbedding(nn.Module):
|
class Qwen2MoeRotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: Qwen2MoeConfig, device=None):
|
||||||
self,
|
|
||||||
config: Qwen2MoeConfig,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
@@ -1064,7 +1060,7 @@ class Qwen2MoeModel(Qwen2MoePreTrainedModel):
|
|||||||
router_logits=all_router_logits,
|
router_logits=all_router_logits,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask
|
# Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask with Phi3->Qwen2Moe
|
||||||
def _update_causal_mask(
|
def _update_causal_mask(
|
||||||
self,
|
self,
|
||||||
attention_mask: torch.Tensor,
|
attention_mask: torch.Tensor,
|
||||||
@@ -1074,6 +1070,14 @@ class Qwen2MoeModel(Qwen2MoePreTrainedModel):
|
|||||||
output_attentions: bool,
|
output_attentions: bool,
|
||||||
):
|
):
|
||||||
if self.config._attn_implementation == "flash_attention_2":
|
if self.config._attn_implementation == "flash_attention_2":
|
||||||
|
if attention_mask is not None and past_key_values is not None:
|
||||||
|
is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
|
||||||
|
if is_padding_right:
|
||||||
|
raise ValueError(
|
||||||
|
"You are attempting to perform batched generation with padding_side='right'"
|
||||||
|
" this may lead to unexpected behaviour for Flash Attention version of Qwen2Moe. Make sure to "
|
||||||
|
" call `tokenizer.padding_side = 'left'` before tokenizing the input. "
|
||||||
|
)
|
||||||
if attention_mask is not None and 0.0 in attention_mask:
|
if attention_mask is not None and 0.0 in attention_mask:
|
||||||
return attention_mask
|
return attention_mask
|
||||||
return None
|
return None
|
||||||
|
|||||||
@@ -1160,7 +1160,7 @@ class Qwen2VLModel(Qwen2VLPreTrainedModel):
|
|||||||
attentions=all_self_attns,
|
attentions=all_self_attns,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask
|
# Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask with Phi3->Qwen2VL
|
||||||
def _update_causal_mask(
|
def _update_causal_mask(
|
||||||
self,
|
self,
|
||||||
attention_mask: torch.Tensor,
|
attention_mask: torch.Tensor,
|
||||||
@@ -1170,6 +1170,14 @@ class Qwen2VLModel(Qwen2VLPreTrainedModel):
|
|||||||
output_attentions: bool,
|
output_attentions: bool,
|
||||||
):
|
):
|
||||||
if self.config._attn_implementation == "flash_attention_2":
|
if self.config._attn_implementation == "flash_attention_2":
|
||||||
|
if attention_mask is not None and past_key_values is not None:
|
||||||
|
is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
|
||||||
|
if is_padding_right:
|
||||||
|
raise ValueError(
|
||||||
|
"You are attempting to perform batched generation with padding_side='right'"
|
||||||
|
" this may lead to unexpected behaviour for Flash Attention version of Qwen2VL. Make sure to "
|
||||||
|
" call `tokenizer.padding_side = 'left'` before tokenizing the input. "
|
||||||
|
)
|
||||||
if attention_mask is not None and 0.0 in attention_mask:
|
if attention_mask is not None and 0.0 in attention_mask:
|
||||||
return attention_mask
|
return attention_mask
|
||||||
return None
|
return None
|
||||||
|
|||||||
@@ -63,11 +63,7 @@ _CONFIG_FOR_DOC = "StableLmConfig"
|
|||||||
|
|
||||||
# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->StableLm
|
# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->StableLm
|
||||||
class StableLmRotaryEmbedding(nn.Module):
|
class StableLmRotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: StableLmConfig, device=None):
|
||||||
self,
|
|
||||||
config: StableLmConfig,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
|
|||||||
@@ -274,11 +274,7 @@ class Starcoder2DecoderLayer(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class Starcoder2RotaryEmbedding(nn.Module):
|
class Starcoder2RotaryEmbedding(nn.Module):
|
||||||
def __init__(
|
def __init__(self, config: Starcoder2Config, device=None):
|
||||||
self,
|
|
||||||
config: Starcoder2Config,
|
|
||||||
device=None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.rope_kwargs = {}
|
self.rope_kwargs = {}
|
||||||
# BC: "rope_type" was originally "type"
|
# BC: "rope_type" was originally "type"
|
||||||
|
|||||||
@@ -459,6 +459,9 @@ class Phi3ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
|
|||||||
"long_factor": [5.0 for _ in range(n_factors)],
|
"long_factor": [5.0 for _ in range(n_factors)],
|
||||||
}
|
}
|
||||||
input_tensor = ids_tensor([1, 4090], config.vocab_size)
|
input_tensor = ids_tensor([1, 4090], config.vocab_size)
|
||||||
|
# Make sure we don't have padding tokens. If this is the case, then the actual number of "true" tokens may be shorter
|
||||||
|
# than `config.original_max_position_embeddings + 5`, invalidating this test
|
||||||
|
input_tensor[input_tensor == config.pad_token_id] += 1
|
||||||
model = Phi3ForCausalLM(config)
|
model = Phi3ForCausalLM(config)
|
||||||
model.to(torch_device)
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|||||||
@@ -311,10 +311,10 @@ class RopeTest(unittest.TestCase):
|
|||||||
self.assertEqual(config.rope_theta, 10000.0)
|
self.assertEqual(config.rope_theta, 10000.0)
|
||||||
self.assertFalse(hasattr(config, "partial_rotary_factor"))
|
self.assertFalse(hasattr(config, "partial_rotary_factor"))
|
||||||
|
|
||||||
# longrope applies scaling on EACH inv frequency, `short_factor` or `long_factor`, depending on `factor`
|
# longrope applies scaling on EACH inv frequency, `short_factor` or `long_factor`, depending on the seq_len
|
||||||
dim = config.hidden_size // config.num_attention_heads
|
dim = config.hidden_size // config.num_attention_heads
|
||||||
short_factor = [2.0] * (dim // 2) # scaling applied when factor == 1.0
|
short_factor = [2.0] * (dim // 2) # scaling applied when seq_len <= max_position_embeddings
|
||||||
long_factor = torch.ones(dim // 2).cumsum(0).tolist() # scaling applied when factor > 1.0
|
long_factor = torch.ones(dim // 2).cumsum(0).tolist() # scaling applied when seq_len > max_position_embeddings
|
||||||
|
|
||||||
rope_fn = ROPE_INIT_FUNCTIONS["default"]
|
rope_fn = ROPE_INIT_FUNCTIONS["default"]
|
||||||
default_inv_freq, _ = rope_fn(config=config, device=torch_device)
|
default_inv_freq, _ = rope_fn(config=config, device=torch_device)
|
||||||
@@ -353,26 +353,18 @@ class RopeTest(unittest.TestCase):
|
|||||||
# Verify that "TypeError: '<' not supported between instances of 'NoneType' and 'int'" is not raised.
|
# Verify that "TypeError: '<' not supported between instances of 'NoneType' and 'int'" is not raised.
|
||||||
rope_config_validation(config)
|
rope_config_validation(config)
|
||||||
|
|
||||||
# Check 2: Factor == 1.0 -> short factor is applied to the default frequencies
|
# Check 2: seq_len == 0 -> short factor is applied to the default frequencies
|
||||||
factor = 1.0
|
|
||||||
config.rope_scaling = {
|
config.rope_scaling = {
|
||||||
"rope_type": "longrope",
|
"rope_type": "longrope",
|
||||||
"factor": factor,
|
"factor": 1.0,
|
||||||
"short_factor": short_factor,
|
"short_factor": short_factor,
|
||||||
"long_factor": long_factor,
|
"long_factor": long_factor,
|
||||||
}
|
}
|
||||||
inv_freq, _ = rope_fn(config=config, device=torch_device)
|
inv_freq, _ = rope_fn(config=config, device=torch_device, seq_len=0)
|
||||||
torch.testing.assert_close(inv_freq, default_inv_freq / torch.tensor(short_factor).to(torch_device))
|
torch.testing.assert_close(inv_freq, default_inv_freq / torch.tensor(short_factor).to(torch_device))
|
||||||
|
|
||||||
# Check 3: Factor > 1.0 -> long factor is applied to the default frequencies
|
# Check 3: seq_len > max_position_embeddings -> long factor is applied to the default frequencies
|
||||||
factor = 10.0
|
inv_freq, _ = rope_fn(config=config, device=torch_device, seq_len=config.max_position_embeddings + 1)
|
||||||
config.rope_scaling = {
|
|
||||||
"rope_type": "longrope",
|
|
||||||
"factor": factor,
|
|
||||||
"short_factor": short_factor,
|
|
||||||
"long_factor": long_factor,
|
|
||||||
}
|
|
||||||
inv_freq, _ = rope_fn(config=config, device=torch_device)
|
|
||||||
torch.testing.assert_close(inv_freq, default_inv_freq / torch.tensor(long_factor).to(torch_device))
|
torch.testing.assert_close(inv_freq, default_inv_freq / torch.tensor(long_factor).to(torch_device))
|
||||||
|
|
||||||
def test_llama3_rope_numerically(self):
|
def test_llama3_rope_numerically(self):
|
||||||
|
|||||||
@@ -51,6 +51,8 @@ SPECIAL_CASES_TO_ALLOW = {
|
|||||||
# generation configs (TODO joao)
|
# generation configs (TODO joao)
|
||||||
"Gemma2Config": ["tie_word_embeddings", "cache_implementation"],
|
"Gemma2Config": ["tie_word_embeddings", "cache_implementation"],
|
||||||
"Cohere2Config": ["cache_implementation"],
|
"Cohere2Config": ["cache_implementation"],
|
||||||
|
# Dropout with this value was declared but never used
|
||||||
|
"Phi3Config": ["embd_pdrop"],
|
||||||
# used to compute the property `self.chunk_length`
|
# used to compute the property `self.chunk_length`
|
||||||
"EncodecConfig": ["overlap"],
|
"EncodecConfig": ["overlap"],
|
||||||
# used to compute the property `self.layers_block_type`
|
# used to compute the property `self.layers_block_type`
|
||||||
|
|||||||
Reference in New Issue
Block a user