From 2e113422b3504fe6de821bb9911b24273b11aa9c Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Tue, 23 Jul 2024 10:42:55 +0100 Subject: [PATCH] Llama: RoPE refactor (#32135) Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- src/transformers/__init__.py | 2 + src/transformers/modeling_rope_utils.py | 451 ++++++++++++++++++ .../models/chameleon/modeling_chameleon.py | 18 +- .../models/cohere/modeling_cohere.py | 9 +- .../models/jamba/modeling_jamba.py | 2 +- .../models/jetmoe/modeling_jetmoe.py | 2 +- .../models/llama/configuration_llama.py | 105 ++-- .../models/llama/modeling_llama.py | 282 +++++------ .../models/mistral/modeling_mistral.py | 11 +- .../models/mixtral/modeling_mixtral.py | 2 +- src/transformers/models/olmo/modeling_olmo.py | 18 +- .../models/persimmon/modeling_persimmon.py | 2 +- src/transformers/models/phi/modeling_phi.py | 2 +- src/transformers/models/phi3/modeling_phi3.py | 2 +- .../models/qwen2_moe/modeling_qwen2_moe.py | 2 +- .../models/stablelm/modeling_stablelm.py | 2 +- .../models/starcoder2/modeling_starcoder2.py | 2 +- src/transformers/utils/dummy_pt_objects.py | 3 + tests/models/llama/test_modeling_llama.py | 78 +-- tests/utils/test_modeling_rope_utils.py | 120 +++++ 20 files changed, 830 insertions(+), 285 deletions(-) create mode 100644 src/transformers/modeling_rope_utils.py create mode 100644 tests/utils/test_modeling_rope_utils.py diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index fe31cd3c23..bc6e786358 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1295,6 +1295,7 @@ else: ) _import_structure["modeling_flash_attention_utils"] = [] _import_structure["modeling_outputs"] = [] + _import_structure["modeling_rope_utils"] = ["ROPE_INIT_FUNCTIONS"] _import_structure["modeling_utils"] = ["PreTrainedModel"] # PyTorch models structure @@ -6010,6 +6011,7 @@ if TYPE_CHECKING: WatermarkLogitsProcessor, WhisperTimeStampLogitsProcessor, ) + from .modeling_rope_utils import ROPE_INIT_FUNCTIONS from .modeling_utils import PreTrainedModel from .models.albert import ( AlbertForMaskedLM, diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py new file mode 100644 index 0000000000..33055d2bf9 --- /dev/null +++ b/src/transformers/modeling_rope_utils.py @@ -0,0 +1,451 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Optional, Tuple + +from .configuration_utils import PretrainedConfig +from .utils import is_torch_available, logging + + +logger = logging.get_logger(__name__) + + +if is_torch_available(): + import torch + + +def _compute_default_rope_parameters( + config: Optional[PretrainedConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + **rope_kwargs, +) -> Tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + if config is not None and len(rope_kwargs) > 0: + raise ValueError( + "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in " + f"`_compute_default_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}" + ) + if len(rope_kwargs) > 0: + base = rope_kwargs["base"] + dim = rope_kwargs["dim"] + elif config is not None: + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim)) + return inv_freq, attention_factor + + +def _compute_linear_scaling_rope_parameters( + config: Optional[PretrainedConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + **rope_kwargs, +) -> Tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + if config is not None and len(rope_kwargs) > 0: + raise ValueError( + "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in " + f"`_compute_linear_scaling_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}" + ) + if len(rope_kwargs) > 0: + factor = rope_kwargs["factor"] + elif config is not None: + factor = config.rope_scaling["factor"] + + # Gets the default RoPE parameters + inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs) + + # Then applies linear scaling to the frequencies. + # NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so + # applying scaling to the inverse frequencies is equivalent. + inv_freq /= factor + return inv_freq, attention_factor + + +def _compute_dynamic_ntk_parameters( + config: Optional[PretrainedConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + **rope_kwargs, +) -> Tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length, used to update the dynamic RoPE at inference time. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + if config is not None and len(rope_kwargs) > 0: + raise ValueError( + "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in " + f"`_compute_dynamic_ntk_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}" + ) + if len(rope_kwargs) > 0: + base = rope_kwargs["base"] + dim = rope_kwargs["dim"] + max_position_embeddings = rope_kwargs["max_position_embeddings"] + factor = rope_kwargs["factor"] + elif config is not None: + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor) + max_position_embeddings = config.max_position_embeddings + factor = config.rope_scaling["factor"] + + attention_factor = 1.0 # Unused in this type of RoPE + + # seq_len: default to max_position_embeddings, e.g. at init time + seq_len = seq_len if seq_len is not None else max_position_embeddings + + # Compute the inverse frequencies + base = base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim)) + return inv_freq, attention_factor + + +def _compute_yarn_parameters( + config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs +) -> Tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies with NTK scaling. Please refer to the + [original paper](https://arxiv.org/abs/2309.00071) + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin. + """ + # No need to keep BC with yarn, unreleased when this new pattern was created. + if len(rope_kwargs) > 0: + raise ValueError( + f"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_yarn_parameters`, got {rope_kwargs}" + ) + + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor) + max_position_embeddings = config.max_position_embeddings + factor = config.rope_scaling["factor"] + + # Sets the attention factor as suggested in the paper + attention_factor = config.rope_scaling.get("attention_factor") + if attention_factor is None: + attention_factor = 0.1 * math.log(factor) + 1.0 + + # Optional config options + # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly) + beta_fast = config.rope_scaling.get("beta_fast") or 32 + beta_slow = config.rope_scaling.get("beta_slow") or 1 + + # Compute the inverse frequencies + def find_correction_dim(num_rotations, dim, base, max_position_embeddings): + """Inverse dimension formula to find the dimension based on the number of rotations""" + return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base)) + + def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings): + """Find dimension range bounds based on rotations""" + low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings)) + high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings)) + return max(low, 0), min(high, dim - 1) + + def linear_ramp_mask(min, max, dim): + if min == max: + max += 0.001 # Prevent singularity + + linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) + ramp_func = torch.clamp(linear_func, 0, 1) + return ramp_func + + pos_freqs = base ** (torch.arange(0, dim, 2).float().to(device) / dim) + inv_freq_extrapolation = 1.0 / pos_freqs + inv_freq_interpolation = 1.0 / (factor * pos_freqs) + + low, high = find_correction_range(beta_fast, beta_slow, dim, base, max_position_embeddings) + + # Get n-dimensional rotational scaling corrected for extrapolation + inv_freq_mask = 1 - linear_ramp_mask(low, high, dim // 2).float().to(device) + inv_freq = inv_freq_interpolation * (1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask + + return inv_freq, attention_factor + + +def _compute_longrope_parameters( + config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs +) -> Tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies with LongRoPE scaling. Please refer to the + [original implementation](https://github.com/microsoft/LongRoPE) + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin. + """ + # No need to keep BC with longrope, unreleased when this new pattern was created. + if len(rope_kwargs) > 0: + raise ValueError( + "Unexpected arguments: `**rope_kwargs` should be unset in `_compute_longrope_parameters`, got " + f"{rope_kwargs}" + ) + + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor) + long_factor = config.rope_scaling["long_factor"] + short_factor = config.rope_scaling["short_factor"] + factor = config.rope_scaling.get("factor") + attention_factor = config.rope_scaling.get("attention_factor") + + # NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a + # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two + # values to compute the default attention scaling factor, instead of using `factor`. + if hasattr(config, "original_max_position_embeddings"): + max_position_embeddings = config.original_max_position_embeddings + expanded_max_position_embeddings = config.max_position_embeddings + factor = expanded_max_position_embeddings / max_position_embeddings + else: + max_position_embeddings = config.max_position_embeddings + expanded_max_position_embeddings = max_position_embeddings * factor + + # Sets the attention factor as suggested in the paper + if attention_factor is None: + if factor <= 1.0: + attention_factor = 1.0 + else: + attention_factor = math.sqrt(1 + math.log(factor) / math.log(max_position_embeddings)) + + # Compute the inverse frequencies -- scaled based on the target sequence length + if expanded_max_position_embeddings > max_position_embeddings: + ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device) + else: + ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device) + inv_freq_shape = torch.arange(0, dim, 2, dtype=torch.int64, device=device).float() / dim + inv_freq = 1.0 / (ext_factors * base**inv_freq_shape) + + return inv_freq, attention_factor + + +# This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters +# from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE +# parameterizations, as long as the callable has the same signature. +ROPE_INIT_FUNCTIONS = { + "default": _compute_default_rope_parameters, + "linear": _compute_linear_scaling_rope_parameters, + "dynamic": _compute_dynamic_ntk_parameters, + "yarn": _compute_yarn_parameters, + "longrope": _compute_longrope_parameters, +} + + +def _check_received_keys(rope_type: str, received_keys: set, required_keys: set, optional_keys: Optional[set] = None): + """Compare the received keys in `config.rope_scaling` against the expected and optional keys""" + missing_keys = required_keys - received_keys + if missing_keys: + raise KeyError(f"Missing required keys in `rope_scaling` for 'rope_type'='{rope_type}': {missing_keys}") + + if optional_keys is not None: + unused_keys = received_keys - required_keys - optional_keys + else: + unused_keys = received_keys - received_keys + if unused_keys: + raise KeyError(f"Unrecognized keys in `rope_scaling` for 'rope_type'='{rope_type}': {unused_keys}") + + +def _validate_default_rope_parameters(config: PretrainedConfig): + rope_scaling = config.rope_scaling + rope_type = rope_scaling["rope_type"] + required_keys = {"rope_type"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys) + + +def _validate_linear_scaling_rope_parameters(config: PretrainedConfig): + rope_scaling = config.rope_scaling + rope_type = rope_scaling["rope_type"] + required_keys = {"rope_type", "factor"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys) + + factor = rope_scaling["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + + +def _validate_yarn_parameters(config: PretrainedConfig): + rope_scaling = config.rope_scaling + rope_type = rope_scaling["rope_type"] + required_keys = {"rope_type", "factor"} + optional_keys = {"attention_factor", "beta_fast", "beta_slow"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys, optional_keys) + + factor = rope_scaling["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + + attention_factor = rope_scaling.get("attention_factor") + if attention_factor is not None and (not isinstance(attention_factor, float) or attention_factor < 0): + raise ValueError( + f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}" + ) + beta_fast = rope_scaling.get("beta_fast") + if beta_fast is not None and not isinstance(beta_fast, float): + raise ValueError(f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}") + beta_slow = rope_scaling.get("beta_slow") + if beta_slow is not None and not isinstance(beta_slow, float): + raise ValueError(f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}") + + if (beta_fast or 32) < (beta_slow or 1): + raise ValueError( + f"`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast={beta_fast} " + f"(defaults to 32 if None) and beta_slow={beta_slow} (defaults to 1 if None)" + ) + + +def _validate_longrope_parameters(config: PretrainedConfig): + rope_scaling = config.rope_scaling + rope_type = rope_scaling["rope_type"] + required_keys = {"rope_type", "short_factor", "long_factor"} + optional_keys = {"attention_factor", "factor"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys, optional_keys) + + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor) + + short_factor = rope_scaling.get("short_factor") + if not isinstance(short_factor, list) and all(isinstance(x, (int, float)) for x in short_factor): + raise ValueError(f"`rope_scaling`'s short_factor field must be a list of numbers, got {short_factor}") + if not len(short_factor) == dim // 2: + raise ValueError(f"`rope_scaling`'s short_factor field must have length {dim // 2}, got {len(short_factor)}") + + long_factor = rope_scaling.get("long_factor") + if not isinstance(long_factor, list) and all(isinstance(x, (int, float)) for x in long_factor): + raise ValueError(f"`rope_scaling`'s long_factor field must be a list of numbers, got {long_factor}") + if not len(long_factor) == dim // 2: + raise ValueError(f"`rope_scaling`'s long_factor field must have length {dim // 2}, got {len(long_factor)}") + + # Handle Phi3 divergence: prefer the use of `attention_factor` and/or `factor` over + # `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_scaling` and is + # unique to longrope (= undesirable) + if hasattr(config, "original_max_position_embeddings"): + logger.warning_once( + "This model has set a `original_max_position_embeddings` field, to be used together with " + "`max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`" + "with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, " + "as it is compatible with most model architectures." + ) + else: + factor = rope_scaling.get("factor") + if factor is None: + raise ValueError("Missing required keys in `rope_scaling`: 'factor'") + elif not isinstance(factor, float) or factor < 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + + attention_factor = rope_scaling.get("attention_factor") + if attention_factor is not None and not isinstance(attention_factor, float) or attention_factor < 0: + raise ValueError( + f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}" + ) + + +# Like `ROPE_INIT_FUNCTIONS`, this validation function mapping can be dynamically updated for custom RoPE types. +ROPE_VALIDATION_FUNCTIONS = { + "default": _validate_default_rope_parameters, + "linear": _validate_linear_scaling_rope_parameters, + "dynamic": _validate_linear_scaling_rope_parameters, # `dynamic` has the same validation pattern as `linear` + "yarn": _validate_yarn_parameters, + "longrope": _validate_longrope_parameters, +} + + +def rope_config_validation(config: PretrainedConfig): + """ + Validate the RoPE config arguments, given a `PretrainedConfig` object + """ + rope_scaling = getattr(config, "rope_scaling", None) # not a default parameter in `PretrainedConfig` + if rope_scaling is None: + return + + possible_rope_types = set(ROPE_INIT_FUNCTIONS.keys()) + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" + if rope_type is None: + raise ValueError( + f"rope_scaling must contain a non-None 'rope_type' field. Possible options are {possible_rope_types}" + ) + + validation_fn = ROPE_VALIDATION_FUNCTIONS.get(rope_type) + if validation_fn is not None: + validation_fn(config) + else: + raise ValueError( + f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'" + ) diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index 1eea9b2249..cd10850ae3 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -80,7 +80,8 @@ class ChameleonRMSNorm(nn.Module): ALL_LAYERNORM_LAYERS.append(ChameleonRMSNorm) -# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Chameleon +# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Chameleon +# TODO(joao): add me back asap :) class ChameleonRotaryEmbedding(nn.Module): def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): super().__init__() @@ -110,7 +111,8 @@ class ChameleonRotaryEmbedding(nn.Module): return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) -# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Chameleon +# copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Chameleon +# TODO(joao): add me back asap :) class ChameleonLinearScalingRotaryEmbedding(ChameleonRotaryEmbedding): """ChameleonRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" @@ -121,7 +123,8 @@ class ChameleonLinearScalingRotaryEmbedding(ChameleonRotaryEmbedding): return cos, sin -# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Chameleon +# copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Chameleon +# TODO(joao): add me back asap :) class ChameleonDynamicNTKScalingRotaryEmbedding(ChameleonRotaryEmbedding): """ChameleonRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" @@ -265,7 +268,8 @@ class ChameleonAttention(nn.Module): self.k_norm = ChameleonLayerNorm((self.num_key_value_heads, self.head_dim)) self._init_rope() - # Copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope with Llama->Chameleon + # copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope with Llama->Chameleon + # TODO(joao): add me back asap :) def _init_rope(self): if self.config.rope_scaling is None: self.rotary_emb = ChameleonRotaryEmbedding( @@ -358,7 +362,8 @@ class ChameleonAttention(nn.Module): return attn_output, attn_weights, past_key_value -# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Chameleon +# copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Chameleon +# TODO(joao): add me back asap :) class ChameleonFlashAttention2(ChameleonAttention): """ Chameleon flash attention module. This module inherits from `ChameleonAttention` as the weights of the module stays @@ -576,7 +581,8 @@ CHAMELEON_ATTENTION_CLASSES = { } -# Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->Chameleon, LLAMA->CHAMELEON +# copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->Chameleon, LLAMA->CHAMELEON +# TODO(joao): add me back asap :) class ChameleonDecoderLayer(nn.Module): def __init__(self, config: ChameleonConfig, layer_idx: int): super().__init__() diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index 5322c2334d..6532c656d4 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -295,7 +295,8 @@ class CohereAttention(nn.Module): return attn_output, attn_weights, past_key_value -# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Cohere +# copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Cohere +# TODO(joao): add me back asap :) class CohereFlashAttention2(CohereAttention): """ Cohere flash attention module. This module inherits from `CohereAttention` as the weights of the module stays @@ -409,7 +410,8 @@ class CohereFlashAttention2(CohereAttention): return attn_output, attn_weights, past_key_value -# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention Llama->Cohere +# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention Llama->Cohere +# TODO(joao): add me back asap :) class CohereSdpaAttention(CohereAttention): """ Cohere attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from @@ -697,7 +699,8 @@ COHERE_INPUTS_DOCSTRING = r""" "The bare Cohere Model outputting raw hidden-states without any specific head on top.", COHERE_START_DOCSTRING, ) -# Copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Cohere +# copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Cohere +# TODO(joao): add me back asap :) class CohereModel(CoherePreTrainedModel): """ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`CohereDecoderLayer`] diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py index 28d732628a..6a03dc82a6 100755 --- a/src/transformers/models/jamba/modeling_jamba.py +++ b/src/transformers/models/jamba/modeling_jamba.py @@ -1624,7 +1624,7 @@ class JambaForSequenceClassification(JambaPreTrainedModel): @add_start_docstrings_to_model_forward(JAMBA_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py index 16d8335e0a..fa15393a40 100644 --- a/src/transformers/models/jetmoe/modeling_jetmoe.py +++ b/src/transformers/models/jetmoe/modeling_jetmoe.py @@ -1363,7 +1363,7 @@ class JetMoeForSequenceClassification(JetMoePreTrainedModel): @add_start_docstrings_to_model_forward(JETMOE_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index 843731eeff..7c987ec85a 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -20,10 +20,7 @@ """LLaMA model configuration""" from ...configuration_utils import PretrainedConfig -from ...utils import logging - - -logger = logging.get_logger(__name__) +from ...modeling_rope_utils import rope_config_validation class LlamaConfig(PretrainedConfig): @@ -84,22 +81,35 @@ class LlamaConfig(PretrainedConfig): rope_theta (`float`, *optional*, defaults to 10000.0): The base period of the RoPE embeddings. rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling - strategies: linear, dynamic and yarn. Their scaling factor must be a float greater than 1. The expected format is - `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update - `max_position_embeddings` to the expected new maximum. See the following thread for more information on how - these scaling strategies behave: - https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an - experimental feature, subject to breaking API changes in future versions. - For the `yarn` strategy, the dictionary may also contain the following fields: - `original_max_position_embeddings` (`int`, *optional*): - The original maximum sequence length. This is used to scale the RoPE embeddings. + Dictionary containing the scaling configuration for the RoPE embeddings. IMPORTANT: RoPE scaling expects + `max_position_embeddings` to remain unchanged -- some methods, like 'longrope', require the original value + to determine which scaling to apply. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope'], + with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + `max_position_embeddings`. `attention_factor` (`float`, *optional*): - The attention scaling factor. If unspecified, it defaults to `0.1 ln(s) + 1`, where `s` is the `original_max_position_embeddings/max_position_embeddings` ratio. + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. `beta_fast` (`float`, *optional*): - Parameter to set the boundary for extrapolation (only) in the linear ramp function. + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. `beta_slow` (`float`, *optional*): - Parameter to set the boundary for interpolation (only) in the linear ramp function. + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `max_position_embeddings` * `factor`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `max_position_embeddings` * `factor`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -167,11 +177,13 @@ class LlamaConfig(PretrainedConfig): self.use_cache = use_cache self.rope_theta = rope_theta self.rope_scaling = rope_scaling - self._rope_scaling_validation() self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias + # Validate the correctness of rotary position embeddings parameters + rope_config_validation(self) + super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, @@ -179,60 +191,3 @@ class LlamaConfig(PretrainedConfig): tie_word_embeddings=tie_word_embeddings, **kwargs, ) - - def _rope_scaling_validation(self): - """ - Validate the `rope_scaling` configuration. - """ - if self.rope_scaling is None: - return - - if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) < 2: - raise ValueError( - "`rope_scaling` must be a dictionary with a minimum of two fields, `type` and `factor`, " - f"got {self.rope_scaling}" - ) - rope_scaling_type = self.rope_scaling.get("type", None) - rope_scaling_factor = self.rope_scaling.get("factor", None) - if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic", "yarn"]: - raise ValueError( - f"`rope_scaling`'s type field must be one of ['linear', 'dynamic', 'yarn'], got {rope_scaling_type}" - ) - if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: - raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}") - - if rope_scaling_type != "yarn": - return - - if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) > 6: - raise ValueError( - "`rope_scaling` with type " - f"{rope_scaling_type}" - " must be a dictionary with a maximum of six fields, `type`, `factor`," - "`original_max_position_embeddings`, `attention_factor`, `beta_fast`, `beta_slow`, " - f"got {self.rope_scaling}" - ) - original_max_position_embeddings = self.rope_scaling.get("original_max_position_embeddings", None) - attention_factor = self.rope_scaling.get("attention_factor", None) - beta_fast = self.rope_scaling.get("beta_fast", None) - beta_slow = self.rope_scaling.get("beta_slow", None) - - if original_max_position_embeddings is not None and not isinstance(original_max_position_embeddings, int): - raise ValueError( - f"`rope_scaling`'s original_max_position_embeddings field must be an int, got {original_max_position_embeddings}" - ) - if attention_factor is not None and not isinstance(attention_factor, float) or attention_factor < 0: - raise ValueError( - f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}" - ) - if beta_fast is not None and not isinstance(beta_fast, float): - raise ValueError(f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}") - if beta_slow is not None and not isinstance(beta_slow, float): - raise ValueError(f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}") - - b_fast = beta_fast if beta_fast is not None else 32 - b_slow = beta_slow if beta_slow is not None else 1 - if b_fast < b_slow: - raise ValueError( - f"`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast={b_fast} and beta_slow={b_slow}" - ) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index b624a2d92d..3115cee78f 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -37,6 +37,7 @@ from ...modeling_outputs import ( SequenceClassifierOutputWithPast, TokenClassifierOutput, ) +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import PreTrainedModel from ...pytorch_utils import ALL_LAYERNORM_LAYERS from ...utils import ( @@ -75,24 +76,77 @@ ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm) class LlamaRotaryEmbedding(nn.Module): - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + def __init__( + self, + dim=None, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + rope_type="default", + config: Optional[LlamaConfig] = None, + ): super().__init__() - self.scaling_factor = scaling_factor - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base - inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) + # TODO (joao): remove the `if` below, only used for BC + self.rope_kwargs = {} + if config is None: + logger.warning_once( + "`LlamaRotaryEmbedding` can now be fully parameterized by passing the model config through the " + "`config` argument. All other arguments will be removed in v4.45" + ) + self.rope_kwargs = { + "rope_type": rope_type, + "factor": scaling_factor, + "dim": dim, + "base": base, + "max_position_embeddings": max_position_embeddings, + } + self.rope_type = rope_type + self.max_seq_len_cached = max_position_embeddings + self.original_max_seq_len = max_position_embeddings + else: + # BC: "rope_type" was originally "type" + if config.rope_scaling is not None: + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling["type"]) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs) self.register_buffer("inv_freq", inv_freq, persistent=False) - # For BC we register cos and sin cached - self.max_seq_len_cached = max_position_embeddings + self.original_inv_freq = self.inv_freq + + def _dynamic_frequency_update(self, position_ids, device): + """ + dynamic RoPE layers should recompute `inv_freq` in the following situations: + 1 - growing beyond the cached sequence length (allow scaling) + 2 - the current sequence length is in the original scale (avoid losing precision with small sequences) + """ + seq_len = torch.max(position_ids) + 1 + if seq_len > self.max_seq_len_cached: # growth + inv_freq, self.attention_scaling = self.rope_init_fn( + self.config, device, seq_len=seq_len, **self.rope_kwargs + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation + self.max_seq_len_cached = seq_len + + if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset + self.register_buffer("inv_freq", self.original_inv_freq, persistent=False) + self.max_seq_len_cached = self.original_max_seq_len @torch.no_grad() def forward(self, x, position_ids): - # x: [bs, num_attention_heads, seq_len, head_size] + if "dynamic" in self.rope_type: + self._dynamic_frequency_update(position_ids, device=x.device) + + # Core RoPE block inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) position_ids_expanded = position_ids[:, None, :].float() - # Force float32 since bfloat16 loses precision on long contexts - # See https://github.com/huggingface/transformers/pull/29285 + # Force float32 (see https://github.com/huggingface/transformers/pull/29285) device_type = x.device.type device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" with torch.autocast(device_type=device_type, enabled=False): @@ -100,107 +154,37 @@ class LlamaRotaryEmbedding(nn.Module): emb = torch.cat((freqs, freqs), dim=-1) cos = emb.cos() sin = emb.sin() + + # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention + cos = cos * self.attention_scaling + sin = sin * self.attention_scaling + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding): """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" - def forward(self, x, position_ids): - # difference to the original RoPE: a scaling factor is aplied to the position ids - position_ids = position_ids.float() / self.scaling_factor - cos, sin = super().forward(x, position_ids) - return cos, sin + def __init__(self, *args, **kwargs): + logger.warning_once( + "`LlamaLinearScalingRotaryEmbedding` is deprecated an will be removed in v4.45. Please use " + "`LlamaRotaryEmbedding`, which now also does linear scaling (simply pass the model config to __init__)." + ) + kwargs["rope_type"] = "linear" + super().__init__(*args, **kwargs) class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding): """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" - def forward(self, x, position_ids): - # difference to the original RoPE: inv_freq is recomputed when the sequence length > original length - seq_len = torch.max(position_ids) + 1 - if seq_len > self.max_position_embeddings: - base = self.base * ( - (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) - ) ** (self.dim / (self.dim - 2)) - inv_freq = 1.0 / ( - base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(x.device) / self.dim) - ) - self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: this may break with compilation - - cos, sin = super().forward(x, position_ids) - return cos, sin - - -class LlamaYarnScalingRotaryEmbedding(LlamaRotaryEmbedding): - def __init__( - self, - dim, - max_position_embeddings=2048, - base=10000, - scaling_factor=1, - original_max_position_embeddings=2048, - attention_factor=None, - beta_fast=32, - beta_slow=1, - device=None, - ): - super().__init__(dim, max_position_embeddings, base, device, scaling_factor) - - self.original_max_position_embeddings = original_max_position_embeddings - self.attention_factor = attention_factor - self.beta_fast = beta_fast - self.beta_slow = beta_slow - - if self.attention_factor is None: - # Recommended attention factor for LLaMA models. - # For more details please refer to https://arxiv.org/pdf/2309.00071, Eq. 22. - self.attention_factor = 0.1 * math.log(scaling_factor) + 1.0 - - self.compute_yarn_scaling(device) - - # Inverse dimension formula to find the dimension based on the number of rotations - def find_correction_dim(self, num_rotations, dim, base=10000, max_position_embeddings=2048): - return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base)) - - # Find dimension range bounds based on rotations - def find_correction_range(self, low_rot, high_rot, dim, base=10000, max_position_embeddings=2048): - low = math.floor(self.find_correction_dim(low_rot, dim, base, max_position_embeddings)) - high = math.ceil(self.find_correction_dim(high_rot, dim, base, max_position_embeddings)) - return max(low, 0), min(high, dim - 1) - - def linear_ramp_mask(self, min, max, dim): - if min == max: - max += 0.001 # Prevent singularity - - linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) - ramp_func = torch.clamp(linear_func, 0, 1) - return ramp_func - - def forward(self, x, position_ids=None): - # Difference to the original RoPE: applies a scaling factor computed with - # the YaRN method (NTK-by-Parts + Attn Scaling) - # x: [bs, num_attention_heads, seq_len, head_size] - cos, sin = super().forward(x, position_ids) - cos = cos * self.mscale - sin = sin * self.mscale - return cos, sin - - def compute_yarn_scaling(self, device): - pos_freqs = self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim) - inv_freq_extrapolation = 1.0 / pos_freqs - inv_freq_interpolation = 1.0 / (self.scaling_factor * pos_freqs) - - low, high = self.find_correction_range( - self.beta_fast, self.beta_slow, self.dim, self.base, self.original_max_position_embeddings + def __init__(self, *args, **kwargs): + logger.warning_once( + "`LlamaDynamicNTKScalingRotaryEmbedding` is deprecated an will be removed in v4.45. Please use " + "`LlamaRotaryEmbedding`, which now also does dynamic ntk scaling (simply pass the model config to " + "__init__)." ) - # Get n-dimensional rotational scaling corrected for extrapolation - inv_freq_mask = 1 - self.linear_ramp_mask(low, high, self.dim // 2).float().to(device) - inv_freq = inv_freq_interpolation * (1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask - - self.register_buffer("inv_freq", inv_freq) - # Get n-dimensional magnitude scaling corrected for interpolation - self.mscale = self.attention_factor + kwargs["rope_type"] = "dynamic" + super().__init__(*args, **kwargs) def rotate_half(x): @@ -317,51 +301,9 @@ class LlamaAttention(nn.Module): self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias) - self._init_rope() - def _init_rope(self): - if self.config.rope_scaling is None: - self.rotary_emb = LlamaRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.rope_theta, - ) - else: - scaling_type = self.config.rope_scaling["type"] - scaling_factor = self.config.rope_scaling["factor"] - # Yarn parameters - kwargs = { - "dim": self.config.rope_scaling.get("original_max_position_embeddings", None), - "max_position_embeddings": self.config.rope_scaling.get("attention_factor", None), - "base": self.config.rope_scaling.get("beta_fast", None), - "scaling_factor": self.config.rope_scaling.get("beta_slow", None), - } - kwargs = {k: v for k, v in kwargs.items() if v is not None} - - if scaling_type == "linear": - self.rotary_emb = LlamaLinearScalingRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - scaling_factor=scaling_factor, - base=self.rope_theta, - ) - elif scaling_type == "dynamic": - self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - scaling_factor=scaling_factor, - base=self.rope_theta, - ) - elif scaling_type == "yarn": - self.rotary_emb = LlamaYarnScalingRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - scaling_factor=scaling_factor, - base=self.rope_theta, - **kwargs, - ) - else: - raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + # TODO (joao): remove in v4.45 (RoPE is computed in the model, not in the decoder layers) + self.rotary_emb = LlamaRotaryEmbedding(config=self.config) def forward( self, @@ -372,6 +314,7 @@ class LlamaAttention(nn.Module): output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() @@ -402,7 +345,16 @@ class LlamaAttention(nn.Module): key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - cos, sin = self.rotary_emb(value_states, position_ids) + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: @@ -471,6 +423,7 @@ class LlamaFlashAttention2(LlamaAttention): output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: if isinstance(past_key_value, StaticCache): raise ValueError( @@ -493,7 +446,16 @@ class LlamaFlashAttention2(LlamaAttention): key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - cos, sin = self.rotary_emb(value_states, position_ids) + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: @@ -573,6 +535,7 @@ class LlamaSdpaAttention(LlamaAttention): output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: if output_attentions: @@ -589,6 +552,7 @@ class LlamaSdpaAttention(LlamaAttention): output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, ) bsz, q_len, _ = hidden_states.size() @@ -601,7 +565,16 @@ class LlamaSdpaAttention(LlamaAttention): key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - cos, sin = self.rotary_emb(value_states, position_ids) + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: @@ -671,6 +644,7 @@ class LlamaDecoderLayer(nn.Module): output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 **kwargs, ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: """ @@ -688,6 +662,9 @@ class LlamaDecoderLayer(nn.Module): past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): Indices depicting the position of the input sequence tokens in the sequence + position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*): + Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, + with `head_dim` being the embedding dimension of each attention head. kwargs (`dict`, *optional*): Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code into the model @@ -705,6 +682,7 @@ class LlamaDecoderLayer(nn.Module): output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, **kwargs, ) hidden_states = residual + hidden_states @@ -867,6 +845,7 @@ class LlamaModel(LlamaPreTrainedModel): [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = LlamaRotaryEmbedding(config=config) self.gradient_checkpointing = False # Initialize weights and apply final processing @@ -933,10 +912,11 @@ class LlamaModel(LlamaPreTrainedModel): causal_mask = self._update_causal_mask( attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions ) - - # embed positions hidden_states = inputs_embeds + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + # decoder layers all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None @@ -956,6 +936,7 @@ class LlamaModel(LlamaPreTrainedModel): output_attentions, use_cache, cache_position, + position_embeddings, ) else: layer_outputs = decoder_layer( @@ -966,6 +947,7 @@ class LlamaModel(LlamaPreTrainedModel): output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, ) hidden_states = layer_outputs[0] @@ -1280,7 +1262,7 @@ class LlamaForSequenceClassification(LlamaPreTrainedModel): @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index 8e2f4dd5a4..dd814cd75f 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -85,7 +85,8 @@ class MistralRotaryEmbedding(nn.Module): self.register_buffer("inv_freq", inv_freq, persistent=False) @torch.no_grad() - # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding.forward + # copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding.forward + # TODO(joao): add me back asap :) def forward(self, x, position_ids): # x: [bs, num_attention_heads, seq_len, head_size] inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) @@ -396,7 +397,8 @@ class MistralFlashAttention2(MistralAttention): return attn_output, attn_weights, past_key_value -# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral +# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral +# TODO(joao): add me back asap :) class MistralSdpaAttention(MistralAttention): """ Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from @@ -492,7 +494,8 @@ MISTRAL_ATTENTION_CLASSES = { } -# Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->Mistral, LLAMA->MISTRAL +# copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->Mistral, LLAMA->MISTRAL +# TODO(joao): add me back asap :) class MistralDecoderLayer(nn.Module): def __init__(self, config: MistralConfig, layer_idx: int): super().__init__() @@ -1146,7 +1149,7 @@ class MistralForSequenceClassification(MistralPreTrainedModel): @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index 4b88afcded..82320de793 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -1362,7 +1362,7 @@ class MixtralForSequenceClassification(MixtralPreTrainedModel): @add_start_docstrings_to_model_forward(MIXTRAL_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index 59c9b3bf1b..a56baf0653 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -74,7 +74,8 @@ class OlmoLayerNorm(nn.Module): ALL_LAYERNORM_LAYERS.append(OlmoLayerNorm) -# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Olmo +# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Olmo +# TODO(joao): add me back asap :) class OlmoRotaryEmbedding(nn.Module): def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): super().__init__() @@ -104,7 +105,8 @@ class OlmoRotaryEmbedding(nn.Module): return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) -# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Olmo +# copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Olmo +# TODO(joao): add me back asap :) class OlmoLinearScalingRotaryEmbedding(OlmoRotaryEmbedding): """OlmoRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" @@ -115,7 +117,8 @@ class OlmoLinearScalingRotaryEmbedding(OlmoRotaryEmbedding): return cos, sin -# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Olmo +# copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Olmo +# TODO(joao): add me back asap :) class OlmoDynamicNTKScalingRotaryEmbedding(OlmoRotaryEmbedding): """OlmoRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" @@ -202,7 +205,8 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: class OlmoAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" - # Copied from transformers.models.llama.modeling_llama.LlamaAttention.__init__ with Llama->Olmo + # copied from transformers.models.llama.modeling_llama.LlamaAttention.__init__ with Llama->Olmo + # TODO(joao): add me back asap :) def __init__(self, config: OlmoConfig, layer_idx: Optional[int] = None): super().__init__() self.config = config @@ -549,7 +553,8 @@ class OlmoDecoderLayer(nn.Module): self.input_layernorm = OlmoLayerNorm(config.hidden_size) self.post_attention_layernorm = OlmoLayerNorm(config.hidden_size) - # Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer.forward + # copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer.forward + # TODO(joao): add me back asap :) def forward( self, hidden_states: torch.Tensor, @@ -768,7 +773,8 @@ class OlmoModel(OlmoPreTrainedModel): self.embed_tokens = value @add_start_docstrings_to_model_forward(OLMO_INPUTS_DOCSTRING) - # Copied from transformers.models.llama.modeling_llama.LlamaModel.forward + # copied from transformers.models.llama.modeling_llama.LlamaModel.forward + # TODO(joao): add me back asap :) def forward( self, input_ids: torch.LongTensor = None, diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py index fc1b729fa6..af22145e3e 100644 --- a/src/transformers/models/persimmon/modeling_persimmon.py +++ b/src/transformers/models/persimmon/modeling_persimmon.py @@ -999,7 +999,7 @@ class PersimmonForSequenceClassification(PersimmonPreTrainedModel): @add_start_docstrings_to_model_forward(PERSIMMON_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index 7ad34a5780..f80453d3f7 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -1282,7 +1282,7 @@ class PhiForSequenceClassification(PhiPreTrainedModel): @add_start_docstrings_to_model_forward(PHI_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index b7d05bbed6..a32f8531e4 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -1278,7 +1278,7 @@ class Phi3ForSequenceClassification(Phi3PreTrainedModel): @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index c20d74fb18..d88b5c357e 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -1370,7 +1370,7 @@ class Qwen2MoeForSequenceClassification(Qwen2MoePreTrainedModel): @add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py index a172183618..ea50a20ede 100755 --- a/src/transformers/models/stablelm/modeling_stablelm.py +++ b/src/transformers/models/stablelm/modeling_stablelm.py @@ -1275,7 +1275,7 @@ class StableLmForSequenceClassification(StableLmPreTrainedModel): @add_start_docstrings_to_model_forward(STABLELM_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index 430befd24a..af532b139c 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -1153,7 +1153,7 @@ class Starcoder2ForSequenceClassification(Starcoder2PreTrainedModel): @add_start_docstrings_to_model_forward(STARCODER2_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 81d4c21055..de739c6e70 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -485,6 +485,9 @@ class WhisperTimeStampLogitsProcessor(metaclass=DummyObject): requires_backends(self, ["torch"]) +ROPE_INIT_FUNCTIONS = None + + class PreTrainedModel(metaclass=DummyObject): _backends = ["torch"] diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index de7eb7e441..85d352fc81 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -51,12 +51,7 @@ if is_torch_available(): LlamaModel, LlamaTokenizer, ) - from transformers.models.llama.modeling_llama import ( - LlamaDynamicNTKScalingRotaryEmbedding, - LlamaLinearScalingRotaryEmbedding, - LlamaRotaryEmbedding, - LlamaYarnScalingRotaryEmbedding, - ) + from transformers.models.llama.modeling_llama import LlamaLinearScalingRotaryEmbedding, LlamaRotaryEmbedding class LlamaModelTester: @@ -431,9 +426,6 @@ class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi def test_model_rope_scaling(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() - hidden_size = config.hidden_size - num_heads = config.num_attention_heads - head_dim = hidden_size // num_heads scaling_factor = 10 short_input_length = 10 long_input_length = int(config.max_position_embeddings * 1.5) @@ -446,11 +438,7 @@ class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi position_ids_long = position_ids_long.unsqueeze(0) # Sanity check original RoPE - original_rope = LlamaRotaryEmbedding( - head_dim, - max_position_embeddings=config.max_position_embeddings, - base=config.rope_theta, - ).to(torch_device) + original_rope = LlamaRotaryEmbedding(config=config).to(torch_device) original_cos_short, original_sin_short = original_rope(x, position_ids_short) original_cos_long, original_sin_long = original_rope(x, position_ids_long) torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :]) @@ -458,12 +446,8 @@ class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi # Sanity check linear RoPE scaling # New position "x" should match original position with index "x/scaling_factor" - linear_scaling_rope = LlamaLinearScalingRotaryEmbedding( - head_dim, - max_position_embeddings=config.max_position_embeddings, - base=config.rope_theta, - scaling_factor=scaling_factor, - ).to(torch_device) + config.rope_scaling = {"type": "linear", "factor": scaling_factor} + linear_scaling_rope = LlamaRotaryEmbedding(config=config).to(torch_device) linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short) linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long) torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :]) @@ -476,12 +460,8 @@ class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi # Sanity check Dynamic NTK RoPE scaling # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase # with scaling_factor (or that `inv_freq` decreases) - ntk_scaling_rope = LlamaDynamicNTKScalingRotaryEmbedding( - head_dim, - max_position_embeddings=config.max_position_embeddings, - base=config.rope_theta, - scaling_factor=scaling_factor, - ).to(torch_device) + config.rope_scaling = {"type": "dynamic", "factor": scaling_factor} + ntk_scaling_rope = LlamaRotaryEmbedding(config=config).to(torch_device) ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short) ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long) torch.testing.assert_close(ntk_cos_short, original_cos_short) @@ -493,12 +473,9 @@ class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) # Sanity check Yarn RoPE scaling - yarn_scaling_rope = LlamaYarnScalingRotaryEmbedding( - head_dim, - max_position_embeddings=config.max_position_embeddings, - base=config.rope_theta, - scaling_factor=scaling_factor, - ).to(torch_device) + # Scaling should be over the entire input + config.rope_scaling = {"type": "yarn", "factor": scaling_factor} + yarn_scaling_rope = LlamaRotaryEmbedding(config=config).to(torch_device) yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short) yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long) torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :]) @@ -512,6 +489,43 @@ class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi with self.assertRaises(AssertionError): torch.testing.assert_close(yarn_sin_long, original_sin_long) + def test_rope_class_retrocompatibility(self): + # Delete me when we remove compatibility for the old API :) + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + scaling_factor = 10 + short_input_length = 10 + long_input_length = int(config.max_position_embeddings * 1.5) + config.rope_scaling = {"type": "linear", "factor": 10} + + # Inputs + x = torch.randn(1, dtype=torch.float32, device=torch_device) # used exlusively to get the dtype and the device + position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device) + position_ids_short = position_ids_short.unsqueeze(0) + position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device) + position_ids_long = position_ids_long.unsqueeze(0) + + # Old API -- under the hood, "type": "linear" is set and `LlamaRotaryEmbedding` is called + old_api_rope = LlamaLinearScalingRotaryEmbedding( + config.hidden_size // config.num_attention_heads, + max_position_embeddings=config.max_position_embeddings, + base=config.rope_theta, + scaling_factor=scaling_factor, + ).to(torch_device) + old_cos_short, old_sin_short = old_api_rope(x, position_ids_short) + old_cos_long, old_sin_long = old_api_rope(x, position_ids_long) + + # New API + config.rope_scaling = {"type": "linear", "factor": scaling_factor} + new_api_rope = LlamaRotaryEmbedding(config=config).to(torch_device) + new_cos_short, new_sin_short = new_api_rope(x, position_ids_short) + new_cos_long, new_sin_long = new_api_rope(x, position_ids_long) + + # The results should match + torch.testing.assert_close(old_cos_short, new_cos_short) + torch.testing.assert_close(old_sin_short, new_sin_short) + torch.testing.assert_close(old_cos_long, new_cos_long) + torch.testing.assert_close(old_sin_long, new_sin_long) + @require_flash_attn @require_torch_gpu @require_bitsandbytes diff --git a/tests/utils/test_modeling_rope_utils.py b/tests/utils/test_modeling_rope_utils.py new file mode 100644 index 0000000000..847323d9bf --- /dev/null +++ b/tests/utils/test_modeling_rope_utils.py @@ -0,0 +1,120 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers import LlamaConfig +from transformers.testing_utils import is_torch_available, require_torch, torch_device + + +if is_torch_available(): + import torch + + from transformers import ROPE_INIT_FUNCTIONS + from transformers.modeling_rope_utils import rope_config_validation + + +@require_torch +class RopeTest(unittest.TestCase): + def test_rope_validation(self): + config = LlamaConfig() + all_rope_types = ROPE_INIT_FUNCTIONS.keys() + + # The base config is always valid (default RoPE) + rope_config_validation(config) + + # If we explicitly set the other RoPE types, then validation should fail + for rope_type in all_rope_types: + if rope_type != "default": + config.rope_scaling = {"rope_type": rope_type} + with self.assertRaises(KeyError): + rope_config_validation(config) + + # Parameters are exclusive to their own RoPE type, and should raise an exception if incorrectly passed + valid_param_mapping = { + "factor": ["linear", "dynamic", "yarn", "longrope"], + "attention_factor": ["yarn", "longrope"], + "beta_fast": ["yarn"], + "beta_slow": ["yarn"], + "short_factor": ["longrope"], + "long_factor": ["longrope"], + } + for rope_type in all_rope_types: + if rope_type == "default": + continue # checked above + for param, valid_rope_types in valid_param_mapping.items(): + # Set `param` with a dummy value -- we want to test the dict key + config.rope_scaling = {"rope_type": rope_type, param: True} + if rope_type in valid_rope_types: + continue + else: + with self.assertRaises(KeyError): + rope_config_validation(config) + + def test_default_rope_function_bc(self): + config = LlamaConfig() + device = torch_device + + rope_kwargs = { + "rope_type": "default", + "dim": config.hidden_size // config.num_attention_heads, + "max_position_embeddings": config.max_position_embeddings, + "base": config.rope_theta, + } + + rope_fn = ROPE_INIT_FUNCTIONS["default"] + config_freqs = rope_fn(config=config, device=device)[0] + kwargs_freqs = rope_fn(**rope_kwargs, device=device)[0] + torch.testing.assert_close(config_freqs, kwargs_freqs) + + def test_linear_rope_function_bc(self): + config = LlamaConfig() + config.rope_scaling = {"rope_type": "linear", "factor": 10.0} + device = torch_device + + rope_kwargs = { + "rope_type": "linear", + "dim": config.hidden_size // config.num_attention_heads, + "max_position_embeddings": config.max_position_embeddings, + "base": config.rope_theta, + "factor": 10.0, + } + + rope_fn = ROPE_INIT_FUNCTIONS["linear"] + config_freqs = rope_fn(config=config, device=device)[0] + kwargs_freqs = rope_fn(**rope_kwargs, device=device)[0] + torch.testing.assert_close(config_freqs, kwargs_freqs) + + def test_dynamic_rope_function_bc(self): + config = LlamaConfig() + config.rope_scaling = {"rope_type": "dynamic", "factor": 10.0} + device = torch_device + + rope_kwargs = { + "rope_type": "dynamic", + "dim": config.hidden_size // config.num_attention_heads, + "max_position_embeddings": config.max_position_embeddings, + "base": config.rope_theta, + "factor": 10.0, + } + + rope_fn = ROPE_INIT_FUNCTIONS["dynamic"] + config_freqs = rope_fn(config=config, device=device)[0] + kwargs_freqs = rope_fn(**rope_kwargs, device=device)[0] + torch.testing.assert_close(config_freqs, kwargs_freqs) + + +# TODO(joao): numerical checks for the different RoPE fns