v4.45.2

Cache: revert DynamicCache init for BC (#33861 )
* tmp commit * tmp commit * make fixup * missing removal * fix condition * fix end-to-end compilation * if -> elif * BC * BC * use @deprecate_kwarg("num_hidden_layers", version="4.47.0") * wups the import * 🥴 --------- Co-authored-by: Arthur Zucker <arthur.zucker@gmail.com>
2024-10-07 19:28:07 +02:00 · 2024-10-07 16:48:52 +02:00 · 2024-10-07 15:47:59 +02:00 · 2024-10-07 15:41:22 +02:00 · 2024-10-07 15:29:36 +02:00 · 2024-10-07 15:29:02 +02:00
15 changed files with 184 additions and 103 deletions
--- a/setup.py
+++ b/setup.py
@@ -436,7 +436,7 @@ install_requires = [

 setup(
    name="transformers",
-    version="4.45.1",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="4.45.2",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
    author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
    author_email="transformers@huggingface.co",
    description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -18,7 +18,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).

-__version__ = "4.45.1"
+__version__ = "4.45.2"

 from typing import TYPE_CHECKING

--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@@ -10,6 +10,7 @@ from packaging import version

 from .configuration_utils import PretrainedConfig
 from .utils import is_hqq_available, is_quanto_available, is_torchdynamo_compiling, logging
+from .utils.deprecation import deprecate_kwarg


 if is_quanto_available():
@@ -17,6 +18,7 @@ if is_quanto_available():
    if quanto_version >= version.parse("0.2.0"):
        from quanto import AffineQuantizer, MaxOptimizer, qint2, qint4

+
 if is_hqq_available():
    from hqq.core.quantize import Quantizer as HQQQuantizer

@@ -360,15 +362,12 @@ class DynamicCache(Cache):
        ```
    """

+    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
    def __init__(self, num_hidden_layers: Optional[int] = None) -> None:
        super().__init__()
-        if num_hidden_layers is None:
-            self.key_cache: List[torch.Tensor] = []
-            self.value_cache: List[torch.Tensor] = []
-        else:
-            self.key_cache: List[torch.Tensor] = [[] for _ in range(num_hidden_layers)]
-            self.value_cache: List[torch.Tensor] = [[] for _ in range(num_hidden_layers)]
        self._seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
+        self.key_cache: List[torch.Tensor] = []
+        self.value_cache: List[torch.Tensor] = []

    def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
        """
@@ -424,11 +423,13 @@ class DynamicCache(Cache):

        # Update the cache
        if len(self.key_cache) <= layer_idx:
+            # There may be skipped layers, fill them with empty lists
+            for _ in range(len(self.key_cache), layer_idx):
+                self.key_cache.append([])
+                self.value_cache.append([])
            self.key_cache.append(key_states)
            self.value_cache.append(value_states)
-        # content on layer cache can be a tensor and checking not tensor causes errors
-        # so we explicitly check for the empty list
-        elif self.key_cache[layer_idx] == []:
+        elif len(self.key_cache[layer_idx]) == 0:  # fills previously skipped layers; checking for tensor causes errors
            self.key_cache[layer_idx] = key_states
            self.value_cache[layer_idx] = value_states
        else:
@@ -440,9 +441,13 @@ class DynamicCache(Cache):
    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
        # TODO: deprecate this function in favor of `cache_position`
-        if len(self.key_cache) <= layer_idx or (len(self.key_cache) > layer_idx and self.key_cache[layer_idx] == []):
-            return 0
-        return self.key_cache[layer_idx].shape[-2]
+        is_empty_layer = (
+            len(self.key_cache) == 0  # no cache in any layer
+            or len(self.key_cache) <= layer_idx  # skipped `layer_idx` and hasn't run a layer with cache after it
+            or len(self.key_cache[layer_idx]) == 0  # the layer has no cache
+        )
+        layer_seq_length = self.key_cache[layer_idx].shape[-2] if not is_empty_layer else 0
+        return layer_seq_length

    def get_max_length(self) -> Optional[int]:
        """Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length."""
@@ -457,12 +462,13 @@ class DynamicCache(Cache):
        return legacy_cache

    @classmethod
+    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
    def from_legacy_cache(
        cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, num_hidden_layers: int = None
    ) -> "DynamicCache":
        """Converts a cache in the legacy cache format into an equivalent `DynamicCache`. Used for
        backward compatibility."""
-        cache = cls(num_hidden_layers)
+        cache = cls()
        if past_key_values is not None:
            for layer_idx in range(len(past_key_values)):
                key_states, value_states = past_key_values[layer_idx]
@@ -485,12 +491,15 @@ class DynamicCache(Cache):
                self.key_cache[idx] = self.key_cache[idx][..., :max_length, :]
                self.value_cache[idx] = self.value_cache[idx][..., :max_length, :]

-    def batch_split(self, full_batch_size: int, split_size: int, num_hidden_layers: int) -> List["DynamicCache"]:
+    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
+    def batch_split(
+        self, full_batch_size: int, split_size: int, num_hidden_layers: int = None
+    ) -> List["DynamicCache"]:
        """Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
        `_split_model_inputs()` in `generation.utils`"""
        out = []
        for i in range(0, full_batch_size, split_size):
-            current_split = DynamicCache(num_hidden_layers)
+            current_split = DynamicCache()
            current_split._seen_tokens = self._seen_tokens
            current_split.key_cache = [tensor[i : i + split_size] for tensor in self.key_cache]
            current_split.value_cache = [tensor[i : i + split_size] for tensor in self.value_cache]
@@ -498,10 +507,11 @@ class DynamicCache(Cache):
        return out

    @classmethod
-    def from_batch_splits(cls, splits: List["DynamicCache"], num_hidden_layers: int) -> "DynamicCache":
+    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
+    def from_batch_splits(cls, splits: List["DynamicCache"], num_hidden_layers: int = None) -> "DynamicCache":
        """This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in
        `generation.utils`"""
-        cache = cls(num_hidden_layers)
+        cache = cls()
        for idx in range(len(splits[0])):
            key_cache = [current.key_cache[idx] for current in splits if current.key_cache[idx] != []]
            value_cache = [current.key_cache[idx] for current in splits if current.key_cache[idx] != []]
@@ -617,7 +627,9 @@ class OffloadedCache(DynamicCache):
            self._seen_tokens += key_states.shape[-2]

        # Update the cache
-        if len(self.key_cache) <= layer_idx:
+        if len(self.key_cache) < layer_idx:
+            raise ValueError("OffloadedCache does not support model usage where layers are skipped. Use DynamicCache.")
+        elif len(self.key_cache) == layer_idx:
            self.key_cache.append(key_states)
            self.value_cache.append(value_states)
            self.original_device.append(key_states.device)
@@ -676,7 +688,9 @@ class QuantizedCache(DynamicCache):
        if layer_idx == 0:
            self._seen_tokens += key_states.shape[-2]

-        if len(self.key_cache) <= layer_idx:
+        if len(self.key_cache) < layer_idx:
+            raise ValueError("QuantizedCache does not support model usage where layers are skipped. Use DynamicCache.")
+        elif len(self.key_cache) == layer_idx:
            self._quantized_key_cache.append(self._quantize(key_states.contiguous(), axis=self.axis_key))
            self._quantized_value_cache.append(self._quantize(value_states.contiguous(), axis=self.axis_value))
            self.key_cache.append(torch.zeros(0, dtype=key_states.dtype, device=key_states.device))
@@ -1408,12 +1422,12 @@ class EncoderDecoderCache(Cache):

    @classmethod
    def from_legacy_cache(
-        cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, num_hidden_layers: int = None
+        cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
    ) -> "EncoderDecoderCache":
        """Converts a cache in the legacy cache format into an equivalent `EncoderDecoderCache`."""
        cache = cls(
-            self_attention_cache=DynamicCache(num_hidden_layers),
-            cross_attention_cache=DynamicCache(num_hidden_layers),
+            self_attention_cache=DynamicCache(),
+            cross_attention_cache=DynamicCache(),
        )
        if past_key_values is not None:
            for layer_idx in range(len(past_key_values)):
@@ -1471,14 +1485,12 @@ class EncoderDecoderCache(Cache):
        self.check_dynamic_cache(self.crop.__name__)
        self.self_attention_cache.crop(maximum_length)

-    def batch_split(
-        self, full_batch_size: int, split_size: int, num_hidden_layers: int
-    ) -> "List[EncoderDecoderCache]":
+    def batch_split(self, full_batch_size: int, split_size: int) -> "List[EncoderDecoderCache]":
        """Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
        `_split_model_inputs()` in `generation.utils`"""
        self.check_dynamic_cache(self.batch_split.__name__)
-        self_attention_cache = self.self_attention_cache.batch_split(full_batch_size, split_size, num_hidden_layers)
-        cross_attention_cache = self.cross_attention_cache.batch_split(full_batch_size, split_size, num_hidden_layers)
+        self_attention_cache = self.self_attention_cache.batch_split(full_batch_size, split_size)
+        cross_attention_cache = self.cross_attention_cache.batch_split(full_batch_size, split_size)

        out = []
        for self_attn, cross_attn in zip(self_attention_cache, cross_attention_cache):
@@ -1486,11 +1498,11 @@ class EncoderDecoderCache(Cache):
        return out

    @classmethod
-    def from_batch_splits(cls, splits: List["EncoderDecoderCache"], num_hidden_layers: int) -> "EncoderDecoderCache":
+    def from_batch_splits(cls, splits: List["EncoderDecoderCache"]) -> "EncoderDecoderCache":
        """This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in
        `generation.utils`"""
-        self_attention_cache = DynamicCache(num_hidden_layers)
-        cross_attention_cache = DynamicCache(num_hidden_layers)
+        self_attention_cache = DynamicCache()
+        cross_attention_cache = DynamicCache()
        for idx in range(len(splits[0])):
            layer_keys = torch.cat([current.self_attention_cache.key_cache[idx] for current in splits], dim=0)
            layer_values = torch.cat([current.self_attention_cache.value_cache[idx] for current in splits], dim=0)
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -380,11 +380,14 @@ class PretrainedConfig(PushToHubMixin):

        non_default_generation_parameters = self._get_non_default_generation_parameters()
        if len(non_default_generation_parameters) > 0:
-            raise ValueError(
+            # TODO (joao): this should be an exception if the user has modified the loaded config. See #33886
+            warnings.warn(
                "Some non-default generation parameters are set in the model config. These should go into either a) "
                "`model.generation_config` (as opposed to `model.config`); OR b) a GenerationConfig file "
-                "(https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) "
-                f"\nNon-default generation parameters: {str(non_default_generation_parameters)}"
+                "(https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model)."
+                "This warning will become an exception in the future."
+                f"\nNon-default generation parameters: {str(non_default_generation_parameters)}",
+                UserWarning,
            )

        os.makedirs(save_directory, exist_ok=True)
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1602,11 +1602,10 @@ class GenerationMixin:
        # Use DynamicCache() instance by default. This will avoid back and forth from legacy format that
        # keeps copying the cache thus using much more memory
        else:
-            num_hidden_layers = self.config.get_text_config().num_hidden_layers
            model_kwargs[cache_name] = (
-                DynamicCache(num_hidden_layers)
+                DynamicCache()
                if not requires_cross_attention_cache
-                else EncoderDecoderCache(DynamicCache(num_hidden_layers), DynamicCache(num_hidden_layers))
+                else EncoderDecoderCache(DynamicCache(), DynamicCache())
            )

    def _supports_num_logits_to_keep(self) -> bool:
--- a/src/transformers/modeling_rope_utils.py
+++ b/src/transformers/modeling_rope_utils.py
@@ -360,13 +360,23 @@ ROPE_INIT_FUNCTIONS = {
 }


-def _check_received_keys(rope_type: str, received_keys: set, required_keys: set, optional_keys: Optional[set] = None):
+def _check_received_keys(
+    rope_type: str,
+    received_keys: set,
+    required_keys: set,
+    optional_keys: Optional[set] = None,
+    ignore_keys: Optional[set] = None,
+):
    """Compare the received keys in `config.rope_scaling` against the expected and optional keys"""
    # BC: "rope_type" was originally "type" -- let's check for "rope_type" when "type" is present
    if "type" in received_keys:
        received_keys -= {"type"}
        required_keys.add("rope_type")

+    # Some models need to store model-specific keys, and we don't want to throw warning at them
+    if ignore_keys is not None:
+        received_keys -= ignore_keys
+
    missing_keys = required_keys - received_keys
    if missing_keys:
        raise KeyError(f"Missing required keys in `rope_scaling` for 'rope_type'='{rope_type}': {missing_keys}")
@@ -379,47 +389,47 @@ def _check_received_keys(rope_type: str, received_keys: set, required_keys: set,
        logger.warning(f"Unrecognized keys in `rope_scaling` for 'rope_type'='{rope_type}': {unused_keys}")


-def _validate_default_rope_parameters(config: PretrainedConfig):
+def _validate_default_rope_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
    rope_scaling = config.rope_scaling
    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
    required_keys = {"rope_type"}
    received_keys = set(rope_scaling.keys())
-    _check_received_keys(rope_type, received_keys, required_keys)
+    _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys)


-def _validate_linear_scaling_rope_parameters(config: PretrainedConfig):
+def _validate_linear_scaling_rope_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
    rope_scaling = config.rope_scaling
    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
    required_keys = {"rope_type", "factor"}
    received_keys = set(rope_scaling.keys())
-    _check_received_keys(rope_type, received_keys, required_keys)
+    _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys)

    factor = rope_scaling["factor"]
    if factor is None or not isinstance(factor, float) or factor < 1.0:
        logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")


-def _validate_dynamic_scaling_rope_parameters(config: PretrainedConfig):
+def _validate_dynamic_scaling_rope_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
    rope_scaling = config.rope_scaling
    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
    required_keys = {"rope_type", "factor"}
    # TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
    optional_keys = {"original_max_position_embeddings"}
    received_keys = set(rope_scaling.keys())
-    _check_received_keys(rope_type, received_keys, required_keys, optional_keys)
+    _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys)

    factor = rope_scaling["factor"]
    if factor is None or not isinstance(factor, float) or factor < 1.0:
        logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")


-def _validate_yarn_parameters(config: PretrainedConfig):
+def _validate_yarn_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
    rope_scaling = config.rope_scaling
    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
    required_keys = {"rope_type", "factor"}
    optional_keys = {"attention_factor", "beta_fast", "beta_slow"}
    received_keys = set(rope_scaling.keys())
-    _check_received_keys(rope_type, received_keys, required_keys, optional_keys)
+    _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys)

    factor = rope_scaling["factor"]
    if factor is None or not isinstance(factor, float) or factor < 1.0:
@@ -444,14 +454,14 @@ def _validate_yarn_parameters(config: PretrainedConfig):
        )


-def _validate_longrope_parameters(config: PretrainedConfig):
+def _validate_longrope_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
    rope_scaling = config.rope_scaling
    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
    required_keys = {"rope_type", "short_factor", "long_factor"}
    # TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
    optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"}
    received_keys = set(rope_scaling.keys())
-    _check_received_keys(rope_type, received_keys, required_keys, optional_keys)
+    _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys)

    partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
@@ -494,12 +504,12 @@ def _validate_longrope_parameters(config: PretrainedConfig):
                )


-def _validate_llama3_parameters(config: PretrainedConfig):
+def _validate_llama3_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
    rope_scaling = config.rope_scaling
    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
    required_keys = {"rope_type", "factor", "original_max_position_embeddings", "low_freq_factor", "high_freq_factor"}
    received_keys = set(rope_scaling.keys())
-    _check_received_keys(rope_type, received_keys, required_keys)
+    _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys)

    factor = rope_scaling["factor"]
    if factor is None or not isinstance(factor, float) or factor < 1.0:
@@ -541,7 +551,7 @@ ROPE_VALIDATION_FUNCTIONS = {
 }


-def rope_config_validation(config: PretrainedConfig):
+def rope_config_validation(config: PretrainedConfig, ignore_keys: Optional[set] = None):
    """
    Validate the RoPE config arguments, given a `PretrainedConfig` object
    """
@@ -553,7 +563,7 @@ def rope_config_validation(config: PretrainedConfig):
    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", "default"))
    validation_fn = ROPE_VALIDATION_FUNCTIONS.get(rope_type)
    if validation_fn is not None:
-        validation_fn(config)
+        validation_fn(config, ignore_keys=ignore_keys)
    else:
        logger.warning(
            f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'"
--- a/src/transformers/models/idefics2/modeling_idefics2.py
+++ b/src/transformers/models/idefics2/modeling_idefics2.py
@@ -1348,17 +1348,18 @@ class Idefics2Model(Idefics2PreTrainedModel):
        past_seen_tokens = 0
        # kept for BC (non `Cache` `past_key_values` inputs)
        return_legacy_cache = False
-        if use_cache and not isinstance(past_key_values, Cache):
-            return_legacy_cache = True
-            if past_key_values is None:
-                past_key_values = DynamicCache()
-            else:
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-                logger.warning_once(
-                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
-                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
-                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
-                )
+        if use_cache:
+            if not isinstance(past_key_values, Cache):
+                return_legacy_cache = True
+                if past_key_values is None:
+                    past_key_values = DynamicCache()
+                else:
+                    past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                    logger.warning_once(
+                        "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                        "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                        "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                    )
            past_seen_tokens = past_key_values.get_seq_length()

        if inputs_embeds is not None and input_ids is None and past_seen_tokens == 0:
--- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py
@@ -235,11 +235,13 @@ class Qwen2VLConfig(PretrainedConfig):

        # Validate the correctness of rotary position embeddings parameters
        # BC: if there is a 'type' field, move it to 'rope_type'.
-        # and change type from 'mrope' to 'default'
+        # and change type from 'mrope' to 'default' because `mrope` does defeault RoPE calculations
+        # one can set it to "linear"/"dynamic" etc. to have scaled RoPE
+        # TODO: @raushan update config in the hub
        if self.rope_scaling is not None and "type" in self.rope_scaling:
            if self.rope_scaling["type"] == "mrope":
                self.rope_scaling["type"] = "default"
            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
-        rope_config_validation(self)
+        rope_config_validation(self, ignore_keys={"mrope_section"})

        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
--- a/src/transformers/models/siglip/modeling_siglip.py
+++ b/src/transformers/models/siglip/modeling_siglip.py
@@ -279,13 +279,13 @@ class SiglipVisionEmbeddings(nn.Module):
        """

        num_patches = embeddings.shape[1]
-        num_positions = self.position_embeddings.shape[1]
+        num_positions = self.position_embedding.weight.shape[0]

        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
-            return self.position_embeddings
+            return self.position_embedding(self.position_ids)

-        patch_pos_embed = self.position_embeddings
+        patch_pos_embed = self.position_embedding.weight.unsqueeze(0)

        dim = embeddings.shape[-1]

--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1613,14 +1613,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):

        self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)

-        if "clean_up_tokenization_spaces" not in kwargs:
-            warnings.warn(
-                "`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This "
-                "behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. "
-                "For more details check this issue: https://github.com/huggingface/transformers/issues/31884",
-                FutureWarning,
-            )
-
        # By default, cleaning tokenization spaces for both fast and slow tokenizers
        self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", False)

--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1846,13 +1846,14 @@ class GenerationTesterMixin:
                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
            )
            set_seed(seed)
-            num_hidden_layers = config.get_text_config().num_hidden_layers
            if config.is_encoder_decoder:
                cache_cls = EncoderDecoderCache
-                past_key_values = cache_cls(DynamicCache(num_hidden_layers), DynamicCache(num_hidden_layers))
+                past_key_values = cache_cls(DynamicCache(), DynamicCache())
+                past_key_values = cache_cls(DynamicCache(), DynamicCache())
            else:
                cache_cls = DynamicCache
-                past_key_values = cache_cls(num_hidden_layers)
+                past_key_values = cache_cls()
+
            new_results = model.generate(
                input_ids,
                attention_mask=attention_mask,
@@ -3797,6 +3798,29 @@ class GenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTestsMi
        self.assertEqual(generated_text_no_padding, generated_text_with_padding)
        self.assertEqual(generated_text_no_padding, "Ich muss diese Aufgabe vor Ende des Tages beenden.")

+    def test_generate_compile_fullgraph_tiny(self):
+        """
+        Tests that we can call end-to-end generation with a tiny model (i.e. doesn't crash)
+        NOTE: this test is quite slow (~20s on a consumer desktop), but it is important that we keep it as part of the
+        non-slow tests to prevent regressions!
+        """
+        model = AutoModelForCausalLM.from_pretrained(
+            "hf-internal-testing/tiny-random-LlamaForCausalLM", torch_dtype=torch.bfloat16, device_map="auto"
+        )
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-LlamaForCausalLM")
+
+        # compile generate
+        compiled_generate = torch.compile(model.generate, fullgraph=True, mode="reduce-overhead")
+
+        # compiled generate does NOT accept parameterization except a) model inputs b) a generation config
+        generation_config = copy.deepcopy(model.generation_config)
+        generation_config.pad_token_id = model.config.eos_token_id
+
+        model_inputs = tokenizer(["Write a poem about the market crashing in summer"], return_tensors="pt")
+        model_inputs = model_inputs.to(model.device)
+        gen_out = compiled_generate(**model_inputs, generation_config=generation_config)
+        self.assertTrue(gen_out.shape[1] > model_inputs["input_ids"].shape[1])  # some text was generated
+

@require_torch
 class TokenHealingTestCase(unittest.TestCase):
--- a/tests/models/mllama/test_modeling_mllama.py
+++ b/tests/models/mllama/test_modeling_mllama.py
@@ -383,45 +383,73 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester
        pass

    @unittest.skip(reason="Failing test, need to fix")
-    def test_beam_sample_generate_dict_output():
+    def test_beam_sample_generate_dict_output(self):
        pass

    @unittest.skip(reason="Failing test, need to fix")
-    def test_beam_search_generate_dict_output():
+    def test_beam_search_generate_dict_output(self):
        pass

    @unittest.skip(reason="Failing test, need to fix")
-    def test_constrained_beam_search_generate_dict_output():
+    def test_constrained_beam_search_generate_dict_output(self):
        pass

    @unittest.skip(reason="Failing test, need to fix")
-    def test_dola_decoding_sample():
+    def test_dola_decoding_sample(self):
        pass

    @unittest.skip(reason="Failing test, need to fix")
-    def test_generate_methods_with_num_logits_to_keep():
+    def test_generate_methods_with_num_logits_to_keep(self):
        pass

    @unittest.skip(reason="Failing test, need to fix")
-    def test_greedy_generate_dict_outputs():
+    def test_greedy_generate_dict_outputs(self):
        pass

    @unittest.skip(reason="Failing test, need to fix")
-    def test_group_beam_search_generate_dict_output():
+    def test_group_beam_search_generate_dict_output(self):
        pass

    @unittest.skip(reason="Failing test, need to fix")
-    def test_model_parallel_beam_search():
+    def test_model_parallel_beam_search(self):
        pass

-    @unittest.skip(reason="Failing test, need to fix")
-    def test_new_cache_format_2():
-        pass
+    @is_flaky()  # TODO (joao, raushan) - investigate why this test is flaky (probably depends on the model initialization)
+    def test_new_cache_format_0(self):
+        super().test_new_cache_format_0()
+
+    @is_flaky()  # TODO (joao, raushan) - investigate why this test is flaky (probably depends on the model initialization)
+    def test_new_cache_format_1(self):
+        super().test_new_cache_format_1()
+
+    @is_flaky()  # TODO (joao, raushan) - investigate why this test is flaky (probably depends on the model initialization)
+    def test_new_cache_format_2(self):
+        super().test_new_cache_format_2()

    @unittest.skip(reason="Failing test, need to fix")
-    def test_sample_generate_dict_output():
+    def test_sample_generate_dict_output(self):
        pass

+    def test_generate_text_only_with_cache(self):
+        """
+        Tests that our cached generation with text-only inputs works. When mllama was introduced, this feature
+        required cache modifications (because layers are skipped in practice). This test should prevent regressions.
+        """
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            model.generate(input_ids, use_cache=True)
+

@require_torch
 class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
--- a/tests/utils/test_cache_utils.py
+++ b/tests/utils/test_cache_utils.py
@@ -53,7 +53,7 @@ class CacheTest(unittest.TestCase):
    def test_dynamic_cache_retrocompatibility(self):
        """Tests that we can convert back and forth between the legacy cache format and DynamicCache"""
        legacy_cache = ()
-        new_cache = DynamicCache(num_hidden_layers=10)
+        new_cache = DynamicCache()

        # Creates a new cache with 10 layers in both formats
        for layer_idx in range(10):
@@ -83,7 +83,7 @@ class CacheTest(unittest.TestCase):
                )

        # Test 1: We can convert from legacy to new with no changes
-        from_legacy = DynamicCache.from_legacy_cache(legacy_cache, num_hidden_layers=10)
+        from_legacy = DynamicCache.from_legacy_cache(legacy_cache)
        for layer_idx in range(10):
            for key_value_idx in range(2):
                self.assertTrue(
@@ -103,7 +103,7 @@ class CacheTest(unittest.TestCase):
        legacy_reorder_fn = GPT2LMHeadModel._reorder_cache  # An example of a legacy `_reorder_cache` function

        legacy_cache = ()
-        new_cache = DynamicCache(num_hidden_layers=10)
+        new_cache = DynamicCache()

        # Creates a new cache with 10 layers in both formats
        for layer_idx in range(10):
@@ -240,9 +240,7 @@ class CacheIntegrationTest(unittest.TestCase):
        set_seed(0)
        gen_out_legacy = model.generate(**inputs, do_sample=True, max_new_tokens=256)
        set_seed(0)
-        gen_out = model.generate(
-            **inputs, do_sample=True, max_new_tokens=256, past_key_values=DynamicCache(model.config.num_hidden_layers)
-        )
+        gen_out = model.generate(**inputs, do_sample=True, max_new_tokens=256, past_key_values=DynamicCache())
        self.assertListEqual(gen_out_legacy.tolist(), gen_out.tolist())

        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
@@ -270,9 +268,7 @@ class CacheIntegrationTest(unittest.TestCase):
            model.device
        )

-        gen_out = model.generate(
-            **inputs, do_sample=False, max_new_tokens=10, past_key_values=DynamicCache(model.config.num_hidden_layers)
-        )
+        gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10, past_key_values=DynamicCache())
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        expected_text = ["A sequence: 1, 2, 3, 4, 5, 6, 7, 8,", "A sequence: A, B, C, D, E, F, G, H"]
        self.assertListEqual(decoded, expected_text)
--- a/tests/utils/test_configuration_utils.py
+++ b/tests/utils/test_configuration_utils.py
@@ -313,11 +313,12 @@ class ConfigTestUtils(unittest.TestCase):
        old_configuration = old_transformers.models.auto.AutoConfig.from_pretrained(repo)
        self.assertEqual(old_configuration.hidden_size, 768)

-    def test_saving_config_with_custom_generation_kwargs_raises_exception(self):
+    def test_saving_config_with_custom_generation_kwargs_raises_warning(self):
        config = BertConfig(min_length=3)  # `min_length = 3` is a non-default generation kwarg
        with tempfile.TemporaryDirectory() as tmp_dir:
-            with self.assertRaises(ValueError):
+            with self.assertWarns(UserWarning) as cm:
                config.save_pretrained(tmp_dir)
+            self.assertIn("min_length", str(cm.warning))

    def test_get_non_default_generation_parameters(self):
        config = BertConfig()
--- a/tests/utils/test_modeling_rope_utils.py
+++ b/tests/utils/test_modeling_rope_utils.py
@@ -65,6 +65,19 @@ class RopeTest(unittest.TestCase):
                    with self.assertRaises(KeyError):
                        rope_config_validation(config)

+        # Any other parameters passed to RoPE will raise a warning that a particular key is not used
+        # But sometimes we can have model-specific RoPE kwargs and bypass warning with `ignore_keys`
+        model_specific_kwarg = "mrope_sections"  # e,g in Qwen2-VL
+
+        for rope_type in all_rope_types:
+            if rope_type == "default":
+                config.rope_scaling = {"rope_type": rope_type, model_specific_kwarg: True}
+                rope_config_validation(config, ignore_keys={model_specific_kwarg})
+                with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs:
+                    rope_config_validation(config)
+                    self.assertEqual(len(logs.output), 1)
+                    self.assertIn(model_specific_kwarg, logs.output[0])
+
    def test_default_rope_function_bc(self):
        config = LlamaConfig()
        device = torch_device
Author	SHA1	Message	Date
Arthur Zucker	53fad641cf	v4.45.2 Some checks failed Release - Conda / build_and_package (push) Has been cancelled Details Secret Leaks / trufflehog (push) Has been cancelled Details	2024-10-07 19:28:07 +02:00
Joao Gante	2fd49d2b28	Cache: revert DynamicCache init for BC (#33861 ) * tmp commit * tmp commit * make fixup * missing removal * fix condition * fix end-to-end compilation * if -> elif * BC * BC * use @deprecate_kwarg("num_hidden_layers", version="4.47.0") * wups the import * 🥴 --------- Co-authored-by: Arthur Zucker <arthur.zucker@gmail.com>	2024-10-07 16:48:52 +02:00
Arthur Zucker	5df4ca826d	wow	2024-10-07 15:47:59 +02:00
Arthur Zucker	277ed58f06	fix post cherry-picking!	2024-10-07 15:41:22 +02:00
Kaito	b1c237fc4e	Fix Tensor + Embedding error in some cases when using SiglipVisionModel (#33994 ) Fix Tensor + Embedding error in some cases Co-authored-by: kaitolucifer <kaito.o@ghelia.com>	2024-10-07 15:29:36 +02:00
Arthur	ae5f4916de	properly fix and RUN_SLOW (#33965 ) * properly fix and RUN_SLOW * lots of models were affected * fix-copies * more fixes	2024-10-07 15:29:02 +02:00
Joao Gante	6ea04aaad8	Cache: revert DynamicCache init for BC (#33861 ) * tmp commit * tmp commit * make fixup * missing removal * fix condition * fix end-to-end compilation * if -> elif * BC * BC * use @deprecate_kwarg("num_hidden_layers", version="4.47.0") * wups the import * 🥴 --------- Co-authored-by: Arthur Zucker <arthur.zucker@gmail.com>	2024-10-07 15:21:31 +02:00
Arthur	be968434fd	hot fix `self.position_embeddings->self.position_embedding` (#33958 )	2024-10-07 15:20:35 +02:00
Raushan Turganbay	333ec0a523	Ignore keys on `validate_rope` (#33753 ) * ignore keys on check rope * add tests * fix tests, so maybe better leave at logger lvl	2024-10-07 15:20:35 +02:00
Joao Gante	3576fec8a3	Config: lower `save_pretrained` exception to warning (#33906 ) * lower to warning * msg * make fixup * rm extra comma	2024-10-07 15:20:35 +02:00
aroun-coumar	f0686f567a	Fixes for issue #33763 in idefics2 model (#33766 )	2024-10-07 15:20:35 +02:00
Ita Zaporozhets	27f03e0a7b	remove warning v2 (#33761 )	2024-10-07 15:20:34 +02:00