[cache] make all classes cache compatible finally (#38635)

* dump * push other models * fix simple greedy generation * xmod * add fmst and clean up some mentions of old cache format * gpt-bigcode now follows standards * delete tuple cache reference in generation * fix some models * fix some models * fix mambas and support cache in tapas * fix some more tests * fix copies * delete `_reorder_cache` * another fix copies * fix typos and delete unnecessary test * fix rag generate, needs special cache reordering * fix tapas and superglue * reformer create special cache * recurrent gemma `reorder_cache` was a no-op, delete * fix-copies * fix blio and musicgen pipeline tests * fix reformer * fix reformer, again... * delete `_supports_cache_class` * delete `supports_quantized_cache` * fix failing tests * fix copies * some minor clean up * style * style * fix copies * fix tests * fix copies * create causal mask now needs positions? * fixc copies * style * Update tests/test_modeling_common.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * clean-up of non-generative model after merging main * check `is_decoder` for cache * delete transpose for scores * remove tuple cache from docs everywhere * fix tests * fix copies * fix copies once more * properly deprecate `encoder_attention_mask` in Bert-like models * import `deprecate_kwarg` where needed * fix copies again * fix copies * delete `nex_decoder_cache` * fix copies asks to update for PLM * fix copies * rebasing had a few new models, fix them and merge asap! * fix copies once more * fix slow tests * fix tests and updare PLM checkpoint * add read token and revert accidentally removed line * oh com -on, style * just skip it, read token has no access to PLM yet --------- Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
2025-07-16 17:00:17 +05:00
parent 6cb43defd0
commit c8524aeb07
268 changed files with 5707 additions and 6831 deletions
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1001,7 +1001,7 @@ class GenerationTesterMixin:
                self.skipTest(reason="Stateful models don't support contrastive search generation")

            # won't fix: FSMT and Reformer have a different cache variable type (and format).
-            if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
+            if any(model_name in model_class.__name__.lower() for model_name in ["reformer"]):
                self.skipTest(reason="Won't fix: old model with different cache format")

            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
@@ -1030,7 +1030,7 @@ class GenerationTesterMixin:
                self.skipTest(reason="Stateful models don't support contrastive search generation")

            # won't fix: FSMT and Reformer have a different cache variable type (and format).
-            if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
+            if any(model_name in model_class.__name__.lower() for model_name in ["reformer"]):
                self.skipTest(reason="Won't fix: old model with different cache format")

            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
@@ -1070,10 +1070,8 @@ class GenerationTesterMixin:
            if model_class._is_stateful:
                self.skipTest(reason="Stateful models don't support contrastive search generation")

-            if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer", "speech2text"]):
+            if any(model_name in model_class.__name__.lower() for model_name in ["reformer"]):
                self.skipTest(reason="Won't fix: old model with different cache format")
-            if any(model_name in model_class.__name__.lower() for model_name in ["gptbigcode"]):
-                self.skipTest(reason="TODO: fix me")

            config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1)

@@ -1112,22 +1110,16 @@ class GenerationTesterMixin:
        for model_class in self.all_generative_model_classes:
            if model_class._is_stateful:
                self.skipTest(reason="Stateful models don't support assisted generation")
-            if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
+            if any(model_name in model_class.__name__.lower() for model_name in ["reformer"]):
                self.skipTest(reason="Won't fix: old model with different cache format")
            if any(
                model_name in model_class.__name__.lower()
                for model_name in [
-                    "bigbirdpegasus",
-                    "led",
-                    "mega",
                    "moshi",
-                    "speech2text",
                    "git",
                    "prophetnet",
-                    "seamlessm4t",
-                    "clvp",
                    "mllama",  # special cache sizes
-                    "blip2",  # overridden `generate()`
+                    "blip2",  # overridden `generate()` all BLIP models
                    "instructblip",
                    "instructblipvideo",
                ]
@@ -1196,23 +1188,16 @@ class GenerationTesterMixin:
        for model_class in self.all_generative_model_classes:
            if model_class._is_stateful:
                self.skipTest(reason="Stateful models don't support assisted generation")
-            if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
+            if any(model_name in model_class.__name__.lower() for model_name in ["reformer"]):
                self.skipTest(reason="Won't fix: old model with different cache format")
            if any(
                model_name in model_class.__name__.lower()
                for model_name in [
-                    "bigbirdpegasus",
-                    "led",
-                    "mega",
                    "moshi",
-                    "speech2text",
                    "git",
                    "prophetnet",
-                    "seamlessm4t",
-                    "clvp",
-                    "fuyu",
                    "mllama",  # special cache sizes
-                    "blip2",  # overridden `generate()`
+                    "blip2",  # overridden `generate()` for all BLIP models
                    "instructblip",
                    "instructblipvideo",
                    # All models below: shouldn't suggest image tokens. Can be fixed by passing `suppress_ids` to candidate generator: @joaa @raushan
@@ -1340,22 +1325,16 @@ class GenerationTesterMixin:
        for model_class in self.all_generative_model_classes:
            if model_class._is_stateful:
                self.skipTest(reason="Stateful models don't support assisted generation")
-            if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
+            if any(model_name in model_class.__name__.lower() for model_name in ["reformer"]):
                self.skipTest(reason="Won't fix: old model with different cache format")
            if any(
                model_name in model_class.__name__.lower()
                for model_name in [
-                    "bigbirdpegasus",
-                    "led",
-                    "mega",
                    "moshi",
-                    "speech2text",
                    "git",
                    "prophetnet",
-                    "seamlessm4t",
-                    "clvp",
                    "mllama",  # special cache sizes
-                    "blip2",  # overridden `generate()`
+                    "blip2",  # overridden `generate()` for all BLIP models
                    "instructblip",
                    "instructblipvideo",
                ]
@@ -2059,12 +2038,15 @@ class GenerationTesterMixin:
    @pytest.mark.generate
    def test_generate_with_quant_cache(self):
        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_quantized_cache:
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+
+            if (
+                config.get_text_config(decoder=True).is_encoder_decoder
+                or not model_class._supports_default_dynamic_cache()
+            ):
                self.skipTest(reason="This model does not support the quantized cache format")

-            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
            config.is_decoder = True
-
            model = model_class(config).to(torch_device).eval()
            generation_kwargs = {
                "max_new_tokens": 5,
@@ -2509,14 +2491,10 @@ class GenerationTesterMixin:
        # Past Key Value States -- a few notes here:
        # 1. Its inner sequence length is with respect to the inputs of the latest forward pass, hence the "-1"
        # 2. We ignore models that have unique cache structures (e.g. mamba) or are in need of refatoring to match the
-        #    standard cache format (e.g.gptbigcode )
+        #    standard cache format (e.g.mamba architecture )
        models_without_standard_cache = (
            "bamba",
-            "ctrl",
-            "fsmt",
            "granitemoehybrid",
-            "gptbigcode",
-            "mega",
            "reformer",
            "jamba",
            "mamba",