[cache] make all classes cache compatible finally (#38635)

* dump

* push other models

* fix simple greedy generation

* xmod

* add fmst and clean up some mentions of old cache format

* gpt-bigcode now follows standards

* delete tuple cache reference in generation

* fix some models

* fix some models

* fix mambas and support cache in tapas

* fix some more tests

* fix copies

* delete `_reorder_cache`

* another fix copies

* fix typos and delete unnecessary test

* fix rag generate, needs special cache reordering

* fix tapas and superglue

* reformer create special cache

* recurrent gemma `reorder_cache` was a no-op, delete

* fix-copies

* fix blio and musicgen pipeline tests

* fix reformer

* fix reformer, again...

* delete `_supports_cache_class`

* delete `supports_quantized_cache`

* fix failing tests

* fix copies

* some minor clean up

* style

* style

* fix copies

* fix tests

* fix copies

* create causal mask now needs positions?

* fixc copies

* style

* Update tests/test_modeling_common.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* clean-up of non-generative model after merging main

* check `is_decoder` for cache

* delete transpose for scores

* remove tuple cache from docs everywhere

* fix tests

* fix copies

* fix copies once more

* properly deprecate `encoder_attention_mask` in Bert-like models

* import `deprecate_kwarg` where needed

* fix copies again

* fix copies

* delete `nex_decoder_cache`

* fix copies asks to update for PLM

* fix copies

* rebasing had a few new models, fix them and merge asap!

* fix copies once more

* fix slow tests

* fix tests and updare PLM checkpoint

* add read token and revert accidentally removed line

* oh com -on, style

* just skip it, read token has no access to PLM yet

---------

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
This commit is contained in:
Raushan Turganbay
2025-07-16 17:00:17 +05:00
committed by GitHub
parent 6cb43defd0
commit c8524aeb07
268 changed files with 5707 additions and 6831 deletions

View File

@@ -1001,7 +1001,7 @@ class GenerationTesterMixin:
self.skipTest(reason="Stateful models don't support contrastive search generation")
# won't fix: FSMT and Reformer have a different cache variable type (and format).
if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
if any(model_name in model_class.__name__.lower() for model_name in ["reformer"]):
self.skipTest(reason="Won't fix: old model with different cache format")
config, inputs_dict = self.prepare_config_and_inputs_for_generate()
@@ -1030,7 +1030,7 @@ class GenerationTesterMixin:
self.skipTest(reason="Stateful models don't support contrastive search generation")
# won't fix: FSMT and Reformer have a different cache variable type (and format).
if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
if any(model_name in model_class.__name__.lower() for model_name in ["reformer"]):
self.skipTest(reason="Won't fix: old model with different cache format")
config, inputs_dict = self.prepare_config_and_inputs_for_generate()
@@ -1070,10 +1070,8 @@ class GenerationTesterMixin:
if model_class._is_stateful:
self.skipTest(reason="Stateful models don't support contrastive search generation")
if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer", "speech2text"]):
if any(model_name in model_class.__name__.lower() for model_name in ["reformer"]):
self.skipTest(reason="Won't fix: old model with different cache format")
if any(model_name in model_class.__name__.lower() for model_name in ["gptbigcode"]):
self.skipTest(reason="TODO: fix me")
config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1)
@@ -1112,22 +1110,16 @@ class GenerationTesterMixin:
for model_class in self.all_generative_model_classes:
if model_class._is_stateful:
self.skipTest(reason="Stateful models don't support assisted generation")
if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
if any(model_name in model_class.__name__.lower() for model_name in ["reformer"]):
self.skipTest(reason="Won't fix: old model with different cache format")
if any(
model_name in model_class.__name__.lower()
for model_name in [
"bigbirdpegasus",
"led",
"mega",
"moshi",
"speech2text",
"git",
"prophetnet",
"seamlessm4t",
"clvp",
"mllama", # special cache sizes
"blip2", # overridden `generate()`
"blip2", # overridden `generate()` all BLIP models
"instructblip",
"instructblipvideo",
]
@@ -1196,23 +1188,16 @@ class GenerationTesterMixin:
for model_class in self.all_generative_model_classes:
if model_class._is_stateful:
self.skipTest(reason="Stateful models don't support assisted generation")
if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
if any(model_name in model_class.__name__.lower() for model_name in ["reformer"]):
self.skipTest(reason="Won't fix: old model with different cache format")
if any(
model_name in model_class.__name__.lower()
for model_name in [
"bigbirdpegasus",
"led",
"mega",
"moshi",
"speech2text",
"git",
"prophetnet",
"seamlessm4t",
"clvp",
"fuyu",
"mllama", # special cache sizes
"blip2", # overridden `generate()`
"blip2", # overridden `generate()` for all BLIP models
"instructblip",
"instructblipvideo",
# All models below: shouldn't suggest image tokens. Can be fixed by passing `suppress_ids` to candidate generator: @joaa @raushan
@@ -1340,22 +1325,16 @@ class GenerationTesterMixin:
for model_class in self.all_generative_model_classes:
if model_class._is_stateful:
self.skipTest(reason="Stateful models don't support assisted generation")
if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
if any(model_name in model_class.__name__.lower() for model_name in ["reformer"]):
self.skipTest(reason="Won't fix: old model with different cache format")
if any(
model_name in model_class.__name__.lower()
for model_name in [
"bigbirdpegasus",
"led",
"mega",
"moshi",
"speech2text",
"git",
"prophetnet",
"seamlessm4t",
"clvp",
"mllama", # special cache sizes
"blip2", # overridden `generate()`
"blip2", # overridden `generate()` for all BLIP models
"instructblip",
"instructblipvideo",
]
@@ -2059,12 +2038,15 @@ class GenerationTesterMixin:
@pytest.mark.generate
def test_generate_with_quant_cache(self):
for model_class in self.all_generative_model_classes:
if not model_class._supports_quantized_cache:
config, inputs_dict = self.prepare_config_and_inputs_for_generate()
if (
config.get_text_config(decoder=True).is_encoder_decoder
or not model_class._supports_default_dynamic_cache()
):
self.skipTest(reason="This model does not support the quantized cache format")
config, inputs_dict = self.prepare_config_and_inputs_for_generate()
config.is_decoder = True
model = model_class(config).to(torch_device).eval()
generation_kwargs = {
"max_new_tokens": 5,
@@ -2509,14 +2491,10 @@ class GenerationTesterMixin:
# Past Key Value States -- a few notes here:
# 1. Its inner sequence length is with respect to the inputs of the latest forward pass, hence the "-1"
# 2. We ignore models that have unique cache structures (e.g. mamba) or are in need of refatoring to match the
# standard cache format (e.g.gptbigcode )
# standard cache format (e.g.mamba architecture )
models_without_standard_cache = (
"bamba",
"ctrl",
"fsmt",
"granitemoehybrid",
"gptbigcode",
"mega",
"reformer",
"jamba",
"mamba",