🚨Early-error🚨 config will error out if output_attentions=True and the attn implementation is wrong (#38288)

* Protect ParallelInterface

* early error out on output attention setting for no wraning in modeling

* modular update

* fixup

* update model tests

* update

* oups

* set model's config

* more cases

* ??

* properly fix

* fixup

* update

* last onces

* update

* fix?

* fix wrong merge commit

* fix hub test

* nits

* wow I am tired

* updates

* fix pipeline!

---------

Co-authored-by: Lysandre <hi@lysand.re>
This commit is contained in:
Arthur
2025-05-23 17:17:38 +02:00
committed by GitHub
parent 896833c183
commit f5d45d89c4
71 changed files with 157 additions and 144 deletions

View File

@@ -1157,7 +1157,8 @@ class GenerationTesterMixin:
self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
config.is_decoder = True
model = model_class(config).to(torch_device).eval()
model = model_class._from_config(config, attn_implementation="eager").to(torch_device).eval()
config = model.config
# Sets assisted generation arguments such that:
# a) no EOS is generated, to ensure generation doesn't break early
# b) the assistant model always generates two tokens when it is called, to ensure the input preparation of
@@ -1187,6 +1188,7 @@ class GenerationTesterMixin:
assistant_model = model_class(config).to(torch_device).eval()
else:
assistant_model = model
assistant_model.config._attn_implementation = "eager"
assistant_model.generation_config.num_assistant_tokens = 2 # see b)
assistant_model.generation_config.num_assistant_tokens_schedule = "constant" # see b)
generation_kwargs.update({"assistant_model": assistant_model})
@@ -1367,7 +1369,8 @@ class GenerationTesterMixin:
self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
config.is_decoder = True
model = model_class(config).to(torch_device).eval()
model = model_class._from_config(config, attn_implementation="eager").to(torch_device).eval()
config = model.config
# Sets assisted generation arguments such that:
# a) no EOS is generated, to ensure generation doesn't break early
# b) the assistant model always generates two tokens when it is called, to ensure the input preparation of