From 2f50230c59ec9f17431236ed6625082cc385c76c Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Thu, 26 Jun 2025 18:48:14 +0200 Subject: [PATCH] fix `t5gemma` tests (#39052) * fix * fix * fix * fix * fix --------- Co-authored-by: ydshieh --- src/transformers/models/t5gemma/modeling_t5gemma.py | 10 +++++++--- src/transformers/models/t5gemma/modular_t5gemma.py | 9 +++++++-- tests/models/t5gemma/test_modeling_t5gemma.py | 8 ++++++++ tests/test_modeling_common.py | 6 +++++- 4 files changed, 27 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/t5gemma/modeling_t5gemma.py b/src/transformers/models/t5gemma/modeling_t5gemma.py index a6cec1c099..a7d60d2fa7 100644 --- a/src/transformers/models/t5gemma/modeling_t5gemma.py +++ b/src/transformers/models/t5gemma/modeling_t5gemma.py @@ -41,7 +41,7 @@ from ...modeling_outputs import ( from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack -from ...utils import auto_docstring, can_return_tuple, logging +from ...utils import auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging from .configuration_t5gemma import T5GemmaConfig, T5GemmaModuleConfig @@ -1112,7 +1112,7 @@ class T5GemmaForConditionalGeneration(T5GemmaPreTrainedModel, GenerationMixin): self.model = T5GemmaModel(config) self.vocab_size = config.decoder.vocab_size self.lm_head = T5GemmaLMHead(config.decoder.hidden_size, self.vocab_size) - self.loss_type = "ForMaskedLMLoss" + self.loss_type = "ForMaskedLM" self.post_init() @@ -1169,10 +1169,14 @@ class T5GemmaForConditionalGeneration(T5GemmaPreTrainedModel, GenerationMixin): (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. """ if self.training and self.config._attn_implementation != "eager": - logger.warning_once( + msg = ( "It is strongly recommended to train T5Gemma models with the `eager` attention implementation " f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('', attn_implementation='eager')`." ) + if is_torchdynamo_compiling(): + raise ValueError(msg) + else: + logger.warning_once(msg) if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: # get decoder inputs from shifting lm labels to the right diff --git a/src/transformers/models/t5gemma/modular_t5gemma.py b/src/transformers/models/t5gemma/modular_t5gemma.py index aea5f3f749..b3dbe761a2 100644 --- a/src/transformers/models/t5gemma/modular_t5gemma.py +++ b/src/transformers/models/t5gemma/modular_t5gemma.py @@ -37,6 +37,7 @@ from ...utils import ( auto_docstring, can_return_tuple, is_torch_flex_attn_available, + is_torchdynamo_compiling, logging, ) from ..gemma2.configuration_gemma2 import Gemma2Config @@ -1058,7 +1059,7 @@ class T5GemmaForConditionalGeneration(T5GemmaPreTrainedModel, GenerationMixin): self.model = T5GemmaModel(config) self.vocab_size = config.decoder.vocab_size self.lm_head = T5GemmaLMHead(config.decoder.hidden_size, self.vocab_size) - self.loss_type = "ForMaskedLMLoss" + self.loss_type = "ForMaskedLM" self.post_init() @@ -1115,10 +1116,14 @@ class T5GemmaForConditionalGeneration(T5GemmaPreTrainedModel, GenerationMixin): (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. """ if self.training and self.config._attn_implementation != "eager": - logger.warning_once( + msg = ( "It is strongly recommended to train T5Gemma models with the `eager` attention implementation " f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('', attn_implementation='eager')`." ) + if is_torchdynamo_compiling(): + raise ValueError(msg) + else: + logger.warning_once(msg) if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: # get decoder inputs from shifting lm labels to the right diff --git a/tests/models/t5gemma/test_modeling_t5gemma.py b/tests/models/t5gemma/test_modeling_t5gemma.py index ba49e91330..fd61e5e5c5 100644 --- a/tests/models/t5gemma/test_modeling_t5gemma.py +++ b/tests/models/t5gemma/test_modeling_t5gemma.py @@ -595,6 +595,11 @@ class T5GemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi # used in `test_torch_compile_for_training` _torch_compile_train_cls = T5GemmaForConditionalGeneration if is_torch_available() else None + # `t5gemma` will give warning or raise error if it is not `eager` during training. + _torch_compile_train_attn_implementation = "eager" + + # won't fix + test_torchscript = False def setUp(self): self.model_tester = T5GemmaModelTester(self) @@ -1584,6 +1589,9 @@ class T5GemmaEncoderOnlyModelTest(ModelTesterMixin, unittest.TestCase): is_encoder_decoder = False model_split_percents = [0.4, 0.5] + # won't fix + test_torchscript = False + def setUp(self): self.model_tester = T5GemmaEncoderOnlyModelTester(self) self.config_tester = ConfigTester( diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 2c734cfd61..b362525555 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -3748,7 +3748,7 @@ class ModelTesterMixin: self.skipTest( "PaliGemma-like models currently (transformers==4.41.0) requires an attention_mask input" ) - if config.model_type in ["modernbert", "gemma3"]: + if config.model_type in ["modernbert", "gemma3", "t5gemma"]: self.skipTest( reason=f"{config.model_type} currently (transformers==4.52.0) automatically adds an attention_mask input" ) @@ -4414,6 +4414,10 @@ class ModelTesterMixin: config, _ = self.model_tester.prepare_config_and_inputs_for_common() cls = self._torch_compile_train_cls + attn_implementation = getattr(self, "_torch_compile_train_attn_implementation", None) + if attn_implementation is not None: + config._attn_implementation = attn_implementation + model = cls(config).to(torch_device) inputs = {