[gemma 3] multimodal checkpoints + AutoModelForCausalLM (#36741)

2025-03-19 15:04:19 +00:00
parent b11050d6a2
commit 7c233980f4
3 changed files with 34 additions and 0 deletions
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -444,6 +444,11 @@ class _BaseAutoModelClass:
            f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
        )

+    @classmethod
+    def _prepare_config_for_auto_class(cls, config: PretrainedConfig) -> PretrainedConfig:
+        """Additional autoclass-specific config post-loading manipulation. May be overridden in subclasses."""
+        return config
+
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        config = kwargs.pop("config", None)
@@ -539,6 +544,10 @@ class _BaseAutoModelClass:
            if kwargs_orig.get("quantization_config", None) is not None:
                kwargs["quantization_config"] = kwargs_orig["quantization_config"]

+        # AutoClass-specific config manipulation
+        config = copy.deepcopy(config)
+        config = cls._prepare_config_for_auto_class(config)
+
        has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map
        has_local_code = type(config) in cls._model_mapping.keys()
        trust_remote_code = resolve_trust_remote_code(
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -17,6 +17,7 @@
 import warnings
 from collections import OrderedDict

+from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 from .auto_factory import (
    _BaseAutoBackboneClass,
@@ -1658,6 +1659,17 @@ _AutoModelWithLMHead = auto_class_update(_AutoModelWithLMHead, head_doc="languag
 class AutoModelForCausalLM(_BaseAutoModelClass):
    _model_mapping = MODEL_FOR_CAUSAL_LM_MAPPING

+    @classmethod
+    def _prepare_config_for_auto_class(cls, config: PretrainedConfig) -> PretrainedConfig:
+        """
+        Additional autoclass-specific config post-loading manipulation. In this specific autoclass, if the config has
+        a nested text decoder section, uses that section instead.
+
+        Under the hood, multimodal models mapped by AutoModelForCausalLM assume the text decoder receives its own
+        config, rather than the config for the whole model. This is used e.g. to load the text-only part of a VLM.
+        """
+        return config.get_text_config(decoder=True)
+

 AutoModelForCausalLM = auto_class_update(AutoModelForCausalLM, head_doc="causal language modeling")

--- a/tests/models/gemma3/test_modeling_gemma3.py
+++ b/tests/models/gemma3/test_modeling_gemma3.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """Testing suite for the PyTorch Gemma3 model."""

+import tempfile
 import unittest

 import pytest
@@ -339,6 +340,18 @@ class Gemma3Vision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, unitte
    def test_flex_attention_with_grads(self):
        pass

+    def test_automodelforcausallm(self):
+        """
+        Regression test for #36741 -- make sure `AutoModelForCausalLM` works with a Gemma3 config, i.e. that
+        `AutoModelForCausalLM.from_pretrained` pulls the text config before loading the model
+        """
+        config = self.model_tester.get_config()
+        model = Gemma3ForConditionalGeneration(config)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+            for_causal_lm = AutoModelForCausalLM.from_pretrained(tmp_dir)
+            self.assertIsInstance(for_causal_lm, Gemma3ForCausalLM)
+

@slow
@require_torch_gpu