From 893ad04fad145904ccb71e4e858e4134c32226b6 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Tue, 5 Nov 2024 11:34:01 +0100 Subject: [PATCH] Load sub-configs from composite configs (#34410) * save/load sub-configs * nit forgot these * fix copies * move test to common * use dict for sub-configs * add load-save-laod test * clean up modeling check * oops this are correct keys * fix some tests, missed some composite configs * this model was missed --- src/transformers/configuration_utils.py | 21 ++++- src/transformers/modeling_utils.py | 17 ++-- .../models/align/configuration_align.py | 42 +-------- .../models/altclip/configuration_altclip.py | 23 +---- .../models/bark/configuration_bark.py | 47 +++------- .../models/blip/configuration_blip.py | 42 +-------- .../models/blip_2/configuration_blip_2.py | 44 ++-------- .../bridgetower/configuration_bridgetower.py | 36 +------- .../chameleon/configuration_chameleon.py | 2 + .../configuration_chinese_clip.py | 42 +-------- .../models/clap/configuration_clap.py | 42 +-------- .../models/clip/configuration_clip.py | 42 +-------- .../models/clipseg/configuration_clipseg.py | 42 +-------- .../models/clvp/configuration_clvp.py | 28 ++---- .../models/dbrx/configuration_dbrx.py | 43 ++-------- .../configuration_encoder_decoder.py | 4 +- .../configuration_fastspeech2_conformer.py | 4 +- .../models/flava/configuration_flava.py | 85 +++---------------- .../models/git/configuration_git.py | 22 +---- .../models/groupvit/configuration_groupvit.py | 42 +-------- .../models/idefics/configuration_idefics.py | 16 ++-- .../models/idefics2/configuration_idefics2.py | 34 ++------ .../models/idefics3/configuration_idefics3.py | 33 ++----- .../configuration_instructblip.py | 48 ++--------- .../configuration_instructblipvideo.py | 47 ++-------- .../modular_instructblipvideo.py | 7 +- .../models/kosmos2/configuration_kosmos2.py | 55 +++--------- .../models/llava/configuration_llava.py | 4 +- .../llava_next/configuration_llava_next.py | 4 +- .../configuration_llava_next_video.py | 4 +- .../modular_llava_next_video.py | 4 +- .../configuration_llava_onevision.py | 4 +- .../models/mllama/configuration_mllama.py | 47 ++-------- .../models/moshi/configuration_moshi.py | 2 +- .../models/mpt/configuration_mpt.py | 28 ++---- .../models/musicgen/configuration_musicgen.py | 6 ++ .../configuration_musicgen_melody.py | 6 ++ .../models/owlv2/configuration_owlv2.py | 56 +----------- .../models/owlvit/configuration_owlvit.py | 56 +----------- .../paligemma/configuration_paligemma.py | 4 +- .../qwen2_audio/configuration_qwen2_audio.py | 4 +- .../models/qwen2_vl/configuration_qwen2_vl.py | 22 +---- .../models/siglip/configuration_siglip.py | 42 +-------- .../configuration_speech_encoder_decoder.py | 1 + .../video_llava/configuration_video_llava.py | 4 +- .../models/vipllava/configuration_vipllava.py | 4 +- .../configuration_vision_encoder_decoder.py | 1 + .../configuration_vision_text_dual_encoder.py | 1 + .../models/x_clip/configuration_x_clip.py | 42 +-------- tests/models/align/test_modeling_align.py | 9 ++ tests/models/altclip/test_modeling_altclip.py | 9 ++ tests/models/blip/test_modeling_blip.py | 7 ++ tests/models/blip_2/test_modeling_blip_2.py | 7 ++ tests/models/clap/test_modeling_clap.py | 7 ++ tests/models/clip/test_modeling_clip.py | 7 ++ tests/models/clipseg/test_modeling_clipseg.py | 7 ++ tests/models/clvp/test_modeling_clvp.py | 8 +- tests/models/flava/test_modeling_flava.py | 7 ++ .../models/groupvit/test_modeling_groupvit.py | 7 ++ .../models/idefics2/test_modeling_idefics2.py | 7 +- .../models/idefics3/test_modeling_idefics3.py | 7 +- .../test_modeling_instructblip.py | 9 ++ .../test_modeling_instructblipvideo.py | 7 ++ tests/models/kosmos2/test_modeling_kosmos2.py | 7 +- tests/models/llava/test_modeling_llava.py | 8 +- .../llava_next/test_modeling_llava_next.py | 8 +- .../test_modeling_llava_next_video.py | 8 +- .../test_modeling_llava_onevision.py | 8 +- tests/models/mllama/test_modeling_mllama.py | 7 +- tests/models/owlv2/test_modeling_owlv2.py | 7 ++ tests/models/owlvit/test_modeling_owlvit.py | 7 ++ .../models/qwen2_vl/test_modeling_qwen2_vl.py | 3 + tests/models/siglip/test_modeling_siglip.py | 5 +- .../video_llava/test_modeling_video_llava.py | 8 +- .../models/vipllava/test_modeling_vipllava.py | 8 +- tests/models/x_clip/test_modeling_x_clip.py | 7 ++ tests/test_configuration_common.py | 50 ++++++++++- tests/test_modeling_common.py | 14 ++- 78 files changed, 464 insertions(+), 1052 deletions(-) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 1d892c49a2..60f9f34cf8 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -190,6 +190,8 @@ class PretrainedConfig(PushToHubMixin): """ model_type: str = "" + base_config_key: str = "" + sub_configs: Dict[str, "PretrainedConfig"] = {} is_composition: bool = False attribute_map: Dict[str, str] = {} _auto_class: Optional[str] = None @@ -543,11 +545,22 @@ class PretrainedConfig(PushToHubMixin): cls._set_token_in_kwargs(kwargs, token) config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + if cls.base_config_key and cls.base_config_key in config_dict: + config_dict = config_dict[cls.base_config_key] + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) + # sometimes the config has no `base_config_key` if the config is used in several composite models + # e.g. LlamaConfig. In that case we try to see if there is match in `model_type` before raising a warning + for k, v in config_dict.items(): + if isinstance(v, dict) and v.get("model_type") == cls.model_type: + config_dict = v + + # raise warning only if we still can't see a match in `model_type` + if config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) return cls.from_dict(config_dict, **kwargs) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 6c332a7a6a..0df59d1db8 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1608,15 +1608,14 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix # Below we check if a config is composite and manually prepare a dict of attn impl if not already passed as a dict. # Later each sub-module will dispatch with its own attn impl, by calling `XXXModel._from_config(config.text_config)` # If any of sub-modules doesn't support requested attn, an error will be raised. See https://github.com/huggingface/transformers/pull/32238 - for key in config: - if isinstance(getattr(config, key), PretrainedConfig): - sub_config = getattr(config, key) - curr_attn_implementation = ( - requested_attn_implementation - if not isinstance(requested_attn_implementation, dict) - else requested_attn_implementation.get(key, None) - ) - sub_config._attn_implementation_internal = curr_attn_implementation + for key in config.sub_configs.keys(): + sub_config = getattr(config, key) + curr_attn_implementation = ( + requested_attn_implementation + if not isinstance(requested_attn_implementation, dict) + else requested_attn_implementation.get(key, None) + ) + sub_config._attn_implementation_internal = curr_attn_implementation if use_flash_attention_2: logger.warning_once( diff --git a/src/transformers/models/align/configuration_align.py b/src/transformers/models/align/configuration_align.py index 99fa81b4a9..a22ab1dc40 100644 --- a/src/transformers/models/align/configuration_align.py +++ b/src/transformers/models/align/configuration_align.py @@ -14,8 +14,7 @@ # limitations under the License. """ALIGN model configuration""" -import os -from typing import TYPE_CHECKING, List, Union +from typing import TYPE_CHECKING, List if TYPE_CHECKING: @@ -95,6 +94,7 @@ class AlignTextConfig(PretrainedConfig): ```""" model_type = "align_text_model" + base_config_key = "text_config" def __init__( self, @@ -133,24 +133,6 @@ class AlignTextConfig(PretrainedConfig): self.use_cache = use_cache self.pad_token_id = pad_token_id - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from AlignConfig - if config_dict.get("model_type") == "align": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class AlignVisionConfig(PretrainedConfig): r""" @@ -223,6 +205,7 @@ class AlignVisionConfig(PretrainedConfig): ```""" model_type = "align_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -272,24 +255,6 @@ class AlignVisionConfig(PretrainedConfig): self.drop_connect_rate = drop_connect_rate self.num_hidden_layers = sum(num_block_repeats) * 4 - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from AlignConfig - if config_dict.get("model_type") == "align": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class AlignConfig(PretrainedConfig): r""" @@ -340,6 +305,7 @@ class AlignConfig(PretrainedConfig): ```""" model_type = "align" + sub_configs = {"text_config": AlignTextConfig, "vision_config": AlignVisionConfig} def __init__( self, diff --git a/src/transformers/models/altclip/configuration_altclip.py b/src/transformers/models/altclip/configuration_altclip.py index 7333fa63a3..3c8e91bd47 100755 --- a/src/transformers/models/altclip/configuration_altclip.py +++ b/src/transformers/models/altclip/configuration_altclip.py @@ -14,9 +14,6 @@ # limitations under the License. """AltCLIP model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -199,6 +196,7 @@ class AltCLIPVisionConfig(PretrainedConfig): ```""" model_type = "altclip_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -233,24 +231,6 @@ class AltCLIPVisionConfig(PretrainedConfig): self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from AltCLIPConfig - if config_dict.get("model_type") == "altclip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class AltCLIPConfig(PretrainedConfig): r""" @@ -298,6 +278,7 @@ class AltCLIPConfig(PretrainedConfig): ```""" model_type = "altclip" + sub_configs = {"text_config": AltCLIPTextConfig, "vision_config": AltCLIPVisionConfig} def __init__( self, text_config=None, vision_config=None, projection_dim=768, logit_scale_init_value=2.6592, **kwargs diff --git a/src/transformers/models/bark/configuration_bark.py b/src/transformers/models/bark/configuration_bark.py index 6dd08b65e8..a498d1dd19 100644 --- a/src/transformers/models/bark/configuration_bark.py +++ b/src/transformers/models/bark/configuration_bark.py @@ -14,12 +14,11 @@ # limitations under the License. """BARK model configuration""" -import os -from typing import Dict, Optional, Union +from typing import Dict from ...configuration_utils import PretrainedConfig from ...utils import add_start_docstrings, logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -64,7 +63,6 @@ BARK_SUBMODELCONFIG_START_DOCSTRING = """ class BarkSubModelConfig(PretrainedConfig): - model_type = "bark_module" keys_to_ignore_at_inference = ["past_key_values"] attribute_map = { @@ -101,38 +99,6 @@ class BarkSubModelConfig(PretrainedConfig): super().__init__(**kwargs) - @classmethod - def from_pretrained( - cls, - pretrained_model_name_or_path: Union[str, os.PathLike], - cache_dir: Optional[Union[str, os.PathLike]] = None, - force_download: bool = False, - local_files_only: bool = False, - token: Optional[Union[str, bool]] = None, - revision: str = "main", - **kwargs, - ) -> "PretrainedConfig": - kwargs["cache_dir"] = cache_dir - kwargs["force_download"] = force_download - kwargs["local_files_only"] = local_files_only - kwargs["revision"] = revision - - cls._set_token_in_kwargs(kwargs, token) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the config dict if we are loading from Bark - if config_dict.get("model_type") == "bark": - config_dict = config_dict[f"{cls.model_type}_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - @add_start_docstrings( BARK_SUBMODELCONFIG_START_DOCSTRING.format(config="BarkSemanticConfig", model="BarkSemanticModel"), @@ -154,6 +120,7 @@ class BarkSubModelConfig(PretrainedConfig): ) class BarkSemanticConfig(BarkSubModelConfig): model_type = "semantic" + base_config_key = "semantic_config" @add_start_docstrings( @@ -176,6 +143,7 @@ class BarkSemanticConfig(BarkSubModelConfig): ) class BarkCoarseConfig(BarkSubModelConfig): model_type = "coarse_acoustics" + base_config_key = "coarse_acoustics_config" @add_start_docstrings( @@ -203,6 +171,7 @@ class BarkCoarseConfig(BarkSubModelConfig): ) class BarkFineConfig(BarkSubModelConfig): model_type = "fine_acoustics" + base_config_key = "fine_acoustics_config" def __init__(self, tie_word_embeddings=True, n_codes_total=8, n_codes_given=1, **kwargs): self.n_codes_total = n_codes_total @@ -265,6 +234,12 @@ class BarkConfig(PretrainedConfig): """ model_type = "bark" + sub_configs = { + "semantic_config": BarkSemanticConfig, + "coarse_acoustics_config": BarkCoarseConfig, + "fine_acoustics_config": BarkFineConfig, + "codec_config": AutoConfig, + } def __init__( self, diff --git a/src/transformers/models/blip/configuration_blip.py b/src/transformers/models/blip/configuration_blip.py index 4772738be1..18db71eb14 100644 --- a/src/transformers/models/blip/configuration_blip.py +++ b/src/transformers/models/blip/configuration_blip.py @@ -14,9 +14,6 @@ # limitations under the License. """Blip model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -96,6 +93,7 @@ class BlipTextConfig(PretrainedConfig): ```""" model_type = "blip_text_model" + base_config_key = "text_config" def __init__( self, @@ -146,24 +144,6 @@ class BlipTextConfig(PretrainedConfig): self.use_cache = use_cache self.label_smoothing = label_smoothing - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from BlipConfig - if config_dict.get("model_type") == "blip": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class BlipVisionConfig(PretrainedConfig): r""" @@ -215,6 +195,7 @@ class BlipVisionConfig(PretrainedConfig): ```""" model_type = "blip_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -245,24 +226,6 @@ class BlipVisionConfig(PretrainedConfig): self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from BlipConfig - if config_dict.get("model_type") == "blip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class BlipConfig(PretrainedConfig): r""" @@ -316,6 +279,7 @@ class BlipConfig(PretrainedConfig): ```""" model_type = "blip" + sub_configs = {"text_config": BlipTextConfig, "vision_config": BlipVisionConfig} def __init__( self, diff --git a/src/transformers/models/blip_2/configuration_blip_2.py b/src/transformers/models/blip_2/configuration_blip_2.py index 16fa4aec38..d690d22338 100644 --- a/src/transformers/models/blip_2/configuration_blip_2.py +++ b/src/transformers/models/blip_2/configuration_blip_2.py @@ -14,13 +14,12 @@ # limitations under the License. """BLIP-2 model configuration""" -import os -from typing import Optional, Union +from typing import Optional from ...configuration_utils import PretrainedConfig from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -76,6 +75,7 @@ class Blip2VisionConfig(PretrainedConfig): ```""" model_type = "blip_2_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -106,24 +106,6 @@ class Blip2VisionConfig(PretrainedConfig): self.hidden_act = hidden_act self.qkv_bias = qkv_bias - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from Blip2Config - if config_dict.get("model_type") == "blip-2": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class Blip2QFormerConfig(PretrainedConfig): r""" @@ -190,6 +172,7 @@ class Blip2QFormerConfig(PretrainedConfig): ```""" model_type = "blip_2_qformer" + base_config_key = "qformer_config" def __init__( self, @@ -229,24 +212,6 @@ class Blip2QFormerConfig(PretrainedConfig): self.encoder_hidden_size = encoder_hidden_size self.use_qformer_text_input = use_qformer_text_input - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the qformer config dict if we are loading from Blip2Config - if config_dict.get("model_type") == "blip-2": - config_dict = config_dict["qformer_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class Blip2Config(PretrainedConfig): r""" @@ -306,6 +271,7 @@ class Blip2Config(PretrainedConfig): ```""" model_type = "blip-2" + sub_configs = {"text_config": AutoConfig, "qformer_config": Blip2QFormerConfig, "vision_config": Blip2VisionConfig} def __init__( self, diff --git a/src/transformers/models/bridgetower/configuration_bridgetower.py b/src/transformers/models/bridgetower/configuration_bridgetower.py index 4985b6ef89..de49283493 100644 --- a/src/transformers/models/bridgetower/configuration_bridgetower.py +++ b/src/transformers/models/bridgetower/configuration_bridgetower.py @@ -14,9 +14,6 @@ # limitations under the License. """BridgeTower model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -68,6 +65,7 @@ class BridgeTowerVisionConfig(PretrainedConfig): ```""" model_type = "bridgetower_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -95,21 +93,6 @@ class BridgeTowerVisionConfig(PretrainedConfig): self.share_layernorm = share_layernorm self.remove_last_layer = remove_last_layer - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if config_dict.get("model_type") == "bridgetower": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class BridgeTowerTextConfig(PretrainedConfig): r""" @@ -175,6 +158,7 @@ class BridgeTowerTextConfig(PretrainedConfig): ```""" model_type = "bridgetower_text_model" + base_config_key = "text_config" def __init__( self, @@ -217,21 +201,6 @@ class BridgeTowerTextConfig(PretrainedConfig): self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if config_dict.get("model_type") == "bridgetower": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class BridgeTowerConfig(PretrainedConfig): r""" @@ -288,6 +257,7 @@ class BridgeTowerConfig(PretrainedConfig): ```""" model_type = "bridgetower" + sub_configs = {"text_config": BridgeTowerTextConfig, "vision_config": BridgeTowerVisionConfig} def __init__( self, diff --git a/src/transformers/models/chameleon/configuration_chameleon.py b/src/transformers/models/chameleon/configuration_chameleon.py index 67de37f2d0..9842127e7b 100644 --- a/src/transformers/models/chameleon/configuration_chameleon.py +++ b/src/transformers/models/chameleon/configuration_chameleon.py @@ -62,6 +62,7 @@ class ChameleonVQVAEConfig(PretrainedConfig): """ model_type = "chameleon_vqgan" + base_config_key = "vq_config" def __init__( self, @@ -187,6 +188,7 @@ class ChameleonConfig(PretrainedConfig): ```""" model_type = "chameleon" + sub_configs = {"vq_config": ChameleonVQVAEConfig} keys_to_ignore_at_inference = ["past_key_values"] def __init__( diff --git a/src/transformers/models/chinese_clip/configuration_chinese_clip.py b/src/transformers/models/chinese_clip/configuration_chinese_clip.py index 5b37044fab..d50d6c842b 100644 --- a/src/transformers/models/chinese_clip/configuration_chinese_clip.py +++ b/src/transformers/models/chinese_clip/configuration_chinese_clip.py @@ -14,9 +14,8 @@ # limitations under the License. """Chinese-CLIP model configuration""" -import os from collections import OrderedDict -from typing import TYPE_CHECKING, Any, Mapping, Optional, Union +from typing import TYPE_CHECKING, Any, Mapping, Optional if TYPE_CHECKING: @@ -102,6 +101,7 @@ class ChineseCLIPTextConfig(PretrainedConfig): ```""" model_type = "chinese_clip_text_model" + base_config_key = "text_config" def __init__( self, @@ -141,24 +141,6 @@ class ChineseCLIPTextConfig(PretrainedConfig): self.position_embedding_type = position_embedding_type self.use_cache = use_cache - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from ChineseCLIPConfig - if config_dict.get("model_type") == "chinese_clip": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class ChineseCLIPVisionConfig(PretrainedConfig): r""" @@ -215,6 +197,7 @@ class ChineseCLIPVisionConfig(PretrainedConfig): ```""" model_type = "chinese_clip_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -249,24 +232,6 @@ class ChineseCLIPVisionConfig(PretrainedConfig): self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from ChineseCLIPConfig - if config_dict.get("model_type") == "chinese_clip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class ChineseCLIPConfig(PretrainedConfig): r""" @@ -316,6 +281,7 @@ class ChineseCLIPConfig(PretrainedConfig): ```""" model_type = "chinese_clip" + sub_configs = {"text_config": ChineseCLIPTextConfig, "vision_config": ChineseCLIPVisionConfig} def __init__( self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py index 1425e2a862..b2added7f0 100644 --- a/src/transformers/models/clap/configuration_clap.py +++ b/src/transformers/models/clap/configuration_clap.py @@ -14,9 +14,6 @@ # limitations under the License. """CLAP model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -94,6 +91,7 @@ class ClapTextConfig(PretrainedConfig): ```""" model_type = "clap_text_model" + base_config_key = "text_config" def __init__( self, @@ -137,24 +135,6 @@ class ClapTextConfig(PretrainedConfig): self.projection_hidden_act = projection_hidden_act self.projection_dim = projection_dim - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from ClapConfig - if config_dict.get("model_type") == "clap": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class ClapAudioConfig(PretrainedConfig): r""" @@ -245,6 +225,7 @@ class ClapAudioConfig(PretrainedConfig): ```""" model_type = "clap_audio_model" + base_config_key = "audio_config" def __init__( self, @@ -307,24 +288,6 @@ class ClapAudioConfig(PretrainedConfig): self.initializer_factor = initializer_factor self.projection_hidden_act = projection_hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the audio config dict if we are loading from ClapConfig - if config_dict.get("model_type") == "clap": - config_dict = config_dict["audio_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class ClapConfig(PretrainedConfig): r""" @@ -377,6 +340,7 @@ class ClapConfig(PretrainedConfig): ```""" model_type = "clap" + sub_configs = {"text_config": ClapTextConfig, "audio_config": ClapAudioConfig} def __init__( self, diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py index 8e027f5c3f..2e1f2deede 100644 --- a/src/transformers/models/clip/configuration_clip.py +++ b/src/transformers/models/clip/configuration_clip.py @@ -14,9 +14,8 @@ # limitations under the License. """CLIP model configuration""" -import os from collections import OrderedDict -from typing import TYPE_CHECKING, Any, Mapping, Optional, Union +from typing import TYPE_CHECKING, Any, Mapping, Optional if TYPE_CHECKING: @@ -93,6 +92,7 @@ class CLIPTextConfig(PretrainedConfig): ```""" model_type = "clip_text_model" + base_config_key = "text_config" def __init__( self, @@ -130,24 +130,6 @@ class CLIPTextConfig(PretrainedConfig): self.initializer_factor = initializer_factor self.attention_dropout = attention_dropout - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from CLIPConfig - if config_dict.get("model_type") == "clip": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class CLIPVisionConfig(PretrainedConfig): r""" @@ -205,6 +187,7 @@ class CLIPVisionConfig(PretrainedConfig): ```""" model_type = "clip_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -239,24 +222,6 @@ class CLIPVisionConfig(PretrainedConfig): self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from CLIPConfig - if config_dict.get("model_type") == "clip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class CLIPConfig(PretrainedConfig): r""" @@ -305,6 +270,7 @@ class CLIPConfig(PretrainedConfig): ```""" model_type = "clip" + sub_configs = {"text_config": CLIPTextConfig, "vision_config": CLIPVisionConfig} def __init__( self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py index 0ac8196fc7..5474840f35 100644 --- a/src/transformers/models/clipseg/configuration_clipseg.py +++ b/src/transformers/models/clipseg/configuration_clipseg.py @@ -14,9 +14,6 @@ # limitations under the License. """CLIPSeg model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -84,6 +81,7 @@ class CLIPSegTextConfig(PretrainedConfig): ```""" model_type = "clipseg_text_model" + base_config_key = "text_config" def __init__( self, @@ -117,24 +115,6 @@ class CLIPSegTextConfig(PretrainedConfig): self.initializer_factor = initializer_factor self.attention_dropout = attention_dropout - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from CLIPSegConfig - if config_dict.get("model_type") == "clipseg": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class CLIPSegVisionConfig(PretrainedConfig): r""" @@ -190,6 +170,7 @@ class CLIPSegVisionConfig(PretrainedConfig): ```""" model_type = "clipseg_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -222,24 +203,6 @@ class CLIPSegVisionConfig(PretrainedConfig): self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from CLIPSegConfig - if config_dict.get("model_type") == "clipseg": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class CLIPSegConfig(PretrainedConfig): r""" @@ -306,6 +269,7 @@ class CLIPSegConfig(PretrainedConfig): ```""" model_type = "clipseg" + sub_configs = {"text_config": CLIPSegTextConfig, "vision_config": CLIPSegVisionConfig} def __init__( self, diff --git a/src/transformers/models/clvp/configuration_clvp.py b/src/transformers/models/clvp/configuration_clvp.py index d17a04c861..8fd0e15080 100644 --- a/src/transformers/models/clvp/configuration_clvp.py +++ b/src/transformers/models/clvp/configuration_clvp.py @@ -91,6 +91,7 @@ class ClvpEncoderConfig(PretrainedConfig): ```""" model_type = "clvp_encoder" + base_config_key = ["text_config", "speech_config"] def __init__( self, @@ -141,7 +142,7 @@ class ClvpEncoderConfig(PretrainedConfig): # make sure to have the config_type be either "text_config" or "speech_config" # this is to make sure that we can load only text or speech configs from the nested ClvpConfig. - if config_type not in ["text_config", "speech_config"]: + if config_type not in cls.base_config_key: raise ValueError( f"We can only load either 'text_config' or 'speech_config' but you are trying to load" f"{config_type}" ) @@ -253,6 +254,7 @@ class ClvpDecoderConfig(PretrainedConfig): ```""" model_type = "clvp_decoder" + base_config_key = "decoder_config" def __init__( self, @@ -314,24 +316,6 @@ class ClvpDecoderConfig(PretrainedConfig): super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the speech config dict if we are loading from ClvpConfig - if config_dict.get("model_type") == "clvp": - config_dict = config_dict["decoder_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class ClvpConfig(PretrainedConfig): r""" @@ -386,7 +370,11 @@ class ClvpConfig(PretrainedConfig): ```""" model_type = "clvp" - is_composition = True + sub_configs = { + "text_config": ClvpEncoderConfig, + "speech_config": ClvpEncoderConfig, + "decoder_config": ClvpDecoderConfig, + } def __init__( self, diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py index dde5232ae5..302b5e6a55 100644 --- a/src/transformers/models/dbrx/configuration_dbrx.py +++ b/src/transformers/models/dbrx/configuration_dbrx.py @@ -41,6 +41,8 @@ class DbrxAttentionConfig(PretrainedConfig): rope_theta (`float`, *optional*, defaults to 10000.0): The base frequency for rope. """ + base_config_key = "attn_config" + def __init__( self, attn_pdrop: float = 0.0, @@ -55,29 +57,12 @@ class DbrxAttentionConfig(PretrainedConfig): self.kv_n_heads = kv_n_heads self.rope_theta = rope_theta - for k in ["model_type"]: + for k in ["model_type", "attn_implementation", "transformers_version", "_commit_hash"]: if k in kwargs: kwargs.pop(k) if len(kwargs) != 0: raise ValueError(f"Found unknown {kwargs=}") - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs: Any) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if config_dict.get("model_type") == "dbrx": - config_dict = config_dict["attn_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class DbrxFFNConfig(PretrainedConfig): """Configuration class for Dbrx FFN. @@ -100,6 +85,8 @@ class DbrxFFNConfig(PretrainedConfig): moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0): The normalization factor for the expert weights. """ + base_config_key = "ffn_config" + def __init__( self, ffn_act_fn: dict = None, @@ -122,29 +109,12 @@ class DbrxFFNConfig(PretrainedConfig): self.moe_loss_weight = moe_loss_weight self.moe_normalize_expert_weights = moe_normalize_expert_weights - for k in ["model_type"]: + for k in ["model_type", "attn_implementation", "transformers_version", "_commit_hash"]: if k in kwargs: kwargs.pop(k) if len(kwargs) != 0: raise ValueError(f"Found unknown {kwargs=}") - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs: Any) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if config_dict.get("model_type") == "dbrx": - config_dict = config_dict["ffn_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class DbrxConfig(PretrainedConfig): r""" @@ -202,6 +172,7 @@ class DbrxConfig(PretrainedConfig): """ model_type = "dbrx" + sub_configs = {"attn_config": DbrxAttentionConfig, "ffn_config": DbrxFFNConfig} attribute_map = { "num_attention_heads": "n_heads", "hidden_size": "d_model", diff --git a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py index ab5d49b32f..5190ed51ff 100644 --- a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py @@ -17,6 +17,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging +from ..auto import AutoConfig logger = logging.get_logger(__name__) @@ -70,6 +71,7 @@ class EncoderDecoderConfig(PretrainedConfig): ```""" model_type = "encoder-decoder" + sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig} is_composition = True def __init__(self, **kwargs): @@ -84,8 +86,6 @@ class EncoderDecoderConfig(PretrainedConfig): decoder_config = kwargs.pop("decoder") decoder_model_type = decoder_config.pop("model_type") - from ..auto.configuration_auto import AutoConfig - self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config) self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config) self.is_encoder_decoder = True diff --git a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py index ade5b8b266..59a1b02975 100644 --- a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +++ b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py @@ -164,6 +164,7 @@ class FastSpeech2ConformerConfig(PretrainedConfig): ```""" model_type = "fastspeech2_conformer" + base_config_key = "model_config" attribute_map = {"num_hidden_layers": "encoder_layers", "num_attention_heads": "encoder_num_attention_heads"} def __init__( @@ -377,6 +378,7 @@ class FastSpeech2ConformerHifiGanConfig(PretrainedConfig): ```""" model_type = "hifigan" + base_config_key = "vocoder_config" def __init__( self, @@ -453,7 +455,7 @@ class FastSpeech2ConformerWithHifiGanConfig(PretrainedConfig): """ model_type = "fastspeech2_conformer_with_hifigan" - is_composition = True + sub_configs = {"model_config": FastSpeech2ConformerConfig, "vocoder_config": FastSpeech2ConformerHifiGanConfig} def __init__( self, diff --git a/src/transformers/models/flava/configuration_flava.py b/src/transformers/models/flava/configuration_flava.py index b6349361c0..47cdb488a2 100644 --- a/src/transformers/models/flava/configuration_flava.py +++ b/src/transformers/models/flava/configuration_flava.py @@ -14,8 +14,7 @@ # limitations under the License. """FLAVA model configurations""" -import os -from typing import Any, Dict, Union +from typing import Any, Dict from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -86,6 +85,7 @@ class FlavaImageConfig(PretrainedConfig): ```""" model_type = "flava_image_model" + base_config_key = "image_config" def __init__( self, @@ -124,24 +124,6 @@ class FlavaImageConfig(PretrainedConfig): self.mask_token = mask_token self.vocab_size = vocab_size - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the image config dict if we are loading from FlavaConfig - if config_dict.get("model_type") == "flava": - config_dict = config_dict["image_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class FlavaTextConfig(PretrainedConfig): r""" @@ -216,6 +198,7 @@ class FlavaTextConfig(PretrainedConfig): ```""" model_type = "flava_text_model" + base_config_key = "text_config" def __init__( self, @@ -254,24 +237,6 @@ class FlavaTextConfig(PretrainedConfig): self.qkv_bias = qkv_bias self.pad_token_id = pad_token_id - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from FlavaConfig - if config_dict.get("model_type") == "flava": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class FlavaMultimodalConfig(PretrainedConfig): r""" @@ -327,6 +292,7 @@ class FlavaMultimodalConfig(PretrainedConfig): ```""" model_type = "flava_multimodal_model" + base_config_key = "multimodal_config" def __init__( self, @@ -357,27 +323,10 @@ class FlavaMultimodalConfig(PretrainedConfig): self.qkv_bias = qkv_bias self.use_cls_token = use_cls_token - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the multimodal config dict if we are loading from FlavaConfig - if config_dict.get("model_type") == "flava": - config_dict = config_dict["multimodal_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class FlavaImageCodebookConfig(PretrainedConfig): model_type = "flava_image_codebook" + base_config_key = "image_codebook_config" r""" [`FlavaImageCodebookConfig`] is the configuration class to store the configuration of a [`FlavaImageCodebook`]. It @@ -442,24 +391,6 @@ class FlavaImageCodebookConfig(PretrainedConfig): self.freeze = freeze self.initializer_range = initializer_range - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the image codebook config dict if we are loading from FlavaConfig - if config_dict.get("model_type") == "flava": - config_dict = config_dict["image_codebook_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class FlavaConfig(PretrainedConfig): r""" @@ -532,6 +463,12 @@ class FlavaConfig(PretrainedConfig): """ model_type = "flava" + sub_configs = { + "text_config": FlavaTextConfig, + "image_config": FlavaImageConfig, + "multimodal_config": FlavaMultimodalConfig, + "image_codebook_config": FlavaImageCodebookConfig, + } def __init__( self, diff --git a/src/transformers/models/git/configuration_git.py b/src/transformers/models/git/configuration_git.py index ecaea17ff9..1be3e7067b 100644 --- a/src/transformers/models/git/configuration_git.py +++ b/src/transformers/models/git/configuration_git.py @@ -13,8 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -from typing import Union from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -72,6 +70,7 @@ class GitVisionConfig(PretrainedConfig): ```""" model_type = "git_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -102,24 +101,6 @@ class GitVisionConfig(PretrainedConfig): self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from GITConfig - if config_dict.get("model_type") == "git": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class GitConfig(PretrainedConfig): r""" @@ -186,6 +167,7 @@ class GitConfig(PretrainedConfig): ```""" model_type = "git" + sub_configs = {"vision_config": GitVisionConfig} def __init__( self, diff --git a/src/transformers/models/groupvit/configuration_groupvit.py b/src/transformers/models/groupvit/configuration_groupvit.py index e608fbcdbe..e85e4fc918 100644 --- a/src/transformers/models/groupvit/configuration_groupvit.py +++ b/src/transformers/models/groupvit/configuration_groupvit.py @@ -14,9 +14,8 @@ # limitations under the License. """GroupViT model configuration""" -import os from collections import OrderedDict -from typing import TYPE_CHECKING, Any, Mapping, Optional, Union +from typing import TYPE_CHECKING, Any, Mapping, Optional from ...configuration_utils import PretrainedConfig from ...onnx import OnnxConfig @@ -86,6 +85,7 @@ class GroupViTTextConfig(PretrainedConfig): ```""" model_type = "groupvit_text_model" + base_config_key = "text_config" def __init__( self, @@ -121,24 +121,6 @@ class GroupViTTextConfig(PretrainedConfig): self.initializer_factor = initializer_factor self.attention_dropout = attention_dropout - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from GroupViTConfig - if config_dict.get("model_type") == "groupvit": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class GroupViTVisionConfig(PretrainedConfig): r""" @@ -197,6 +179,7 @@ class GroupViTVisionConfig(PretrainedConfig): ```""" model_type = "groupvit_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -246,24 +229,6 @@ class GroupViTVisionConfig(PretrainedConfig): self.assign_eps = assign_eps self.assign_mlp_ratio = assign_mlp_ratio - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from GroupViTConfig - if config_dict.get("model_type") == "groupvit": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class GroupViTConfig(PretrainedConfig): r""" @@ -292,6 +257,7 @@ class GroupViTConfig(PretrainedConfig): """ model_type = "groupvit" + sub_configs = {"text_config": GroupViTTextConfig, "vision_config": GroupViTVisionConfig} def __init__( self, diff --git a/src/transformers/models/idefics/configuration_idefics.py b/src/transformers/models/idefics/configuration_idefics.py index 56b6025a8e..e34a576440 100644 --- a/src/transformers/models/idefics/configuration_idefics.py +++ b/src/transformers/models/idefics/configuration_idefics.py @@ -38,7 +38,7 @@ class IdeficsVisionConfig(PretrainedConfig): documentation from [`PretrainedConfig`] for more information. Args: - hidden_size (`int`, *optional*, defaults to 768): + embed_dim (`int`, *optional*, defaults to 768): Dimensionality of the encoder layers and the pooler layer. (elsewhere referred to as `hidden_size`) image_size (`int`, *optional*, defaults to 224): The size (resolution) of each image. @@ -50,12 +50,12 @@ class IdeficsVisionConfig(PretrainedConfig): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 16): Number of attention heads for each attention layer in the Transformer encoder. - image_num_channels (`int`, *optional*, defaults to `3`): + num_channels (`int`, *optional*, defaults to 3): Number of image channels. hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. - layer_norm_eps (`float`, *optional*, defaults to 1e-5): + layer_norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon used by the layer normalization layers. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. @@ -64,11 +64,9 @@ class IdeficsVisionConfig(PretrainedConfig): initializer_factor (`float`, *optional*, defaults to 1.0): A factor for initializing all weight matrices (should be kept to 1.0, used internally for initialization testing). - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. """ - model_type = "idefics" + model_type = "idefics_vision" attribute_map = { "hidden_size": "embed_dim", } @@ -119,7 +117,7 @@ class IdeficsPerceiverConfig(PretrainedConfig): Args: use_resampler (`bool`, *optional*, defaults to `False`): Whether or not to use the resampler - resampler_n_latents (`int`, *optional*, defaults to ): + resampler_n_latents (`int`, *optional*, defaults to 64): Number of latent embeddings to resample ("compress") the input sequence to (usually < 128). resampler_depth (`int`, *optional*, defaults to 6): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3). @@ -131,7 +129,7 @@ class IdeficsPerceiverConfig(PretrainedConfig): Whether or not to use qk layer norms in perceiver """ - model_type = "idefics" + model_type = "idefics_perciever" def __init__( self, @@ -235,7 +233,7 @@ class IdeficsConfig(PretrainedConfig): ```""" model_type = "idefics" - is_composition = False + sub_configs = {"perceiver_config": IdeficsPerceiverConfig, "vision_config": IdeficsVisionConfig} def __init__( self, diff --git a/src/transformers/models/idefics2/configuration_idefics2.py b/src/transformers/models/idefics2/configuration_idefics2.py index 64743d1cd4..408d374c77 100644 --- a/src/transformers/models/idefics2/configuration_idefics2.py +++ b/src/transformers/models/idefics2/configuration_idefics2.py @@ -13,12 +13,9 @@ # limitations under the License. """Idefics2 model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -76,7 +73,8 @@ class Idefics2VisionConfig(PretrainedConfig): >>> configuration = model.config ```""" - model_type = "idefics2" + model_type = "idefics2_vision" + base_config_key = "vision_config" def __init__( self, @@ -107,24 +105,6 @@ class Idefics2VisionConfig(PretrainedConfig): self.hidden_act = hidden_act self.initializer_range = initializer_range - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from Idefics2Config - if config_dict.get("model_type") == "idefics2": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class Idefics2PerceiverConfig(PretrainedConfig): r""" @@ -152,7 +132,7 @@ class Idefics2PerceiverConfig(PretrainedConfig): The dropout ratio for the attention probabilities. """ - model_type = "idefics2" + model_type = "idefics2_perceiver" def __init__( self, @@ -220,7 +200,11 @@ class Idefics2Config(PretrainedConfig): ```""" model_type = "idefics2" - is_composition = True + sub_configs = { + "text_config": AutoConfig, + "perceiver_config": Idefics2PerceiverConfig, + "vision_config": Idefics2VisionConfig, + } def __init__( self, diff --git a/src/transformers/models/idefics3/configuration_idefics3.py b/src/transformers/models/idefics3/configuration_idefics3.py index 45afe685f5..4b10d8d2d0 100644 --- a/src/transformers/models/idefics3/configuration_idefics3.py +++ b/src/transformers/models/idefics3/configuration_idefics3.py @@ -13,12 +13,9 @@ # limitations under the License. """Idefics3 model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -57,8 +54,7 @@ class Idefics3VisionConfig(PretrainedConfig): The epsilon used by the layer normalization layers. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - intializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation for initializing all weight matrices in the model. + initializer_range (``, *optional*, defaults to 0.02): Example: @@ -76,7 +72,8 @@ class Idefics3VisionConfig(PretrainedConfig): >>> configuration = model.config ```""" - model_type = "idefics3" + model_type = "idefics3_vision" + base_config_key = "vision_config" def __init__( self, @@ -107,24 +104,6 @@ class Idefics3VisionConfig(PretrainedConfig): self.hidden_act = hidden_act self.initializer_range = initializer_range - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from Idefics3Config - if config_dict.get("model_type") == "idefics3": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class Idefics3Config(PretrainedConfig): r""" @@ -165,7 +144,7 @@ class Idefics3Config(PretrainedConfig): ```""" model_type = "idefics3" - is_composition = True + sub_configs = {"text_config": AutoConfig, "vision_config": Idefics3VisionConfig} def __init__( self, @@ -204,4 +183,4 @@ class Idefics3Config(PretrainedConfig): self.text_config = text_config self.scale_factor = scale_factor - super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings) + super().__init__(**kwargs, pad_token_id=pad_token_id, tie_word_embeddings=tie_word_embeddings) diff --git a/src/transformers/models/instructblip/configuration_instructblip.py b/src/transformers/models/instructblip/configuration_instructblip.py index a274212a94..6124dba3a0 100644 --- a/src/transformers/models/instructblip/configuration_instructblip.py +++ b/src/transformers/models/instructblip/configuration_instructblip.py @@ -14,13 +14,10 @@ # limitations under the License. """InstructBLIP model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -78,6 +75,7 @@ class InstructBlipVisionConfig(PretrainedConfig): ```""" model_type = "instructblip_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -108,24 +106,6 @@ class InstructBlipVisionConfig(PretrainedConfig): self.hidden_act = hidden_act self.qkv_bias = qkv_bias - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from InstructBlipConfig - if config_dict.get("model_type") == "instructblip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class InstructBlipQFormerConfig(PretrainedConfig): r""" @@ -192,6 +172,7 @@ class InstructBlipQFormerConfig(PretrainedConfig): ```""" model_type = "instructblip_qformer" + base_config_key = "qformer_config" def __init__( self, @@ -229,24 +210,6 @@ class InstructBlipQFormerConfig(PretrainedConfig): self.cross_attention_frequency = cross_attention_frequency self.encoder_hidden_size = encoder_hidden_size - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the qformer config dict if we are loading from InstructBlipConfig - if config_dict.get("model_type") == "instructblip": - config_dict = config_dict["qformer_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class InstructBlipConfig(PretrainedConfig): r""" @@ -305,6 +268,11 @@ class InstructBlipConfig(PretrainedConfig): ```""" model_type = "instructblip" + sub_configs = { + "text_config": AutoConfig, + "qformer_config": InstructBlipQFormerConfig, + "vision_config": InstructBlipVisionConfig, + } def __init__( self, diff --git a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py index e7c8eeccef..14687a96e5 100644 --- a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py @@ -19,13 +19,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -from typing import Union from ...configuration_utils import PretrainedConfig from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -83,6 +81,7 @@ class InstructBlipVideoVisionConfig(PretrainedConfig): ```""" model_type = "instructblipvideo_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -113,24 +112,6 @@ class InstructBlipVideoVisionConfig(PretrainedConfig): self.hidden_act = hidden_act self.qkv_bias = qkv_bias - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from InstructBlipVideoConfig - if config_dict.get("model_type") == "instructblipvideo": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class InstructBlipVideoQFormerConfig(PretrainedConfig): r""" @@ -197,6 +178,7 @@ class InstructBlipVideoQFormerConfig(PretrainedConfig): ```""" model_type = "instructblipvideo_qformer" + base_config_key = "qformer_config" def __init__( self, @@ -234,24 +216,6 @@ class InstructBlipVideoQFormerConfig(PretrainedConfig): self.cross_attention_frequency = cross_attention_frequency self.encoder_hidden_size = encoder_hidden_size - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the qformer config dict if we are loading from InstructBlipVideoConfig - if config_dict.get("model_type") == "instructblipvideo": - config_dict = config_dict["qformer_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class InstructBlipVideoConfig(PretrainedConfig): r""" @@ -310,6 +274,11 @@ class InstructBlipVideoConfig(PretrainedConfig): ```""" model_type = "instructblipvideo" + sub_configs = { + "text_config": AutoConfig, + "qformer_config": InstructBlipVideoQFormerConfig, + "vision_config": InstructBlipVideoVisionConfig, + } def __init__( self, diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py index 63c6c48685..b0dc8a2157 100644 --- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py @@ -32,7 +32,7 @@ from transformers.models.instructblip.modeling_instructblip import ( from ...configuration_utils import PretrainedConfig from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -103,6 +103,11 @@ class InstructBlipVideoConfig(PretrainedConfig): ```""" model_type = "instructblipvideo" + sub_configs = { + "text_config": AutoConfig, + "qformer_config": InstructBlipVideoQFormerConfig, + "vision_config": InstructBlipVideoVisionConfig, + } def __init__( self, diff --git a/src/transformers/models/kosmos2/configuration_kosmos2.py b/src/transformers/models/kosmos2/configuration_kosmos2.py index e49074f806..921ec336c0 100644 --- a/src/transformers/models/kosmos2/configuration_kosmos2.py +++ b/src/transformers/models/kosmos2/configuration_kosmos2.py @@ -14,9 +14,6 @@ # limitations under the License. """KOSMOS-2 model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -61,7 +58,7 @@ class Kosmos2TextConfig(PretrainedConfig): layerdrop (`float`, *optional*, defaults to 0.0): The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more details. - layer_norm_eps (`float`, *optional*, defaults to 1e-5): + layer_norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon used by the layer normalization layers. init_std (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. @@ -69,9 +66,16 @@ class Kosmos2TextConfig(PretrainedConfig): Scale embeddings by diving by sqrt(embed_dim). use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). + pad_token_id (`int`, *optional*, defaults to 1): + Token id used for padding. + bos_token_id (`int`, *optional*, defaults to 0): + Token id used for beginning of string. + eos_token_id (`int`, *optional*, defaults to 2): + Token id used for end of string. ```""" model_type = "kosmos_2_text_model" + base_config_key = "text_config" keys_to_ignore_at_inference = ["past_key_values"] attribute_map = { "num_attention_heads": "attention_heads", @@ -124,24 +128,6 @@ class Kosmos2TextConfig(PretrainedConfig): self.scale_embedding = scale_embedding self.use_cache = use_cache - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from Kosmos2Config - if config_dict.get("model_type") == "kosmos-2": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class Kosmos2VisionConfig(PretrainedConfig): r""" @@ -171,18 +157,19 @@ class Kosmos2VisionConfig(PretrainedConfig): hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. - layer_norm_eps (`float`, *optional*, defaults to 1e-5): + layer_norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon used by the layer normalization layers. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - initializer_factor (`float`, *optional*, defaults to 1): + initializer_factor (`float`, *optional*, defaults to 1.0): A factor for initializing all weight matrices (should be kept to 1, used internally for initialization testing). ```""" model_type = "kosmos_2_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -215,24 +202,6 @@ class Kosmos2VisionConfig(PretrainedConfig): self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from Kosmos2Config - if config_dict.get("model_type") == "kosmos-2": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class Kosmos2Config(PretrainedConfig): r""" @@ -267,7 +236,7 @@ class Kosmos2Config(PretrainedConfig): ```""" model_type = "kosmos-2" - is_composition = True + sub_configs = {"text_config": Kosmos2TextConfig, "vision_config": Kosmos2VisionConfig} def __init__( self, diff --git a/src/transformers/models/llava/configuration_llava.py b/src/transformers/models/llava/configuration_llava.py index 3a4cb09855..05034f5cfc 100644 --- a/src/transformers/models/llava/configuration_llava.py +++ b/src/transformers/models/llava/configuration_llava.py @@ -15,7 +15,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -73,7 +73,7 @@ class LlavaConfig(PretrainedConfig): ```""" model_type = "llava" - is_composition = True + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/llava_next/configuration_llava_next.py b/src/transformers/models/llava_next/configuration_llava_next.py index e8768dde85..54616edbf9 100644 --- a/src/transformers/models/llava_next/configuration_llava_next.py +++ b/src/transformers/models/llava_next/configuration_llava_next.py @@ -15,7 +15,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -78,7 +78,7 @@ class LlavaNextConfig(PretrainedConfig): ```""" model_type = "llava_next" - is_composition = False + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/llava_next_video/configuration_llava_next_video.py b/src/transformers/models/llava_next_video/configuration_llava_next_video.py index 0e4e39b4b3..2fe889da60 100644 --- a/src/transformers/models/llava_next_video/configuration_llava_next_video.py +++ b/src/transformers/models/llava_next_video/configuration_llava_next_video.py @@ -21,7 +21,7 @@ from ...configuration_utils import PretrainedConfig -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig class LlavaNextVideoConfig(PretrainedConfig): @@ -86,7 +86,7 @@ class LlavaNextVideoConfig(PretrainedConfig): ```""" model_type = "llava_next_video" - is_composition = True + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index 8018afa724..002b450c2a 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -31,7 +31,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import ( logging, ) -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -99,7 +99,7 @@ class LlavaNextVideoConfig(PretrainedConfig): ```""" model_type = "llava_next_video" - is_composition = True + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/llava_onevision/configuration_llava_onevision.py b/src/transformers/models/llava_onevision/configuration_llava_onevision.py index eef86c6c8c..46b65b35b1 100644 --- a/src/transformers/models/llava_onevision/configuration_llava_onevision.py +++ b/src/transformers/models/llava_onevision/configuration_llava_onevision.py @@ -18,7 +18,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import ( logging, ) -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -81,7 +81,7 @@ class LlavaOnevisionConfig(PretrainedConfig): ```""" model_type = "llava_onevision" - is_composition = False + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/mllama/configuration_mllama.py b/src/transformers/models/mllama/configuration_mllama.py index 539fc61ba4..635ca50320 100644 --- a/src/transformers/models/mllama/configuration_mllama.py +++ b/src/transformers/models/mllama/configuration_mllama.py @@ -13,8 +13,7 @@ # limitations under the License. """Mllama model configuration""" -import os -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional from ...configuration_utils import PretrainedConfig from ...modeling_rope_utils import rope_config_validation @@ -59,7 +58,7 @@ class MllamaVisionConfig(PretrainedConfig): The size (resolution) of each image *tile*. patch_size (`int`, *optional*, defaults to 14): The size (resolution) of each patch. - norm_eps (`float`, *optional*, defaults to 1e-5): + norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon used by the layer normalization layers. max_num_tiles (`int`, *optional*, defaults to 4): Maximum number of tiles for image splitting. @@ -88,6 +87,7 @@ class MllamaVisionConfig(PretrainedConfig): ```""" model_type = "mllama_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -137,23 +137,6 @@ class MllamaVisionConfig(PretrainedConfig): def max_aspect_ratio_id(self) -> int: return len(self.supported_aspect_ratios) - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if config_dict.get("model_type") == "mllama": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class MllamaTextConfig(PretrainedConfig): r""" @@ -178,12 +161,12 @@ class MllamaTextConfig(PretrainedConfig): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 32): Number of attention heads for each attention layer in the Transformer encoder. - num_key_value_heads (`int`, *optional*): + num_key_value_heads (`int`, *optional*, defaults to 8): This is the number of key_value heads that should be used to implement Grouped Query Attention. If not specified, will default to `num_attention_heads`. intermediate_size (`int`, *optional*, defaults to 14336): Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. - rope_theta (`float`, *optional*, defaults to 500000.0): + rope_theta (`float`, *optional*, defaults to `500000.0`): The base period of the RoPE embeddings. rope_scaling (`Dict`, *optional*): Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type @@ -259,6 +242,7 @@ class MllamaTextConfig(PretrainedConfig): ```""" model_type = "mllama_text_model" + base_config_key = "text_config" def __init__( self, @@ -311,23 +295,6 @@ class MllamaTextConfig(PretrainedConfig): **kwargs, ) - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if config_dict.get("model_type") == "mllama": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class MllamaConfig(PretrainedConfig): r""" @@ -370,7 +337,7 @@ class MllamaConfig(PretrainedConfig): ```""" model_type = "mllama" - is_composition = True + sub_configs = {"text_config": MllamaTextConfig, "vision_config": MllamaVisionConfig} def __init__( self, diff --git a/src/transformers/models/moshi/configuration_moshi.py b/src/transformers/models/moshi/configuration_moshi.py index 654e4e82a4..1b31141f02 100644 --- a/src/transformers/models/moshi/configuration_moshi.py +++ b/src/transformers/models/moshi/configuration_moshi.py @@ -235,8 +235,8 @@ class MoshiConfig(PretrainedConfig): ```""" model_type = "moshi" - is_composition = True keys_to_ignore_at_inference = ["past_key_values"] + sub_configs = {"audio_encoder_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/mpt/configuration_mpt.py b/src/transformers/models/mpt/configuration_mpt.py index ed822c813b..8ee3f8c0c0 100644 --- a/src/transformers/models/mpt/configuration_mpt.py +++ b/src/transformers/models/mpt/configuration_mpt.py @@ -41,22 +41,22 @@ class MptAttentionConfig(PretrainedConfig): Args: attn_type (`str`, *optional*, defaults to `"multihead_attention"`): type of attention to use. Options: `"multihead_attention"`, `"multiquery_attention"`. - attn_pdrop (`float`, *optional*, defaults to 0.0): + attn_pdrop (`float`, *optional*, defaults to `0.0`): The dropout probability for the attention layers. attn_impl (`str`, *optional*, defaults to `"torch"`): The attention implementation to use. One of `"torch"`, `"flash"`, or `"triton"`. clip_qkv (`float`, *optional*): If not `None`, clip the queries, keys, and values in the attention layer to this value. - softmax_scale (`float`, *optional*, defaults to `None`): + softmax_scale (`float`, *optional*): If not `None`, scale the softmax in the attention layer by this value. If `None`, will default to `1/sqrt(hidden_size)`. - prefix_lm (`bool`, *optional*, defaults to `False`)): + prefix_lm (`bool`, *optional*, defaults to `False`): Whether the model should operate as a Prefix LM. This requires passing an extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix can attend to one another bi-directionally. Tokens outside the prefix use causal attention. qk_ln (`bool`, *optional*, defaults to `False`): Whether to apply layer normalization to the queries and keys in the attention layer. - attn_uses_sequence_id (`bool`, *optional*, defaults to `False`)): + attn_uses_sequence_id (`bool`, *optional*, defaults to `False`): Whether to restrict attention to tokens that have the same token_type_ids. When the model is in `train` mode, this requires passing an extra *token_type_ids* argument which indicates which sub-sequence each token belongs to. Defaults to `False` meaning any provided *token_type_ids* will be ignored. @@ -66,6 +66,8 @@ class MptAttentionConfig(PretrainedConfig): The maximum value of the alibi bias. """ + base_config_key = "attn_config" + def __init__( self, attn_type="multihead_attention", @@ -97,23 +99,6 @@ class MptAttentionConfig(PretrainedConfig): f"`attn_type` has to be either `multihead_attention` or `multiquery_attention`. Received: {attn_type}" ) - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if config_dict.get("model_type") == "mpt": - config_dict = config_dict["attn_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class MptConfig(PretrainedConfig): """ @@ -188,6 +173,7 @@ class MptConfig(PretrainedConfig): """ model_type = "mpt" + sub_configs = {"attn_config": MptAttentionConfig} attribute_map = { "num_attention_heads": "n_heads", "hidden_size": "d_model", diff --git a/src/transformers/models/musicgen/configuration_musicgen.py b/src/transformers/models/musicgen/configuration_musicgen.py index 0d282355de..00c0307219 100644 --- a/src/transformers/models/musicgen/configuration_musicgen.py +++ b/src/transformers/models/musicgen/configuration_musicgen.py @@ -76,6 +76,7 @@ class MusicgenDecoderConfig(PretrainedConfig): """ model_type = "musicgen_decoder" + base_config_key = "decoder_config" keys_to_ignore_at_inference = ["past_key_values"] def __init__( @@ -189,6 +190,11 @@ class MusicgenConfig(PretrainedConfig): ```""" model_type = "musicgen" + sub_configs = { + "text_encoder": AutoConfig, + "audio_encoder": AutoConfig, + "decoder": MusicgenDecoderConfig, + } is_composition = True def __init__(self, **kwargs): diff --git a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py index 8a77cea025..e65ad50021 100644 --- a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py @@ -78,6 +78,7 @@ class MusicgenMelodyDecoderConfig(PretrainedConfig): """ model_type = "musicgen_melody_decoder" + base_config_key = "decoder_config" keys_to_ignore_at_inference = ["past_key_values"] def __init__( @@ -195,6 +196,11 @@ class MusicgenMelodyConfig(PretrainedConfig): ```""" model_type = "musicgen_melody" + sub_configs = { + "text_encoder": AutoConfig, + "audio_encoder": AutoConfig, + "decoder": MusicgenMelodyDecoderConfig, + } is_composition = True def __init__( diff --git a/src/transformers/models/owlv2/configuration_owlv2.py b/src/transformers/models/owlv2/configuration_owlv2.py index 43019553c5..f9085eaf9c 100644 --- a/src/transformers/models/owlv2/configuration_owlv2.py +++ b/src/transformers/models/owlv2/configuration_owlv2.py @@ -14,8 +14,7 @@ # limitations under the License. """OWLv2 model configuration""" -import os -from typing import TYPE_CHECKING, Dict, Union +from typing import TYPE_CHECKING, Dict if TYPE_CHECKING: @@ -90,6 +89,7 @@ class Owlv2TextConfig(PretrainedConfig): ```""" model_type = "owlv2_text_model" + base_config_key = "text_config" def __init__( self, @@ -123,24 +123,6 @@ class Owlv2TextConfig(PretrainedConfig): self.initializer_range = initializer_range self.initializer_factor = initializer_factor - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from Owlv2Config - if config_dict.get("model_type") == "owlv2": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - # Copied from transformers.models.owlvit.configuration_owlvit.OwlViTVisionConfig with OwlViT->Owlv2, owlvit-base-patch32->owlv2-base-patch16, owlvit->owlv2, OWL-ViT->OWLv2, 32->16 class Owlv2VisionConfig(PretrainedConfig): @@ -197,6 +179,7 @@ class Owlv2VisionConfig(PretrainedConfig): ```""" model_type = "owlv2_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -229,24 +212,6 @@ class Owlv2VisionConfig(PretrainedConfig): self.initializer_range = initializer_range self.initializer_factor = initializer_factor - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from Owlv2Config - if config_dict.get("model_type") == "owlv2": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - # Copied from transformers.models.owlvit.configuration_owlvit.OwlViTConfig with OwlViT->Owlv2, owlvit-base-patch32->owlv2-base-patch16, owlvit->owlv2, OWL-ViT->OWLv2 class Owlv2Config(PretrainedConfig): @@ -276,6 +241,7 @@ class Owlv2Config(PretrainedConfig): """ model_type = "owlv2" + sub_configs = {"text_config": Owlv2TextConfig, "vision_config": Owlv2VisionConfig} def __init__( self, @@ -304,20 +270,6 @@ class Owlv2Config(PretrainedConfig): self.return_dict = return_dict self.initializer_factor = 1.0 - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - @classmethod def from_text_vision_configs(cls, text_config: Dict, vision_config: Dict, **kwargs): r""" diff --git a/src/transformers/models/owlvit/configuration_owlvit.py b/src/transformers/models/owlvit/configuration_owlvit.py index 877b348f32..8be707ce99 100644 --- a/src/transformers/models/owlvit/configuration_owlvit.py +++ b/src/transformers/models/owlvit/configuration_owlvit.py @@ -14,9 +14,8 @@ # limitations under the License. """OWL-ViT model configuration""" -import os from collections import OrderedDict -from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional if TYPE_CHECKING: @@ -92,6 +91,7 @@ class OwlViTTextConfig(PretrainedConfig): ```""" model_type = "owlvit_text_model" + base_config_key = "text_config" def __init__( self, @@ -125,24 +125,6 @@ class OwlViTTextConfig(PretrainedConfig): self.initializer_range = initializer_range self.initializer_factor = initializer_factor - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from OwlViTConfig - if config_dict.get("model_type") == "owlvit": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class OwlViTVisionConfig(PretrainedConfig): r""" @@ -198,6 +180,7 @@ class OwlViTVisionConfig(PretrainedConfig): ```""" model_type = "owlvit_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -230,24 +213,6 @@ class OwlViTVisionConfig(PretrainedConfig): self.initializer_range = initializer_range self.initializer_factor = initializer_factor - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from OwlViTConfig - if config_dict.get("model_type") == "owlvit": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class OwlViTConfig(PretrainedConfig): r""" @@ -276,6 +241,7 @@ class OwlViTConfig(PretrainedConfig): """ model_type = "owlvit" + sub_configs = {"text_config": OwlViTTextConfig, "vision_config": OwlViTVisionConfig} def __init__( self, @@ -304,20 +270,6 @@ class OwlViTConfig(PretrainedConfig): self.return_dict = return_dict self.initializer_factor = 1.0 - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - @classmethod def from_text_vision_configs(cls, text_config: Dict, vision_config: Dict, **kwargs): r""" diff --git a/src/transformers/models/paligemma/configuration_paligemma.py b/src/transformers/models/paligemma/configuration_paligemma.py index 64598436db..de60c50129 100644 --- a/src/transformers/models/paligemma/configuration_paligemma.py +++ b/src/transformers/models/paligemma/configuration_paligemma.py @@ -17,7 +17,7 @@ import warnings from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -73,7 +73,7 @@ class PaliGemmaConfig(PretrainedConfig): ```""" model_type = "paligemma" - is_composition = False + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py b/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py index deb276f334..925aa60a8d 100644 --- a/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py +++ b/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py @@ -15,7 +15,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -157,7 +157,7 @@ class Qwen2AudioConfig(PretrainedConfig): ```""" model_type = "qwen2_audio" - is_composition = False + sub_configs = {"text_config": AutoConfig, "audio_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py index 1349006e76..55042327de 100644 --- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py @@ -14,9 +14,6 @@ # limitations under the License. """Qwen2VL model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...modeling_rope_utils import rope_config_validation from ...utils import logging @@ -27,6 +24,7 @@ logger = logging.get_logger(__name__) class Qwen2VLVisionConfig(PretrainedConfig): model_type = "qwen2_vl" + base_config_key = "vision_config" def __init__( self, @@ -55,23 +53,6 @@ class Qwen2VLVisionConfig(PretrainedConfig): self.spatial_merge_size = spatial_merge_size self.temporal_patch_size = temporal_patch_size - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if config_dict.get("model_type") == "qwen2_vl": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class Qwen2VLConfig(PretrainedConfig): r""" @@ -180,6 +161,7 @@ class Qwen2VLConfig(PretrainedConfig): ```""" model_type = "qwen2_vl" + sub_configs = {"vision_config": Qwen2VLVisionConfig} keys_to_ignore_at_inference = ["past_key_values"] def __init__( diff --git a/src/transformers/models/siglip/configuration_siglip.py b/src/transformers/models/siglip/configuration_siglip.py index 73622373cb..cc8fae93cd 100644 --- a/src/transformers/models/siglip/configuration_siglip.py +++ b/src/transformers/models/siglip/configuration_siglip.py @@ -14,9 +14,6 @@ # limitations under the License. """Siglip model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -79,6 +76,7 @@ class SiglipTextConfig(PretrainedConfig): ```""" model_type = "siglip_text_model" + base_config_key = "text_config" def __init__( self, @@ -110,24 +108,6 @@ class SiglipTextConfig(PretrainedConfig): self.hidden_act = hidden_act self.attention_dropout = attention_dropout - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from SiglipConfig - if config_dict.get("model_type") == "siglip": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class SiglipVisionConfig(PretrainedConfig): r""" @@ -178,6 +158,7 @@ class SiglipVisionConfig(PretrainedConfig): ```""" model_type = "siglip_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -206,24 +187,6 @@ class SiglipVisionConfig(PretrainedConfig): self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from SiglipConfig - if config_dict.get("model_type") == "siglip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class SiglipConfig(PretrainedConfig): r""" @@ -268,6 +231,7 @@ class SiglipConfig(PretrainedConfig): ```""" model_type = "siglip" + sub_configs = {"text_config": SiglipTextConfig, "vision_config": SiglipVisionConfig} def __init__(self, text_config=None, vision_config=None, **kwargs): super().__init__(**kwargs) diff --git a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py index 32a58ec558..d7e0211610 100644 --- a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py @@ -71,6 +71,7 @@ class SpeechEncoderDecoderConfig(PretrainedConfig): ```""" model_type = "speech-encoder-decoder" + sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig} is_composition = True def __init__(self, **kwargs): diff --git a/src/transformers/models/video_llava/configuration_video_llava.py b/src/transformers/models/video_llava/configuration_video_llava.py index 8738a02585..87d96ca24f 100644 --- a/src/transformers/models/video_llava/configuration_video_llava.py +++ b/src/transformers/models/video_llava/configuration_video_llava.py @@ -15,7 +15,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -78,7 +78,7 @@ class VideoLlavaConfig(PretrainedConfig): ```""" model_type = "video_llava" - is_composition = False + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/vipllava/configuration_vipllava.py b/src/transformers/models/vipllava/configuration_vipllava.py index f88be5adfb..f26c2b2f50 100644 --- a/src/transformers/models/vipllava/configuration_vipllava.py +++ b/src/transformers/models/vipllava/configuration_vipllava.py @@ -15,7 +15,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -72,7 +72,7 @@ class VipLlavaConfig(PretrainedConfig): ```""" model_type = "vipllava" - is_composition = False + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py index a4aa663f98..59678f2573 100644 --- a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py +++ b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py @@ -78,6 +78,7 @@ class VisionEncoderDecoderConfig(PretrainedConfig): ```""" model_type = "vision-encoder-decoder" + sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig} is_composition = True def __init__(self, **kwargs): diff --git a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py index 4cea34ca23..0d79720e1a 100644 --- a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py @@ -75,6 +75,7 @@ class VisionTextDualEncoderConfig(PretrainedConfig): ```""" model_type = "vision-text-dual-encoder" + sub_configs = {"vision_config": AutoConfig, "text_config": AutoConfig} is_composition = True def __init__(self, projection_dim=512, logit_scale_init_value=2.6592, **kwargs): diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py index 827046b6c3..3d3b92d2c8 100644 --- a/src/transformers/models/x_clip/configuration_x_clip.py +++ b/src/transformers/models/x_clip/configuration_x_clip.py @@ -14,9 +14,6 @@ # limitations under the License. """X-CLIP model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -79,6 +76,7 @@ class XCLIPTextConfig(PretrainedConfig): ```""" model_type = "xclip_text_model" + base_config_key = "text_config" def __init__( self, @@ -112,24 +110,6 @@ class XCLIPTextConfig(PretrainedConfig): self.initializer_factor = initializer_factor self.attention_dropout = attention_dropout - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from XCLIPConfig - if config_dict.get("model_type") == "xclip": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class XCLIPVisionConfig(PretrainedConfig): r""" @@ -195,6 +175,7 @@ class XCLIPVisionConfig(PretrainedConfig): ```""" model_type = "xclip_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -239,24 +220,6 @@ class XCLIPVisionConfig(PretrainedConfig): self.hidden_act = hidden_act self.drop_path_rate = drop_path_rate - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from XCLIPConfig - if config_dict.get("model_type") == "xclip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class XCLIPConfig(PretrainedConfig): r""" @@ -295,6 +258,7 @@ class XCLIPConfig(PretrainedConfig): """ model_type = "xclip" + sub_configs = {"text_config": XCLIPTextConfig, "vision_config": XCLIPVisionConfig} def __init__( self, diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py index ddeb585a75..3c7e679686 100644 --- a/tests/models/align/test_modeling_align.py +++ b/tests/models/align/test_modeling_align.py @@ -457,11 +457,20 @@ class AlignModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = AlignModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=AlignConfig, + has_text_modality=False, + common_properties=["projection_dim", "temperature_init_value"], + ) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + @unittest.skip(reason="Start to fail after using torch `cu118`.") def test_multi_gpu_data_parallel_forward(self): super().test_multi_gpu_data_parallel_forward() diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index 0175e562ed..658e2e38d9 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -452,11 +452,20 @@ class AltCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase) def setUp(self): self.model_tester = AltCLIPModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=AltCLIPConfig, + has_text_modality=False, + common_properties=["projection_dim", "logit_scale_init_value"], + ) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): pass diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py index d542757cbf..7e1dbbe6bb 100644 --- a/tests/models/blip/test_modeling_blip.py +++ b/tests/models/blip/test_modeling_blip.py @@ -449,11 +449,18 @@ class BlipModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = BlipModelTester(self) + common_properties = ["logit_scale_init_value", "image_text_hidden_size", "projection_dim", "label_smoothing"] + self.config_tester = ConfigTester( + self, config_class=BlipConfig, has_text_modality=False, common_properties=common_properties + ) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): pass diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index 1ec9c2e1c0..0943661b96 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -482,6 +482,13 @@ class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationT def setUp(self): self.model_tester = Blip2ForConditionalGenerationDecoderOnlyModelTester(self) + common_properties = ["image_token_index", "num_query_tokens", "image_text_hidden_size"] + self.config_tester = ConfigTester( + self, config_class=Blip2Config, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() def test_for_conditional_generation(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py index 9f8cc62d2e..60b77d0efa 100644 --- a/tests/models/clap/test_modeling_clap.py +++ b/tests/models/clap/test_modeling_clap.py @@ -515,11 +515,18 @@ class ClapModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = ClapModelTester(self) + common_properties = ["logit_scale_init_value", "projection_hidden_act", "projection_dim"] + self.config_tester = ConfigTester( + self, config_class=ClapConfig, has_text_modality=False, common_properties=common_properties + ) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): pass diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index a7c8c8ef84..fa5de84e06 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -745,11 +745,18 @@ class CLIPModelTest(CLIPModelTesterMixin, PipelineTesterMixin, unittest.TestCase def setUp(self): self.model_tester = CLIPModelTester(self) + common_properties = ["projection_dim", "logit_scale_init_value"] + self.config_tester = ConfigTester( + self, config_class=CLIPConfig, has_text_modality=False, common_properties=common_properties + ) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): pass diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index 75ffa7ad23..b2b047bb50 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -472,11 +472,18 @@ class CLIPSegModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase) def setUp(self): self.model_tester = CLIPSegModelTester(self) + common_properties = ["projection_dim", "logit_scale_init_value"] + self.config_tester = ConfigTester( + self, config_class=CLIPSegConfig, has_text_modality=False, common_properties=common_properties + ) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + def test_model_for_image_segmentation(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model_for_image_segmentation(*config_and_inputs) diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py index 12e5850006..a212b4781d 100644 --- a/tests/models/clvp/test_modeling_clvp.py +++ b/tests/models/clvp/test_modeling_clvp.py @@ -414,7 +414,13 @@ class ClvpModelForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase) def setUp(self): self.model_tester = ClvpModelForConditionalGenerationTester(self) - self.clvp_config_tester = ConfigTester(self, config_class=ClvpConfig, hidden_size=32) + common_properties = ["projection_dim", "logit_scale_init_value"] + self.clvp_config_tester = ConfigTester( + self, config_class=ClvpConfig, has_text_modality=False, common_properties=common_properties, hidden_size=32 + ) + + def test_config(self): + self.clvp_config_tester.run_common_tests() def tearDown(self): super().tearDown() diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py index d8c8f385e9..1c35fd705c 100644 --- a/tests/models/flava/test_modeling_flava.py +++ b/tests/models/flava/test_modeling_flava.py @@ -931,11 +931,18 @@ class FlavaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = self.class_for_tester(self) + common_properties = ["projection_dim", "logit_scale_init_value", "init_codebook"] + self.config_tester = ConfigTester( + self, config_class=FlavaConfig, has_text_modality=False, common_properties=common_properties + ) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common() self.model_tester.create_and_check_model(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + @unittest.skip(reason="tested in individual model tests") def test_hidden_states_output(self): pass diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py index ce31bc44a6..88b55ec56d 100644 --- a/tests/models/groupvit/test_modeling_groupvit.py +++ b/tests/models/groupvit/test_modeling_groupvit.py @@ -559,11 +559,18 @@ class GroupViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase def setUp(self): self.model_tester = GroupViTModelTester(self) + common_properties = ["projection_dim", "projection_intermediate_dim", "logit_scale_init_value"] + self.config_tester = ConfigTester( + self, config_class=GroupViTConfig, has_text_modality=False, common_properties=common_properties + ) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + @unittest.skip(reason="hidden_states are tested in individual model tests") def test_hidden_states_output(self): pass diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py index 3dcd0bf5fb..ae8c91f29d 100644 --- a/tests/models/idefics2/test_modeling_idefics2.py +++ b/tests/models/idefics2/test_modeling_idefics2.py @@ -185,7 +185,12 @@ class Idefics2ModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = Idefics2VisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=Idefics2Config, has_text_modality=False) + self.config_tester = ConfigTester( + self, config_class=Idefics2Config, has_text_modality=False, common_properties=["image_token_id"] + ) + + def test_config(self): + self.config_tester.run_common_tests() @unittest.skip(reason="input_embeds cannot be passed in without input_ids") def test_inputs_embeds(): diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py index 598f588247..5bfd4c3f3c 100644 --- a/tests/models/idefics3/test_modeling_idefics3.py +++ b/tests/models/idefics3/test_modeling_idefics3.py @@ -168,7 +168,12 @@ class Idefics3ModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = Idefics3VisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=Idefics3Config, has_text_modality=False) + self.config_tester = ConfigTester( + self, config_class=Idefics3Config, has_text_modality=False, common_properties=["image_token_id"] + ) + + def test_config(self): + self.config_tester.run_common_tests() @unittest.skip(reason="input_embeds cannot be passed in without input_ids") def test_inputs_embeds(): diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index f06caeb037..e77577dad7 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -486,6 +486,15 @@ class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, Gene def setUp(self): self.model_tester = InstructBlipForConditionalGenerationDecoderOnlyModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=InstructBlipConfig, + has_text_modality=False, + common_properties=["num_query_tokens", "image_token_index"], + ) + + def test_config(self): + self.config_tester.run_common_tests() def test_for_conditional_generation(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py index 7e0bf4eaf0..3be5f89325 100644 --- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -510,11 +510,18 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest( def setUp(self): self.model_tester = InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester(self) + common_properties = ["num_query_tokens", "video_token_index"] + self.config_tester = ConfigTester( + self, config_class=InstructBlipVideoConfig, has_text_modality=False, common_properties=common_properties + ) def test_for_conditional_generation(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): pass diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index 43266a750b..7ede47a348 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -304,7 +304,12 @@ class Kosmos2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase) def setUp(self): self.model_tester = Kosmos2ModelTester(self) - self.config_tester = ConfigTester(self, config_class=Kosmos2Config, hidden_size=37) + self.config_tester = ConfigTester( + self, config_class=Kosmos2Config, has_text_modality=False, common_properties=["latent_query_num"] + ) + + def test_config(self): + self.config_tester.run_common_tests() # overwrite from common to skip `image_to_text_projection.latent_query` def test_initialization(self): diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py index 9810ff7c2a..1359e16a3d 100644 --- a/tests/models/llava/test_modeling_llava.py +++ b/tests/models/llava/test_modeling_llava.py @@ -194,7 +194,13 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM def setUp(self): self.model_tester = LlavaVisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=LlavaConfig, has_text_modality=False) + common_properties = ["image_token_index", "vision_feature_layer", "image_seq_length"] + self.config_tester = ConfigTester( + self, config_class=LlavaConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs def test_inputs_embeds(self): diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py index 2146c94c18..7ce57dcba3 100644 --- a/tests/models/llava_next/test_modeling_llava_next.py +++ b/tests/models/llava_next/test_modeling_llava_next.py @@ -223,7 +223,13 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes def setUp(self): self.model_tester = LlavaNextVisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=LlavaNextConfig, has_text_modality=False) + common_properties = ["image_token_index", "vision_feature_layer", "image_seq_length"] + self.config_tester = ConfigTester( + self, config_class=LlavaNextConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py index 89cdce65ec..3ebb5752bd 100644 --- a/tests/models/llava_next_video/test_modeling_llava_next_video.py +++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py @@ -240,7 +240,13 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati def setUp(self): self.model_tester = LlavaNextVideoVisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=LlavaNextVideoConfig, has_text_modality=False) + common_properties = ["image_token_index", "video_token_index", "vision_feature_layer", "image_seq_length"] + self.config_tester = ConfigTester( + self, config_class=LlavaNextVideoConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/llava_onevision/test_modeling_llava_onevision.py b/tests/models/llava_onevision/test_modeling_llava_onevision.py index 7a5781fa03..a217eee2c7 100644 --- a/tests/models/llava_onevision/test_modeling_llava_onevision.py +++ b/tests/models/llava_onevision/test_modeling_llava_onevision.py @@ -226,7 +226,13 @@ class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, Generati def setUp(self): self.model_tester = LlavaOnevisionVisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=LlavaOnevisionConfig, has_text_modality=False) + common_properties = ["image_token_index", "video_token_index", "vision_feature_layer"] + self.config_tester = ConfigTester( + self, config_class=LlavaOnevisionConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/mllama/test_modeling_mllama.py b/tests/models/mllama/test_modeling_mllama.py index 91f2169a02..9ed5d67822 100644 --- a/tests/models/mllama/test_modeling_mllama.py +++ b/tests/models/mllama/test_modeling_mllama.py @@ -272,7 +272,12 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester def setUp(self): self.model_tester = MllamaVisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=MllamaConfig, has_text_modality=False) + self.config_tester = ConfigTester( + self, config_class=MllamaConfig, has_text_modality=False, common_properties=["image_token_index"] + ) + + def test_config(self): + self.config_tester.run_common_tests() # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs def test_inputs_embeds(self): diff --git a/tests/models/owlv2/test_modeling_owlv2.py b/tests/models/owlv2/test_modeling_owlv2.py index 48070c7bb8..df763aed48 100644 --- a/tests/models/owlv2/test_modeling_owlv2.py +++ b/tests/models/owlv2/test_modeling_owlv2.py @@ -447,6 +447,13 @@ class Owlv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = Owlv2ModelTester(self) + common_properties = ["projection_dim", "logit_scale_init_value"] + self.config_tester = ConfigTester( + self, config_class=Owlv2Config, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py index a08fae0bc6..e0599a50fb 100644 --- a/tests/models/owlvit/test_modeling_owlvit.py +++ b/tests/models/owlvit/test_modeling_owlvit.py @@ -442,6 +442,13 @@ class OwlViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = OwlViTModelTester(self) + common_properties = ["projection_dim", "logit_scale_init_value"] + self.config_tester = ConfigTester( + self, config_class=OwlViTConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index 6c04ba40df..afd45dc016 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -232,6 +232,9 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas self.model_tester = Qwen2VLVisionText2TextModelTester(self) self.config_tester = ConfigTester(self, config_class=Qwen2VLConfig, has_text_modality=False) + def test_config(self): + self.config_tester.run_common_tests() + def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/siglip/test_modeling_siglip.py b/tests/models/siglip/test_modeling_siglip.py index 2fe06b1511..61ac78f102 100644 --- a/tests/models/siglip/test_modeling_siglip.py +++ b/tests/models/siglip/test_modeling_siglip.py @@ -667,9 +667,12 @@ class SiglipModelTest(SiglipModelTesterMixin, PipelineTesterMixin, unittest.Test test_disk_offload_bin = False _is_composite = True - # Copied from tests.models.clip.test_modeling_clip.CLIPModelTest.setUp with CLIP->Siglip def setUp(self): self.model_tester = SiglipModelTester(self) + self.config_tester = ConfigTester(self, config_class=SiglipConfig, has_text_modality=False) + + def test_config(self): + self.config_tester.run_common_tests() # Copied from tests.models.clip.test_modeling_clip.CLIPModelTest.test_model def test_model(self): diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py index 0044ef0272..4da6dc19ad 100644 --- a/tests/models/video_llava/test_modeling_video_llava.py +++ b/tests/models/video_llava/test_modeling_video_llava.py @@ -217,7 +217,13 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe def setUp(self): self.model_tester = VideoLlavaVisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=VideoLlavaConfig, has_text_modality=False) + common_properties = ["image_token_index", "video_token_index", "vision_feature_layer", "image_seq_length"] + self.config_tester = ConfigTester( + self, config_class=VideoLlavaConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() @unittest.skip( reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py index e2f9ae1ccf..25670d782a 100644 --- a/tests/models/vipllava/test_modeling_vipllava.py +++ b/tests/models/vipllava/test_modeling_vipllava.py @@ -179,7 +179,13 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest def setUp(self): self.model_tester = VipLlavaVisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=VipLlavaConfig, has_text_modality=False) + common_properties = ["image_token_index", "vision_feature_layers", "image_seq_length"] + self.config_tester = ConfigTester( + self, config_class=VipLlavaConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs def test_inputs_embeds(self): diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py index 8b91019bae..04dd2d9d29 100644 --- a/tests/models/x_clip/test_modeling_x_clip.py +++ b/tests/models/x_clip/test_modeling_x_clip.py @@ -547,6 +547,13 @@ class XCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = XCLIPModelTester(self) + common_properties = ["projection_dim", "prompt_layers", "prompt_num_attention_heads"] + self.config_tester = ConfigTester( + self, config_class=XCLIPConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py index 81c6a008b1..4dbbdedbbc 100644 --- a/tests/test_configuration_common.py +++ b/tests/test_configuration_common.py @@ -17,12 +17,17 @@ import copy import json import os import tempfile +from pathlib import Path from transformers import is_torch_available +from transformers.utils import direct_transformers_import from .utils.test_configuration_utils import config_common_kwargs +transformers_module = direct_transformers_import(Path(__file__).parent) + + class ConfigTester: def __init__(self, parent, config_class=None, has_text_modality=True, common_properties=None, **kwargs): self.parent = parent @@ -35,9 +40,10 @@ class ConfigTester: config = self.config_class(**self.inputs_dict) common_properties = ( ["hidden_size", "num_attention_heads", "num_hidden_layers"] - if self.common_properties is None + if self.common_properties is None and not self.config_class.sub_configs else self.common_properties ) + common_properties = [] if common_properties is None else common_properties # Add common fields for text models if self.has_text_modality: @@ -110,6 +116,44 @@ class ConfigTester: self.parent.assertEqual(config_second.to_dict(), config_first.to_dict()) + def create_and_test_config_from_and_save_pretrained_composite(self): + """ + Tests that composite or nested cofigs can be loaded and saved correctly. In case the config + has a sub-config, we should be able to call `sub_config.from_pretrained('general_config_file')` + and get a result same as if we loaded the whole config and obtained `config.sub_config` from it. + """ + config = self.config_class(**self.inputs_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + config.save_pretrained(tmpdirname) + general_config_loaded = self.config_class.from_pretrained(tmpdirname) + general_config_dict = config.to_dict() + + # Iterate over all sub_configs if there are any and load them with their own classes + sub_configs = self.config_class.sub_configs + for sub_config_key, sub_class in sub_configs.items(): + if sub_class.__name__ == "AutoConfig": + sub_class = sub_class.for_model(**general_config_dict[sub_config_key]).__class__ + sub_config_loaded = sub_class.from_pretrained(tmpdirname) + else: + sub_config_loaded = sub_class.from_pretrained(tmpdirname) + + # Pop `transformers_version`, it never exists when a config is part of a general composite config + # Verify that loading with subconfig class results in same dict as if we loaded with general composite config class + sub_config_loaded_dict = sub_config_loaded.to_dict() + sub_config_loaded_dict.pop("transformers_version", None) + self.parent.assertEqual(sub_config_loaded_dict, general_config_dict[sub_config_key]) + + # Verify that the loaded config type is same as in the general config + type_from_general_config = type(getattr(general_config_loaded, sub_config_key)) + self.parent.assertTrue(isinstance(sub_config_loaded, type_from_general_config)) + + # Now save only the sub-config and load it back to make sure the whole load-save-load pipeline works + with tempfile.TemporaryDirectory() as tmpdirname2: + sub_config_loaded.save_pretrained(tmpdirname2) + sub_config_loaded_2 = sub_class.from_pretrained(tmpdirname2) + self.parent.assertEqual(sub_config_loaded.to_dict(), sub_config_loaded_2.to_dict()) + def create_and_test_config_with_num_labels(self): config = self.config_class(**self.inputs_dict, num_labels=5) self.parent.assertEqual(len(config.id2label), 5) @@ -128,6 +172,9 @@ class ConfigTester: self.parent.assertIsNotNone(config) def check_config_arguments_init(self): + if self.config_class.sub_configs: + return # TODO: @raushan composite models are not consistent in how they set general params + kwargs = copy.deepcopy(config_common_kwargs) config = self.config_class(**kwargs) wrong_values = [] @@ -153,6 +200,7 @@ class ConfigTester: self.create_and_test_config_to_json_file() self.create_and_test_config_from_and_save_pretrained() self.create_and_test_config_from_and_save_pretrained_subfolder() + self.create_and_test_config_from_and_save_pretrained_composite() self.create_and_test_config_with_num_labels() self.check_config_can_be_init_without_params() self.check_config_arguments_init() diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 13c4d5155b..c7a11ff0ac 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -3802,22 +3802,18 @@ class ModelTesterMixin: self.skipTest("Model is not a composite model.") config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - sub_configs = { - key: getattr(config, key) for key in config if isinstance(getattr(config, key), PretrainedConfig) - } # set eager as it will be the one supported in all models # we just need to test if passing 'attn_implementation' as a dict fails or not attn_implementation_per_subconfig = {} - for key, sub_config in sub_configs.items(): + for key in config.sub_configs.keys(): attn_implementation_per_subconfig[key] = "eager" config._attn_implementation = attn_implementation_per_subconfig model = model_class(config) - for key in model.config: - if isinstance(getattr(model.config, key), PretrainedConfig): - sub_config = getattr(model.config, key) - self.assertTrue(sub_config._attn_implementation == "eager") + for key in config.sub_configs.keys(): + sub_config = getattr(model.config, key) + self.assertTrue(sub_config._attn_implementation == "eager") for name, submodule in model.named_modules(): class_name = submodule.__class__.__name__ @@ -3826,7 +3822,7 @@ class ModelTesterMixin: or "SdpaSelfAttention" in class_name or "FlashAttention" in class_name ): - raise ValueError("The eager model should not have SDPA/FA2 attention layers") + raise ValueError(f"The eager model should not have SDPA/FA2 attention layers but got {class_name}") @require_torch_sdpa def test_sdpa_can_dispatch_non_composite_models(self):