Load sub-configs from composite configs (#34410)
* save/load sub-configs * nit forgot these * fix copies * move test to common * use dict for sub-configs * add load-save-laod test * clean up modeling check * oops this are correct keys * fix some tests, missed some composite configs * this model was missed
This commit is contained in:
committed by
GitHub
parent
5e1fd4e204
commit
893ad04fad
@@ -190,6 +190,8 @@ class PretrainedConfig(PushToHubMixin):
|
||||
"""
|
||||
|
||||
model_type: str = ""
|
||||
base_config_key: str = ""
|
||||
sub_configs: Dict[str, "PretrainedConfig"] = {}
|
||||
is_composition: bool = False
|
||||
attribute_map: Dict[str, str] = {}
|
||||
_auto_class: Optional[str] = None
|
||||
@@ -543,11 +545,22 @@ class PretrainedConfig(PushToHubMixin):
|
||||
cls._set_token_in_kwargs(kwargs, token)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
if cls.base_config_key and cls.base_config_key in config_dict:
|
||||
config_dict = config_dict[cls.base_config_key]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
# sometimes the config has no `base_config_key` if the config is used in several composite models
|
||||
# e.g. LlamaConfig. In that case we try to see if there is match in `model_type` before raising a warning
|
||||
for k, v in config_dict.items():
|
||||
if isinstance(v, dict) and v.get("model_type") == cls.model_type:
|
||||
config_dict = v
|
||||
|
||||
# raise warning only if we still can't see a match in `model_type`
|
||||
if config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
@@ -1608,15 +1608,14 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
# Below we check if a config is composite and manually prepare a dict of attn impl if not already passed as a dict.
|
||||
# Later each sub-module will dispatch with its own attn impl, by calling `XXXModel._from_config(config.text_config)`
|
||||
# If any of sub-modules doesn't support requested attn, an error will be raised. See https://github.com/huggingface/transformers/pull/32238
|
||||
for key in config:
|
||||
if isinstance(getattr(config, key), PretrainedConfig):
|
||||
sub_config = getattr(config, key)
|
||||
curr_attn_implementation = (
|
||||
requested_attn_implementation
|
||||
if not isinstance(requested_attn_implementation, dict)
|
||||
else requested_attn_implementation.get(key, None)
|
||||
)
|
||||
sub_config._attn_implementation_internal = curr_attn_implementation
|
||||
for key in config.sub_configs.keys():
|
||||
sub_config = getattr(config, key)
|
||||
curr_attn_implementation = (
|
||||
requested_attn_implementation
|
||||
if not isinstance(requested_attn_implementation, dict)
|
||||
else requested_attn_implementation.get(key, None)
|
||||
)
|
||||
sub_config._attn_implementation_internal = curr_attn_implementation
|
||||
|
||||
if use_flash_attention_2:
|
||||
logger.warning_once(
|
||||
|
||||
@@ -14,8 +14,7 @@
|
||||
# limitations under the License.
|
||||
"""ALIGN model configuration"""
|
||||
|
||||
import os
|
||||
from typing import TYPE_CHECKING, List, Union
|
||||
from typing import TYPE_CHECKING, List
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -95,6 +94,7 @@ class AlignTextConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "align_text_model"
|
||||
base_config_key = "text_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -133,24 +133,6 @@ class AlignTextConfig(PretrainedConfig):
|
||||
self.use_cache = use_cache
|
||||
self.pad_token_id = pad_token_id
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the text config dict if we are loading from AlignConfig
|
||||
if config_dict.get("model_type") == "align":
|
||||
config_dict = config_dict["text_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class AlignVisionConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -223,6 +205,7 @@ class AlignVisionConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "align_vision_model"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -272,24 +255,6 @@ class AlignVisionConfig(PretrainedConfig):
|
||||
self.drop_connect_rate = drop_connect_rate
|
||||
self.num_hidden_layers = sum(num_block_repeats) * 4
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the vision config dict if we are loading from AlignConfig
|
||||
if config_dict.get("model_type") == "align":
|
||||
config_dict = config_dict["vision_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class AlignConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -340,6 +305,7 @@ class AlignConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "align"
|
||||
sub_configs = {"text_config": AlignTextConfig, "vision_config": AlignVisionConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -14,9 +14,6 @@
|
||||
# limitations under the License.
|
||||
"""AltCLIP model configuration"""
|
||||
|
||||
import os
|
||||
from typing import Union
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
|
||||
@@ -199,6 +196,7 @@ class AltCLIPVisionConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "altclip_vision_model"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -233,24 +231,6 @@ class AltCLIPVisionConfig(PretrainedConfig):
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.hidden_act = hidden_act
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the vision config dict if we are loading from AltCLIPConfig
|
||||
if config_dict.get("model_type") == "altclip":
|
||||
config_dict = config_dict["vision_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class AltCLIPConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -298,6 +278,7 @@ class AltCLIPConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "altclip"
|
||||
sub_configs = {"text_config": AltCLIPTextConfig, "vision_config": AltCLIPVisionConfig}
|
||||
|
||||
def __init__(
|
||||
self, text_config=None, vision_config=None, projection_dim=768, logit_scale_init_value=2.6592, **kwargs
|
||||
|
||||
@@ -14,12 +14,11 @@
|
||||
# limitations under the License.
|
||||
"""BARK model configuration"""
|
||||
|
||||
import os
|
||||
from typing import Dict, Optional, Union
|
||||
from typing import Dict
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import add_start_docstrings, logging
|
||||
from ..auto import CONFIG_MAPPING
|
||||
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
@@ -64,7 +63,6 @@ BARK_SUBMODELCONFIG_START_DOCSTRING = """
|
||||
|
||||
|
||||
class BarkSubModelConfig(PretrainedConfig):
|
||||
model_type = "bark_module"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
attribute_map = {
|
||||
@@ -101,38 +99,6 @@ class BarkSubModelConfig(PretrainedConfig):
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls,
|
||||
pretrained_model_name_or_path: Union[str, os.PathLike],
|
||||
cache_dir: Optional[Union[str, os.PathLike]] = None,
|
||||
force_download: bool = False,
|
||||
local_files_only: bool = False,
|
||||
token: Optional[Union[str, bool]] = None,
|
||||
revision: str = "main",
|
||||
**kwargs,
|
||||
) -> "PretrainedConfig":
|
||||
kwargs["cache_dir"] = cache_dir
|
||||
kwargs["force_download"] = force_download
|
||||
kwargs["local_files_only"] = local_files_only
|
||||
kwargs["revision"] = revision
|
||||
|
||||
cls._set_token_in_kwargs(kwargs, token)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the config dict if we are loading from Bark
|
||||
if config_dict.get("model_type") == "bark":
|
||||
config_dict = config_dict[f"{cls.model_type}_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
BARK_SUBMODELCONFIG_START_DOCSTRING.format(config="BarkSemanticConfig", model="BarkSemanticModel"),
|
||||
@@ -154,6 +120,7 @@ class BarkSubModelConfig(PretrainedConfig):
|
||||
)
|
||||
class BarkSemanticConfig(BarkSubModelConfig):
|
||||
model_type = "semantic"
|
||||
base_config_key = "semantic_config"
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
@@ -176,6 +143,7 @@ class BarkSemanticConfig(BarkSubModelConfig):
|
||||
)
|
||||
class BarkCoarseConfig(BarkSubModelConfig):
|
||||
model_type = "coarse_acoustics"
|
||||
base_config_key = "coarse_acoustics_config"
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
@@ -203,6 +171,7 @@ class BarkCoarseConfig(BarkSubModelConfig):
|
||||
)
|
||||
class BarkFineConfig(BarkSubModelConfig):
|
||||
model_type = "fine_acoustics"
|
||||
base_config_key = "fine_acoustics_config"
|
||||
|
||||
def __init__(self, tie_word_embeddings=True, n_codes_total=8, n_codes_given=1, **kwargs):
|
||||
self.n_codes_total = n_codes_total
|
||||
@@ -265,6 +234,12 @@ class BarkConfig(PretrainedConfig):
|
||||
"""
|
||||
|
||||
model_type = "bark"
|
||||
sub_configs = {
|
||||
"semantic_config": BarkSemanticConfig,
|
||||
"coarse_acoustics_config": BarkCoarseConfig,
|
||||
"fine_acoustics_config": BarkFineConfig,
|
||||
"codec_config": AutoConfig,
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -14,9 +14,6 @@
|
||||
# limitations under the License.
|
||||
"""Blip model configuration"""
|
||||
|
||||
import os
|
||||
from typing import Union
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
|
||||
@@ -96,6 +93,7 @@ class BlipTextConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "blip_text_model"
|
||||
base_config_key = "text_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -146,24 +144,6 @@ class BlipTextConfig(PretrainedConfig):
|
||||
self.use_cache = use_cache
|
||||
self.label_smoothing = label_smoothing
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the text config dict if we are loading from BlipConfig
|
||||
if config_dict.get("model_type") == "blip":
|
||||
config_dict = config_dict["text_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class BlipVisionConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -215,6 +195,7 @@ class BlipVisionConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "blip_vision_model"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -245,24 +226,6 @@ class BlipVisionConfig(PretrainedConfig):
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.hidden_act = hidden_act
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the vision config dict if we are loading from BlipConfig
|
||||
if config_dict.get("model_type") == "blip":
|
||||
config_dict = config_dict["vision_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class BlipConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -316,6 +279,7 @@ class BlipConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "blip"
|
||||
sub_configs = {"text_config": BlipTextConfig, "vision_config": BlipVisionConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -14,13 +14,12 @@
|
||||
# limitations under the License.
|
||||
"""BLIP-2 model configuration"""
|
||||
|
||||
import os
|
||||
from typing import Optional, Union
|
||||
from typing import Optional
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
|
||||
from ...utils import logging
|
||||
from ..auto import CONFIG_MAPPING
|
||||
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
@@ -76,6 +75,7 @@ class Blip2VisionConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "blip_2_vision_model"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -106,24 +106,6 @@ class Blip2VisionConfig(PretrainedConfig):
|
||||
self.hidden_act = hidden_act
|
||||
self.qkv_bias = qkv_bias
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the vision config dict if we are loading from Blip2Config
|
||||
if config_dict.get("model_type") == "blip-2":
|
||||
config_dict = config_dict["vision_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class Blip2QFormerConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -190,6 +172,7 @@ class Blip2QFormerConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "blip_2_qformer"
|
||||
base_config_key = "qformer_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -229,24 +212,6 @@ class Blip2QFormerConfig(PretrainedConfig):
|
||||
self.encoder_hidden_size = encoder_hidden_size
|
||||
self.use_qformer_text_input = use_qformer_text_input
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the qformer config dict if we are loading from Blip2Config
|
||||
if config_dict.get("model_type") == "blip-2":
|
||||
config_dict = config_dict["qformer_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class Blip2Config(PretrainedConfig):
|
||||
r"""
|
||||
@@ -306,6 +271,7 @@ class Blip2Config(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "blip-2"
|
||||
sub_configs = {"text_config": AutoConfig, "qformer_config": Blip2QFormerConfig, "vision_config": Blip2VisionConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -14,9 +14,6 @@
|
||||
# limitations under the License.
|
||||
"""BridgeTower model configuration"""
|
||||
|
||||
import os
|
||||
from typing import Union
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
|
||||
@@ -68,6 +65,7 @@ class BridgeTowerVisionConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "bridgetower_vision_model"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -95,21 +93,6 @@ class BridgeTowerVisionConfig(PretrainedConfig):
|
||||
self.share_layernorm = share_layernorm
|
||||
self.remove_last_layer = remove_last_layer
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
if config_dict.get("model_type") == "bridgetower":
|
||||
config_dict = config_dict["text_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class BridgeTowerTextConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -175,6 +158,7 @@ class BridgeTowerTextConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "bridgetower_text_model"
|
||||
base_config_key = "text_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -217,21 +201,6 @@ class BridgeTowerTextConfig(PretrainedConfig):
|
||||
self.bos_token_id = bos_token_id
|
||||
self.eos_token_id = eos_token_id
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
if config_dict.get("model_type") == "bridgetower":
|
||||
config_dict = config_dict["text_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class BridgeTowerConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -288,6 +257,7 @@ class BridgeTowerConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "bridgetower"
|
||||
sub_configs = {"text_config": BridgeTowerTextConfig, "vision_config": BridgeTowerVisionConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -62,6 +62,7 @@ class ChameleonVQVAEConfig(PretrainedConfig):
|
||||
"""
|
||||
|
||||
model_type = "chameleon_vqgan"
|
||||
base_config_key = "vq_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -187,6 +188,7 @@ class ChameleonConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "chameleon"
|
||||
sub_configs = {"vq_config": ChameleonVQVAEConfig}
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
|
||||
@@ -14,9 +14,8 @@
|
||||
# limitations under the License.
|
||||
"""Chinese-CLIP model configuration"""
|
||||
|
||||
import os
|
||||
from collections import OrderedDict
|
||||
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any, Mapping, Optional
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -102,6 +101,7 @@ class ChineseCLIPTextConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "chinese_clip_text_model"
|
||||
base_config_key = "text_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -141,24 +141,6 @@ class ChineseCLIPTextConfig(PretrainedConfig):
|
||||
self.position_embedding_type = position_embedding_type
|
||||
self.use_cache = use_cache
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the vision config dict if we are loading from ChineseCLIPConfig
|
||||
if config_dict.get("model_type") == "chinese_clip":
|
||||
config_dict = config_dict["text_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class ChineseCLIPVisionConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -215,6 +197,7 @@ class ChineseCLIPVisionConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "chinese_clip_vision_model"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -249,24 +232,6 @@ class ChineseCLIPVisionConfig(PretrainedConfig):
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.hidden_act = hidden_act
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the vision config dict if we are loading from ChineseCLIPConfig
|
||||
if config_dict.get("model_type") == "chinese_clip":
|
||||
config_dict = config_dict["vision_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class ChineseCLIPConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -316,6 +281,7 @@ class ChineseCLIPConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "chinese_clip"
|
||||
sub_configs = {"text_config": ChineseCLIPTextConfig, "vision_config": ChineseCLIPVisionConfig}
|
||||
|
||||
def __init__(
|
||||
self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
|
||||
|
||||
@@ -14,9 +14,6 @@
|
||||
# limitations under the License.
|
||||
"""CLAP model configuration"""
|
||||
|
||||
import os
|
||||
from typing import Union
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
|
||||
@@ -94,6 +91,7 @@ class ClapTextConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "clap_text_model"
|
||||
base_config_key = "text_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -137,24 +135,6 @@ class ClapTextConfig(PretrainedConfig):
|
||||
self.projection_hidden_act = projection_hidden_act
|
||||
self.projection_dim = projection_dim
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the text config dict if we are loading from ClapConfig
|
||||
if config_dict.get("model_type") == "clap":
|
||||
config_dict = config_dict["text_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class ClapAudioConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -245,6 +225,7 @@ class ClapAudioConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "clap_audio_model"
|
||||
base_config_key = "audio_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -307,24 +288,6 @@ class ClapAudioConfig(PretrainedConfig):
|
||||
self.initializer_factor = initializer_factor
|
||||
self.projection_hidden_act = projection_hidden_act
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the audio config dict if we are loading from ClapConfig
|
||||
if config_dict.get("model_type") == "clap":
|
||||
config_dict = config_dict["audio_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class ClapConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -377,6 +340,7 @@ class ClapConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "clap"
|
||||
sub_configs = {"text_config": ClapTextConfig, "audio_config": ClapAudioConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -14,9 +14,8 @@
|
||||
# limitations under the License.
|
||||
"""CLIP model configuration"""
|
||||
|
||||
import os
|
||||
from collections import OrderedDict
|
||||
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any, Mapping, Optional
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -93,6 +92,7 @@ class CLIPTextConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "clip_text_model"
|
||||
base_config_key = "text_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -130,24 +130,6 @@ class CLIPTextConfig(PretrainedConfig):
|
||||
self.initializer_factor = initializer_factor
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the text config dict if we are loading from CLIPConfig
|
||||
if config_dict.get("model_type") == "clip":
|
||||
config_dict = config_dict["text_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class CLIPVisionConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -205,6 +187,7 @@ class CLIPVisionConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "clip_vision_model"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -239,24 +222,6 @@ class CLIPVisionConfig(PretrainedConfig):
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.hidden_act = hidden_act
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the vision config dict if we are loading from CLIPConfig
|
||||
if config_dict.get("model_type") == "clip":
|
||||
config_dict = config_dict["vision_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class CLIPConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -305,6 +270,7 @@ class CLIPConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "clip"
|
||||
sub_configs = {"text_config": CLIPTextConfig, "vision_config": CLIPVisionConfig}
|
||||
|
||||
def __init__(
|
||||
self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
|
||||
|
||||
@@ -14,9 +14,6 @@
|
||||
# limitations under the License.
|
||||
"""CLIPSeg model configuration"""
|
||||
|
||||
import os
|
||||
from typing import Union
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
|
||||
@@ -84,6 +81,7 @@ class CLIPSegTextConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "clipseg_text_model"
|
||||
base_config_key = "text_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -117,24 +115,6 @@ class CLIPSegTextConfig(PretrainedConfig):
|
||||
self.initializer_factor = initializer_factor
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the text config dict if we are loading from CLIPSegConfig
|
||||
if config_dict.get("model_type") == "clipseg":
|
||||
config_dict = config_dict["text_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class CLIPSegVisionConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -190,6 +170,7 @@ class CLIPSegVisionConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "clipseg_vision_model"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -222,24 +203,6 @@ class CLIPSegVisionConfig(PretrainedConfig):
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.hidden_act = hidden_act
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the vision config dict if we are loading from CLIPSegConfig
|
||||
if config_dict.get("model_type") == "clipseg":
|
||||
config_dict = config_dict["vision_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class CLIPSegConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -306,6 +269,7 @@ class CLIPSegConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "clipseg"
|
||||
sub_configs = {"text_config": CLIPSegTextConfig, "vision_config": CLIPSegVisionConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -91,6 +91,7 @@ class ClvpEncoderConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "clvp_encoder"
|
||||
base_config_key = ["text_config", "speech_config"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -141,7 +142,7 @@ class ClvpEncoderConfig(PretrainedConfig):
|
||||
|
||||
# make sure to have the config_type be either "text_config" or "speech_config"
|
||||
# this is to make sure that we can load only text or speech configs from the nested ClvpConfig.
|
||||
if config_type not in ["text_config", "speech_config"]:
|
||||
if config_type not in cls.base_config_key:
|
||||
raise ValueError(
|
||||
f"We can only load either 'text_config' or 'speech_config' but you are trying to load" f"{config_type}"
|
||||
)
|
||||
@@ -253,6 +254,7 @@ class ClvpDecoderConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "clvp_decoder"
|
||||
base_config_key = "decoder_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -314,24 +316,6 @@ class ClvpDecoderConfig(PretrainedConfig):
|
||||
|
||||
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the speech config dict if we are loading from ClvpConfig
|
||||
if config_dict.get("model_type") == "clvp":
|
||||
config_dict = config_dict["decoder_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class ClvpConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -386,7 +370,11 @@ class ClvpConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "clvp"
|
||||
is_composition = True
|
||||
sub_configs = {
|
||||
"text_config": ClvpEncoderConfig,
|
||||
"speech_config": ClvpEncoderConfig,
|
||||
"decoder_config": ClvpDecoderConfig,
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -41,6 +41,8 @@ class DbrxAttentionConfig(PretrainedConfig):
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0): The base frequency for rope.
|
||||
"""
|
||||
|
||||
base_config_key = "attn_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
attn_pdrop: float = 0.0,
|
||||
@@ -55,29 +57,12 @@ class DbrxAttentionConfig(PretrainedConfig):
|
||||
self.kv_n_heads = kv_n_heads
|
||||
self.rope_theta = rope_theta
|
||||
|
||||
for k in ["model_type"]:
|
||||
for k in ["model_type", "attn_implementation", "transformers_version", "_commit_hash"]:
|
||||
if k in kwargs:
|
||||
kwargs.pop(k)
|
||||
if len(kwargs) != 0:
|
||||
raise ValueError(f"Found unknown {kwargs=}")
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs: Any) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
if config_dict.get("model_type") == "dbrx":
|
||||
config_dict = config_dict["attn_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class DbrxFFNConfig(PretrainedConfig):
|
||||
"""Configuration class for Dbrx FFN.
|
||||
@@ -100,6 +85,8 @@ class DbrxFFNConfig(PretrainedConfig):
|
||||
moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0): The normalization factor for the expert weights.
|
||||
"""
|
||||
|
||||
base_config_key = "ffn_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
ffn_act_fn: dict = None,
|
||||
@@ -122,29 +109,12 @@ class DbrxFFNConfig(PretrainedConfig):
|
||||
self.moe_loss_weight = moe_loss_weight
|
||||
self.moe_normalize_expert_weights = moe_normalize_expert_weights
|
||||
|
||||
for k in ["model_type"]:
|
||||
for k in ["model_type", "attn_implementation", "transformers_version", "_commit_hash"]:
|
||||
if k in kwargs:
|
||||
kwargs.pop(k)
|
||||
if len(kwargs) != 0:
|
||||
raise ValueError(f"Found unknown {kwargs=}")
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs: Any) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
if config_dict.get("model_type") == "dbrx":
|
||||
config_dict = config_dict["ffn_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class DbrxConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -202,6 +172,7 @@ class DbrxConfig(PretrainedConfig):
|
||||
"""
|
||||
|
||||
model_type = "dbrx"
|
||||
sub_configs = {"attn_config": DbrxAttentionConfig, "ffn_config": DbrxFFNConfig}
|
||||
attribute_map = {
|
||||
"num_attention_heads": "n_heads",
|
||||
"hidden_size": "d_model",
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
from ..auto import AutoConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
@@ -70,6 +71,7 @@ class EncoderDecoderConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "encoder-decoder"
|
||||
sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig}
|
||||
is_composition = True
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
@@ -84,8 +86,6 @@ class EncoderDecoderConfig(PretrainedConfig):
|
||||
decoder_config = kwargs.pop("decoder")
|
||||
decoder_model_type = decoder_config.pop("model_type")
|
||||
|
||||
from ..auto.configuration_auto import AutoConfig
|
||||
|
||||
self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config)
|
||||
self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config)
|
||||
self.is_encoder_decoder = True
|
||||
|
||||
@@ -164,6 +164,7 @@ class FastSpeech2ConformerConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "fastspeech2_conformer"
|
||||
base_config_key = "model_config"
|
||||
attribute_map = {"num_hidden_layers": "encoder_layers", "num_attention_heads": "encoder_num_attention_heads"}
|
||||
|
||||
def __init__(
|
||||
@@ -377,6 +378,7 @@ class FastSpeech2ConformerHifiGanConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "hifigan"
|
||||
base_config_key = "vocoder_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -453,7 +455,7 @@ class FastSpeech2ConformerWithHifiGanConfig(PretrainedConfig):
|
||||
"""
|
||||
|
||||
model_type = "fastspeech2_conformer_with_hifigan"
|
||||
is_composition = True
|
||||
sub_configs = {"model_config": FastSpeech2ConformerConfig, "vocoder_config": FastSpeech2ConformerHifiGanConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -14,8 +14,7 @@
|
||||
# limitations under the License.
|
||||
"""FLAVA model configurations"""
|
||||
|
||||
import os
|
||||
from typing import Any, Dict, Union
|
||||
from typing import Any, Dict
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
@@ -86,6 +85,7 @@ class FlavaImageConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "flava_image_model"
|
||||
base_config_key = "image_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -124,24 +124,6 @@ class FlavaImageConfig(PretrainedConfig):
|
||||
self.mask_token = mask_token
|
||||
self.vocab_size = vocab_size
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the image config dict if we are loading from FlavaConfig
|
||||
if config_dict.get("model_type") == "flava":
|
||||
config_dict = config_dict["image_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class FlavaTextConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -216,6 +198,7 @@ class FlavaTextConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "flava_text_model"
|
||||
base_config_key = "text_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -254,24 +237,6 @@ class FlavaTextConfig(PretrainedConfig):
|
||||
self.qkv_bias = qkv_bias
|
||||
self.pad_token_id = pad_token_id
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the text config dict if we are loading from FlavaConfig
|
||||
if config_dict.get("model_type") == "flava":
|
||||
config_dict = config_dict["text_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class FlavaMultimodalConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -327,6 +292,7 @@ class FlavaMultimodalConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "flava_multimodal_model"
|
||||
base_config_key = "multimodal_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -357,27 +323,10 @@ class FlavaMultimodalConfig(PretrainedConfig):
|
||||
self.qkv_bias = qkv_bias
|
||||
self.use_cls_token = use_cls_token
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the multimodal config dict if we are loading from FlavaConfig
|
||||
if config_dict.get("model_type") == "flava":
|
||||
config_dict = config_dict["multimodal_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class FlavaImageCodebookConfig(PretrainedConfig):
|
||||
model_type = "flava_image_codebook"
|
||||
base_config_key = "image_codebook_config"
|
||||
|
||||
r"""
|
||||
[`FlavaImageCodebookConfig`] is the configuration class to store the configuration of a [`FlavaImageCodebook`]. It
|
||||
@@ -442,24 +391,6 @@ class FlavaImageCodebookConfig(PretrainedConfig):
|
||||
self.freeze = freeze
|
||||
self.initializer_range = initializer_range
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the image codebook config dict if we are loading from FlavaConfig
|
||||
if config_dict.get("model_type") == "flava":
|
||||
config_dict = config_dict["image_codebook_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class FlavaConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -532,6 +463,12 @@ class FlavaConfig(PretrainedConfig):
|
||||
"""
|
||||
|
||||
model_type = "flava"
|
||||
sub_configs = {
|
||||
"text_config": FlavaTextConfig,
|
||||
"image_config": FlavaImageConfig,
|
||||
"multimodal_config": FlavaMultimodalConfig,
|
||||
"image_codebook_config": FlavaImageCodebookConfig,
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -13,8 +13,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
from typing import Union
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
@@ -72,6 +70,7 @@ class GitVisionConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "git_vision_model"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -102,24 +101,6 @@ class GitVisionConfig(PretrainedConfig):
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.hidden_act = hidden_act
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the vision config dict if we are loading from GITConfig
|
||||
if config_dict.get("model_type") == "git":
|
||||
config_dict = config_dict["vision_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class GitConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -186,6 +167,7 @@ class GitConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "git"
|
||||
sub_configs = {"vision_config": GitVisionConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -14,9 +14,8 @@
|
||||
# limitations under the License.
|
||||
"""GroupViT model configuration"""
|
||||
|
||||
import os
|
||||
from collections import OrderedDict
|
||||
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any, Mapping, Optional
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...onnx import OnnxConfig
|
||||
@@ -86,6 +85,7 @@ class GroupViTTextConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "groupvit_text_model"
|
||||
base_config_key = "text_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -121,24 +121,6 @@ class GroupViTTextConfig(PretrainedConfig):
|
||||
self.initializer_factor = initializer_factor
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the text config dict if we are loading from GroupViTConfig
|
||||
if config_dict.get("model_type") == "groupvit":
|
||||
config_dict = config_dict["text_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class GroupViTVisionConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -197,6 +179,7 @@ class GroupViTVisionConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "groupvit_vision_model"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -246,24 +229,6 @@ class GroupViTVisionConfig(PretrainedConfig):
|
||||
self.assign_eps = assign_eps
|
||||
self.assign_mlp_ratio = assign_mlp_ratio
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the vision config dict if we are loading from GroupViTConfig
|
||||
if config_dict.get("model_type") == "groupvit":
|
||||
config_dict = config_dict["vision_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class GroupViTConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -292,6 +257,7 @@ class GroupViTConfig(PretrainedConfig):
|
||||
"""
|
||||
|
||||
model_type = "groupvit"
|
||||
sub_configs = {"text_config": GroupViTTextConfig, "vision_config": GroupViTVisionConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -38,7 +38,7 @@ class IdeficsVisionConfig(PretrainedConfig):
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
hidden_size (`int`, *optional*, defaults to 768):
|
||||
embed_dim (`int`, *optional*, defaults to 768):
|
||||
Dimensionality of the encoder layers and the pooler layer. (elsewhere referred to as `hidden_size`)
|
||||
image_size (`int`, *optional*, defaults to 224):
|
||||
The size (resolution) of each image.
|
||||
@@ -50,12 +50,12 @@ class IdeficsVisionConfig(PretrainedConfig):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
image_num_channels (`int`, *optional*, defaults to `3`):
|
||||
num_channels (`int`, *optional*, defaults to 3):
|
||||
Number of image channels.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the layer normalization layers.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
@@ -64,11 +64,9 @@ class IdeficsVisionConfig(PretrainedConfig):
|
||||
initializer_factor (`float`, *optional*, defaults to 1.0):
|
||||
A factor for initializing all weight matrices (should be kept to 1.0, used internally for initialization
|
||||
testing).
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
"""
|
||||
|
||||
model_type = "idefics"
|
||||
model_type = "idefics_vision"
|
||||
attribute_map = {
|
||||
"hidden_size": "embed_dim",
|
||||
}
|
||||
@@ -119,7 +117,7 @@ class IdeficsPerceiverConfig(PretrainedConfig):
|
||||
Args:
|
||||
use_resampler (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to use the resampler
|
||||
resampler_n_latents (`int`, *optional*, defaults to ):
|
||||
resampler_n_latents (`int`, *optional*, defaults to 64):
|
||||
Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
|
||||
resampler_depth (`int`, *optional*, defaults to 6):
|
||||
Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
|
||||
@@ -131,7 +129,7 @@ class IdeficsPerceiverConfig(PretrainedConfig):
|
||||
Whether or not to use qk layer norms in perceiver
|
||||
"""
|
||||
|
||||
model_type = "idefics"
|
||||
model_type = "idefics_perciever"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -235,7 +233,7 @@ class IdeficsConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "idefics"
|
||||
is_composition = False
|
||||
sub_configs = {"perceiver_config": IdeficsPerceiverConfig, "vision_config": IdeficsVisionConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -13,12 +13,9 @@
|
||||
# limitations under the License.
|
||||
"""Idefics2 model configuration"""
|
||||
|
||||
import os
|
||||
from typing import Union
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
from ..auto import CONFIG_MAPPING
|
||||
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
@@ -76,7 +73,8 @@ class Idefics2VisionConfig(PretrainedConfig):
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "idefics2"
|
||||
model_type = "idefics2_vision"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -107,24 +105,6 @@ class Idefics2VisionConfig(PretrainedConfig):
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the vision config dict if we are loading from Idefics2Config
|
||||
if config_dict.get("model_type") == "idefics2":
|
||||
config_dict = config_dict["vision_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class Idefics2PerceiverConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -152,7 +132,7 @@ class Idefics2PerceiverConfig(PretrainedConfig):
|
||||
The dropout ratio for the attention probabilities.
|
||||
"""
|
||||
|
||||
model_type = "idefics2"
|
||||
model_type = "idefics2_perceiver"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -220,7 +200,11 @@ class Idefics2Config(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "idefics2"
|
||||
is_composition = True
|
||||
sub_configs = {
|
||||
"text_config": AutoConfig,
|
||||
"perceiver_config": Idefics2PerceiverConfig,
|
||||
"vision_config": Idefics2VisionConfig,
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -13,12 +13,9 @@
|
||||
# limitations under the License.
|
||||
"""Idefics3 model configuration"""
|
||||
|
||||
import os
|
||||
from typing import Union
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
from ..auto import CONFIG_MAPPING
|
||||
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
@@ -57,8 +54,7 @@ class Idefics3VisionConfig(PretrainedConfig):
|
||||
The epsilon used by the layer normalization layers.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
intializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation for initializing all weight matrices in the model.
|
||||
initializer_range (`<fill_type>`, *optional*, defaults to 0.02): <fill_docstring>
|
||||
|
||||
Example:
|
||||
|
||||
@@ -76,7 +72,8 @@ class Idefics3VisionConfig(PretrainedConfig):
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "idefics3"
|
||||
model_type = "idefics3_vision"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -107,24 +104,6 @@ class Idefics3VisionConfig(PretrainedConfig):
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the vision config dict if we are loading from Idefics3Config
|
||||
if config_dict.get("model_type") == "idefics3":
|
||||
config_dict = config_dict["vision_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class Idefics3Config(PretrainedConfig):
|
||||
r"""
|
||||
@@ -165,7 +144,7 @@ class Idefics3Config(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "idefics3"
|
||||
is_composition = True
|
||||
sub_configs = {"text_config": AutoConfig, "vision_config": Idefics3VisionConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -204,4 +183,4 @@ class Idefics3Config(PretrainedConfig):
|
||||
self.text_config = text_config
|
||||
self.scale_factor = scale_factor
|
||||
|
||||
super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
|
||||
super().__init__(**kwargs, pad_token_id=pad_token_id, tie_word_embeddings=tie_word_embeddings)
|
||||
|
||||
@@ -14,13 +14,10 @@
|
||||
# limitations under the License.
|
||||
"""InstructBLIP model configuration"""
|
||||
|
||||
import os
|
||||
from typing import Union
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
|
||||
from ...utils import logging
|
||||
from ..auto import CONFIG_MAPPING
|
||||
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
@@ -78,6 +75,7 @@ class InstructBlipVisionConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "instructblip_vision_model"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -108,24 +106,6 @@ class InstructBlipVisionConfig(PretrainedConfig):
|
||||
self.hidden_act = hidden_act
|
||||
self.qkv_bias = qkv_bias
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the vision config dict if we are loading from InstructBlipConfig
|
||||
if config_dict.get("model_type") == "instructblip":
|
||||
config_dict = config_dict["vision_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class InstructBlipQFormerConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -192,6 +172,7 @@ class InstructBlipQFormerConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "instructblip_qformer"
|
||||
base_config_key = "qformer_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -229,24 +210,6 @@ class InstructBlipQFormerConfig(PretrainedConfig):
|
||||
self.cross_attention_frequency = cross_attention_frequency
|
||||
self.encoder_hidden_size = encoder_hidden_size
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the qformer config dict if we are loading from InstructBlipConfig
|
||||
if config_dict.get("model_type") == "instructblip":
|
||||
config_dict = config_dict["qformer_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class InstructBlipConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -305,6 +268,11 @@ class InstructBlipConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "instructblip"
|
||||
sub_configs = {
|
||||
"text_config": AutoConfig,
|
||||
"qformer_config": InstructBlipQFormerConfig,
|
||||
"vision_config": InstructBlipVisionConfig,
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -19,13 +19,11 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
from typing import Union
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
|
||||
from ...utils import logging
|
||||
from ..auto import CONFIG_MAPPING
|
||||
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
@@ -83,6 +81,7 @@ class InstructBlipVideoVisionConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "instructblipvideo_vision_model"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -113,24 +112,6 @@ class InstructBlipVideoVisionConfig(PretrainedConfig):
|
||||
self.hidden_act = hidden_act
|
||||
self.qkv_bias = qkv_bias
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the vision config dict if we are loading from InstructBlipVideoConfig
|
||||
if config_dict.get("model_type") == "instructblipvideo":
|
||||
config_dict = config_dict["vision_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class InstructBlipVideoQFormerConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -197,6 +178,7 @@ class InstructBlipVideoQFormerConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "instructblipvideo_qformer"
|
||||
base_config_key = "qformer_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -234,24 +216,6 @@ class InstructBlipVideoQFormerConfig(PretrainedConfig):
|
||||
self.cross_attention_frequency = cross_attention_frequency
|
||||
self.encoder_hidden_size = encoder_hidden_size
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the qformer config dict if we are loading from InstructBlipVideoConfig
|
||||
if config_dict.get("model_type") == "instructblipvideo":
|
||||
config_dict = config_dict["qformer_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class InstructBlipVideoConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -310,6 +274,11 @@ class InstructBlipVideoConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "instructblipvideo"
|
||||
sub_configs = {
|
||||
"text_config": AutoConfig,
|
||||
"qformer_config": InstructBlipVideoQFormerConfig,
|
||||
"vision_config": InstructBlipVideoVisionConfig,
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -32,7 +32,7 @@ from transformers.models.instructblip.modeling_instructblip import (
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
|
||||
from ...utils import logging
|
||||
from ..auto import CONFIG_MAPPING
|
||||
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
@@ -103,6 +103,11 @@ class InstructBlipVideoConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "instructblipvideo"
|
||||
sub_configs = {
|
||||
"text_config": AutoConfig,
|
||||
"qformer_config": InstructBlipVideoQFormerConfig,
|
||||
"vision_config": InstructBlipVideoVisionConfig,
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -14,9 +14,6 @@
|
||||
# limitations under the License.
|
||||
"""KOSMOS-2 model configuration"""
|
||||
|
||||
import os
|
||||
from typing import Union
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
|
||||
@@ -61,7 +58,7 @@ class Kosmos2TextConfig(PretrainedConfig):
|
||||
layerdrop (`float`, *optional*, defaults to 0.0):
|
||||
The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
|
||||
for more details.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the layer normalization layers.
|
||||
init_std (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
@@ -69,9 +66,16 @@ class Kosmos2TextConfig(PretrainedConfig):
|
||||
Scale embeddings by diving by sqrt(embed_dim).
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||
pad_token_id (`int`, *optional*, defaults to 1):
|
||||
Token id used for padding.
|
||||
bos_token_id (`int`, *optional*, defaults to 0):
|
||||
Token id used for beginning of string.
|
||||
eos_token_id (`int`, *optional*, defaults to 2):
|
||||
Token id used for end of string.
|
||||
```"""
|
||||
|
||||
model_type = "kosmos_2_text_model"
|
||||
base_config_key = "text_config"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_attention_heads": "attention_heads",
|
||||
@@ -124,24 +128,6 @@ class Kosmos2TextConfig(PretrainedConfig):
|
||||
self.scale_embedding = scale_embedding
|
||||
self.use_cache = use_cache
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the text config dict if we are loading from Kosmos2Config
|
||||
if config_dict.get("model_type") == "kosmos-2":
|
||||
config_dict = config_dict["text_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class Kosmos2VisionConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -171,18 +157,19 @@ class Kosmos2VisionConfig(PretrainedConfig):
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the layer normalization layers.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
initializer_factor (`float`, *optional*, defaults to 1):
|
||||
initializer_factor (`float`, *optional*, defaults to 1.0):
|
||||
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
||||
testing).
|
||||
```"""
|
||||
|
||||
model_type = "kosmos_2_vision_model"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -215,24 +202,6 @@ class Kosmos2VisionConfig(PretrainedConfig):
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.hidden_act = hidden_act
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the vision config dict if we are loading from Kosmos2Config
|
||||
if config_dict.get("model_type") == "kosmos-2":
|
||||
config_dict = config_dict["vision_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class Kosmos2Config(PretrainedConfig):
|
||||
r"""
|
||||
@@ -267,7 +236,7 @@ class Kosmos2Config(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "kosmos-2"
|
||||
is_composition = True
|
||||
sub_configs = {"text_config": Kosmos2TextConfig, "vision_config": Kosmos2VisionConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
from ..auto import CONFIG_MAPPING
|
||||
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
@@ -73,7 +73,7 @@ class LlavaConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "llava"
|
||||
is_composition = True
|
||||
sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
from ..auto import CONFIG_MAPPING
|
||||
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
@@ -78,7 +78,7 @@ class LlavaNextConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "llava_next"
|
||||
is_composition = False
|
||||
sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ..auto import CONFIG_MAPPING
|
||||
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||
|
||||
|
||||
class LlavaNextVideoConfig(PretrainedConfig):
|
||||
@@ -86,7 +86,7 @@ class LlavaNextVideoConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "llava_next_video"
|
||||
is_composition = True
|
||||
sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -31,7 +31,7 @@ from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import (
|
||||
logging,
|
||||
)
|
||||
from ..auto import CONFIG_MAPPING
|
||||
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
@@ -99,7 +99,7 @@ class LlavaNextVideoConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "llava_next_video"
|
||||
is_composition = True
|
||||
sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -18,7 +18,7 @@ from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import (
|
||||
logging,
|
||||
)
|
||||
from ..auto import CONFIG_MAPPING
|
||||
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
@@ -81,7 +81,7 @@ class LlavaOnevisionConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "llava_onevision"
|
||||
is_composition = False
|
||||
sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -13,8 +13,7 @@
|
||||
# limitations under the License.
|
||||
"""Mllama model configuration"""
|
||||
|
||||
import os
|
||||
from typing import Dict, List, Optional, Union
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...modeling_rope_utils import rope_config_validation
|
||||
@@ -59,7 +58,7 @@ class MllamaVisionConfig(PretrainedConfig):
|
||||
The size (resolution) of each image *tile*.
|
||||
patch_size (`int`, *optional*, defaults to 14):
|
||||
The size (resolution) of each patch.
|
||||
norm_eps (`float`, *optional*, defaults to 1e-5):
|
||||
norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the layer normalization layers.
|
||||
max_num_tiles (`int`, *optional*, defaults to 4):
|
||||
Maximum number of tiles for image splitting.
|
||||
@@ -88,6 +87,7 @@ class MllamaVisionConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "mllama_vision_model"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -137,23 +137,6 @@ class MllamaVisionConfig(PretrainedConfig):
|
||||
def max_aspect_ratio_id(self) -> int:
|
||||
return len(self.supported_aspect_ratios)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
if config_dict.get("model_type") == "mllama":
|
||||
config_dict = config_dict["vision_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class MllamaTextConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -178,12 +161,12 @@ class MllamaTextConfig(PretrainedConfig):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
num_key_value_heads (`int`, *optional*):
|
||||
num_key_value_heads (`int`, *optional*, defaults to 8):
|
||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If not
|
||||
specified, will default to `num_attention_heads`.
|
||||
intermediate_size (`int`, *optional*, defaults to 14336):
|
||||
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
|
||||
rope_theta (`float`, *optional*, defaults to 500000.0):
|
||||
rope_theta (`float`, *optional*, defaults to `500000.0`):
|
||||
The base period of the RoPE embeddings.
|
||||
rope_scaling (`Dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
|
||||
@@ -259,6 +242,7 @@ class MllamaTextConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "mllama_text_model"
|
||||
base_config_key = "text_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -311,23 +295,6 @@ class MllamaTextConfig(PretrainedConfig):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
if config_dict.get("model_type") == "mllama":
|
||||
config_dict = config_dict["text_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class MllamaConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -370,7 +337,7 @@ class MllamaConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "mllama"
|
||||
is_composition = True
|
||||
sub_configs = {"text_config": MllamaTextConfig, "vision_config": MllamaVisionConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -235,8 +235,8 @@ class MoshiConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "moshi"
|
||||
is_composition = True
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
sub_configs = {"audio_encoder_config": AutoConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -41,22 +41,22 @@ class MptAttentionConfig(PretrainedConfig):
|
||||
Args:
|
||||
attn_type (`str`, *optional*, defaults to `"multihead_attention"`):
|
||||
type of attention to use. Options: `"multihead_attention"`, `"multiquery_attention"`.
|
||||
attn_pdrop (`float`, *optional*, defaults to 0.0):
|
||||
attn_pdrop (`float`, *optional*, defaults to `0.0`):
|
||||
The dropout probability for the attention layers.
|
||||
attn_impl (`str`, *optional*, defaults to `"torch"`):
|
||||
The attention implementation to use. One of `"torch"`, `"flash"`, or `"triton"`.
|
||||
clip_qkv (`float`, *optional*):
|
||||
If not `None`, clip the queries, keys, and values in the attention layer to this value.
|
||||
softmax_scale (`float`, *optional*, defaults to `None`):
|
||||
softmax_scale (`float`, *optional*):
|
||||
If not `None`, scale the softmax in the attention layer by this value. If `None`, will default to
|
||||
`1/sqrt(hidden_size)`.
|
||||
prefix_lm (`bool`, *optional*, defaults to `False`)):
|
||||
prefix_lm (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model should operate as a Prefix LM. This requires passing an extra `prefix_mask` argument
|
||||
which indicates which tokens belong to the prefix. Tokens in the prefix can attend to one another
|
||||
bi-directionally. Tokens outside the prefix use causal attention.
|
||||
qk_ln (`bool`, *optional*, defaults to `False`):
|
||||
Whether to apply layer normalization to the queries and keys in the attention layer.
|
||||
attn_uses_sequence_id (`bool`, *optional*, defaults to `False`)):
|
||||
attn_uses_sequence_id (`bool`, *optional*, defaults to `False`):
|
||||
Whether to restrict attention to tokens that have the same token_type_ids. When the model is in `train`
|
||||
mode, this requires passing an extra *token_type_ids* argument which indicates which sub-sequence each
|
||||
token belongs to. Defaults to `False` meaning any provided *token_type_ids* will be ignored.
|
||||
@@ -66,6 +66,8 @@ class MptAttentionConfig(PretrainedConfig):
|
||||
The maximum value of the alibi bias.
|
||||
"""
|
||||
|
||||
base_config_key = "attn_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
attn_type="multihead_attention",
|
||||
@@ -97,23 +99,6 @@ class MptAttentionConfig(PretrainedConfig):
|
||||
f"`attn_type` has to be either `multihead_attention` or `multiquery_attention`. Received: {attn_type}"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
if config_dict.get("model_type") == "mpt":
|
||||
config_dict = config_dict["attn_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class MptConfig(PretrainedConfig):
|
||||
"""
|
||||
@@ -188,6 +173,7 @@ class MptConfig(PretrainedConfig):
|
||||
"""
|
||||
|
||||
model_type = "mpt"
|
||||
sub_configs = {"attn_config": MptAttentionConfig}
|
||||
attribute_map = {
|
||||
"num_attention_heads": "n_heads",
|
||||
"hidden_size": "d_model",
|
||||
|
||||
@@ -76,6 +76,7 @@ class MusicgenDecoderConfig(PretrainedConfig):
|
||||
"""
|
||||
|
||||
model_type = "musicgen_decoder"
|
||||
base_config_key = "decoder_config"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
@@ -189,6 +190,11 @@ class MusicgenConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "musicgen"
|
||||
sub_configs = {
|
||||
"text_encoder": AutoConfig,
|
||||
"audio_encoder": AutoConfig,
|
||||
"decoder": MusicgenDecoderConfig,
|
||||
}
|
||||
is_composition = True
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
|
||||
@@ -78,6 +78,7 @@ class MusicgenMelodyDecoderConfig(PretrainedConfig):
|
||||
"""
|
||||
|
||||
model_type = "musicgen_melody_decoder"
|
||||
base_config_key = "decoder_config"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
@@ -195,6 +196,11 @@ class MusicgenMelodyConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "musicgen_melody"
|
||||
sub_configs = {
|
||||
"text_encoder": AutoConfig,
|
||||
"audio_encoder": AutoConfig,
|
||||
"decoder": MusicgenMelodyDecoderConfig,
|
||||
}
|
||||
is_composition = True
|
||||
|
||||
def __init__(
|
||||
|
||||
@@ -14,8 +14,7 @@
|
||||
# limitations under the License.
|
||||
"""OWLv2 model configuration"""
|
||||
|
||||
import os
|
||||
from typing import TYPE_CHECKING, Dict, Union
|
||||
from typing import TYPE_CHECKING, Dict
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -90,6 +89,7 @@ class Owlv2TextConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "owlv2_text_model"
|
||||
base_config_key = "text_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -123,24 +123,6 @@ class Owlv2TextConfig(PretrainedConfig):
|
||||
self.initializer_range = initializer_range
|
||||
self.initializer_factor = initializer_factor
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the text config dict if we are loading from Owlv2Config
|
||||
if config_dict.get("model_type") == "owlv2":
|
||||
config_dict = config_dict["text_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
# Copied from transformers.models.owlvit.configuration_owlvit.OwlViTVisionConfig with OwlViT->Owlv2, owlvit-base-patch32->owlv2-base-patch16, owlvit->owlv2, OWL-ViT->OWLv2, 32->16
|
||||
class Owlv2VisionConfig(PretrainedConfig):
|
||||
@@ -197,6 +179,7 @@ class Owlv2VisionConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "owlv2_vision_model"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -229,24 +212,6 @@ class Owlv2VisionConfig(PretrainedConfig):
|
||||
self.initializer_range = initializer_range
|
||||
self.initializer_factor = initializer_factor
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the vision config dict if we are loading from Owlv2Config
|
||||
if config_dict.get("model_type") == "owlv2":
|
||||
config_dict = config_dict["vision_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
# Copied from transformers.models.owlvit.configuration_owlvit.OwlViTConfig with OwlViT->Owlv2, owlvit-base-patch32->owlv2-base-patch16, owlvit->owlv2, OWL-ViT->OWLv2
|
||||
class Owlv2Config(PretrainedConfig):
|
||||
@@ -276,6 +241,7 @@ class Owlv2Config(PretrainedConfig):
|
||||
"""
|
||||
|
||||
model_type = "owlv2"
|
||||
sub_configs = {"text_config": Owlv2TextConfig, "vision_config": Owlv2VisionConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -304,20 +270,6 @@ class Owlv2Config(PretrainedConfig):
|
||||
self.return_dict = return_dict
|
||||
self.initializer_factor = 1.0
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def from_text_vision_configs(cls, text_config: Dict, vision_config: Dict, **kwargs):
|
||||
r"""
|
||||
|
||||
@@ -14,9 +14,8 @@
|
||||
# limitations under the License.
|
||||
"""OWL-ViT model configuration"""
|
||||
|
||||
import os
|
||||
from collections import OrderedDict
|
||||
from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -92,6 +91,7 @@ class OwlViTTextConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "owlvit_text_model"
|
||||
base_config_key = "text_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -125,24 +125,6 @@ class OwlViTTextConfig(PretrainedConfig):
|
||||
self.initializer_range = initializer_range
|
||||
self.initializer_factor = initializer_factor
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the text config dict if we are loading from OwlViTConfig
|
||||
if config_dict.get("model_type") == "owlvit":
|
||||
config_dict = config_dict["text_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class OwlViTVisionConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -198,6 +180,7 @@ class OwlViTVisionConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "owlvit_vision_model"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -230,24 +213,6 @@ class OwlViTVisionConfig(PretrainedConfig):
|
||||
self.initializer_range = initializer_range
|
||||
self.initializer_factor = initializer_factor
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the vision config dict if we are loading from OwlViTConfig
|
||||
if config_dict.get("model_type") == "owlvit":
|
||||
config_dict = config_dict["vision_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class OwlViTConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -276,6 +241,7 @@ class OwlViTConfig(PretrainedConfig):
|
||||
"""
|
||||
|
||||
model_type = "owlvit"
|
||||
sub_configs = {"text_config": OwlViTTextConfig, "vision_config": OwlViTVisionConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -304,20 +270,6 @@ class OwlViTConfig(PretrainedConfig):
|
||||
self.return_dict = return_dict
|
||||
self.initializer_factor = 1.0
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def from_text_vision_configs(cls, text_config: Dict, vision_config: Dict, **kwargs):
|
||||
r"""
|
||||
|
||||
@@ -17,7 +17,7 @@ import warnings
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
from ..auto import CONFIG_MAPPING
|
||||
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
@@ -73,7 +73,7 @@ class PaliGemmaConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "paligemma"
|
||||
is_composition = False
|
||||
sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
from ..auto import CONFIG_MAPPING
|
||||
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
@@ -157,7 +157,7 @@ class Qwen2AudioConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "qwen2_audio"
|
||||
is_composition = False
|
||||
sub_configs = {"text_config": AutoConfig, "audio_config": AutoConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -14,9 +14,6 @@
|
||||
# limitations under the License.
|
||||
"""Qwen2VL model configuration"""
|
||||
|
||||
import os
|
||||
from typing import Union
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...modeling_rope_utils import rope_config_validation
|
||||
from ...utils import logging
|
||||
@@ -27,6 +24,7 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
class Qwen2VLVisionConfig(PretrainedConfig):
|
||||
model_type = "qwen2_vl"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -55,23 +53,6 @@ class Qwen2VLVisionConfig(PretrainedConfig):
|
||||
self.spatial_merge_size = spatial_merge_size
|
||||
self.temporal_patch_size = temporal_patch_size
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
if config_dict.get("model_type") == "qwen2_vl":
|
||||
config_dict = config_dict["vision_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class Qwen2VLConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -180,6 +161,7 @@ class Qwen2VLConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "qwen2_vl"
|
||||
sub_configs = {"vision_config": Qwen2VLVisionConfig}
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
|
||||
@@ -14,9 +14,6 @@
|
||||
# limitations under the License.
|
||||
"""Siglip model configuration"""
|
||||
|
||||
import os
|
||||
from typing import Union
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
|
||||
@@ -79,6 +76,7 @@ class SiglipTextConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "siglip_text_model"
|
||||
base_config_key = "text_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -110,24 +108,6 @@ class SiglipTextConfig(PretrainedConfig):
|
||||
self.hidden_act = hidden_act
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the text config dict if we are loading from SiglipConfig
|
||||
if config_dict.get("model_type") == "siglip":
|
||||
config_dict = config_dict["text_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class SiglipVisionConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -178,6 +158,7 @@ class SiglipVisionConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "siglip_vision_model"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -206,24 +187,6 @@ class SiglipVisionConfig(PretrainedConfig):
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.hidden_act = hidden_act
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the vision config dict if we are loading from SiglipConfig
|
||||
if config_dict.get("model_type") == "siglip":
|
||||
config_dict = config_dict["vision_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class SiglipConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -268,6 +231,7 @@ class SiglipConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "siglip"
|
||||
sub_configs = {"text_config": SiglipTextConfig, "vision_config": SiglipVisionConfig}
|
||||
|
||||
def __init__(self, text_config=None, vision_config=None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@@ -71,6 +71,7 @@ class SpeechEncoderDecoderConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "speech-encoder-decoder"
|
||||
sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig}
|
||||
is_composition = True
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
from ..auto import CONFIG_MAPPING
|
||||
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
@@ -78,7 +78,7 @@ class VideoLlavaConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "video_llava"
|
||||
is_composition = False
|
||||
sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
from ..auto import CONFIG_MAPPING
|
||||
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
@@ -72,7 +72,7 @@ class VipLlavaConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "vipllava"
|
||||
is_composition = False
|
||||
sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -78,6 +78,7 @@ class VisionEncoderDecoderConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "vision-encoder-decoder"
|
||||
sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig}
|
||||
is_composition = True
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
|
||||
@@ -75,6 +75,7 @@ class VisionTextDualEncoderConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "vision-text-dual-encoder"
|
||||
sub_configs = {"vision_config": AutoConfig, "text_config": AutoConfig}
|
||||
is_composition = True
|
||||
|
||||
def __init__(self, projection_dim=512, logit_scale_init_value=2.6592, **kwargs):
|
||||
|
||||
@@ -14,9 +14,6 @@
|
||||
# limitations under the License.
|
||||
"""X-CLIP model configuration"""
|
||||
|
||||
import os
|
||||
from typing import Union
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
|
||||
@@ -79,6 +76,7 @@ class XCLIPTextConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "xclip_text_model"
|
||||
base_config_key = "text_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -112,24 +110,6 @@ class XCLIPTextConfig(PretrainedConfig):
|
||||
self.initializer_factor = initializer_factor
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the text config dict if we are loading from XCLIPConfig
|
||||
if config_dict.get("model_type") == "xclip":
|
||||
config_dict = config_dict["text_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class XCLIPVisionConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -195,6 +175,7 @@ class XCLIPVisionConfig(PretrainedConfig):
|
||||
```"""
|
||||
|
||||
model_type = "xclip_vision_model"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -239,24 +220,6 @@ class XCLIPVisionConfig(PretrainedConfig):
|
||||
self.hidden_act = hidden_act
|
||||
self.drop_path_rate = drop_path_rate
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the vision config dict if we are loading from XCLIPConfig
|
||||
if config_dict.get("model_type") == "xclip":
|
||||
config_dict = config_dict["vision_config"]
|
||||
|
||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class XCLIPConfig(PretrainedConfig):
|
||||
r"""
|
||||
@@ -295,6 +258,7 @@ class XCLIPConfig(PretrainedConfig):
|
||||
"""
|
||||
|
||||
model_type = "xclip"
|
||||
sub_configs = {"text_config": XCLIPTextConfig, "vision_config": XCLIPVisionConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -457,11 +457,20 @@ class AlignModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = AlignModelTester(self)
|
||||
self.config_tester = ConfigTester(
|
||||
self,
|
||||
config_class=AlignConfig,
|
||||
has_text_modality=False,
|
||||
common_properties=["projection_dim", "temperature_init_value"],
|
||||
)
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
@unittest.skip(reason="Start to fail after using torch `cu118`.")
|
||||
def test_multi_gpu_data_parallel_forward(self):
|
||||
super().test_multi_gpu_data_parallel_forward()
|
||||
|
||||
@@ -452,11 +452,20 @@ class AltCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = AltCLIPModelTester(self)
|
||||
self.config_tester = ConfigTester(
|
||||
self,
|
||||
config_class=AltCLIPConfig,
|
||||
has_text_modality=False,
|
||||
common_properties=["projection_dim", "logit_scale_init_value"],
|
||||
)
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
@unittest.skip(reason="Hidden_states is tested in individual model tests")
|
||||
def test_hidden_states_output(self):
|
||||
pass
|
||||
|
||||
@@ -449,11 +449,18 @@ class BlipModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = BlipModelTester(self)
|
||||
common_properties = ["logit_scale_init_value", "image_text_hidden_size", "projection_dim", "label_smoothing"]
|
||||
self.config_tester = ConfigTester(
|
||||
self, config_class=BlipConfig, has_text_modality=False, common_properties=common_properties
|
||||
)
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
@unittest.skip(reason="Hidden_states is tested in individual model tests")
|
||||
def test_hidden_states_output(self):
|
||||
pass
|
||||
|
||||
@@ -482,6 +482,13 @@ class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationT
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = Blip2ForConditionalGenerationDecoderOnlyModelTester(self)
|
||||
common_properties = ["image_token_index", "num_query_tokens", "image_text_hidden_size"]
|
||||
self.config_tester = ConfigTester(
|
||||
self, config_class=Blip2Config, has_text_modality=False, common_properties=common_properties
|
||||
)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_for_conditional_generation(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
|
||||
@@ -515,11 +515,18 @@ class ClapModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = ClapModelTester(self)
|
||||
common_properties = ["logit_scale_init_value", "projection_hidden_act", "projection_dim"]
|
||||
self.config_tester = ConfigTester(
|
||||
self, config_class=ClapConfig, has_text_modality=False, common_properties=common_properties
|
||||
)
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
@unittest.skip(reason="Hidden_states is tested in individual model tests")
|
||||
def test_hidden_states_output(self):
|
||||
pass
|
||||
|
||||
@@ -745,11 +745,18 @@ class CLIPModelTest(CLIPModelTesterMixin, PipelineTesterMixin, unittest.TestCase
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = CLIPModelTester(self)
|
||||
common_properties = ["projection_dim", "logit_scale_init_value"]
|
||||
self.config_tester = ConfigTester(
|
||||
self, config_class=CLIPConfig, has_text_modality=False, common_properties=common_properties
|
||||
)
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
@unittest.skip(reason="Hidden_states is tested in individual model tests")
|
||||
def test_hidden_states_output(self):
|
||||
pass
|
||||
|
||||
@@ -472,11 +472,18 @@ class CLIPSegModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = CLIPSegModelTester(self)
|
||||
common_properties = ["projection_dim", "logit_scale_init_value"]
|
||||
self.config_tester = ConfigTester(
|
||||
self, config_class=CLIPSegConfig, has_text_modality=False, common_properties=common_properties
|
||||
)
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_model_for_image_segmentation(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model_for_image_segmentation(*config_and_inputs)
|
||||
|
||||
@@ -414,7 +414,13 @@ class ClvpModelForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase)
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = ClvpModelForConditionalGenerationTester(self)
|
||||
self.clvp_config_tester = ConfigTester(self, config_class=ClvpConfig, hidden_size=32)
|
||||
common_properties = ["projection_dim", "logit_scale_init_value"]
|
||||
self.clvp_config_tester = ConfigTester(
|
||||
self, config_class=ClvpConfig, has_text_modality=False, common_properties=common_properties, hidden_size=32
|
||||
)
|
||||
|
||||
def test_config(self):
|
||||
self.clvp_config_tester.run_common_tests()
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
|
||||
@@ -931,11 +931,18 @@ class FlavaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = self.class_for_tester(self)
|
||||
common_properties = ["projection_dim", "logit_scale_init_value", "init_codebook"]
|
||||
self.config_tester = ConfigTester(
|
||||
self, config_class=FlavaConfig, has_text_modality=False, common_properties=common_properties
|
||||
)
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
@unittest.skip(reason="tested in individual model tests")
|
||||
def test_hidden_states_output(self):
|
||||
pass
|
||||
|
||||
@@ -559,11 +559,18 @@ class GroupViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = GroupViTModelTester(self)
|
||||
common_properties = ["projection_dim", "projection_intermediate_dim", "logit_scale_init_value"]
|
||||
self.config_tester = ConfigTester(
|
||||
self, config_class=GroupViTConfig, has_text_modality=False, common_properties=common_properties
|
||||
)
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
@unittest.skip(reason="hidden_states are tested in individual model tests")
|
||||
def test_hidden_states_output(self):
|
||||
pass
|
||||
|
||||
@@ -185,7 +185,12 @@ class Idefics2ModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = Idefics2VisionText2TextModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=Idefics2Config, has_text_modality=False)
|
||||
self.config_tester = ConfigTester(
|
||||
self, config_class=Idefics2Config, has_text_modality=False, common_properties=["image_token_id"]
|
||||
)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
@unittest.skip(reason="input_embeds cannot be passed in without input_ids")
|
||||
def test_inputs_embeds():
|
||||
|
||||
@@ -168,7 +168,12 @@ class Idefics3ModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = Idefics3VisionText2TextModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=Idefics3Config, has_text_modality=False)
|
||||
self.config_tester = ConfigTester(
|
||||
self, config_class=Idefics3Config, has_text_modality=False, common_properties=["image_token_id"]
|
||||
)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
@unittest.skip(reason="input_embeds cannot be passed in without input_ids")
|
||||
def test_inputs_embeds():
|
||||
|
||||
@@ -486,6 +486,15 @@ class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, Gene
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = InstructBlipForConditionalGenerationDecoderOnlyModelTester(self)
|
||||
self.config_tester = ConfigTester(
|
||||
self,
|
||||
config_class=InstructBlipConfig,
|
||||
has_text_modality=False,
|
||||
common_properties=["num_query_tokens", "image_token_index"],
|
||||
)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_for_conditional_generation(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
|
||||
@@ -510,11 +510,18 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester(self)
|
||||
common_properties = ["num_query_tokens", "video_token_index"]
|
||||
self.config_tester = ConfigTester(
|
||||
self, config_class=InstructBlipVideoConfig, has_text_modality=False, common_properties=common_properties
|
||||
)
|
||||
|
||||
def test_for_conditional_generation(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
@unittest.skip(reason="Hidden_states is tested in individual model tests")
|
||||
def test_hidden_states_output(self):
|
||||
pass
|
||||
|
||||
@@ -304,7 +304,12 @@ class Kosmos2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = Kosmos2ModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=Kosmos2Config, hidden_size=37)
|
||||
self.config_tester = ConfigTester(
|
||||
self, config_class=Kosmos2Config, has_text_modality=False, common_properties=["latent_query_num"]
|
||||
)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
# overwrite from common to skip `image_to_text_projection.latent_query`
|
||||
def test_initialization(self):
|
||||
|
||||
@@ -194,7 +194,13 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = LlavaVisionText2TextModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=LlavaConfig, has_text_modality=False)
|
||||
common_properties = ["image_token_index", "vision_feature_layer", "image_seq_length"]
|
||||
self.config_tester = ConfigTester(
|
||||
self, config_class=LlavaConfig, has_text_modality=False, common_properties=common_properties
|
||||
)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
# overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
|
||||
def test_inputs_embeds(self):
|
||||
|
||||
@@ -223,7 +223,13 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = LlavaNextVisionText2TextModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=LlavaNextConfig, has_text_modality=False)
|
||||
common_properties = ["image_token_index", "vision_feature_layer", "image_seq_length"]
|
||||
self.config_tester = ConfigTester(
|
||||
self, config_class=LlavaNextConfig, has_text_modality=False, common_properties=common_properties
|
||||
)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_initialization(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
@@ -240,7 +240,13 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = LlavaNextVideoVisionText2TextModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=LlavaNextVideoConfig, has_text_modality=False)
|
||||
common_properties = ["image_token_index", "video_token_index", "vision_feature_layer", "image_seq_length"]
|
||||
self.config_tester = ConfigTester(
|
||||
self, config_class=LlavaNextVideoConfig, has_text_modality=False, common_properties=common_properties
|
||||
)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_initialization(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
@@ -226,7 +226,13 @@ class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, Generati
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = LlavaOnevisionVisionText2TextModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=LlavaOnevisionConfig, has_text_modality=False)
|
||||
common_properties = ["image_token_index", "video_token_index", "vision_feature_layer"]
|
||||
self.config_tester = ConfigTester(
|
||||
self, config_class=LlavaOnevisionConfig, has_text_modality=False, common_properties=common_properties
|
||||
)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_initialization(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
@@ -272,7 +272,12 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = MllamaVisionText2TextModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=MllamaConfig, has_text_modality=False)
|
||||
self.config_tester = ConfigTester(
|
||||
self, config_class=MllamaConfig, has_text_modality=False, common_properties=["image_token_index"]
|
||||
)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
# overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
|
||||
def test_inputs_embeds(self):
|
||||
|
||||
@@ -447,6 +447,13 @@ class Owlv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = Owlv2ModelTester(self)
|
||||
common_properties = ["projection_dim", "logit_scale_init_value"]
|
||||
self.config_tester = ConfigTester(
|
||||
self, config_class=Owlv2Config, has_text_modality=False, common_properties=common_properties
|
||||
)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
|
||||
@@ -442,6 +442,13 @@ class OwlViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = OwlViTModelTester(self)
|
||||
common_properties = ["projection_dim", "logit_scale_init_value"]
|
||||
self.config_tester = ConfigTester(
|
||||
self, config_class=OwlViTConfig, has_text_modality=False, common_properties=common_properties
|
||||
)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
|
||||
@@ -232,6 +232,9 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
|
||||
self.model_tester = Qwen2VLVisionText2TextModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=Qwen2VLConfig, has_text_modality=False)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_initialization(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
|
||||
@@ -667,9 +667,12 @@ class SiglipModelTest(SiglipModelTesterMixin, PipelineTesterMixin, unittest.Test
|
||||
test_disk_offload_bin = False
|
||||
_is_composite = True
|
||||
|
||||
# Copied from tests.models.clip.test_modeling_clip.CLIPModelTest.setUp with CLIP->Siglip
|
||||
def setUp(self):
|
||||
self.model_tester = SiglipModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=SiglipConfig, has_text_modality=False)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
# Copied from tests.models.clip.test_modeling_clip.CLIPModelTest.test_model
|
||||
def test_model(self):
|
||||
|
||||
@@ -217,7 +217,13 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = VideoLlavaVisionText2TextModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=VideoLlavaConfig, has_text_modality=False)
|
||||
common_properties = ["image_token_index", "video_token_index", "vision_feature_layer", "image_seq_length"]
|
||||
self.config_tester = ConfigTester(
|
||||
self, config_class=VideoLlavaConfig, has_text_modality=False, common_properties=common_properties
|
||||
)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
|
||||
@@ -179,7 +179,13 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = VipLlavaVisionText2TextModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=VipLlavaConfig, has_text_modality=False)
|
||||
common_properties = ["image_token_index", "vision_feature_layers", "image_seq_length"]
|
||||
self.config_tester = ConfigTester(
|
||||
self, config_class=VipLlavaConfig, has_text_modality=False, common_properties=common_properties
|
||||
)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
# overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
|
||||
def test_inputs_embeds(self):
|
||||
|
||||
@@ -547,6 +547,13 @@ class XCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = XCLIPModelTester(self)
|
||||
common_properties = ["projection_dim", "prompt_layers", "prompt_num_attention_heads"]
|
||||
self.config_tester = ConfigTester(
|
||||
self, config_class=XCLIPConfig, has_text_modality=False, common_properties=common_properties
|
||||
)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
|
||||
@@ -17,12 +17,17 @@ import copy
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
from transformers import is_torch_available
|
||||
from transformers.utils import direct_transformers_import
|
||||
|
||||
from .utils.test_configuration_utils import config_common_kwargs
|
||||
|
||||
|
||||
transformers_module = direct_transformers_import(Path(__file__).parent)
|
||||
|
||||
|
||||
class ConfigTester:
|
||||
def __init__(self, parent, config_class=None, has_text_modality=True, common_properties=None, **kwargs):
|
||||
self.parent = parent
|
||||
@@ -35,9 +40,10 @@ class ConfigTester:
|
||||
config = self.config_class(**self.inputs_dict)
|
||||
common_properties = (
|
||||
["hidden_size", "num_attention_heads", "num_hidden_layers"]
|
||||
if self.common_properties is None
|
||||
if self.common_properties is None and not self.config_class.sub_configs
|
||||
else self.common_properties
|
||||
)
|
||||
common_properties = [] if common_properties is None else common_properties
|
||||
|
||||
# Add common fields for text models
|
||||
if self.has_text_modality:
|
||||
@@ -110,6 +116,44 @@ class ConfigTester:
|
||||
|
||||
self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
|
||||
|
||||
def create_and_test_config_from_and_save_pretrained_composite(self):
|
||||
"""
|
||||
Tests that composite or nested cofigs can be loaded and saved correctly. In case the config
|
||||
has a sub-config, we should be able to call `sub_config.from_pretrained('general_config_file')`
|
||||
and get a result same as if we loaded the whole config and obtained `config.sub_config` from it.
|
||||
"""
|
||||
config = self.config_class(**self.inputs_dict)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
config.save_pretrained(tmpdirname)
|
||||
general_config_loaded = self.config_class.from_pretrained(tmpdirname)
|
||||
general_config_dict = config.to_dict()
|
||||
|
||||
# Iterate over all sub_configs if there are any and load them with their own classes
|
||||
sub_configs = self.config_class.sub_configs
|
||||
for sub_config_key, sub_class in sub_configs.items():
|
||||
if sub_class.__name__ == "AutoConfig":
|
||||
sub_class = sub_class.for_model(**general_config_dict[sub_config_key]).__class__
|
||||
sub_config_loaded = sub_class.from_pretrained(tmpdirname)
|
||||
else:
|
||||
sub_config_loaded = sub_class.from_pretrained(tmpdirname)
|
||||
|
||||
# Pop `transformers_version`, it never exists when a config is part of a general composite config
|
||||
# Verify that loading with subconfig class results in same dict as if we loaded with general composite config class
|
||||
sub_config_loaded_dict = sub_config_loaded.to_dict()
|
||||
sub_config_loaded_dict.pop("transformers_version", None)
|
||||
self.parent.assertEqual(sub_config_loaded_dict, general_config_dict[sub_config_key])
|
||||
|
||||
# Verify that the loaded config type is same as in the general config
|
||||
type_from_general_config = type(getattr(general_config_loaded, sub_config_key))
|
||||
self.parent.assertTrue(isinstance(sub_config_loaded, type_from_general_config))
|
||||
|
||||
# Now save only the sub-config and load it back to make sure the whole load-save-load pipeline works
|
||||
with tempfile.TemporaryDirectory() as tmpdirname2:
|
||||
sub_config_loaded.save_pretrained(tmpdirname2)
|
||||
sub_config_loaded_2 = sub_class.from_pretrained(tmpdirname2)
|
||||
self.parent.assertEqual(sub_config_loaded.to_dict(), sub_config_loaded_2.to_dict())
|
||||
|
||||
def create_and_test_config_with_num_labels(self):
|
||||
config = self.config_class(**self.inputs_dict, num_labels=5)
|
||||
self.parent.assertEqual(len(config.id2label), 5)
|
||||
@@ -128,6 +172,9 @@ class ConfigTester:
|
||||
self.parent.assertIsNotNone(config)
|
||||
|
||||
def check_config_arguments_init(self):
|
||||
if self.config_class.sub_configs:
|
||||
return # TODO: @raushan composite models are not consistent in how they set general params
|
||||
|
||||
kwargs = copy.deepcopy(config_common_kwargs)
|
||||
config = self.config_class(**kwargs)
|
||||
wrong_values = []
|
||||
@@ -153,6 +200,7 @@ class ConfigTester:
|
||||
self.create_and_test_config_to_json_file()
|
||||
self.create_and_test_config_from_and_save_pretrained()
|
||||
self.create_and_test_config_from_and_save_pretrained_subfolder()
|
||||
self.create_and_test_config_from_and_save_pretrained_composite()
|
||||
self.create_and_test_config_with_num_labels()
|
||||
self.check_config_can_be_init_without_params()
|
||||
self.check_config_arguments_init()
|
||||
|
||||
@@ -3802,22 +3802,18 @@ class ModelTesterMixin:
|
||||
self.skipTest("Model is not a composite model.")
|
||||
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
sub_configs = {
|
||||
key: getattr(config, key) for key in config if isinstance(getattr(config, key), PretrainedConfig)
|
||||
}
|
||||
|
||||
# set eager as it will be the one supported in all models
|
||||
# we just need to test if passing 'attn_implementation' as a dict fails or not
|
||||
attn_implementation_per_subconfig = {}
|
||||
for key, sub_config in sub_configs.items():
|
||||
for key in config.sub_configs.keys():
|
||||
attn_implementation_per_subconfig[key] = "eager"
|
||||
|
||||
config._attn_implementation = attn_implementation_per_subconfig
|
||||
model = model_class(config)
|
||||
for key in model.config:
|
||||
if isinstance(getattr(model.config, key), PretrainedConfig):
|
||||
sub_config = getattr(model.config, key)
|
||||
self.assertTrue(sub_config._attn_implementation == "eager")
|
||||
for key in config.sub_configs.keys():
|
||||
sub_config = getattr(model.config, key)
|
||||
self.assertTrue(sub_config._attn_implementation == "eager")
|
||||
|
||||
for name, submodule in model.named_modules():
|
||||
class_name = submodule.__class__.__name__
|
||||
@@ -3826,7 +3822,7 @@ class ModelTesterMixin:
|
||||
or "SdpaSelfAttention" in class_name
|
||||
or "FlashAttention" in class_name
|
||||
):
|
||||
raise ValueError("The eager model should not have SDPA/FA2 attention layers")
|
||||
raise ValueError(f"The eager model should not have SDPA/FA2 attention layers but got {class_name}")
|
||||
|
||||
@require_torch_sdpa
|
||||
def test_sdpa_can_dispatch_non_composite_models(self):
|
||||
|
||||
Reference in New Issue
Block a user