Load sub-configs from composite configs (#34410)
* save/load sub-configs * nit forgot these * fix copies * move test to common * use dict for sub-configs * add load-save-laod test * clean up modeling check * oops this are correct keys * fix some tests, missed some composite configs * this model was missed
This commit is contained in:
committed by
GitHub
parent
5e1fd4e204
commit
893ad04fad
@@ -190,6 +190,8 @@ class PretrainedConfig(PushToHubMixin):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
model_type: str = ""
|
model_type: str = ""
|
||||||
|
base_config_key: str = ""
|
||||||
|
sub_configs: Dict[str, "PretrainedConfig"] = {}
|
||||||
is_composition: bool = False
|
is_composition: bool = False
|
||||||
attribute_map: Dict[str, str] = {}
|
attribute_map: Dict[str, str] = {}
|
||||||
_auto_class: Optional[str] = None
|
_auto_class: Optional[str] = None
|
||||||
@@ -543,11 +545,22 @@ class PretrainedConfig(PushToHubMixin):
|
|||||||
cls._set_token_in_kwargs(kwargs, token)
|
cls._set_token_in_kwargs(kwargs, token)
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||||
|
if cls.base_config_key and cls.base_config_key in config_dict:
|
||||||
|
config_dict = config_dict[cls.base_config_key]
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||||
logger.warning(
|
# sometimes the config has no `base_config_key` if the config is used in several composite models
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
# e.g. LlamaConfig. In that case we try to see if there is match in `model_type` before raising a warning
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
for k, v in config_dict.items():
|
||||||
)
|
if isinstance(v, dict) and v.get("model_type") == cls.model_type:
|
||||||
|
config_dict = v
|
||||||
|
|
||||||
|
# raise warning only if we still can't see a match in `model_type`
|
||||||
|
if config_dict["model_type"] != cls.model_type:
|
||||||
|
logger.warning(
|
||||||
|
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||||
|
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||||
|
)
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
return cls.from_dict(config_dict, **kwargs)
|
||||||
|
|
||||||
|
|||||||
@@ -1608,15 +1608,14 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
|||||||
# Below we check if a config is composite and manually prepare a dict of attn impl if not already passed as a dict.
|
# Below we check if a config is composite and manually prepare a dict of attn impl if not already passed as a dict.
|
||||||
# Later each sub-module will dispatch with its own attn impl, by calling `XXXModel._from_config(config.text_config)`
|
# Later each sub-module will dispatch with its own attn impl, by calling `XXXModel._from_config(config.text_config)`
|
||||||
# If any of sub-modules doesn't support requested attn, an error will be raised. See https://github.com/huggingface/transformers/pull/32238
|
# If any of sub-modules doesn't support requested attn, an error will be raised. See https://github.com/huggingface/transformers/pull/32238
|
||||||
for key in config:
|
for key in config.sub_configs.keys():
|
||||||
if isinstance(getattr(config, key), PretrainedConfig):
|
sub_config = getattr(config, key)
|
||||||
sub_config = getattr(config, key)
|
curr_attn_implementation = (
|
||||||
curr_attn_implementation = (
|
requested_attn_implementation
|
||||||
requested_attn_implementation
|
if not isinstance(requested_attn_implementation, dict)
|
||||||
if not isinstance(requested_attn_implementation, dict)
|
else requested_attn_implementation.get(key, None)
|
||||||
else requested_attn_implementation.get(key, None)
|
)
|
||||||
)
|
sub_config._attn_implementation_internal = curr_attn_implementation
|
||||||
sub_config._attn_implementation_internal = curr_attn_implementation
|
|
||||||
|
|
||||||
if use_flash_attention_2:
|
if use_flash_attention_2:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
|
|||||||
@@ -14,8 +14,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""ALIGN model configuration"""
|
"""ALIGN model configuration"""
|
||||||
|
|
||||||
import os
|
from typing import TYPE_CHECKING, List
|
||||||
from typing import TYPE_CHECKING, List, Union
|
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@@ -95,6 +94,7 @@ class AlignTextConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "align_text_model"
|
model_type = "align_text_model"
|
||||||
|
base_config_key = "text_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -133,24 +133,6 @@ class AlignTextConfig(PretrainedConfig):
|
|||||||
self.use_cache = use_cache
|
self.use_cache = use_cache
|
||||||
self.pad_token_id = pad_token_id
|
self.pad_token_id = pad_token_id
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the text config dict if we are loading from AlignConfig
|
|
||||||
if config_dict.get("model_type") == "align":
|
|
||||||
config_dict = config_dict["text_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class AlignVisionConfig(PretrainedConfig):
|
class AlignVisionConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -223,6 +205,7 @@ class AlignVisionConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "align_vision_model"
|
model_type = "align_vision_model"
|
||||||
|
base_config_key = "vision_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -272,24 +255,6 @@ class AlignVisionConfig(PretrainedConfig):
|
|||||||
self.drop_connect_rate = drop_connect_rate
|
self.drop_connect_rate = drop_connect_rate
|
||||||
self.num_hidden_layers = sum(num_block_repeats) * 4
|
self.num_hidden_layers = sum(num_block_repeats) * 4
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the vision config dict if we are loading from AlignConfig
|
|
||||||
if config_dict.get("model_type") == "align":
|
|
||||||
config_dict = config_dict["vision_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class AlignConfig(PretrainedConfig):
|
class AlignConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -340,6 +305,7 @@ class AlignConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "align"
|
model_type = "align"
|
||||||
|
sub_configs = {"text_config": AlignTextConfig, "vision_config": AlignVisionConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -14,9 +14,6 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""AltCLIP model configuration"""
|
"""AltCLIP model configuration"""
|
||||||
|
|
||||||
import os
|
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
|
|
||||||
@@ -199,6 +196,7 @@ class AltCLIPVisionConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "altclip_vision_model"
|
model_type = "altclip_vision_model"
|
||||||
|
base_config_key = "vision_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -233,24 +231,6 @@ class AltCLIPVisionConfig(PretrainedConfig):
|
|||||||
self.layer_norm_eps = layer_norm_eps
|
self.layer_norm_eps = layer_norm_eps
|
||||||
self.hidden_act = hidden_act
|
self.hidden_act = hidden_act
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the vision config dict if we are loading from AltCLIPConfig
|
|
||||||
if config_dict.get("model_type") == "altclip":
|
|
||||||
config_dict = config_dict["vision_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class AltCLIPConfig(PretrainedConfig):
|
class AltCLIPConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -298,6 +278,7 @@ class AltCLIPConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "altclip"
|
model_type = "altclip"
|
||||||
|
sub_configs = {"text_config": AltCLIPTextConfig, "vision_config": AltCLIPVisionConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, text_config=None, vision_config=None, projection_dim=768, logit_scale_init_value=2.6592, **kwargs
|
self, text_config=None, vision_config=None, projection_dim=768, logit_scale_init_value=2.6592, **kwargs
|
||||||
|
|||||||
@@ -14,12 +14,11 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""BARK model configuration"""
|
"""BARK model configuration"""
|
||||||
|
|
||||||
import os
|
from typing import Dict
|
||||||
from typing import Dict, Optional, Union
|
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...utils import add_start_docstrings, logging
|
from ...utils import add_start_docstrings, logging
|
||||||
from ..auto import CONFIG_MAPPING
|
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
@@ -64,7 +63,6 @@ BARK_SUBMODELCONFIG_START_DOCSTRING = """
|
|||||||
|
|
||||||
|
|
||||||
class BarkSubModelConfig(PretrainedConfig):
|
class BarkSubModelConfig(PretrainedConfig):
|
||||||
model_type = "bark_module"
|
|
||||||
keys_to_ignore_at_inference = ["past_key_values"]
|
keys_to_ignore_at_inference = ["past_key_values"]
|
||||||
|
|
||||||
attribute_map = {
|
attribute_map = {
|
||||||
@@ -101,38 +99,6 @@ class BarkSubModelConfig(PretrainedConfig):
|
|||||||
|
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(
|
|
||||||
cls,
|
|
||||||
pretrained_model_name_or_path: Union[str, os.PathLike],
|
|
||||||
cache_dir: Optional[Union[str, os.PathLike]] = None,
|
|
||||||
force_download: bool = False,
|
|
||||||
local_files_only: bool = False,
|
|
||||||
token: Optional[Union[str, bool]] = None,
|
|
||||||
revision: str = "main",
|
|
||||||
**kwargs,
|
|
||||||
) -> "PretrainedConfig":
|
|
||||||
kwargs["cache_dir"] = cache_dir
|
|
||||||
kwargs["force_download"] = force_download
|
|
||||||
kwargs["local_files_only"] = local_files_only
|
|
||||||
kwargs["revision"] = revision
|
|
||||||
|
|
||||||
cls._set_token_in_kwargs(kwargs, token)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the config dict if we are loading from Bark
|
|
||||||
if config_dict.get("model_type") == "bark":
|
|
||||||
config_dict = config_dict[f"{cls.model_type}_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
BARK_SUBMODELCONFIG_START_DOCSTRING.format(config="BarkSemanticConfig", model="BarkSemanticModel"),
|
BARK_SUBMODELCONFIG_START_DOCSTRING.format(config="BarkSemanticConfig", model="BarkSemanticModel"),
|
||||||
@@ -154,6 +120,7 @@ class BarkSubModelConfig(PretrainedConfig):
|
|||||||
)
|
)
|
||||||
class BarkSemanticConfig(BarkSubModelConfig):
|
class BarkSemanticConfig(BarkSubModelConfig):
|
||||||
model_type = "semantic"
|
model_type = "semantic"
|
||||||
|
base_config_key = "semantic_config"
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
@@ -176,6 +143,7 @@ class BarkSemanticConfig(BarkSubModelConfig):
|
|||||||
)
|
)
|
||||||
class BarkCoarseConfig(BarkSubModelConfig):
|
class BarkCoarseConfig(BarkSubModelConfig):
|
||||||
model_type = "coarse_acoustics"
|
model_type = "coarse_acoustics"
|
||||||
|
base_config_key = "coarse_acoustics_config"
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
@@ -203,6 +171,7 @@ class BarkCoarseConfig(BarkSubModelConfig):
|
|||||||
)
|
)
|
||||||
class BarkFineConfig(BarkSubModelConfig):
|
class BarkFineConfig(BarkSubModelConfig):
|
||||||
model_type = "fine_acoustics"
|
model_type = "fine_acoustics"
|
||||||
|
base_config_key = "fine_acoustics_config"
|
||||||
|
|
||||||
def __init__(self, tie_word_embeddings=True, n_codes_total=8, n_codes_given=1, **kwargs):
|
def __init__(self, tie_word_embeddings=True, n_codes_total=8, n_codes_given=1, **kwargs):
|
||||||
self.n_codes_total = n_codes_total
|
self.n_codes_total = n_codes_total
|
||||||
@@ -265,6 +234,12 @@ class BarkConfig(PretrainedConfig):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
model_type = "bark"
|
model_type = "bark"
|
||||||
|
sub_configs = {
|
||||||
|
"semantic_config": BarkSemanticConfig,
|
||||||
|
"coarse_acoustics_config": BarkCoarseConfig,
|
||||||
|
"fine_acoustics_config": BarkFineConfig,
|
||||||
|
"codec_config": AutoConfig,
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -14,9 +14,6 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""Blip model configuration"""
|
"""Blip model configuration"""
|
||||||
|
|
||||||
import os
|
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
|
|
||||||
@@ -96,6 +93,7 @@ class BlipTextConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "blip_text_model"
|
model_type = "blip_text_model"
|
||||||
|
base_config_key = "text_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -146,24 +144,6 @@ class BlipTextConfig(PretrainedConfig):
|
|||||||
self.use_cache = use_cache
|
self.use_cache = use_cache
|
||||||
self.label_smoothing = label_smoothing
|
self.label_smoothing = label_smoothing
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the text config dict if we are loading from BlipConfig
|
|
||||||
if config_dict.get("model_type") == "blip":
|
|
||||||
config_dict = config_dict["text_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class BlipVisionConfig(PretrainedConfig):
|
class BlipVisionConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -215,6 +195,7 @@ class BlipVisionConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "blip_vision_model"
|
model_type = "blip_vision_model"
|
||||||
|
base_config_key = "vision_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -245,24 +226,6 @@ class BlipVisionConfig(PretrainedConfig):
|
|||||||
self.layer_norm_eps = layer_norm_eps
|
self.layer_norm_eps = layer_norm_eps
|
||||||
self.hidden_act = hidden_act
|
self.hidden_act = hidden_act
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the vision config dict if we are loading from BlipConfig
|
|
||||||
if config_dict.get("model_type") == "blip":
|
|
||||||
config_dict = config_dict["vision_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class BlipConfig(PretrainedConfig):
|
class BlipConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -316,6 +279,7 @@ class BlipConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "blip"
|
model_type = "blip"
|
||||||
|
sub_configs = {"text_config": BlipTextConfig, "vision_config": BlipVisionConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -14,13 +14,12 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""BLIP-2 model configuration"""
|
"""BLIP-2 model configuration"""
|
||||||
|
|
||||||
import os
|
from typing import Optional
|
||||||
from typing import Optional, Union
|
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
|
from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
from ..auto import CONFIG_MAPPING
|
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
@@ -76,6 +75,7 @@ class Blip2VisionConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "blip_2_vision_model"
|
model_type = "blip_2_vision_model"
|
||||||
|
base_config_key = "vision_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -106,24 +106,6 @@ class Blip2VisionConfig(PretrainedConfig):
|
|||||||
self.hidden_act = hidden_act
|
self.hidden_act = hidden_act
|
||||||
self.qkv_bias = qkv_bias
|
self.qkv_bias = qkv_bias
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the vision config dict if we are loading from Blip2Config
|
|
||||||
if config_dict.get("model_type") == "blip-2":
|
|
||||||
config_dict = config_dict["vision_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class Blip2QFormerConfig(PretrainedConfig):
|
class Blip2QFormerConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -190,6 +172,7 @@ class Blip2QFormerConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "blip_2_qformer"
|
model_type = "blip_2_qformer"
|
||||||
|
base_config_key = "qformer_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -229,24 +212,6 @@ class Blip2QFormerConfig(PretrainedConfig):
|
|||||||
self.encoder_hidden_size = encoder_hidden_size
|
self.encoder_hidden_size = encoder_hidden_size
|
||||||
self.use_qformer_text_input = use_qformer_text_input
|
self.use_qformer_text_input = use_qformer_text_input
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the qformer config dict if we are loading from Blip2Config
|
|
||||||
if config_dict.get("model_type") == "blip-2":
|
|
||||||
config_dict = config_dict["qformer_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class Blip2Config(PretrainedConfig):
|
class Blip2Config(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -306,6 +271,7 @@ class Blip2Config(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "blip-2"
|
model_type = "blip-2"
|
||||||
|
sub_configs = {"text_config": AutoConfig, "qformer_config": Blip2QFormerConfig, "vision_config": Blip2VisionConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -14,9 +14,6 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""BridgeTower model configuration"""
|
"""BridgeTower model configuration"""
|
||||||
|
|
||||||
import os
|
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
|
|
||||||
@@ -68,6 +65,7 @@ class BridgeTowerVisionConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "bridgetower_vision_model"
|
model_type = "bridgetower_vision_model"
|
||||||
|
base_config_key = "vision_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -95,21 +93,6 @@ class BridgeTowerVisionConfig(PretrainedConfig):
|
|||||||
self.share_layernorm = share_layernorm
|
self.share_layernorm = share_layernorm
|
||||||
self.remove_last_layer = remove_last_layer
|
self.remove_last_layer = remove_last_layer
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
if config_dict.get("model_type") == "bridgetower":
|
|
||||||
config_dict = config_dict["text_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class BridgeTowerTextConfig(PretrainedConfig):
|
class BridgeTowerTextConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -175,6 +158,7 @@ class BridgeTowerTextConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "bridgetower_text_model"
|
model_type = "bridgetower_text_model"
|
||||||
|
base_config_key = "text_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -217,21 +201,6 @@ class BridgeTowerTextConfig(PretrainedConfig):
|
|||||||
self.bos_token_id = bos_token_id
|
self.bos_token_id = bos_token_id
|
||||||
self.eos_token_id = eos_token_id
|
self.eos_token_id = eos_token_id
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
if config_dict.get("model_type") == "bridgetower":
|
|
||||||
config_dict = config_dict["text_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class BridgeTowerConfig(PretrainedConfig):
|
class BridgeTowerConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -288,6 +257,7 @@ class BridgeTowerConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "bridgetower"
|
model_type = "bridgetower"
|
||||||
|
sub_configs = {"text_config": BridgeTowerTextConfig, "vision_config": BridgeTowerVisionConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -62,6 +62,7 @@ class ChameleonVQVAEConfig(PretrainedConfig):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
model_type = "chameleon_vqgan"
|
model_type = "chameleon_vqgan"
|
||||||
|
base_config_key = "vq_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -187,6 +188,7 @@ class ChameleonConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "chameleon"
|
model_type = "chameleon"
|
||||||
|
sub_configs = {"vq_config": ChameleonVQVAEConfig}
|
||||||
keys_to_ignore_at_inference = ["past_key_values"]
|
keys_to_ignore_at_inference = ["past_key_values"]
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
|||||||
@@ -14,9 +14,8 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""Chinese-CLIP model configuration"""
|
"""Chinese-CLIP model configuration"""
|
||||||
|
|
||||||
import os
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
|
from typing import TYPE_CHECKING, Any, Mapping, Optional
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@@ -102,6 +101,7 @@ class ChineseCLIPTextConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "chinese_clip_text_model"
|
model_type = "chinese_clip_text_model"
|
||||||
|
base_config_key = "text_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -141,24 +141,6 @@ class ChineseCLIPTextConfig(PretrainedConfig):
|
|||||||
self.position_embedding_type = position_embedding_type
|
self.position_embedding_type = position_embedding_type
|
||||||
self.use_cache = use_cache
|
self.use_cache = use_cache
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the vision config dict if we are loading from ChineseCLIPConfig
|
|
||||||
if config_dict.get("model_type") == "chinese_clip":
|
|
||||||
config_dict = config_dict["text_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class ChineseCLIPVisionConfig(PretrainedConfig):
|
class ChineseCLIPVisionConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -215,6 +197,7 @@ class ChineseCLIPVisionConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "chinese_clip_vision_model"
|
model_type = "chinese_clip_vision_model"
|
||||||
|
base_config_key = "vision_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -249,24 +232,6 @@ class ChineseCLIPVisionConfig(PretrainedConfig):
|
|||||||
self.layer_norm_eps = layer_norm_eps
|
self.layer_norm_eps = layer_norm_eps
|
||||||
self.hidden_act = hidden_act
|
self.hidden_act = hidden_act
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the vision config dict if we are loading from ChineseCLIPConfig
|
|
||||||
if config_dict.get("model_type") == "chinese_clip":
|
|
||||||
config_dict = config_dict["vision_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class ChineseCLIPConfig(PretrainedConfig):
|
class ChineseCLIPConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -316,6 +281,7 @@ class ChineseCLIPConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "chinese_clip"
|
model_type = "chinese_clip"
|
||||||
|
sub_configs = {"text_config": ChineseCLIPTextConfig, "vision_config": ChineseCLIPVisionConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
|
self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
|
||||||
|
|||||||
@@ -14,9 +14,6 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""CLAP model configuration"""
|
"""CLAP model configuration"""
|
||||||
|
|
||||||
import os
|
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
|
|
||||||
@@ -94,6 +91,7 @@ class ClapTextConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "clap_text_model"
|
model_type = "clap_text_model"
|
||||||
|
base_config_key = "text_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -137,24 +135,6 @@ class ClapTextConfig(PretrainedConfig):
|
|||||||
self.projection_hidden_act = projection_hidden_act
|
self.projection_hidden_act = projection_hidden_act
|
||||||
self.projection_dim = projection_dim
|
self.projection_dim = projection_dim
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the text config dict if we are loading from ClapConfig
|
|
||||||
if config_dict.get("model_type") == "clap":
|
|
||||||
config_dict = config_dict["text_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class ClapAudioConfig(PretrainedConfig):
|
class ClapAudioConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -245,6 +225,7 @@ class ClapAudioConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "clap_audio_model"
|
model_type = "clap_audio_model"
|
||||||
|
base_config_key = "audio_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -307,24 +288,6 @@ class ClapAudioConfig(PretrainedConfig):
|
|||||||
self.initializer_factor = initializer_factor
|
self.initializer_factor = initializer_factor
|
||||||
self.projection_hidden_act = projection_hidden_act
|
self.projection_hidden_act = projection_hidden_act
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the audio config dict if we are loading from ClapConfig
|
|
||||||
if config_dict.get("model_type") == "clap":
|
|
||||||
config_dict = config_dict["audio_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class ClapConfig(PretrainedConfig):
|
class ClapConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -377,6 +340,7 @@ class ClapConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "clap"
|
model_type = "clap"
|
||||||
|
sub_configs = {"text_config": ClapTextConfig, "audio_config": ClapAudioConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -14,9 +14,8 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""CLIP model configuration"""
|
"""CLIP model configuration"""
|
||||||
|
|
||||||
import os
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
|
from typing import TYPE_CHECKING, Any, Mapping, Optional
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@@ -93,6 +92,7 @@ class CLIPTextConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "clip_text_model"
|
model_type = "clip_text_model"
|
||||||
|
base_config_key = "text_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -130,24 +130,6 @@ class CLIPTextConfig(PretrainedConfig):
|
|||||||
self.initializer_factor = initializer_factor
|
self.initializer_factor = initializer_factor
|
||||||
self.attention_dropout = attention_dropout
|
self.attention_dropout = attention_dropout
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the text config dict if we are loading from CLIPConfig
|
|
||||||
if config_dict.get("model_type") == "clip":
|
|
||||||
config_dict = config_dict["text_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class CLIPVisionConfig(PretrainedConfig):
|
class CLIPVisionConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -205,6 +187,7 @@ class CLIPVisionConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "clip_vision_model"
|
model_type = "clip_vision_model"
|
||||||
|
base_config_key = "vision_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -239,24 +222,6 @@ class CLIPVisionConfig(PretrainedConfig):
|
|||||||
self.layer_norm_eps = layer_norm_eps
|
self.layer_norm_eps = layer_norm_eps
|
||||||
self.hidden_act = hidden_act
|
self.hidden_act = hidden_act
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the vision config dict if we are loading from CLIPConfig
|
|
||||||
if config_dict.get("model_type") == "clip":
|
|
||||||
config_dict = config_dict["vision_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class CLIPConfig(PretrainedConfig):
|
class CLIPConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -305,6 +270,7 @@ class CLIPConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "clip"
|
model_type = "clip"
|
||||||
|
sub_configs = {"text_config": CLIPTextConfig, "vision_config": CLIPVisionConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
|
self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
|
||||||
|
|||||||
@@ -14,9 +14,6 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""CLIPSeg model configuration"""
|
"""CLIPSeg model configuration"""
|
||||||
|
|
||||||
import os
|
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
|
|
||||||
@@ -84,6 +81,7 @@ class CLIPSegTextConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "clipseg_text_model"
|
model_type = "clipseg_text_model"
|
||||||
|
base_config_key = "text_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -117,24 +115,6 @@ class CLIPSegTextConfig(PretrainedConfig):
|
|||||||
self.initializer_factor = initializer_factor
|
self.initializer_factor = initializer_factor
|
||||||
self.attention_dropout = attention_dropout
|
self.attention_dropout = attention_dropout
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the text config dict if we are loading from CLIPSegConfig
|
|
||||||
if config_dict.get("model_type") == "clipseg":
|
|
||||||
config_dict = config_dict["text_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class CLIPSegVisionConfig(PretrainedConfig):
|
class CLIPSegVisionConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -190,6 +170,7 @@ class CLIPSegVisionConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "clipseg_vision_model"
|
model_type = "clipseg_vision_model"
|
||||||
|
base_config_key = "vision_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -222,24 +203,6 @@ class CLIPSegVisionConfig(PretrainedConfig):
|
|||||||
self.layer_norm_eps = layer_norm_eps
|
self.layer_norm_eps = layer_norm_eps
|
||||||
self.hidden_act = hidden_act
|
self.hidden_act = hidden_act
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the vision config dict if we are loading from CLIPSegConfig
|
|
||||||
if config_dict.get("model_type") == "clipseg":
|
|
||||||
config_dict = config_dict["vision_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class CLIPSegConfig(PretrainedConfig):
|
class CLIPSegConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -306,6 +269,7 @@ class CLIPSegConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "clipseg"
|
model_type = "clipseg"
|
||||||
|
sub_configs = {"text_config": CLIPSegTextConfig, "vision_config": CLIPSegVisionConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -91,6 +91,7 @@ class ClvpEncoderConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "clvp_encoder"
|
model_type = "clvp_encoder"
|
||||||
|
base_config_key = ["text_config", "speech_config"]
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -141,7 +142,7 @@ class ClvpEncoderConfig(PretrainedConfig):
|
|||||||
|
|
||||||
# make sure to have the config_type be either "text_config" or "speech_config"
|
# make sure to have the config_type be either "text_config" or "speech_config"
|
||||||
# this is to make sure that we can load only text or speech configs from the nested ClvpConfig.
|
# this is to make sure that we can load only text or speech configs from the nested ClvpConfig.
|
||||||
if config_type not in ["text_config", "speech_config"]:
|
if config_type not in cls.base_config_key:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"We can only load either 'text_config' or 'speech_config' but you are trying to load" f"{config_type}"
|
f"We can only load either 'text_config' or 'speech_config' but you are trying to load" f"{config_type}"
|
||||||
)
|
)
|
||||||
@@ -253,6 +254,7 @@ class ClvpDecoderConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "clvp_decoder"
|
model_type = "clvp_decoder"
|
||||||
|
base_config_key = "decoder_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -314,24 +316,6 @@ class ClvpDecoderConfig(PretrainedConfig):
|
|||||||
|
|
||||||
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the speech config dict if we are loading from ClvpConfig
|
|
||||||
if config_dict.get("model_type") == "clvp":
|
|
||||||
config_dict = config_dict["decoder_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class ClvpConfig(PretrainedConfig):
|
class ClvpConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -386,7 +370,11 @@ class ClvpConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "clvp"
|
model_type = "clvp"
|
||||||
is_composition = True
|
sub_configs = {
|
||||||
|
"text_config": ClvpEncoderConfig,
|
||||||
|
"speech_config": ClvpEncoderConfig,
|
||||||
|
"decoder_config": ClvpDecoderConfig,
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -41,6 +41,8 @@ class DbrxAttentionConfig(PretrainedConfig):
|
|||||||
rope_theta (`float`, *optional*, defaults to 10000.0): The base frequency for rope.
|
rope_theta (`float`, *optional*, defaults to 10000.0): The base frequency for rope.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
base_config_key = "attn_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
attn_pdrop: float = 0.0,
|
attn_pdrop: float = 0.0,
|
||||||
@@ -55,29 +57,12 @@ class DbrxAttentionConfig(PretrainedConfig):
|
|||||||
self.kv_n_heads = kv_n_heads
|
self.kv_n_heads = kv_n_heads
|
||||||
self.rope_theta = rope_theta
|
self.rope_theta = rope_theta
|
||||||
|
|
||||||
for k in ["model_type"]:
|
for k in ["model_type", "attn_implementation", "transformers_version", "_commit_hash"]:
|
||||||
if k in kwargs:
|
if k in kwargs:
|
||||||
kwargs.pop(k)
|
kwargs.pop(k)
|
||||||
if len(kwargs) != 0:
|
if len(kwargs) != 0:
|
||||||
raise ValueError(f"Found unknown {kwargs=}")
|
raise ValueError(f"Found unknown {kwargs=}")
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs: Any) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
if config_dict.get("model_type") == "dbrx":
|
|
||||||
config_dict = config_dict["attn_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class DbrxFFNConfig(PretrainedConfig):
|
class DbrxFFNConfig(PretrainedConfig):
|
||||||
"""Configuration class for Dbrx FFN.
|
"""Configuration class for Dbrx FFN.
|
||||||
@@ -100,6 +85,8 @@ class DbrxFFNConfig(PretrainedConfig):
|
|||||||
moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0): The normalization factor for the expert weights.
|
moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0): The normalization factor for the expert weights.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
base_config_key = "ffn_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
ffn_act_fn: dict = None,
|
ffn_act_fn: dict = None,
|
||||||
@@ -122,29 +109,12 @@ class DbrxFFNConfig(PretrainedConfig):
|
|||||||
self.moe_loss_weight = moe_loss_weight
|
self.moe_loss_weight = moe_loss_weight
|
||||||
self.moe_normalize_expert_weights = moe_normalize_expert_weights
|
self.moe_normalize_expert_weights = moe_normalize_expert_weights
|
||||||
|
|
||||||
for k in ["model_type"]:
|
for k in ["model_type", "attn_implementation", "transformers_version", "_commit_hash"]:
|
||||||
if k in kwargs:
|
if k in kwargs:
|
||||||
kwargs.pop(k)
|
kwargs.pop(k)
|
||||||
if len(kwargs) != 0:
|
if len(kwargs) != 0:
|
||||||
raise ValueError(f"Found unknown {kwargs=}")
|
raise ValueError(f"Found unknown {kwargs=}")
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs: Any) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
if config_dict.get("model_type") == "dbrx":
|
|
||||||
config_dict = config_dict["ffn_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class DbrxConfig(PretrainedConfig):
|
class DbrxConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -202,6 +172,7 @@ class DbrxConfig(PretrainedConfig):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
model_type = "dbrx"
|
model_type = "dbrx"
|
||||||
|
sub_configs = {"attn_config": DbrxAttentionConfig, "ffn_config": DbrxFFNConfig}
|
||||||
attribute_map = {
|
attribute_map = {
|
||||||
"num_attention_heads": "n_heads",
|
"num_attention_heads": "n_heads",
|
||||||
"hidden_size": "d_model",
|
"hidden_size": "d_model",
|
||||||
|
|||||||
@@ -17,6 +17,7 @@
|
|||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
|
from ..auto import AutoConfig
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
@@ -70,6 +71,7 @@ class EncoderDecoderConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "encoder-decoder"
|
model_type = "encoder-decoder"
|
||||||
|
sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig}
|
||||||
is_composition = True
|
is_composition = True
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
@@ -84,8 +86,6 @@ class EncoderDecoderConfig(PretrainedConfig):
|
|||||||
decoder_config = kwargs.pop("decoder")
|
decoder_config = kwargs.pop("decoder")
|
||||||
decoder_model_type = decoder_config.pop("model_type")
|
decoder_model_type = decoder_config.pop("model_type")
|
||||||
|
|
||||||
from ..auto.configuration_auto import AutoConfig
|
|
||||||
|
|
||||||
self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config)
|
self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config)
|
||||||
self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config)
|
self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config)
|
||||||
self.is_encoder_decoder = True
|
self.is_encoder_decoder = True
|
||||||
|
|||||||
@@ -164,6 +164,7 @@ class FastSpeech2ConformerConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "fastspeech2_conformer"
|
model_type = "fastspeech2_conformer"
|
||||||
|
base_config_key = "model_config"
|
||||||
attribute_map = {"num_hidden_layers": "encoder_layers", "num_attention_heads": "encoder_num_attention_heads"}
|
attribute_map = {"num_hidden_layers": "encoder_layers", "num_attention_heads": "encoder_num_attention_heads"}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -377,6 +378,7 @@ class FastSpeech2ConformerHifiGanConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "hifigan"
|
model_type = "hifigan"
|
||||||
|
base_config_key = "vocoder_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -453,7 +455,7 @@ class FastSpeech2ConformerWithHifiGanConfig(PretrainedConfig):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
model_type = "fastspeech2_conformer_with_hifigan"
|
model_type = "fastspeech2_conformer_with_hifigan"
|
||||||
is_composition = True
|
sub_configs = {"model_config": FastSpeech2ConformerConfig, "vocoder_config": FastSpeech2ConformerHifiGanConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -14,8 +14,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""FLAVA model configurations"""
|
"""FLAVA model configurations"""
|
||||||
|
|
||||||
import os
|
from typing import Any, Dict
|
||||||
from typing import Any, Dict, Union
|
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
@@ -86,6 +85,7 @@ class FlavaImageConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "flava_image_model"
|
model_type = "flava_image_model"
|
||||||
|
base_config_key = "image_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -124,24 +124,6 @@ class FlavaImageConfig(PretrainedConfig):
|
|||||||
self.mask_token = mask_token
|
self.mask_token = mask_token
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the image config dict if we are loading from FlavaConfig
|
|
||||||
if config_dict.get("model_type") == "flava":
|
|
||||||
config_dict = config_dict["image_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class FlavaTextConfig(PretrainedConfig):
|
class FlavaTextConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -216,6 +198,7 @@ class FlavaTextConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "flava_text_model"
|
model_type = "flava_text_model"
|
||||||
|
base_config_key = "text_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -254,24 +237,6 @@ class FlavaTextConfig(PretrainedConfig):
|
|||||||
self.qkv_bias = qkv_bias
|
self.qkv_bias = qkv_bias
|
||||||
self.pad_token_id = pad_token_id
|
self.pad_token_id = pad_token_id
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the text config dict if we are loading from FlavaConfig
|
|
||||||
if config_dict.get("model_type") == "flava":
|
|
||||||
config_dict = config_dict["text_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class FlavaMultimodalConfig(PretrainedConfig):
|
class FlavaMultimodalConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -327,6 +292,7 @@ class FlavaMultimodalConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "flava_multimodal_model"
|
model_type = "flava_multimodal_model"
|
||||||
|
base_config_key = "multimodal_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -357,27 +323,10 @@ class FlavaMultimodalConfig(PretrainedConfig):
|
|||||||
self.qkv_bias = qkv_bias
|
self.qkv_bias = qkv_bias
|
||||||
self.use_cls_token = use_cls_token
|
self.use_cls_token = use_cls_token
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the multimodal config dict if we are loading from FlavaConfig
|
|
||||||
if config_dict.get("model_type") == "flava":
|
|
||||||
config_dict = config_dict["multimodal_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class FlavaImageCodebookConfig(PretrainedConfig):
|
class FlavaImageCodebookConfig(PretrainedConfig):
|
||||||
model_type = "flava_image_codebook"
|
model_type = "flava_image_codebook"
|
||||||
|
base_config_key = "image_codebook_config"
|
||||||
|
|
||||||
r"""
|
r"""
|
||||||
[`FlavaImageCodebookConfig`] is the configuration class to store the configuration of a [`FlavaImageCodebook`]. It
|
[`FlavaImageCodebookConfig`] is the configuration class to store the configuration of a [`FlavaImageCodebook`]. It
|
||||||
@@ -442,24 +391,6 @@ class FlavaImageCodebookConfig(PretrainedConfig):
|
|||||||
self.freeze = freeze
|
self.freeze = freeze
|
||||||
self.initializer_range = initializer_range
|
self.initializer_range = initializer_range
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the image codebook config dict if we are loading from FlavaConfig
|
|
||||||
if config_dict.get("model_type") == "flava":
|
|
||||||
config_dict = config_dict["image_codebook_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class FlavaConfig(PretrainedConfig):
|
class FlavaConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -532,6 +463,12 @@ class FlavaConfig(PretrainedConfig):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
model_type = "flava"
|
model_type = "flava"
|
||||||
|
sub_configs = {
|
||||||
|
"text_config": FlavaTextConfig,
|
||||||
|
"image_config": FlavaImageConfig,
|
||||||
|
"multimodal_config": FlavaMultimodalConfig,
|
||||||
|
"image_codebook_config": FlavaImageCodebookConfig,
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -13,8 +13,6 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import os
|
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
@@ -72,6 +70,7 @@ class GitVisionConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "git_vision_model"
|
model_type = "git_vision_model"
|
||||||
|
base_config_key = "vision_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -102,24 +101,6 @@ class GitVisionConfig(PretrainedConfig):
|
|||||||
self.layer_norm_eps = layer_norm_eps
|
self.layer_norm_eps = layer_norm_eps
|
||||||
self.hidden_act = hidden_act
|
self.hidden_act = hidden_act
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the vision config dict if we are loading from GITConfig
|
|
||||||
if config_dict.get("model_type") == "git":
|
|
||||||
config_dict = config_dict["vision_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class GitConfig(PretrainedConfig):
|
class GitConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -186,6 +167,7 @@ class GitConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "git"
|
model_type = "git"
|
||||||
|
sub_configs = {"vision_config": GitVisionConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -14,9 +14,8 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""GroupViT model configuration"""
|
"""GroupViT model configuration"""
|
||||||
|
|
||||||
import os
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
|
from typing import TYPE_CHECKING, Any, Mapping, Optional
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...onnx import OnnxConfig
|
from ...onnx import OnnxConfig
|
||||||
@@ -86,6 +85,7 @@ class GroupViTTextConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "groupvit_text_model"
|
model_type = "groupvit_text_model"
|
||||||
|
base_config_key = "text_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -121,24 +121,6 @@ class GroupViTTextConfig(PretrainedConfig):
|
|||||||
self.initializer_factor = initializer_factor
|
self.initializer_factor = initializer_factor
|
||||||
self.attention_dropout = attention_dropout
|
self.attention_dropout = attention_dropout
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the text config dict if we are loading from GroupViTConfig
|
|
||||||
if config_dict.get("model_type") == "groupvit":
|
|
||||||
config_dict = config_dict["text_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class GroupViTVisionConfig(PretrainedConfig):
|
class GroupViTVisionConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -197,6 +179,7 @@ class GroupViTVisionConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "groupvit_vision_model"
|
model_type = "groupvit_vision_model"
|
||||||
|
base_config_key = "vision_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -246,24 +229,6 @@ class GroupViTVisionConfig(PretrainedConfig):
|
|||||||
self.assign_eps = assign_eps
|
self.assign_eps = assign_eps
|
||||||
self.assign_mlp_ratio = assign_mlp_ratio
|
self.assign_mlp_ratio = assign_mlp_ratio
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the vision config dict if we are loading from GroupViTConfig
|
|
||||||
if config_dict.get("model_type") == "groupvit":
|
|
||||||
config_dict = config_dict["vision_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class GroupViTConfig(PretrainedConfig):
|
class GroupViTConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -292,6 +257,7 @@ class GroupViTConfig(PretrainedConfig):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
model_type = "groupvit"
|
model_type = "groupvit"
|
||||||
|
sub_configs = {"text_config": GroupViTTextConfig, "vision_config": GroupViTVisionConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ class IdeficsVisionConfig(PretrainedConfig):
|
|||||||
documentation from [`PretrainedConfig`] for more information.
|
documentation from [`PretrainedConfig`] for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
hidden_size (`int`, *optional*, defaults to 768):
|
embed_dim (`int`, *optional*, defaults to 768):
|
||||||
Dimensionality of the encoder layers and the pooler layer. (elsewhere referred to as `hidden_size`)
|
Dimensionality of the encoder layers and the pooler layer. (elsewhere referred to as `hidden_size`)
|
||||||
image_size (`int`, *optional*, defaults to 224):
|
image_size (`int`, *optional*, defaults to 224):
|
||||||
The size (resolution) of each image.
|
The size (resolution) of each image.
|
||||||
@@ -50,12 +50,12 @@ class IdeficsVisionConfig(PretrainedConfig):
|
|||||||
Number of hidden layers in the Transformer encoder.
|
Number of hidden layers in the Transformer encoder.
|
||||||
num_attention_heads (`int`, *optional*, defaults to 16):
|
num_attention_heads (`int`, *optional*, defaults to 16):
|
||||||
Number of attention heads for each attention layer in the Transformer encoder.
|
Number of attention heads for each attention layer in the Transformer encoder.
|
||||||
image_num_channels (`int`, *optional*, defaults to `3`):
|
num_channels (`int`, *optional*, defaults to 3):
|
||||||
Number of image channels.
|
Number of image channels.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
|
||||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||||
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
|
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
|
||||||
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
@@ -64,11 +64,9 @@ class IdeficsVisionConfig(PretrainedConfig):
|
|||||||
initializer_factor (`float`, *optional*, defaults to 1.0):
|
initializer_factor (`float`, *optional*, defaults to 1.0):
|
||||||
A factor for initializing all weight matrices (should be kept to 1.0, used internally for initialization
|
A factor for initializing all weight matrices (should be kept to 1.0, used internally for initialization
|
||||||
testing).
|
testing).
|
||||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
|
||||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
model_type = "idefics"
|
model_type = "idefics_vision"
|
||||||
attribute_map = {
|
attribute_map = {
|
||||||
"hidden_size": "embed_dim",
|
"hidden_size": "embed_dim",
|
||||||
}
|
}
|
||||||
@@ -119,7 +117,7 @@ class IdeficsPerceiverConfig(PretrainedConfig):
|
|||||||
Args:
|
Args:
|
||||||
use_resampler (`bool`, *optional*, defaults to `False`):
|
use_resampler (`bool`, *optional*, defaults to `False`):
|
||||||
Whether or not to use the resampler
|
Whether or not to use the resampler
|
||||||
resampler_n_latents (`int`, *optional*, defaults to ):
|
resampler_n_latents (`int`, *optional*, defaults to 64):
|
||||||
Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
|
Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
|
||||||
resampler_depth (`int`, *optional*, defaults to 6):
|
resampler_depth (`int`, *optional*, defaults to 6):
|
||||||
Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
|
Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
|
||||||
@@ -131,7 +129,7 @@ class IdeficsPerceiverConfig(PretrainedConfig):
|
|||||||
Whether or not to use qk layer norms in perceiver
|
Whether or not to use qk layer norms in perceiver
|
||||||
"""
|
"""
|
||||||
|
|
||||||
model_type = "idefics"
|
model_type = "idefics_perciever"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -235,7 +233,7 @@ class IdeficsConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "idefics"
|
model_type = "idefics"
|
||||||
is_composition = False
|
sub_configs = {"perceiver_config": IdeficsPerceiverConfig, "vision_config": IdeficsVisionConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -13,12 +13,9 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""Idefics2 model configuration"""
|
"""Idefics2 model configuration"""
|
||||||
|
|
||||||
import os
|
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
from ..auto import CONFIG_MAPPING
|
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
@@ -76,7 +73,8 @@ class Idefics2VisionConfig(PretrainedConfig):
|
|||||||
>>> configuration = model.config
|
>>> configuration = model.config
|
||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "idefics2"
|
model_type = "idefics2_vision"
|
||||||
|
base_config_key = "vision_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -107,24 +105,6 @@ class Idefics2VisionConfig(PretrainedConfig):
|
|||||||
self.hidden_act = hidden_act
|
self.hidden_act = hidden_act
|
||||||
self.initializer_range = initializer_range
|
self.initializer_range = initializer_range
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the vision config dict if we are loading from Idefics2Config
|
|
||||||
if config_dict.get("model_type") == "idefics2":
|
|
||||||
config_dict = config_dict["vision_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class Idefics2PerceiverConfig(PretrainedConfig):
|
class Idefics2PerceiverConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -152,7 +132,7 @@ class Idefics2PerceiverConfig(PretrainedConfig):
|
|||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
model_type = "idefics2"
|
model_type = "idefics2_perceiver"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -220,7 +200,11 @@ class Idefics2Config(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "idefics2"
|
model_type = "idefics2"
|
||||||
is_composition = True
|
sub_configs = {
|
||||||
|
"text_config": AutoConfig,
|
||||||
|
"perceiver_config": Idefics2PerceiverConfig,
|
||||||
|
"vision_config": Idefics2VisionConfig,
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -13,12 +13,9 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""Idefics3 model configuration"""
|
"""Idefics3 model configuration"""
|
||||||
|
|
||||||
import os
|
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
from ..auto import CONFIG_MAPPING
|
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
@@ -57,8 +54,7 @@ class Idefics3VisionConfig(PretrainedConfig):
|
|||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
intializer_range (`float`, *optional*, defaults to 0.02):
|
initializer_range (`<fill_type>`, *optional*, defaults to 0.02): <fill_docstring>
|
||||||
The standard deviation for initializing all weight matrices in the model.
|
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
@@ -76,7 +72,8 @@ class Idefics3VisionConfig(PretrainedConfig):
|
|||||||
>>> configuration = model.config
|
>>> configuration = model.config
|
||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "idefics3"
|
model_type = "idefics3_vision"
|
||||||
|
base_config_key = "vision_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -107,24 +104,6 @@ class Idefics3VisionConfig(PretrainedConfig):
|
|||||||
self.hidden_act = hidden_act
|
self.hidden_act = hidden_act
|
||||||
self.initializer_range = initializer_range
|
self.initializer_range = initializer_range
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the vision config dict if we are loading from Idefics3Config
|
|
||||||
if config_dict.get("model_type") == "idefics3":
|
|
||||||
config_dict = config_dict["vision_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class Idefics3Config(PretrainedConfig):
|
class Idefics3Config(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -165,7 +144,7 @@ class Idefics3Config(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "idefics3"
|
model_type = "idefics3"
|
||||||
is_composition = True
|
sub_configs = {"text_config": AutoConfig, "vision_config": Idefics3VisionConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -204,4 +183,4 @@ class Idefics3Config(PretrainedConfig):
|
|||||||
self.text_config = text_config
|
self.text_config = text_config
|
||||||
self.scale_factor = scale_factor
|
self.scale_factor = scale_factor
|
||||||
|
|
||||||
super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
|
super().__init__(**kwargs, pad_token_id=pad_token_id, tie_word_embeddings=tie_word_embeddings)
|
||||||
|
|||||||
@@ -14,13 +14,10 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""InstructBLIP model configuration"""
|
"""InstructBLIP model configuration"""
|
||||||
|
|
||||||
import os
|
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
|
from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
from ..auto import CONFIG_MAPPING
|
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
@@ -78,6 +75,7 @@ class InstructBlipVisionConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "instructblip_vision_model"
|
model_type = "instructblip_vision_model"
|
||||||
|
base_config_key = "vision_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -108,24 +106,6 @@ class InstructBlipVisionConfig(PretrainedConfig):
|
|||||||
self.hidden_act = hidden_act
|
self.hidden_act = hidden_act
|
||||||
self.qkv_bias = qkv_bias
|
self.qkv_bias = qkv_bias
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the vision config dict if we are loading from InstructBlipConfig
|
|
||||||
if config_dict.get("model_type") == "instructblip":
|
|
||||||
config_dict = config_dict["vision_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class InstructBlipQFormerConfig(PretrainedConfig):
|
class InstructBlipQFormerConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -192,6 +172,7 @@ class InstructBlipQFormerConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "instructblip_qformer"
|
model_type = "instructblip_qformer"
|
||||||
|
base_config_key = "qformer_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -229,24 +210,6 @@ class InstructBlipQFormerConfig(PretrainedConfig):
|
|||||||
self.cross_attention_frequency = cross_attention_frequency
|
self.cross_attention_frequency = cross_attention_frequency
|
||||||
self.encoder_hidden_size = encoder_hidden_size
|
self.encoder_hidden_size = encoder_hidden_size
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the qformer config dict if we are loading from InstructBlipConfig
|
|
||||||
if config_dict.get("model_type") == "instructblip":
|
|
||||||
config_dict = config_dict["qformer_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class InstructBlipConfig(PretrainedConfig):
|
class InstructBlipConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -305,6 +268,11 @@ class InstructBlipConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "instructblip"
|
model_type = "instructblip"
|
||||||
|
sub_configs = {
|
||||||
|
"text_config": AutoConfig,
|
||||||
|
"qformer_config": InstructBlipQFormerConfig,
|
||||||
|
"vision_config": InstructBlipVisionConfig,
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -19,13 +19,11 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import os
|
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
|
from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
from ..auto import CONFIG_MAPPING
|
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
@@ -83,6 +81,7 @@ class InstructBlipVideoVisionConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "instructblipvideo_vision_model"
|
model_type = "instructblipvideo_vision_model"
|
||||||
|
base_config_key = "vision_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -113,24 +112,6 @@ class InstructBlipVideoVisionConfig(PretrainedConfig):
|
|||||||
self.hidden_act = hidden_act
|
self.hidden_act = hidden_act
|
||||||
self.qkv_bias = qkv_bias
|
self.qkv_bias = qkv_bias
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the vision config dict if we are loading from InstructBlipVideoConfig
|
|
||||||
if config_dict.get("model_type") == "instructblipvideo":
|
|
||||||
config_dict = config_dict["vision_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class InstructBlipVideoQFormerConfig(PretrainedConfig):
|
class InstructBlipVideoQFormerConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -197,6 +178,7 @@ class InstructBlipVideoQFormerConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "instructblipvideo_qformer"
|
model_type = "instructblipvideo_qformer"
|
||||||
|
base_config_key = "qformer_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -234,24 +216,6 @@ class InstructBlipVideoQFormerConfig(PretrainedConfig):
|
|||||||
self.cross_attention_frequency = cross_attention_frequency
|
self.cross_attention_frequency = cross_attention_frequency
|
||||||
self.encoder_hidden_size = encoder_hidden_size
|
self.encoder_hidden_size = encoder_hidden_size
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the qformer config dict if we are loading from InstructBlipVideoConfig
|
|
||||||
if config_dict.get("model_type") == "instructblipvideo":
|
|
||||||
config_dict = config_dict["qformer_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class InstructBlipVideoConfig(PretrainedConfig):
|
class InstructBlipVideoConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -310,6 +274,11 @@ class InstructBlipVideoConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "instructblipvideo"
|
model_type = "instructblipvideo"
|
||||||
|
sub_configs = {
|
||||||
|
"text_config": AutoConfig,
|
||||||
|
"qformer_config": InstructBlipVideoQFormerConfig,
|
||||||
|
"vision_config": InstructBlipVideoVisionConfig,
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ from transformers.models.instructblip.modeling_instructblip import (
|
|||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
|
from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
from ..auto import CONFIG_MAPPING
|
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
@@ -103,6 +103,11 @@ class InstructBlipVideoConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "instructblipvideo"
|
model_type = "instructblipvideo"
|
||||||
|
sub_configs = {
|
||||||
|
"text_config": AutoConfig,
|
||||||
|
"qformer_config": InstructBlipVideoQFormerConfig,
|
||||||
|
"vision_config": InstructBlipVideoVisionConfig,
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -14,9 +14,6 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""KOSMOS-2 model configuration"""
|
"""KOSMOS-2 model configuration"""
|
||||||
|
|
||||||
import os
|
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
|
|
||||||
@@ -61,7 +58,7 @@ class Kosmos2TextConfig(PretrainedConfig):
|
|||||||
layerdrop (`float`, *optional*, defaults to 0.0):
|
layerdrop (`float`, *optional*, defaults to 0.0):
|
||||||
The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
|
The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
|
||||||
for more details.
|
for more details.
|
||||||
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
init_std (`float`, *optional*, defaults to 0.02):
|
init_std (`float`, *optional*, defaults to 0.02):
|
||||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
@@ -69,9 +66,16 @@ class Kosmos2TextConfig(PretrainedConfig):
|
|||||||
Scale embeddings by diving by sqrt(embed_dim).
|
Scale embeddings by diving by sqrt(embed_dim).
|
||||||
use_cache (`bool`, *optional*, defaults to `True`):
|
use_cache (`bool`, *optional*, defaults to `True`):
|
||||||
Whether or not the model should return the last key/values attentions (not used by all models).
|
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||||
|
pad_token_id (`int`, *optional*, defaults to 1):
|
||||||
|
Token id used for padding.
|
||||||
|
bos_token_id (`int`, *optional*, defaults to 0):
|
||||||
|
Token id used for beginning of string.
|
||||||
|
eos_token_id (`int`, *optional*, defaults to 2):
|
||||||
|
Token id used for end of string.
|
||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "kosmos_2_text_model"
|
model_type = "kosmos_2_text_model"
|
||||||
|
base_config_key = "text_config"
|
||||||
keys_to_ignore_at_inference = ["past_key_values"]
|
keys_to_ignore_at_inference = ["past_key_values"]
|
||||||
attribute_map = {
|
attribute_map = {
|
||||||
"num_attention_heads": "attention_heads",
|
"num_attention_heads": "attention_heads",
|
||||||
@@ -124,24 +128,6 @@ class Kosmos2TextConfig(PretrainedConfig):
|
|||||||
self.scale_embedding = scale_embedding
|
self.scale_embedding = scale_embedding
|
||||||
self.use_cache = use_cache
|
self.use_cache = use_cache
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the text config dict if we are loading from Kosmos2Config
|
|
||||||
if config_dict.get("model_type") == "kosmos-2":
|
|
||||||
config_dict = config_dict["text_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class Kosmos2VisionConfig(PretrainedConfig):
|
class Kosmos2VisionConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -171,18 +157,19 @@ class Kosmos2VisionConfig(PretrainedConfig):
|
|||||||
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
||||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||||
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
|
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
|
||||||
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
initializer_factor (`float`, *optional*, defaults to 1):
|
initializer_factor (`float`, *optional*, defaults to 1.0):
|
||||||
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
||||||
testing).
|
testing).
|
||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "kosmos_2_vision_model"
|
model_type = "kosmos_2_vision_model"
|
||||||
|
base_config_key = "vision_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -215,24 +202,6 @@ class Kosmos2VisionConfig(PretrainedConfig):
|
|||||||
self.layer_norm_eps = layer_norm_eps
|
self.layer_norm_eps = layer_norm_eps
|
||||||
self.hidden_act = hidden_act
|
self.hidden_act = hidden_act
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the vision config dict if we are loading from Kosmos2Config
|
|
||||||
if config_dict.get("model_type") == "kosmos-2":
|
|
||||||
config_dict = config_dict["vision_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class Kosmos2Config(PretrainedConfig):
|
class Kosmos2Config(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -267,7 +236,7 @@ class Kosmos2Config(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "kosmos-2"
|
model_type = "kosmos-2"
|
||||||
is_composition = True
|
sub_configs = {"text_config": Kosmos2TextConfig, "vision_config": Kosmos2VisionConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -15,7 +15,7 @@
|
|||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
from ..auto import CONFIG_MAPPING
|
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
@@ -73,7 +73,7 @@ class LlavaConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "llava"
|
model_type = "llava"
|
||||||
is_composition = True
|
sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -15,7 +15,7 @@
|
|||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
from ..auto import CONFIG_MAPPING
|
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
@@ -78,7 +78,7 @@ class LlavaNextConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "llava_next"
|
model_type = "llava_next"
|
||||||
is_composition = False
|
sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -21,7 +21,7 @@
|
|||||||
|
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ..auto import CONFIG_MAPPING
|
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||||
|
|
||||||
|
|
||||||
class LlavaNextVideoConfig(PretrainedConfig):
|
class LlavaNextVideoConfig(PretrainedConfig):
|
||||||
@@ -86,7 +86,7 @@ class LlavaNextVideoConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "llava_next_video"
|
model_type = "llava_next_video"
|
||||||
is_composition = True
|
sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ from ...configuration_utils import PretrainedConfig
|
|||||||
from ...utils import (
|
from ...utils import (
|
||||||
logging,
|
logging,
|
||||||
)
|
)
|
||||||
from ..auto import CONFIG_MAPPING
|
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
@@ -99,7 +99,7 @@ class LlavaNextVideoConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "llava_next_video"
|
model_type = "llava_next_video"
|
||||||
is_composition = True
|
sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ from ...configuration_utils import PretrainedConfig
|
|||||||
from ...utils import (
|
from ...utils import (
|
||||||
logging,
|
logging,
|
||||||
)
|
)
|
||||||
from ..auto import CONFIG_MAPPING
|
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
@@ -81,7 +81,7 @@ class LlavaOnevisionConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "llava_onevision"
|
model_type = "llava_onevision"
|
||||||
is_composition = False
|
sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -13,8 +13,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""Mllama model configuration"""
|
"""Mllama model configuration"""
|
||||||
|
|
||||||
import os
|
from typing import Dict, List, Optional
|
||||||
from typing import Dict, List, Optional, Union
|
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...modeling_rope_utils import rope_config_validation
|
from ...modeling_rope_utils import rope_config_validation
|
||||||
@@ -59,7 +58,7 @@ class MllamaVisionConfig(PretrainedConfig):
|
|||||||
The size (resolution) of each image *tile*.
|
The size (resolution) of each image *tile*.
|
||||||
patch_size (`int`, *optional*, defaults to 14):
|
patch_size (`int`, *optional*, defaults to 14):
|
||||||
The size (resolution) of each patch.
|
The size (resolution) of each patch.
|
||||||
norm_eps (`float`, *optional*, defaults to 1e-5):
|
norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
max_num_tiles (`int`, *optional*, defaults to 4):
|
max_num_tiles (`int`, *optional*, defaults to 4):
|
||||||
Maximum number of tiles for image splitting.
|
Maximum number of tiles for image splitting.
|
||||||
@@ -88,6 +87,7 @@ class MllamaVisionConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "mllama_vision_model"
|
model_type = "mllama_vision_model"
|
||||||
|
base_config_key = "vision_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -137,23 +137,6 @@ class MllamaVisionConfig(PretrainedConfig):
|
|||||||
def max_aspect_ratio_id(self) -> int:
|
def max_aspect_ratio_id(self) -> int:
|
||||||
return len(self.supported_aspect_ratios)
|
return len(self.supported_aspect_ratios)
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
if config_dict.get("model_type") == "mllama":
|
|
||||||
config_dict = config_dict["vision_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class MllamaTextConfig(PretrainedConfig):
|
class MllamaTextConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -178,12 +161,12 @@ class MllamaTextConfig(PretrainedConfig):
|
|||||||
Number of hidden layers in the Transformer encoder.
|
Number of hidden layers in the Transformer encoder.
|
||||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||||
Number of attention heads for each attention layer in the Transformer encoder.
|
Number of attention heads for each attention layer in the Transformer encoder.
|
||||||
num_key_value_heads (`int`, *optional*):
|
num_key_value_heads (`int`, *optional*, defaults to 8):
|
||||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If not
|
This is the number of key_value heads that should be used to implement Grouped Query Attention. If not
|
||||||
specified, will default to `num_attention_heads`.
|
specified, will default to `num_attention_heads`.
|
||||||
intermediate_size (`int`, *optional*, defaults to 14336):
|
intermediate_size (`int`, *optional*, defaults to 14336):
|
||||||
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
|
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
|
||||||
rope_theta (`float`, *optional*, defaults to 500000.0):
|
rope_theta (`float`, *optional*, defaults to `500000.0`):
|
||||||
The base period of the RoPE embeddings.
|
The base period of the RoPE embeddings.
|
||||||
rope_scaling (`Dict`, *optional*):
|
rope_scaling (`Dict`, *optional*):
|
||||||
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
|
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
|
||||||
@@ -259,6 +242,7 @@ class MllamaTextConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "mllama_text_model"
|
model_type = "mllama_text_model"
|
||||||
|
base_config_key = "text_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -311,23 +295,6 @@ class MllamaTextConfig(PretrainedConfig):
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
if config_dict.get("model_type") == "mllama":
|
|
||||||
config_dict = config_dict["text_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class MllamaConfig(PretrainedConfig):
|
class MllamaConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -370,7 +337,7 @@ class MllamaConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "mllama"
|
model_type = "mllama"
|
||||||
is_composition = True
|
sub_configs = {"text_config": MllamaTextConfig, "vision_config": MllamaVisionConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -235,8 +235,8 @@ class MoshiConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "moshi"
|
model_type = "moshi"
|
||||||
is_composition = True
|
|
||||||
keys_to_ignore_at_inference = ["past_key_values"]
|
keys_to_ignore_at_inference = ["past_key_values"]
|
||||||
|
sub_configs = {"audio_encoder_config": AutoConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -41,22 +41,22 @@ class MptAttentionConfig(PretrainedConfig):
|
|||||||
Args:
|
Args:
|
||||||
attn_type (`str`, *optional*, defaults to `"multihead_attention"`):
|
attn_type (`str`, *optional*, defaults to `"multihead_attention"`):
|
||||||
type of attention to use. Options: `"multihead_attention"`, `"multiquery_attention"`.
|
type of attention to use. Options: `"multihead_attention"`, `"multiquery_attention"`.
|
||||||
attn_pdrop (`float`, *optional*, defaults to 0.0):
|
attn_pdrop (`float`, *optional*, defaults to `0.0`):
|
||||||
The dropout probability for the attention layers.
|
The dropout probability for the attention layers.
|
||||||
attn_impl (`str`, *optional*, defaults to `"torch"`):
|
attn_impl (`str`, *optional*, defaults to `"torch"`):
|
||||||
The attention implementation to use. One of `"torch"`, `"flash"`, or `"triton"`.
|
The attention implementation to use. One of `"torch"`, `"flash"`, or `"triton"`.
|
||||||
clip_qkv (`float`, *optional*):
|
clip_qkv (`float`, *optional*):
|
||||||
If not `None`, clip the queries, keys, and values in the attention layer to this value.
|
If not `None`, clip the queries, keys, and values in the attention layer to this value.
|
||||||
softmax_scale (`float`, *optional*, defaults to `None`):
|
softmax_scale (`float`, *optional*):
|
||||||
If not `None`, scale the softmax in the attention layer by this value. If `None`, will default to
|
If not `None`, scale the softmax in the attention layer by this value. If `None`, will default to
|
||||||
`1/sqrt(hidden_size)`.
|
`1/sqrt(hidden_size)`.
|
||||||
prefix_lm (`bool`, *optional*, defaults to `False`)):
|
prefix_lm (`bool`, *optional*, defaults to `False`):
|
||||||
Whether the model should operate as a Prefix LM. This requires passing an extra `prefix_mask` argument
|
Whether the model should operate as a Prefix LM. This requires passing an extra `prefix_mask` argument
|
||||||
which indicates which tokens belong to the prefix. Tokens in the prefix can attend to one another
|
which indicates which tokens belong to the prefix. Tokens in the prefix can attend to one another
|
||||||
bi-directionally. Tokens outside the prefix use causal attention.
|
bi-directionally. Tokens outside the prefix use causal attention.
|
||||||
qk_ln (`bool`, *optional*, defaults to `False`):
|
qk_ln (`bool`, *optional*, defaults to `False`):
|
||||||
Whether to apply layer normalization to the queries and keys in the attention layer.
|
Whether to apply layer normalization to the queries and keys in the attention layer.
|
||||||
attn_uses_sequence_id (`bool`, *optional*, defaults to `False`)):
|
attn_uses_sequence_id (`bool`, *optional*, defaults to `False`):
|
||||||
Whether to restrict attention to tokens that have the same token_type_ids. When the model is in `train`
|
Whether to restrict attention to tokens that have the same token_type_ids. When the model is in `train`
|
||||||
mode, this requires passing an extra *token_type_ids* argument which indicates which sub-sequence each
|
mode, this requires passing an extra *token_type_ids* argument which indicates which sub-sequence each
|
||||||
token belongs to. Defaults to `False` meaning any provided *token_type_ids* will be ignored.
|
token belongs to. Defaults to `False` meaning any provided *token_type_ids* will be ignored.
|
||||||
@@ -66,6 +66,8 @@ class MptAttentionConfig(PretrainedConfig):
|
|||||||
The maximum value of the alibi bias.
|
The maximum value of the alibi bias.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
base_config_key = "attn_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
attn_type="multihead_attention",
|
attn_type="multihead_attention",
|
||||||
@@ -97,23 +99,6 @@ class MptAttentionConfig(PretrainedConfig):
|
|||||||
f"`attn_type` has to be either `multihead_attention` or `multiquery_attention`. Received: {attn_type}"
|
f"`attn_type` has to be either `multihead_attention` or `multiquery_attention`. Received: {attn_type}"
|
||||||
)
|
)
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
if config_dict.get("model_type") == "mpt":
|
|
||||||
config_dict = config_dict["attn_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class MptConfig(PretrainedConfig):
|
class MptConfig(PretrainedConfig):
|
||||||
"""
|
"""
|
||||||
@@ -188,6 +173,7 @@ class MptConfig(PretrainedConfig):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
model_type = "mpt"
|
model_type = "mpt"
|
||||||
|
sub_configs = {"attn_config": MptAttentionConfig}
|
||||||
attribute_map = {
|
attribute_map = {
|
||||||
"num_attention_heads": "n_heads",
|
"num_attention_heads": "n_heads",
|
||||||
"hidden_size": "d_model",
|
"hidden_size": "d_model",
|
||||||
|
|||||||
@@ -76,6 +76,7 @@ class MusicgenDecoderConfig(PretrainedConfig):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
model_type = "musicgen_decoder"
|
model_type = "musicgen_decoder"
|
||||||
|
base_config_key = "decoder_config"
|
||||||
keys_to_ignore_at_inference = ["past_key_values"]
|
keys_to_ignore_at_inference = ["past_key_values"]
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -189,6 +190,11 @@ class MusicgenConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "musicgen"
|
model_type = "musicgen"
|
||||||
|
sub_configs = {
|
||||||
|
"text_encoder": AutoConfig,
|
||||||
|
"audio_encoder": AutoConfig,
|
||||||
|
"decoder": MusicgenDecoderConfig,
|
||||||
|
}
|
||||||
is_composition = True
|
is_composition = True
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
|
|||||||
@@ -78,6 +78,7 @@ class MusicgenMelodyDecoderConfig(PretrainedConfig):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
model_type = "musicgen_melody_decoder"
|
model_type = "musicgen_melody_decoder"
|
||||||
|
base_config_key = "decoder_config"
|
||||||
keys_to_ignore_at_inference = ["past_key_values"]
|
keys_to_ignore_at_inference = ["past_key_values"]
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -195,6 +196,11 @@ class MusicgenMelodyConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "musicgen_melody"
|
model_type = "musicgen_melody"
|
||||||
|
sub_configs = {
|
||||||
|
"text_encoder": AutoConfig,
|
||||||
|
"audio_encoder": AutoConfig,
|
||||||
|
"decoder": MusicgenMelodyDecoderConfig,
|
||||||
|
}
|
||||||
is_composition = True
|
is_composition = True
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
|||||||
@@ -14,8 +14,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""OWLv2 model configuration"""
|
"""OWLv2 model configuration"""
|
||||||
|
|
||||||
import os
|
from typing import TYPE_CHECKING, Dict
|
||||||
from typing import TYPE_CHECKING, Dict, Union
|
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@@ -90,6 +89,7 @@ class Owlv2TextConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "owlv2_text_model"
|
model_type = "owlv2_text_model"
|
||||||
|
base_config_key = "text_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -123,24 +123,6 @@ class Owlv2TextConfig(PretrainedConfig):
|
|||||||
self.initializer_range = initializer_range
|
self.initializer_range = initializer_range
|
||||||
self.initializer_factor = initializer_factor
|
self.initializer_factor = initializer_factor
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the text config dict if we are loading from Owlv2Config
|
|
||||||
if config_dict.get("model_type") == "owlv2":
|
|
||||||
config_dict = config_dict["text_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.owlvit.configuration_owlvit.OwlViTVisionConfig with OwlViT->Owlv2, owlvit-base-patch32->owlv2-base-patch16, owlvit->owlv2, OWL-ViT->OWLv2, 32->16
|
# Copied from transformers.models.owlvit.configuration_owlvit.OwlViTVisionConfig with OwlViT->Owlv2, owlvit-base-patch32->owlv2-base-patch16, owlvit->owlv2, OWL-ViT->OWLv2, 32->16
|
||||||
class Owlv2VisionConfig(PretrainedConfig):
|
class Owlv2VisionConfig(PretrainedConfig):
|
||||||
@@ -197,6 +179,7 @@ class Owlv2VisionConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "owlv2_vision_model"
|
model_type = "owlv2_vision_model"
|
||||||
|
base_config_key = "vision_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -229,24 +212,6 @@ class Owlv2VisionConfig(PretrainedConfig):
|
|||||||
self.initializer_range = initializer_range
|
self.initializer_range = initializer_range
|
||||||
self.initializer_factor = initializer_factor
|
self.initializer_factor = initializer_factor
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the vision config dict if we are loading from Owlv2Config
|
|
||||||
if config_dict.get("model_type") == "owlv2":
|
|
||||||
config_dict = config_dict["vision_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.owlvit.configuration_owlvit.OwlViTConfig with OwlViT->Owlv2, owlvit-base-patch32->owlv2-base-patch16, owlvit->owlv2, OWL-ViT->OWLv2
|
# Copied from transformers.models.owlvit.configuration_owlvit.OwlViTConfig with OwlViT->Owlv2, owlvit-base-patch32->owlv2-base-patch16, owlvit->owlv2, OWL-ViT->OWLv2
|
||||||
class Owlv2Config(PretrainedConfig):
|
class Owlv2Config(PretrainedConfig):
|
||||||
@@ -276,6 +241,7 @@ class Owlv2Config(PretrainedConfig):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
model_type = "owlv2"
|
model_type = "owlv2"
|
||||||
|
sub_configs = {"text_config": Owlv2TextConfig, "vision_config": Owlv2VisionConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -304,20 +270,6 @@ class Owlv2Config(PretrainedConfig):
|
|||||||
self.return_dict = return_dict
|
self.return_dict = return_dict
|
||||||
self.initializer_factor = 1.0
|
self.initializer_factor = 1.0
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_text_vision_configs(cls, text_config: Dict, vision_config: Dict, **kwargs):
|
def from_text_vision_configs(cls, text_config: Dict, vision_config: Dict, **kwargs):
|
||||||
r"""
|
r"""
|
||||||
|
|||||||
@@ -14,9 +14,8 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""OWL-ViT model configuration"""
|
"""OWL-ViT model configuration"""
|
||||||
|
|
||||||
import os
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional, Union
|
from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@@ -92,6 +91,7 @@ class OwlViTTextConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "owlvit_text_model"
|
model_type = "owlvit_text_model"
|
||||||
|
base_config_key = "text_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -125,24 +125,6 @@ class OwlViTTextConfig(PretrainedConfig):
|
|||||||
self.initializer_range = initializer_range
|
self.initializer_range = initializer_range
|
||||||
self.initializer_factor = initializer_factor
|
self.initializer_factor = initializer_factor
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the text config dict if we are loading from OwlViTConfig
|
|
||||||
if config_dict.get("model_type") == "owlvit":
|
|
||||||
config_dict = config_dict["text_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class OwlViTVisionConfig(PretrainedConfig):
|
class OwlViTVisionConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -198,6 +180,7 @@ class OwlViTVisionConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "owlvit_vision_model"
|
model_type = "owlvit_vision_model"
|
||||||
|
base_config_key = "vision_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -230,24 +213,6 @@ class OwlViTVisionConfig(PretrainedConfig):
|
|||||||
self.initializer_range = initializer_range
|
self.initializer_range = initializer_range
|
||||||
self.initializer_factor = initializer_factor
|
self.initializer_factor = initializer_factor
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the vision config dict if we are loading from OwlViTConfig
|
|
||||||
if config_dict.get("model_type") == "owlvit":
|
|
||||||
config_dict = config_dict["vision_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class OwlViTConfig(PretrainedConfig):
|
class OwlViTConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -276,6 +241,7 @@ class OwlViTConfig(PretrainedConfig):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
model_type = "owlvit"
|
model_type = "owlvit"
|
||||||
|
sub_configs = {"text_config": OwlViTTextConfig, "vision_config": OwlViTVisionConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -304,20 +270,6 @@ class OwlViTConfig(PretrainedConfig):
|
|||||||
self.return_dict = return_dict
|
self.return_dict = return_dict
|
||||||
self.initializer_factor = 1.0
|
self.initializer_factor = 1.0
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_text_vision_configs(cls, text_config: Dict, vision_config: Dict, **kwargs):
|
def from_text_vision_configs(cls, text_config: Dict, vision_config: Dict, **kwargs):
|
||||||
r"""
|
r"""
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ import warnings
|
|||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
from ..auto import CONFIG_MAPPING
|
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
@@ -73,7 +73,7 @@ class PaliGemmaConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "paligemma"
|
model_type = "paligemma"
|
||||||
is_composition = False
|
sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -15,7 +15,7 @@
|
|||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
from ..auto import CONFIG_MAPPING
|
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
@@ -157,7 +157,7 @@ class Qwen2AudioConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "qwen2_audio"
|
model_type = "qwen2_audio"
|
||||||
is_composition = False
|
sub_configs = {"text_config": AutoConfig, "audio_config": AutoConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -14,9 +14,6 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""Qwen2VL model configuration"""
|
"""Qwen2VL model configuration"""
|
||||||
|
|
||||||
import os
|
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...modeling_rope_utils import rope_config_validation
|
from ...modeling_rope_utils import rope_config_validation
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
@@ -27,6 +24,7 @@ logger = logging.get_logger(__name__)
|
|||||||
|
|
||||||
class Qwen2VLVisionConfig(PretrainedConfig):
|
class Qwen2VLVisionConfig(PretrainedConfig):
|
||||||
model_type = "qwen2_vl"
|
model_type = "qwen2_vl"
|
||||||
|
base_config_key = "vision_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -55,23 +53,6 @@ class Qwen2VLVisionConfig(PretrainedConfig):
|
|||||||
self.spatial_merge_size = spatial_merge_size
|
self.spatial_merge_size = spatial_merge_size
|
||||||
self.temporal_patch_size = temporal_patch_size
|
self.temporal_patch_size = temporal_patch_size
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
if config_dict.get("model_type") == "qwen2_vl":
|
|
||||||
config_dict = config_dict["vision_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class Qwen2VLConfig(PretrainedConfig):
|
class Qwen2VLConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -180,6 +161,7 @@ class Qwen2VLConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "qwen2_vl"
|
model_type = "qwen2_vl"
|
||||||
|
sub_configs = {"vision_config": Qwen2VLVisionConfig}
|
||||||
keys_to_ignore_at_inference = ["past_key_values"]
|
keys_to_ignore_at_inference = ["past_key_values"]
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
|||||||
@@ -14,9 +14,6 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""Siglip model configuration"""
|
"""Siglip model configuration"""
|
||||||
|
|
||||||
import os
|
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
|
|
||||||
@@ -79,6 +76,7 @@ class SiglipTextConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "siglip_text_model"
|
model_type = "siglip_text_model"
|
||||||
|
base_config_key = "text_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -110,24 +108,6 @@ class SiglipTextConfig(PretrainedConfig):
|
|||||||
self.hidden_act = hidden_act
|
self.hidden_act = hidden_act
|
||||||
self.attention_dropout = attention_dropout
|
self.attention_dropout = attention_dropout
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the text config dict if we are loading from SiglipConfig
|
|
||||||
if config_dict.get("model_type") == "siglip":
|
|
||||||
config_dict = config_dict["text_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class SiglipVisionConfig(PretrainedConfig):
|
class SiglipVisionConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -178,6 +158,7 @@ class SiglipVisionConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "siglip_vision_model"
|
model_type = "siglip_vision_model"
|
||||||
|
base_config_key = "vision_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -206,24 +187,6 @@ class SiglipVisionConfig(PretrainedConfig):
|
|||||||
self.layer_norm_eps = layer_norm_eps
|
self.layer_norm_eps = layer_norm_eps
|
||||||
self.hidden_act = hidden_act
|
self.hidden_act = hidden_act
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the vision config dict if we are loading from SiglipConfig
|
|
||||||
if config_dict.get("model_type") == "siglip":
|
|
||||||
config_dict = config_dict["vision_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class SiglipConfig(PretrainedConfig):
|
class SiglipConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -268,6 +231,7 @@ class SiglipConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "siglip"
|
model_type = "siglip"
|
||||||
|
sub_configs = {"text_config": SiglipTextConfig, "vision_config": SiglipVisionConfig}
|
||||||
|
|
||||||
def __init__(self, text_config=None, vision_config=None, **kwargs):
|
def __init__(self, text_config=None, vision_config=None, **kwargs):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|||||||
@@ -71,6 +71,7 @@ class SpeechEncoderDecoderConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "speech-encoder-decoder"
|
model_type = "speech-encoder-decoder"
|
||||||
|
sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig}
|
||||||
is_composition = True
|
is_composition = True
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
|
|||||||
@@ -15,7 +15,7 @@
|
|||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
from ..auto import CONFIG_MAPPING
|
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
@@ -78,7 +78,7 @@ class VideoLlavaConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "video_llava"
|
model_type = "video_llava"
|
||||||
is_composition = False
|
sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -15,7 +15,7 @@
|
|||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
from ..auto import CONFIG_MAPPING
|
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
@@ -72,7 +72,7 @@ class VipLlavaConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "vipllava"
|
model_type = "vipllava"
|
||||||
is_composition = False
|
sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -78,6 +78,7 @@ class VisionEncoderDecoderConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "vision-encoder-decoder"
|
model_type = "vision-encoder-decoder"
|
||||||
|
sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig}
|
||||||
is_composition = True
|
is_composition = True
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
|
|||||||
@@ -75,6 +75,7 @@ class VisionTextDualEncoderConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "vision-text-dual-encoder"
|
model_type = "vision-text-dual-encoder"
|
||||||
|
sub_configs = {"vision_config": AutoConfig, "text_config": AutoConfig}
|
||||||
is_composition = True
|
is_composition = True
|
||||||
|
|
||||||
def __init__(self, projection_dim=512, logit_scale_init_value=2.6592, **kwargs):
|
def __init__(self, projection_dim=512, logit_scale_init_value=2.6592, **kwargs):
|
||||||
|
|||||||
@@ -14,9 +14,6 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""X-CLIP model configuration"""
|
"""X-CLIP model configuration"""
|
||||||
|
|
||||||
import os
|
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
|
|
||||||
@@ -79,6 +76,7 @@ class XCLIPTextConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "xclip_text_model"
|
model_type = "xclip_text_model"
|
||||||
|
base_config_key = "text_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -112,24 +110,6 @@ class XCLIPTextConfig(PretrainedConfig):
|
|||||||
self.initializer_factor = initializer_factor
|
self.initializer_factor = initializer_factor
|
||||||
self.attention_dropout = attention_dropout
|
self.attention_dropout = attention_dropout
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the text config dict if we are loading from XCLIPConfig
|
|
||||||
if config_dict.get("model_type") == "xclip":
|
|
||||||
config_dict = config_dict["text_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class XCLIPVisionConfig(PretrainedConfig):
|
class XCLIPVisionConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -195,6 +175,7 @@ class XCLIPVisionConfig(PretrainedConfig):
|
|||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "xclip_vision_model"
|
model_type = "xclip_vision_model"
|
||||||
|
base_config_key = "vision_config"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -239,24 +220,6 @@ class XCLIPVisionConfig(PretrainedConfig):
|
|||||||
self.hidden_act = hidden_act
|
self.hidden_act = hidden_act
|
||||||
self.drop_path_rate = drop_path_rate
|
self.drop_path_rate = drop_path_rate
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
||||||
cls._set_token_in_kwargs(kwargs)
|
|
||||||
|
|
||||||
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# get the vision config dict if we are loading from XCLIPConfig
|
|
||||||
if config_dict.get("model_type") == "xclip":
|
|
||||||
config_dict = config_dict["vision_config"]
|
|
||||||
|
|
||||||
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
||||||
logger.warning(
|
|
||||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
||||||
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
||||||
)
|
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class XCLIPConfig(PretrainedConfig):
|
class XCLIPConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
@@ -295,6 +258,7 @@ class XCLIPConfig(PretrainedConfig):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
model_type = "xclip"
|
model_type = "xclip"
|
||||||
|
sub_configs = {"text_config": XCLIPTextConfig, "vision_config": XCLIPVisionConfig}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -457,11 +457,20 @@ class AlignModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = AlignModelTester(self)
|
self.model_tester = AlignModelTester(self)
|
||||||
|
self.config_tester = ConfigTester(
|
||||||
|
self,
|
||||||
|
config_class=AlignConfig,
|
||||||
|
has_text_modality=False,
|
||||||
|
common_properties=["projection_dim", "temperature_init_value"],
|
||||||
|
)
|
||||||
|
|
||||||
def test_model(self):
|
def test_model(self):
|
||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
@unittest.skip(reason="Start to fail after using torch `cu118`.")
|
@unittest.skip(reason="Start to fail after using torch `cu118`.")
|
||||||
def test_multi_gpu_data_parallel_forward(self):
|
def test_multi_gpu_data_parallel_forward(self):
|
||||||
super().test_multi_gpu_data_parallel_forward()
|
super().test_multi_gpu_data_parallel_forward()
|
||||||
|
|||||||
@@ -452,11 +452,20 @@ class AltCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = AltCLIPModelTester(self)
|
self.model_tester = AltCLIPModelTester(self)
|
||||||
|
self.config_tester = ConfigTester(
|
||||||
|
self,
|
||||||
|
config_class=AltCLIPConfig,
|
||||||
|
has_text_modality=False,
|
||||||
|
common_properties=["projection_dim", "logit_scale_init_value"],
|
||||||
|
)
|
||||||
|
|
||||||
def test_model(self):
|
def test_model(self):
|
||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
@unittest.skip(reason="Hidden_states is tested in individual model tests")
|
@unittest.skip(reason="Hidden_states is tested in individual model tests")
|
||||||
def test_hidden_states_output(self):
|
def test_hidden_states_output(self):
|
||||||
pass
|
pass
|
||||||
|
|||||||
@@ -449,11 +449,18 @@ class BlipModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = BlipModelTester(self)
|
self.model_tester = BlipModelTester(self)
|
||||||
|
common_properties = ["logit_scale_init_value", "image_text_hidden_size", "projection_dim", "label_smoothing"]
|
||||||
|
self.config_tester = ConfigTester(
|
||||||
|
self, config_class=BlipConfig, has_text_modality=False, common_properties=common_properties
|
||||||
|
)
|
||||||
|
|
||||||
def test_model(self):
|
def test_model(self):
|
||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
@unittest.skip(reason="Hidden_states is tested in individual model tests")
|
@unittest.skip(reason="Hidden_states is tested in individual model tests")
|
||||||
def test_hidden_states_output(self):
|
def test_hidden_states_output(self):
|
||||||
pass
|
pass
|
||||||
|
|||||||
@@ -482,6 +482,13 @@ class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationT
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = Blip2ForConditionalGenerationDecoderOnlyModelTester(self)
|
self.model_tester = Blip2ForConditionalGenerationDecoderOnlyModelTester(self)
|
||||||
|
common_properties = ["image_token_index", "num_query_tokens", "image_text_hidden_size"]
|
||||||
|
self.config_tester = ConfigTester(
|
||||||
|
self, config_class=Blip2Config, has_text_modality=False, common_properties=common_properties
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
def test_for_conditional_generation(self):
|
def test_for_conditional_generation(self):
|
||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
|
|||||||
@@ -515,11 +515,18 @@ class ClapModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = ClapModelTester(self)
|
self.model_tester = ClapModelTester(self)
|
||||||
|
common_properties = ["logit_scale_init_value", "projection_hidden_act", "projection_dim"]
|
||||||
|
self.config_tester = ConfigTester(
|
||||||
|
self, config_class=ClapConfig, has_text_modality=False, common_properties=common_properties
|
||||||
|
)
|
||||||
|
|
||||||
def test_model(self):
|
def test_model(self):
|
||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
@unittest.skip(reason="Hidden_states is tested in individual model tests")
|
@unittest.skip(reason="Hidden_states is tested in individual model tests")
|
||||||
def test_hidden_states_output(self):
|
def test_hidden_states_output(self):
|
||||||
pass
|
pass
|
||||||
|
|||||||
@@ -745,11 +745,18 @@ class CLIPModelTest(CLIPModelTesterMixin, PipelineTesterMixin, unittest.TestCase
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = CLIPModelTester(self)
|
self.model_tester = CLIPModelTester(self)
|
||||||
|
common_properties = ["projection_dim", "logit_scale_init_value"]
|
||||||
|
self.config_tester = ConfigTester(
|
||||||
|
self, config_class=CLIPConfig, has_text_modality=False, common_properties=common_properties
|
||||||
|
)
|
||||||
|
|
||||||
def test_model(self):
|
def test_model(self):
|
||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
@unittest.skip(reason="Hidden_states is tested in individual model tests")
|
@unittest.skip(reason="Hidden_states is tested in individual model tests")
|
||||||
def test_hidden_states_output(self):
|
def test_hidden_states_output(self):
|
||||||
pass
|
pass
|
||||||
|
|||||||
@@ -472,11 +472,18 @@ class CLIPSegModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = CLIPSegModelTester(self)
|
self.model_tester = CLIPSegModelTester(self)
|
||||||
|
common_properties = ["projection_dim", "logit_scale_init_value"]
|
||||||
|
self.config_tester = ConfigTester(
|
||||||
|
self, config_class=CLIPSegConfig, has_text_modality=False, common_properties=common_properties
|
||||||
|
)
|
||||||
|
|
||||||
def test_model(self):
|
def test_model(self):
|
||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
def test_model_for_image_segmentation(self):
|
def test_model_for_image_segmentation(self):
|
||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_model_for_image_segmentation(*config_and_inputs)
|
self.model_tester.create_and_check_model_for_image_segmentation(*config_and_inputs)
|
||||||
|
|||||||
@@ -414,7 +414,13 @@ class ClvpModelForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase)
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = ClvpModelForConditionalGenerationTester(self)
|
self.model_tester = ClvpModelForConditionalGenerationTester(self)
|
||||||
self.clvp_config_tester = ConfigTester(self, config_class=ClvpConfig, hidden_size=32)
|
common_properties = ["projection_dim", "logit_scale_init_value"]
|
||||||
|
self.clvp_config_tester = ConfigTester(
|
||||||
|
self, config_class=ClvpConfig, has_text_modality=False, common_properties=common_properties, hidden_size=32
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.clvp_config_tester.run_common_tests()
|
||||||
|
|
||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
super().tearDown()
|
super().tearDown()
|
||||||
|
|||||||
@@ -931,11 +931,18 @@ class FlavaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = self.class_for_tester(self)
|
self.model_tester = self.class_for_tester(self)
|
||||||
|
common_properties = ["projection_dim", "logit_scale_init_value", "init_codebook"]
|
||||||
|
self.config_tester = ConfigTester(
|
||||||
|
self, config_class=FlavaConfig, has_text_modality=False, common_properties=common_properties
|
||||||
|
)
|
||||||
|
|
||||||
def test_model(self):
|
def test_model(self):
|
||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
@unittest.skip(reason="tested in individual model tests")
|
@unittest.skip(reason="tested in individual model tests")
|
||||||
def test_hidden_states_output(self):
|
def test_hidden_states_output(self):
|
||||||
pass
|
pass
|
||||||
|
|||||||
@@ -559,11 +559,18 @@ class GroupViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = GroupViTModelTester(self)
|
self.model_tester = GroupViTModelTester(self)
|
||||||
|
common_properties = ["projection_dim", "projection_intermediate_dim", "logit_scale_init_value"]
|
||||||
|
self.config_tester = ConfigTester(
|
||||||
|
self, config_class=GroupViTConfig, has_text_modality=False, common_properties=common_properties
|
||||||
|
)
|
||||||
|
|
||||||
def test_model(self):
|
def test_model(self):
|
||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
@unittest.skip(reason="hidden_states are tested in individual model tests")
|
@unittest.skip(reason="hidden_states are tested in individual model tests")
|
||||||
def test_hidden_states_output(self):
|
def test_hidden_states_output(self):
|
||||||
pass
|
pass
|
||||||
|
|||||||
@@ -185,7 +185,12 @@ class Idefics2ModelTest(ModelTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = Idefics2VisionText2TextModelTester(self)
|
self.model_tester = Idefics2VisionText2TextModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=Idefics2Config, has_text_modality=False)
|
self.config_tester = ConfigTester(
|
||||||
|
self, config_class=Idefics2Config, has_text_modality=False, common_properties=["image_token_id"]
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
@unittest.skip(reason="input_embeds cannot be passed in without input_ids")
|
@unittest.skip(reason="input_embeds cannot be passed in without input_ids")
|
||||||
def test_inputs_embeds():
|
def test_inputs_embeds():
|
||||||
|
|||||||
@@ -168,7 +168,12 @@ class Idefics3ModelTest(ModelTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = Idefics3VisionText2TextModelTester(self)
|
self.model_tester = Idefics3VisionText2TextModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=Idefics3Config, has_text_modality=False)
|
self.config_tester = ConfigTester(
|
||||||
|
self, config_class=Idefics3Config, has_text_modality=False, common_properties=["image_token_id"]
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
@unittest.skip(reason="input_embeds cannot be passed in without input_ids")
|
@unittest.skip(reason="input_embeds cannot be passed in without input_ids")
|
||||||
def test_inputs_embeds():
|
def test_inputs_embeds():
|
||||||
|
|||||||
@@ -486,6 +486,15 @@ class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, Gene
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = InstructBlipForConditionalGenerationDecoderOnlyModelTester(self)
|
self.model_tester = InstructBlipForConditionalGenerationDecoderOnlyModelTester(self)
|
||||||
|
self.config_tester = ConfigTester(
|
||||||
|
self,
|
||||||
|
config_class=InstructBlipConfig,
|
||||||
|
has_text_modality=False,
|
||||||
|
common_properties=["num_query_tokens", "image_token_index"],
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
def test_for_conditional_generation(self):
|
def test_for_conditional_generation(self):
|
||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
|
|||||||
@@ -510,11 +510,18 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester(self)
|
self.model_tester = InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester(self)
|
||||||
|
common_properties = ["num_query_tokens", "video_token_index"]
|
||||||
|
self.config_tester = ConfigTester(
|
||||||
|
self, config_class=InstructBlipVideoConfig, has_text_modality=False, common_properties=common_properties
|
||||||
|
)
|
||||||
|
|
||||||
def test_for_conditional_generation(self):
|
def test_for_conditional_generation(self):
|
||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
|
self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
@unittest.skip(reason="Hidden_states is tested in individual model tests")
|
@unittest.skip(reason="Hidden_states is tested in individual model tests")
|
||||||
def test_hidden_states_output(self):
|
def test_hidden_states_output(self):
|
||||||
pass
|
pass
|
||||||
|
|||||||
@@ -304,7 +304,12 @@ class Kosmos2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = Kosmos2ModelTester(self)
|
self.model_tester = Kosmos2ModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=Kosmos2Config, hidden_size=37)
|
self.config_tester = ConfigTester(
|
||||||
|
self, config_class=Kosmos2Config, has_text_modality=False, common_properties=["latent_query_num"]
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
# overwrite from common to skip `image_to_text_projection.latent_query`
|
# overwrite from common to skip `image_to_text_projection.latent_query`
|
||||||
def test_initialization(self):
|
def test_initialization(self):
|
||||||
|
|||||||
@@ -194,7 +194,13 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = LlavaVisionText2TextModelTester(self)
|
self.model_tester = LlavaVisionText2TextModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=LlavaConfig, has_text_modality=False)
|
common_properties = ["image_token_index", "vision_feature_layer", "image_seq_length"]
|
||||||
|
self.config_tester = ConfigTester(
|
||||||
|
self, config_class=LlavaConfig, has_text_modality=False, common_properties=common_properties
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
# overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
|
# overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
|
||||||
def test_inputs_embeds(self):
|
def test_inputs_embeds(self):
|
||||||
|
|||||||
@@ -223,7 +223,13 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = LlavaNextVisionText2TextModelTester(self)
|
self.model_tester = LlavaNextVisionText2TextModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=LlavaNextConfig, has_text_modality=False)
|
common_properties = ["image_token_index", "vision_feature_layer", "image_seq_length"]
|
||||||
|
self.config_tester = ConfigTester(
|
||||||
|
self, config_class=LlavaNextConfig, has_text_modality=False, common_properties=common_properties
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
def test_initialization(self):
|
def test_initialization(self):
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|||||||
@@ -240,7 +240,13 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = LlavaNextVideoVisionText2TextModelTester(self)
|
self.model_tester = LlavaNextVideoVisionText2TextModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=LlavaNextVideoConfig, has_text_modality=False)
|
common_properties = ["image_token_index", "video_token_index", "vision_feature_layer", "image_seq_length"]
|
||||||
|
self.config_tester = ConfigTester(
|
||||||
|
self, config_class=LlavaNextVideoConfig, has_text_modality=False, common_properties=common_properties
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
def test_initialization(self):
|
def test_initialization(self):
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|||||||
@@ -226,7 +226,13 @@ class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, Generati
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = LlavaOnevisionVisionText2TextModelTester(self)
|
self.model_tester = LlavaOnevisionVisionText2TextModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=LlavaOnevisionConfig, has_text_modality=False)
|
common_properties = ["image_token_index", "video_token_index", "vision_feature_layer"]
|
||||||
|
self.config_tester = ConfigTester(
|
||||||
|
self, config_class=LlavaOnevisionConfig, has_text_modality=False, common_properties=common_properties
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
def test_initialization(self):
|
def test_initialization(self):
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|||||||
@@ -272,7 +272,12 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = MllamaVisionText2TextModelTester(self)
|
self.model_tester = MllamaVisionText2TextModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=MllamaConfig, has_text_modality=False)
|
self.config_tester = ConfigTester(
|
||||||
|
self, config_class=MllamaConfig, has_text_modality=False, common_properties=["image_token_index"]
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
# overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
|
# overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
|
||||||
def test_inputs_embeds(self):
|
def test_inputs_embeds(self):
|
||||||
|
|||||||
@@ -447,6 +447,13 @@ class Owlv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = Owlv2ModelTester(self)
|
self.model_tester = Owlv2ModelTester(self)
|
||||||
|
common_properties = ["projection_dim", "logit_scale_init_value"]
|
||||||
|
self.config_tester = ConfigTester(
|
||||||
|
self, config_class=Owlv2Config, has_text_modality=False, common_properties=common_properties
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
def test_model(self):
|
def test_model(self):
|
||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
|
|||||||
@@ -442,6 +442,13 @@ class OwlViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = OwlViTModelTester(self)
|
self.model_tester = OwlViTModelTester(self)
|
||||||
|
common_properties = ["projection_dim", "logit_scale_init_value"]
|
||||||
|
self.config_tester = ConfigTester(
|
||||||
|
self, config_class=OwlViTConfig, has_text_modality=False, common_properties=common_properties
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
def test_model(self):
|
def test_model(self):
|
||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
|
|||||||
@@ -232,6 +232,9 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
|
|||||||
self.model_tester = Qwen2VLVisionText2TextModelTester(self)
|
self.model_tester = Qwen2VLVisionText2TextModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=Qwen2VLConfig, has_text_modality=False)
|
self.config_tester = ConfigTester(self, config_class=Qwen2VLConfig, has_text_modality=False)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
def test_initialization(self):
|
def test_initialization(self):
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|
||||||
|
|||||||
@@ -667,9 +667,12 @@ class SiglipModelTest(SiglipModelTesterMixin, PipelineTesterMixin, unittest.Test
|
|||||||
test_disk_offload_bin = False
|
test_disk_offload_bin = False
|
||||||
_is_composite = True
|
_is_composite = True
|
||||||
|
|
||||||
# Copied from tests.models.clip.test_modeling_clip.CLIPModelTest.setUp with CLIP->Siglip
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = SiglipModelTester(self)
|
self.model_tester = SiglipModelTester(self)
|
||||||
|
self.config_tester = ConfigTester(self, config_class=SiglipConfig, has_text_modality=False)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
# Copied from tests.models.clip.test_modeling_clip.CLIPModelTest.test_model
|
# Copied from tests.models.clip.test_modeling_clip.CLIPModelTest.test_model
|
||||||
def test_model(self):
|
def test_model(self):
|
||||||
|
|||||||
@@ -217,7 +217,13 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = VideoLlavaVisionText2TextModelTester(self)
|
self.model_tester = VideoLlavaVisionText2TextModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=VideoLlavaConfig, has_text_modality=False)
|
common_properties = ["image_token_index", "video_token_index", "vision_feature_layer", "image_seq_length"]
|
||||||
|
self.config_tester = ConfigTester(
|
||||||
|
self, config_class=VideoLlavaConfig, has_text_modality=False, common_properties=common_properties
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
@unittest.skip(
|
@unittest.skip(
|
||||||
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||||
|
|||||||
@@ -179,7 +179,13 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = VipLlavaVisionText2TextModelTester(self)
|
self.model_tester = VipLlavaVisionText2TextModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=VipLlavaConfig, has_text_modality=False)
|
common_properties = ["image_token_index", "vision_feature_layers", "image_seq_length"]
|
||||||
|
self.config_tester = ConfigTester(
|
||||||
|
self, config_class=VipLlavaConfig, has_text_modality=False, common_properties=common_properties
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
# overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
|
# overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
|
||||||
def test_inputs_embeds(self):
|
def test_inputs_embeds(self):
|
||||||
|
|||||||
@@ -547,6 +547,13 @@ class XCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = XCLIPModelTester(self)
|
self.model_tester = XCLIPModelTester(self)
|
||||||
|
common_properties = ["projection_dim", "prompt_layers", "prompt_num_attention_heads"]
|
||||||
|
self.config_tester = ConfigTester(
|
||||||
|
self, config_class=XCLIPConfig, has_text_modality=False, common_properties=common_properties
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
def test_model(self):
|
def test_model(self):
|
||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
|
|||||||
@@ -17,12 +17,17 @@ import copy
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from transformers import is_torch_available
|
from transformers import is_torch_available
|
||||||
|
from transformers.utils import direct_transformers_import
|
||||||
|
|
||||||
from .utils.test_configuration_utils import config_common_kwargs
|
from .utils.test_configuration_utils import config_common_kwargs
|
||||||
|
|
||||||
|
|
||||||
|
transformers_module = direct_transformers_import(Path(__file__).parent)
|
||||||
|
|
||||||
|
|
||||||
class ConfigTester:
|
class ConfigTester:
|
||||||
def __init__(self, parent, config_class=None, has_text_modality=True, common_properties=None, **kwargs):
|
def __init__(self, parent, config_class=None, has_text_modality=True, common_properties=None, **kwargs):
|
||||||
self.parent = parent
|
self.parent = parent
|
||||||
@@ -35,9 +40,10 @@ class ConfigTester:
|
|||||||
config = self.config_class(**self.inputs_dict)
|
config = self.config_class(**self.inputs_dict)
|
||||||
common_properties = (
|
common_properties = (
|
||||||
["hidden_size", "num_attention_heads", "num_hidden_layers"]
|
["hidden_size", "num_attention_heads", "num_hidden_layers"]
|
||||||
if self.common_properties is None
|
if self.common_properties is None and not self.config_class.sub_configs
|
||||||
else self.common_properties
|
else self.common_properties
|
||||||
)
|
)
|
||||||
|
common_properties = [] if common_properties is None else common_properties
|
||||||
|
|
||||||
# Add common fields for text models
|
# Add common fields for text models
|
||||||
if self.has_text_modality:
|
if self.has_text_modality:
|
||||||
@@ -110,6 +116,44 @@ class ConfigTester:
|
|||||||
|
|
||||||
self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
|
self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
|
||||||
|
|
||||||
|
def create_and_test_config_from_and_save_pretrained_composite(self):
|
||||||
|
"""
|
||||||
|
Tests that composite or nested cofigs can be loaded and saved correctly. In case the config
|
||||||
|
has a sub-config, we should be able to call `sub_config.from_pretrained('general_config_file')`
|
||||||
|
and get a result same as if we loaded the whole config and obtained `config.sub_config` from it.
|
||||||
|
"""
|
||||||
|
config = self.config_class(**self.inputs_dict)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||||
|
config.save_pretrained(tmpdirname)
|
||||||
|
general_config_loaded = self.config_class.from_pretrained(tmpdirname)
|
||||||
|
general_config_dict = config.to_dict()
|
||||||
|
|
||||||
|
# Iterate over all sub_configs if there are any and load them with their own classes
|
||||||
|
sub_configs = self.config_class.sub_configs
|
||||||
|
for sub_config_key, sub_class in sub_configs.items():
|
||||||
|
if sub_class.__name__ == "AutoConfig":
|
||||||
|
sub_class = sub_class.for_model(**general_config_dict[sub_config_key]).__class__
|
||||||
|
sub_config_loaded = sub_class.from_pretrained(tmpdirname)
|
||||||
|
else:
|
||||||
|
sub_config_loaded = sub_class.from_pretrained(tmpdirname)
|
||||||
|
|
||||||
|
# Pop `transformers_version`, it never exists when a config is part of a general composite config
|
||||||
|
# Verify that loading with subconfig class results in same dict as if we loaded with general composite config class
|
||||||
|
sub_config_loaded_dict = sub_config_loaded.to_dict()
|
||||||
|
sub_config_loaded_dict.pop("transformers_version", None)
|
||||||
|
self.parent.assertEqual(sub_config_loaded_dict, general_config_dict[sub_config_key])
|
||||||
|
|
||||||
|
# Verify that the loaded config type is same as in the general config
|
||||||
|
type_from_general_config = type(getattr(general_config_loaded, sub_config_key))
|
||||||
|
self.parent.assertTrue(isinstance(sub_config_loaded, type_from_general_config))
|
||||||
|
|
||||||
|
# Now save only the sub-config and load it back to make sure the whole load-save-load pipeline works
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdirname2:
|
||||||
|
sub_config_loaded.save_pretrained(tmpdirname2)
|
||||||
|
sub_config_loaded_2 = sub_class.from_pretrained(tmpdirname2)
|
||||||
|
self.parent.assertEqual(sub_config_loaded.to_dict(), sub_config_loaded_2.to_dict())
|
||||||
|
|
||||||
def create_and_test_config_with_num_labels(self):
|
def create_and_test_config_with_num_labels(self):
|
||||||
config = self.config_class(**self.inputs_dict, num_labels=5)
|
config = self.config_class(**self.inputs_dict, num_labels=5)
|
||||||
self.parent.assertEqual(len(config.id2label), 5)
|
self.parent.assertEqual(len(config.id2label), 5)
|
||||||
@@ -128,6 +172,9 @@ class ConfigTester:
|
|||||||
self.parent.assertIsNotNone(config)
|
self.parent.assertIsNotNone(config)
|
||||||
|
|
||||||
def check_config_arguments_init(self):
|
def check_config_arguments_init(self):
|
||||||
|
if self.config_class.sub_configs:
|
||||||
|
return # TODO: @raushan composite models are not consistent in how they set general params
|
||||||
|
|
||||||
kwargs = copy.deepcopy(config_common_kwargs)
|
kwargs = copy.deepcopy(config_common_kwargs)
|
||||||
config = self.config_class(**kwargs)
|
config = self.config_class(**kwargs)
|
||||||
wrong_values = []
|
wrong_values = []
|
||||||
@@ -153,6 +200,7 @@ class ConfigTester:
|
|||||||
self.create_and_test_config_to_json_file()
|
self.create_and_test_config_to_json_file()
|
||||||
self.create_and_test_config_from_and_save_pretrained()
|
self.create_and_test_config_from_and_save_pretrained()
|
||||||
self.create_and_test_config_from_and_save_pretrained_subfolder()
|
self.create_and_test_config_from_and_save_pretrained_subfolder()
|
||||||
|
self.create_and_test_config_from_and_save_pretrained_composite()
|
||||||
self.create_and_test_config_with_num_labels()
|
self.create_and_test_config_with_num_labels()
|
||||||
self.check_config_can_be_init_without_params()
|
self.check_config_can_be_init_without_params()
|
||||||
self.check_config_arguments_init()
|
self.check_config_arguments_init()
|
||||||
|
|||||||
@@ -3802,22 +3802,18 @@ class ModelTesterMixin:
|
|||||||
self.skipTest("Model is not a composite model.")
|
self.skipTest("Model is not a composite model.")
|
||||||
|
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
sub_configs = {
|
|
||||||
key: getattr(config, key) for key in config if isinstance(getattr(config, key), PretrainedConfig)
|
|
||||||
}
|
|
||||||
|
|
||||||
# set eager as it will be the one supported in all models
|
# set eager as it will be the one supported in all models
|
||||||
# we just need to test if passing 'attn_implementation' as a dict fails or not
|
# we just need to test if passing 'attn_implementation' as a dict fails or not
|
||||||
attn_implementation_per_subconfig = {}
|
attn_implementation_per_subconfig = {}
|
||||||
for key, sub_config in sub_configs.items():
|
for key in config.sub_configs.keys():
|
||||||
attn_implementation_per_subconfig[key] = "eager"
|
attn_implementation_per_subconfig[key] = "eager"
|
||||||
|
|
||||||
config._attn_implementation = attn_implementation_per_subconfig
|
config._attn_implementation = attn_implementation_per_subconfig
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
for key in model.config:
|
for key in config.sub_configs.keys():
|
||||||
if isinstance(getattr(model.config, key), PretrainedConfig):
|
sub_config = getattr(model.config, key)
|
||||||
sub_config = getattr(model.config, key)
|
self.assertTrue(sub_config._attn_implementation == "eager")
|
||||||
self.assertTrue(sub_config._attn_implementation == "eager")
|
|
||||||
|
|
||||||
for name, submodule in model.named_modules():
|
for name, submodule in model.named_modules():
|
||||||
class_name = submodule.__class__.__name__
|
class_name = submodule.__class__.__name__
|
||||||
@@ -3826,7 +3822,7 @@ class ModelTesterMixin:
|
|||||||
or "SdpaSelfAttention" in class_name
|
or "SdpaSelfAttention" in class_name
|
||||||
or "FlashAttention" in class_name
|
or "FlashAttention" in class_name
|
||||||
):
|
):
|
||||||
raise ValueError("The eager model should not have SDPA/FA2 attention layers")
|
raise ValueError(f"The eager model should not have SDPA/FA2 attention layers but got {class_name}")
|
||||||
|
|
||||||
@require_torch_sdpa
|
@require_torch_sdpa
|
||||||
def test_sdpa_can_dispatch_non_composite_models(self):
|
def test_sdpa_can_dispatch_non_composite_models(self):
|
||||||
|
|||||||
Reference in New Issue
Block a user