[configuration] remove redundant classmethod (#38812)
* remove redundant classmethod * warning message, add space between words * fix tests * fix copies
This commit is contained in:
committed by
GitHub
parent
02ea23cbde
commit
b56d721397
@@ -1199,6 +1199,42 @@ class PretrainedConfig(PushToHubMixin):
|
|||||||
config_to_return = self
|
config_to_return = self
|
||||||
return config_to_return
|
return config_to_return
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_text_vision_configs(cls, text_config, vision_config, **kwargs):
|
||||||
|
r"""
|
||||||
|
Instantiate a model config (or a derived class) from text model configuration and vision model
|
||||||
|
configuration.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
[`PreTrainedConfig`]: An instance of a configuration object
|
||||||
|
"""
|
||||||
|
|
||||||
|
warnings.warn(
|
||||||
|
"The `from_text_vision_configs` method is deprecated and will be removed in v4.60 of Transformers. Please instantiate "
|
||||||
|
"the config class directly with `MyConfig(text_config=text_config, vision_config=vision_config, **kwargs)` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
|
||||||
|
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_text_audio_configs(cls, text_config, audio_config, **kwargs):
|
||||||
|
r"""
|
||||||
|
Instantiate a model config (or a derived class) from text model configuration and audio model
|
||||||
|
configuration.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
[`PreTrainedConfig`]: An instance of a configuration object
|
||||||
|
"""
|
||||||
|
|
||||||
|
warnings.warn(
|
||||||
|
"The `from_text_audio_configs` method is deprecated and will be removed in v4.60 of Transformers. Please instantiate "
|
||||||
|
"the config class directly with `MyConfig(text_config=text_config, audio_config=audio_config, **kwargs)` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
|
||||||
|
return cls(text_config=text_config.to_dict(), audio_config=audio_config.to_dict(), **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def get_configuration_file(configuration_files: list[str]) -> str:
|
def get_configuration_file(configuration_files: list[str]) -> str:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -280,17 +280,5 @@ class Aimv2Config(PretrainedConfig):
|
|||||||
self.logit_scale_init_value = logit_scale_init_value
|
self.logit_scale_init_value = logit_scale_init_value
|
||||||
self.max_logit_scale = 100.0
|
self.max_logit_scale = 100.0
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_text_vision_configs(cls, text_config: Aimv2TextConfig, vision_config: Aimv2VisionConfig, **kwargs):
|
|
||||||
r"""
|
|
||||||
Instantiate a [`Aimv2Config`] (or a derived class) from aimv2 text model configuration and aimv2 vision
|
|
||||||
model configuration.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
[`Aimv2Config`]: An instance of a configuration object
|
|
||||||
"""
|
|
||||||
|
|
||||||
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Aimv2Config", "Aimv2VisionConfig", "Aimv2TextConfig"]
|
__all__ = ["Aimv2Config", "Aimv2VisionConfig", "Aimv2TextConfig"]
|
||||||
|
|||||||
@@ -327,17 +327,5 @@ class AlignConfig(PretrainedConfig):
|
|||||||
self.temperature_init_value = temperature_init_value
|
self.temperature_init_value = temperature_init_value
|
||||||
self.initializer_range = initializer_range
|
self.initializer_range = initializer_range
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_text_vision_configs(cls, text_config: AlignTextConfig, vision_config: AlignVisionConfig, **kwargs):
|
|
||||||
r"""
|
|
||||||
Instantiate a [`AlignConfig`] (or a derived class) from align text model configuration and align vision model
|
|
||||||
configuration.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
[`AlignConfig`]: An instance of a configuration object
|
|
||||||
"""
|
|
||||||
|
|
||||||
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["AlignTextConfig", "AlignVisionConfig", "AlignConfig"]
|
__all__ = ["AlignTextConfig", "AlignVisionConfig", "AlignConfig"]
|
||||||
|
|||||||
@@ -368,17 +368,5 @@ class AltCLIPConfig(PretrainedConfig):
|
|||||||
self.logit_scale_init_value = logit_scale_init_value
|
self.logit_scale_init_value = logit_scale_init_value
|
||||||
self.initializer_factor = 1.0
|
self.initializer_factor = 1.0
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_text_vision_configs(cls, text_config: AltCLIPTextConfig, vision_config: AltCLIPVisionConfig, **kwargs):
|
|
||||||
r"""
|
|
||||||
Instantiate a [`AltCLIPConfig`] (or a derived class) from altclip text model configuration and altclip vision
|
|
||||||
model configuration.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
[`AltCLIPConfig`]: An instance of a configuration object
|
|
||||||
"""
|
|
||||||
|
|
||||||
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["AltCLIPTextConfig", "AltCLIPVisionConfig", "AltCLIPConfig"]
|
__all__ = ["AltCLIPTextConfig", "AltCLIPVisionConfig", "AltCLIPConfig"]
|
||||||
|
|||||||
@@ -313,17 +313,5 @@ class BlipConfig(PretrainedConfig):
|
|||||||
self.image_text_hidden_size = image_text_hidden_size
|
self.image_text_hidden_size = image_text_hidden_size
|
||||||
self.label_smoothing = label_smoothing
|
self.label_smoothing = label_smoothing
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_text_vision_configs(cls, text_config: BlipTextConfig, vision_config: BlipVisionConfig, **kwargs):
|
|
||||||
r"""
|
|
||||||
Instantiate a [`BlipConfig`] (or a derived class) from blip text model configuration and blip vision model
|
|
||||||
configuration.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
[`BlipConfig`]: An instance of a configuration object
|
|
||||||
"""
|
|
||||||
|
|
||||||
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["BlipConfig", "BlipTextConfig", "BlipVisionConfig"]
|
__all__ = ["BlipConfig", "BlipTextConfig", "BlipVisionConfig"]
|
||||||
|
|||||||
@@ -304,16 +304,5 @@ class BridgeTowerConfig(PretrainedConfig):
|
|||||||
self.text_config = BridgeTowerTextConfig(**text_config)
|
self.text_config = BridgeTowerTextConfig(**text_config)
|
||||||
self.vision_config = BridgeTowerVisionConfig(**vision_config)
|
self.vision_config = BridgeTowerVisionConfig(**vision_config)
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_text_vision_configs(
|
|
||||||
cls, text_config: BridgeTowerTextConfig, vision_config: BridgeTowerVisionConfig, **kwargs
|
|
||||||
):
|
|
||||||
r"""
|
|
||||||
Instantiate a [`BridgeTowerConfig`] (or a derived class) from BridgeTower text model configuration. Returns:
|
|
||||||
[`BridgeTowerConfig`]: An instance of a configuration object
|
|
||||||
"""
|
|
||||||
|
|
||||||
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["BridgeTowerConfig", "BridgeTowerTextConfig", "BridgeTowerVisionConfig"]
|
__all__ = ["BridgeTowerConfig", "BridgeTowerTextConfig", "BridgeTowerVisionConfig"]
|
||||||
|
|||||||
@@ -373,18 +373,6 @@ class ChineseCLIPConfig(PretrainedConfig):
|
|||||||
self.initializer_factor = 1.0
|
self.initializer_factor = 1.0
|
||||||
self.initializer_range = 0.02
|
self.initializer_range = 0.02
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_text_vision_configs(
|
|
||||||
cls, text_config: ChineseCLIPTextConfig, vision_config: ChineseCLIPVisionConfig, **kwargs
|
|
||||||
):
|
|
||||||
r"""
|
|
||||||
Instantiate a [`ChineseCLIPConfig`] (or a derived class) from Chinese-CLIP text model configuration and
|
|
||||||
Chinese-CLIP vision model configuration. Returns:
|
|
||||||
[`ChineseCLIPConfig`]: An instance of a configuration object
|
|
||||||
"""
|
|
||||||
|
|
||||||
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class ChineseCLIPOnnxConfig(OnnxConfig):
|
class ChineseCLIPOnnxConfig(OnnxConfig):
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -378,17 +378,5 @@ class ClapConfig(PretrainedConfig):
|
|||||||
self.initializer_factor = initializer_factor
|
self.initializer_factor = initializer_factor
|
||||||
self.num_hidden_layers = self.text_config.num_hidden_layers + len(self.audio_config.depths)
|
self.num_hidden_layers = self.text_config.num_hidden_layers + len(self.audio_config.depths)
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_text_audio_configs(cls, text_config: ClapTextConfig, audio_config: ClapAudioConfig, **kwargs):
|
|
||||||
r"""
|
|
||||||
Instantiate a [`ClapConfig`] (or a derived class) from clap text model configuration and clap audio model
|
|
||||||
configuration.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
[`ClapConfig`]: An instance of a configuration object
|
|
||||||
"""
|
|
||||||
|
|
||||||
return cls(text_config=text_config.to_dict(), audio_config=audio_config.to_dict(), **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["ClapAudioConfig", "ClapConfig", "ClapTextConfig"]
|
__all__ = ["ClapAudioConfig", "ClapConfig", "ClapTextConfig"]
|
||||||
|
|||||||
@@ -361,18 +361,6 @@ class CLIPConfig(PretrainedConfig):
|
|||||||
self.logit_scale_init_value = logit_scale_init_value
|
self.logit_scale_init_value = logit_scale_init_value
|
||||||
self.initializer_factor = 1.0
|
self.initializer_factor = 1.0
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs):
|
|
||||||
r"""
|
|
||||||
Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
|
|
||||||
configuration.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
[`CLIPConfig`]: An instance of a configuration object
|
|
||||||
"""
|
|
||||||
|
|
||||||
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class CLIPOnnxConfig(OnnxConfig):
|
class CLIPOnnxConfig(OnnxConfig):
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -380,17 +380,5 @@ class CLIPSegConfig(PretrainedConfig):
|
|||||||
self.initializer_factor = 1.0
|
self.initializer_factor = 1.0
|
||||||
self.use_complex_transposed_convolution = use_complex_transposed_convolution
|
self.use_complex_transposed_convolution = use_complex_transposed_convolution
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_text_vision_configs(cls, text_config: CLIPSegTextConfig, vision_config: CLIPSegVisionConfig, **kwargs):
|
|
||||||
r"""
|
|
||||||
Instantiate a [`CLIPSegConfig`] (or a derived class) from clipseg text model configuration and clipseg vision
|
|
||||||
model configuration.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
[`CLIPSegConfig`]: An instance of a configuration object
|
|
||||||
"""
|
|
||||||
|
|
||||||
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["CLIPSegConfig", "CLIPSegTextConfig", "CLIPSegVisionConfig"]
|
__all__ = ["CLIPSegConfig", "CLIPSegTextConfig", "CLIPSegVisionConfig"]
|
||||||
|
|||||||
@@ -357,18 +357,6 @@ class GroupViTConfig(PretrainedConfig):
|
|||||||
self.initializer_factor = 1.0
|
self.initializer_factor = 1.0
|
||||||
self.output_segmentation = False
|
self.output_segmentation = False
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_text_vision_configs(cls, text_config: GroupViTTextConfig, vision_config: GroupViTVisionConfig, **kwargs):
|
|
||||||
r"""
|
|
||||||
Instantiate a [`GroupViTConfig`] (or a derived class) from groupvit text model configuration and groupvit
|
|
||||||
vision model configuration.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
[`GroupViTConfig`]: An instance of a configuration object
|
|
||||||
"""
|
|
||||||
|
|
||||||
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class GroupViTOnnxConfig(OnnxConfig):
|
class GroupViTOnnxConfig(OnnxConfig):
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -332,19 +332,5 @@ class Pix2StructConfig(PretrainedConfig):
|
|||||||
|
|
||||||
self.is_vqa = is_vqa
|
self.is_vqa = is_vqa
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_text_vision_configs(
|
|
||||||
cls, text_config: Pix2StructTextConfig, vision_config: Pix2StructVisionConfig, **kwargs
|
|
||||||
):
|
|
||||||
r"""
|
|
||||||
Instantiate a [`Pix2StructConfig`] (or a derived class) from pix2struct text model configuration and pix2struct
|
|
||||||
vision model configuration.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
[`Pix2StructConfig`]: An instance of a configuration object
|
|
||||||
"""
|
|
||||||
|
|
||||||
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Pix2StructConfig", "Pix2StructTextConfig", "Pix2StructVisionConfig"]
|
__all__ = ["Pix2StructConfig", "Pix2StructTextConfig", "Pix2StructVisionConfig"]
|
||||||
|
|||||||
@@ -253,17 +253,5 @@ class SiglipConfig(PretrainedConfig):
|
|||||||
|
|
||||||
self.initializer_factor = 1.0
|
self.initializer_factor = 1.0
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_text_vision_configs(cls, text_config: SiglipTextConfig, vision_config: SiglipVisionConfig, **kwargs):
|
|
||||||
r"""
|
|
||||||
Instantiate a [`SiglipConfig`] (or a derived class) from siglip text model configuration and siglip vision
|
|
||||||
model configuration.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
[`SiglipConfig`]: An instance of a configuration object
|
|
||||||
"""
|
|
||||||
|
|
||||||
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["SiglipConfig", "SiglipTextConfig", "SiglipVisionConfig"]
|
__all__ = ["SiglipConfig", "SiglipTextConfig", "SiglipVisionConfig"]
|
||||||
|
|||||||
@@ -261,17 +261,5 @@ class Siglip2Config(PretrainedConfig):
|
|||||||
|
|
||||||
self.initializer_factor = 1.0
|
self.initializer_factor = 1.0
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_text_vision_configs(cls, text_config: Siglip2TextConfig, vision_config: Siglip2VisionConfig, **kwargs):
|
|
||||||
r"""
|
|
||||||
Instantiate a [`Siglip2Config`] (or a derived class) from siglip2 text model configuration and siglip2 vision
|
|
||||||
model configuration.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
[`Siglip2Config`]: An instance of a configuration object
|
|
||||||
"""
|
|
||||||
|
|
||||||
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Siglip2Config", "Siglip2TextConfig", "Siglip2VisionConfig"]
|
__all__ = ["Siglip2Config", "Siglip2TextConfig", "Siglip2VisionConfig"]
|
||||||
|
|||||||
@@ -365,17 +365,5 @@ class XCLIPConfig(PretrainedConfig):
|
|||||||
self.logit_scale_init_value = logit_scale_init_value
|
self.logit_scale_init_value = logit_scale_init_value
|
||||||
self.initializer_factor = 1.0
|
self.initializer_factor = 1.0
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_text_vision_configs(cls, text_config: XCLIPTextConfig, vision_config: XCLIPVisionConfig, **kwargs):
|
|
||||||
r"""
|
|
||||||
Instantiate a [`XCLIPConfig`] (or a derived class) from xclip text model configuration and xclip vision model
|
|
||||||
configuration.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
[`XCLIPConfig`]: An instance of a configuration object
|
|
||||||
"""
|
|
||||||
|
|
||||||
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["XCLIPConfig", "XCLIPTextConfig", "XCLIPVisionConfig"]
|
__all__ = ["XCLIPConfig", "XCLIPTextConfig", "XCLIPVisionConfig"]
|
||||||
|
|||||||
@@ -408,8 +408,10 @@ class AlignModelTester:
|
|||||||
return config, input_ids, token_type_ids, input_mask, pixel_values
|
return config, input_ids, token_type_ids, input_mask, pixel_values
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
return AlignConfig.from_text_vision_configs(
|
return AlignConfig(
|
||||||
self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
|
text_config=self.text_model_tester.get_config().to_dict(),
|
||||||
|
vision_config=self.vision_model_tester.get_config().to_dict(),
|
||||||
|
projection_dim=64,
|
||||||
)
|
)
|
||||||
|
|
||||||
def create_and_check_model(self, config, input_ids, token_type_ids, attention_mask, pixel_values):
|
def create_and_check_model(self, config, input_ids, token_type_ids, attention_mask, pixel_values):
|
||||||
|
|||||||
@@ -376,8 +376,10 @@ class AltCLIPModelTester:
|
|||||||
return config, input_ids, attention_mask, pixel_values
|
return config, input_ids, attention_mask, pixel_values
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
return AltCLIPConfig.from_text_vision_configs(
|
return AltCLIPConfig(
|
||||||
self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
|
text_config=self.text_model_tester.get_config().to_dict(),
|
||||||
|
vision_config=self.vision_model_tester.get_config().to_dict(),
|
||||||
|
projection_dim=64,
|
||||||
)
|
)
|
||||||
|
|
||||||
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
|
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
|
||||||
|
|||||||
@@ -381,8 +381,10 @@ class BlipModelTester:
|
|||||||
return config, input_ids, attention_mask, pixel_values
|
return config, input_ids, attention_mask, pixel_values
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
return BlipConfig.from_text_vision_configs(
|
return BlipConfig(
|
||||||
self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
|
text_config=self.text_model_tester.get_config().to_dict(),
|
||||||
|
vision_config=self.vision_model_tester.get_config().to_dict(),
|
||||||
|
projection_dim=64,
|
||||||
)
|
)
|
||||||
|
|
||||||
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
|
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
|
||||||
@@ -664,8 +666,10 @@ class BlipTextRetrievalModelTester:
|
|||||||
return config, input_ids, attention_mask, pixel_values
|
return config, input_ids, attention_mask, pixel_values
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
return BlipConfig.from_text_vision_configs(
|
return BlipConfig(
|
||||||
self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
|
text_config=self.text_model_tester.get_config().to_dict(),
|
||||||
|
vision_config=self.vision_model_tester.get_config().to_dict(),
|
||||||
|
projection_dim=64,
|
||||||
)
|
)
|
||||||
|
|
||||||
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
|
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
|
||||||
@@ -713,8 +717,10 @@ class BlipTextImageModelsModelTester:
|
|||||||
return config, input_ids, attention_mask, pixel_values
|
return config, input_ids, attention_mask, pixel_values
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
return BlipConfig.from_text_vision_configs(
|
return BlipConfig(
|
||||||
self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
|
text_config=self.text_model_tester.get_config().to_dict(),
|
||||||
|
vision_config=self.vision_model_tester.get_config().to_dict(),
|
||||||
|
projection_dim=64,
|
||||||
)
|
)
|
||||||
|
|
||||||
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
|
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
|
||||||
@@ -761,8 +767,10 @@ class BlipVQAModelTester:
|
|||||||
return config, input_ids, attention_mask, pixel_values
|
return config, input_ids, attention_mask, pixel_values
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
return BlipConfig.from_text_vision_configs(
|
return BlipConfig(
|
||||||
self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
|
text_config=self.text_model_tester.get_config().to_dict(),
|
||||||
|
vision_config=self.vision_model_tester.get_config().to_dict(),
|
||||||
|
projection_dim=64,
|
||||||
)
|
)
|
||||||
|
|
||||||
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
|
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
|
||||||
|
|||||||
@@ -203,9 +203,9 @@ class BridgeTowerModelTester:
|
|||||||
return (config, input_ids, attention_mask, pixel_values, pixel_mask)
|
return (config, input_ids, attention_mask, pixel_values, pixel_mask)
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
return BridgeTowerConfig.from_text_vision_configs(
|
return BridgeTowerConfig(
|
||||||
text_config=self.text_model_tester.get_config(),
|
text_config=self.text_model_tester.get_config().to_dict(),
|
||||||
vision_config=self.vision_model_tester.get_config(),
|
vision_config=self.vision_model_tester.get_config().to_dict(),
|
||||||
share_cross_modal_transformer_layers=self.share_cross_modal_transformer_layers,
|
share_cross_modal_transformer_layers=self.share_cross_modal_transformer_layers,
|
||||||
share_link_tower_layers=self.share_link_tower_layers,
|
share_link_tower_layers=self.share_link_tower_layers,
|
||||||
link_tower_type=self.link_tower_type,
|
link_tower_type=self.link_tower_type,
|
||||||
|
|||||||
@@ -515,8 +515,10 @@ class ChineseCLIPModelTester:
|
|||||||
return config, input_ids, token_type_ids, attention_mask, pixel_values
|
return config, input_ids, token_type_ids, attention_mask, pixel_values
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
return ChineseCLIPConfig.from_text_vision_configs(
|
return ChineseCLIPConfig(
|
||||||
self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
|
text_config=self.text_model_tester.get_config().to_dict(),
|
||||||
|
vision_config=self.vision_model_tester.get_config().to_dict(),
|
||||||
|
projection_dim=64,
|
||||||
)
|
)
|
||||||
|
|
||||||
def create_and_check_model(self, config, input_ids, token_type_ids, attention_mask, pixel_values):
|
def create_and_check_model(self, config, input_ids, token_type_ids, attention_mask, pixel_values):
|
||||||
|
|||||||
@@ -459,8 +459,10 @@ class ClapModelTester:
|
|||||||
return config, input_ids, attention_mask, input_features
|
return config, input_ids, attention_mask, input_features
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
return ClapConfig.from_text_audio_configs(
|
return ClapConfig(
|
||||||
self.text_model_tester.get_config(), self.audio_model_tester.get_config(), projection_dim=64
|
text_config=self.text_model_tester.get_config().to_dict(),
|
||||||
|
audio_config=self.audio_model_tester.get_config().to_dict(),
|
||||||
|
projection_dim=64,
|
||||||
)
|
)
|
||||||
|
|
||||||
def create_and_check_model(self, config, input_ids, attention_mask, input_features):
|
def create_and_check_model(self, config, input_ids, attention_mask, input_features):
|
||||||
|
|||||||
@@ -502,8 +502,10 @@ class CLIPModelTester:
|
|||||||
return config, input_ids, attention_mask, pixel_values
|
return config, input_ids, attention_mask, pixel_values
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
return CLIPConfig.from_text_vision_configs(
|
return CLIPConfig(
|
||||||
self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
|
text_config=self.text_model_tester.get_config().to_dict(),
|
||||||
|
vision_config=self.vision_model_tester.get_config().to_dict(),
|
||||||
|
projection_dim=64,
|
||||||
)
|
)
|
||||||
|
|
||||||
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
|
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
|
||||||
|
|||||||
@@ -374,9 +374,9 @@ class CLIPSegModelTester:
|
|||||||
return config, input_ids, attention_mask, pixel_values
|
return config, input_ids, attention_mask, pixel_values
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
return CLIPSegConfig.from_text_vision_configs(
|
return CLIPSegConfig(
|
||||||
self.text_model_tester.get_config(),
|
text_config=self.text_model_tester.get_config().to_dict(),
|
||||||
self.vision_model_tester.get_config(),
|
vision_config=self.vision_model_tester.get_config().to_dict(),
|
||||||
projection_dim=64,
|
projection_dim=64,
|
||||||
reduce_dim=32,
|
reduce_dim=32,
|
||||||
extract_layers=self.extract_layers,
|
extract_layers=self.extract_layers,
|
||||||
|
|||||||
@@ -216,7 +216,7 @@ class EfficientLoFTRModelTest(ModelTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
list(hidden_states[0].shape[-2:]),
|
list(hidden_states[0].shape[-2:]),
|
||||||
[self.model_tester.image_height // 2, self.model_tester.image_width // 2],
|
[self.model_tester.image_height, self.model_tester.image_width],
|
||||||
)
|
)
|
||||||
|
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|||||||
@@ -497,8 +497,10 @@ class GroupViTModelTester:
|
|||||||
return config, input_ids, attention_mask, pixel_values
|
return config, input_ids, attention_mask, pixel_values
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
return GroupViTConfig.from_text_vision_configs(
|
return GroupViTConfig(
|
||||||
self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
|
text_config=self.text_model_tester.get_config().to_dict(),
|
||||||
|
vision_config=self.vision_model_tester.get_config().to_dict(),
|
||||||
|
projection_dim=64,
|
||||||
)
|
)
|
||||||
|
|
||||||
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
|
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
|
||||||
|
|||||||
@@ -375,7 +375,11 @@ class Owlv2ModelTester:
|
|||||||
return config, input_ids, attention_mask, pixel_values
|
return config, input_ids, attention_mask, pixel_values
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
return Owlv2Config.from_text_vision_configs(self.text_config, self.vision_config, projection_dim=64)
|
return Owlv2Config(
|
||||||
|
text_config=self.text_config,
|
||||||
|
vision_config=self.vision_config,
|
||||||
|
projection_dim=64,
|
||||||
|
)
|
||||||
|
|
||||||
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
|
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
|
||||||
model = Owlv2Model(config).to(torch_device).eval()
|
model = Owlv2Model(config).to(torch_device).eval()
|
||||||
@@ -589,7 +593,11 @@ class Owlv2ForObjectDetectionTester:
|
|||||||
return config, pixel_values, input_ids, attention_mask
|
return config, pixel_values, input_ids, attention_mask
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
return Owlv2Config.from_text_vision_configs(self.text_config, self.vision_config, projection_dim=64)
|
return Owlv2Config(
|
||||||
|
text_config=self.text_config,
|
||||||
|
vision_config=self.vision_config,
|
||||||
|
projection_dim=64,
|
||||||
|
)
|
||||||
|
|
||||||
def create_and_check_model(self, config, pixel_values, input_ids, attention_mask):
|
def create_and_check_model(self, config, pixel_values, input_ids, attention_mask):
|
||||||
model = Owlv2ForObjectDetection(config).to(torch_device).eval()
|
model = Owlv2ForObjectDetection(config).to(torch_device).eval()
|
||||||
|
|||||||
@@ -371,7 +371,11 @@ class OwlViTModelTester:
|
|||||||
return config, input_ids, attention_mask, pixel_values
|
return config, input_ids, attention_mask, pixel_values
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
return OwlViTConfig.from_text_vision_configs(self.text_config, self.vision_config, projection_dim=64)
|
return OwlViTConfig(
|
||||||
|
text_config=self.text_config,
|
||||||
|
vision_config=self.vision_config,
|
||||||
|
projection_dim=64,
|
||||||
|
)
|
||||||
|
|
||||||
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
|
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
|
||||||
model = OwlViTModel(config).to(torch_device).eval()
|
model = OwlViTModel(config).to(torch_device).eval()
|
||||||
@@ -583,7 +587,11 @@ class OwlViTForObjectDetectionTester:
|
|||||||
return config, pixel_values, input_ids, attention_mask
|
return config, pixel_values, input_ids, attention_mask
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
return OwlViTConfig.from_text_vision_configs(self.text_config, self.vision_config, projection_dim=64)
|
return OwlViTConfig(
|
||||||
|
text_config=self.text_config,
|
||||||
|
vision_config=self.vision_config,
|
||||||
|
projection_dim=64,
|
||||||
|
)
|
||||||
|
|
||||||
def create_and_check_model(self, config, pixel_values, input_ids, attention_mask):
|
def create_and_check_model(self, config, pixel_values, input_ids, attention_mask):
|
||||||
model = OwlViTForObjectDetection(config).to(torch_device).eval()
|
model = OwlViTForObjectDetection(config).to(torch_device).eval()
|
||||||
|
|||||||
@@ -383,7 +383,11 @@ class Pix2StructModelTester:
|
|||||||
return config, input_ids, attention_mask, flattened_patches
|
return config, input_ids, attention_mask, flattened_patches
|
||||||
|
|
||||||
def get_config(self, text_config, vision_config):
|
def get_config(self, text_config, vision_config):
|
||||||
return Pix2StructConfig.from_text_vision_configs(text_config, vision_config, projection_dim=64)
|
return Pix2StructConfig(
|
||||||
|
text_config=self.text_model_tester.get_config().to_dict(),
|
||||||
|
vision_config=self.vision_model_tester.get_config().to_dict(),
|
||||||
|
projection_dim=64,
|
||||||
|
)
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
def prepare_config_and_inputs_for_common(self):
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
|||||||
@@ -428,9 +428,9 @@ class SiglipModelTester:
|
|||||||
return config, input_ids, attention_mask, pixel_values
|
return config, input_ids, attention_mask, pixel_values
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
return SiglipConfig.from_text_vision_configs(
|
return SiglipConfig(
|
||||||
self.text_model_tester.get_config(),
|
text_config=self.text_model_tester.get_config().to_dict(),
|
||||||
self.vision_model_tester.get_config(),
|
vision_config=self.vision_model_tester.get_config().to_dict(),
|
||||||
)
|
)
|
||||||
|
|
||||||
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
|
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
|
||||||
|
|||||||
@@ -514,9 +514,9 @@ class Siglip2ModelTester:
|
|||||||
return config, input_ids, attention_mask, pixel_values, pixel_attention_mask, spatial_shapes
|
return config, input_ids, attention_mask, pixel_values, pixel_attention_mask, spatial_shapes
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
return Siglip2Config.from_text_vision_configs(
|
return Siglip2Config(
|
||||||
self.text_model_tester.get_config(),
|
text_config=self.text_model_tester.get_config().to_dict(),
|
||||||
self.vision_model_tester.get_config(),
|
vision_config=self.vision_model_tester.get_config().to_dict(),
|
||||||
)
|
)
|
||||||
|
|
||||||
def create_and_check_model(
|
def create_and_check_model(
|
||||||
|
|||||||
@@ -493,9 +493,9 @@ class XCLIPModelTester:
|
|||||||
return config, input_ids, attention_mask, pixel_values
|
return config, input_ids, attention_mask, pixel_values
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
return XCLIPConfig.from_text_vision_configs(
|
return XCLIPConfig(
|
||||||
self.text_model_tester.get_config(),
|
text_config=self.text_model_tester.get_config().to_dict(),
|
||||||
self.vision_model_tester.get_config(),
|
vision_config=self.vision_model_tester.get_config().to_dict(),
|
||||||
projection_dim=self.projection_dim,
|
projection_dim=self.projection_dim,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user