diff --git a/docs/source/en/model_doc/clip.mdx b/docs/source/en/model_doc/clip.mdx index 080c80e709..5c56bf637d 100644 --- a/docs/source/en/model_doc/clip.mdx +++ b/docs/source/en/model_doc/clip.mdx @@ -125,6 +125,17 @@ This model was contributed by [valhalla](https://huggingface.co/valhalla). The o [[autodoc]] CLIPTextModel - forward +## CLIPTextModelWithProjection + +[[autodoc]] CLIPTextModelWithProjection + - forward + +## CLIPVisionModelWithProjection + +[[autodoc]] CLIPVisionModelWithProjection + - forward + + ## CLIPVisionModel [[autodoc]] CLIPVisionModel diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 9fa910fda8..34845dfbda 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1096,7 +1096,9 @@ else: "CLIPModel", "CLIPPreTrainedModel", "CLIPTextModel", + "CLIPTextModelWithProjection", "CLIPVisionModel", + "CLIPVisionModelWithProjection", ] ) _import_structure["models.clipseg"].extend( @@ -4086,7 +4088,9 @@ if TYPE_CHECKING: CLIPModel, CLIPPreTrainedModel, CLIPTextModel, + CLIPTextModelWithProjection, CLIPVisionModel, + CLIPVisionModelWithProjection, ) from .models.clipseg import ( CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST, diff --git a/src/transformers/models/clip/__init__.py b/src/transformers/models/clip/__init__.py index e9bfd63844..4fb4ca8eca 100644 --- a/src/transformers/models/clip/__init__.py +++ b/src/transformers/models/clip/__init__.py @@ -68,7 +68,9 @@ else: "CLIPModel", "CLIPPreTrainedModel", "CLIPTextModel", + "CLIPTextModelWithProjection", "CLIPVisionModel", + "CLIPVisionModelWithProjection", ] try: @@ -140,7 +142,9 @@ if TYPE_CHECKING: CLIPModel, CLIPPreTrainedModel, CLIPTextModel, + CLIPTextModelWithProjection, CLIPVisionModel, + CLIPVisionModelWithProjection, ) try: diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py index 3da2a76670..1aeda330e4 100644 --- a/src/transformers/models/clip/configuration_clip.py +++ b/src/transformers/models/clip/configuration_clip.py @@ -98,6 +98,7 @@ class CLIPTextConfig(PretrainedConfig): vocab_size=49408, hidden_size=512, intermediate_size=2048, + projection_dim=512, num_hidden_layers=12, num_attention_heads=8, max_position_embeddings=77, @@ -117,6 +118,7 @@ class CLIPTextConfig(PretrainedConfig): self.vocab_size = vocab_size self.hidden_size = hidden_size self.intermediate_size = intermediate_size + self.projection_dim = projection_dim self.dropout = dropout self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads @@ -204,6 +206,7 @@ class CLIPVisionConfig(PretrainedConfig): self, hidden_size=768, intermediate_size=3072, + projection_dim=512, num_hidden_layers=12, num_attention_heads=12, num_channels=3, @@ -221,6 +224,7 @@ class CLIPVisionConfig(PretrainedConfig): self.hidden_size = hidden_size self.intermediate_size = intermediate_size + self.projection_dim = projection_dim self.dropout = dropout self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py old mode 100755 new mode 100644 index de15ff8ca7..c2529c4592 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -72,6 +72,64 @@ def clip_loss(similarity: torch.Tensor) -> torch.Tensor: return (caption_loss + image_loss) / 2.0 +@dataclass +class CLIPVisionModelOutput(ModelOutput): + """ + Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states. + + Args: + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The image embeddings obtained by applying the projection layer to the pooler_output. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + image_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class CLIPTextModelOutput(ModelOutput): + """ + Base class for text model's outputs that also contains a pooling of the last hidden states. + + Args: + text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The text embeddings obtained by applying the projection layer to the pooler_output. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + text_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + @dataclass class CLIPOutput(ModelOutput): """ @@ -386,6 +444,16 @@ class CLIPPreTrainedModel(PreTrainedModel): module.visual_projection.weight, std=module.vision_embed_dim**-0.5 * self.config.initializer_factor, ) + elif isinstance(module, CLIPVisionModelWithProjection): + nn.init.normal_( + module.visual_projection.weight, + std=self.config.hidden_size**-0.5 * self.config.initializer_factor, + ) + elif isinstance(module, CLIPTextModelWithProjection): + nn.init.normal_( + module.text_projection.weight, + std=self.config.hidden_size**-0.5 * self.config.initializer_factor, + ) if isinstance(module, nn.LayerNorm): module.bias.data.zero_() @@ -399,9 +467,13 @@ class CLIPPreTrainedModel(PreTrainedModel): CLIP_START_DOCSTRING = r""" - This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it - as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and - behavior. + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. Parameters: config ([`CLIPConfig`]): Model configuration class with all the parameters of the model. @@ -685,6 +757,10 @@ class CLIPTextTransformer(nn.Module): return mask +@add_start_docstrings( + """The text model from CLIP without any head or projection on top.""", + CLIP_START_DOCSTRING, +) class CLIPTextModel(CLIPPreTrainedModel): config_class = CLIPTextConfig @@ -730,6 +806,8 @@ class CLIPTextModel(CLIPPreTrainedModel): >>> last_hidden_state = outputs.last_hidden_state >>> pooled_output = outputs.pooler_output # pooled (EOS token) states ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return self.text_model( input_ids=input_ids, attention_mask=attention_mask, @@ -798,6 +876,10 @@ class CLIPVisionTransformer(nn.Module): ) +@add_start_docstrings( + """The vision model from CLIP without any head or projection on top.""", + CLIP_START_DOCSTRING, +) class CLIPVisionModel(CLIPPreTrainedModel): config_class = CLIPVisionConfig main_input_name = "pixel_values" @@ -842,6 +924,8 @@ class CLIPVisionModel(CLIPPreTrainedModel): >>> last_hidden_state = outputs.last_hidden_state >>> pooled_output = outputs.pooler_output # pooled CLS states ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, @@ -1074,3 +1158,162 @@ class CLIPModel(CLIPPreTrainedModel): text_model_output=text_outputs, vision_model_output=vision_outputs, ) + + +@add_start_docstrings( + """ + CLIP Text Model with a projection layer on top (a linear layer on top of the pooled output). + """, + CLIP_START_DOCSTRING, +) +class CLIPTextModelWithProjection(CLIPPreTrainedModel): + config_class = CLIPTextConfig + + _no_split_modules = ["CLIPEncoderLayer"] + + def __init__(self, config: CLIPTextConfig): + super().__init__(config) + + self.text_model = CLIPTextTransformer(config) + + self.text_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.text_model.embeddings.token_embedding + + def set_input_embeddings(self, value): + self.text_model.embeddings.token_embedding = value + + @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CLIPTextModelOutput, config_class=CLIPTextConfig) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CLIPTextModelOutput]: + r""" + Returns: + + Examples: + + ```python + >>> from transformers import CLIPTokenizer, CLIPTextModelWithProjection + + >>> model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32") + >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> text_embeds = outputs.text_embeds + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = text_outputs[1] + + text_embeds = self.text_projection(pooled_output) + + if not return_dict: + outputs = (text_embeds, text_outputs[0]) + text_outputs[2:] + return tuple(output for output in outputs if output is not None) + + return CLIPTextModelOutput( + text_embeds=text_embeds, + last_hidden_state=text_outputs.last_hidden_state, + hidden_states=text_outputs.hidden_states, + attentions=text_outputs.attentions, + ) + + +@add_start_docstrings( + """ + CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output). + """, + CLIP_START_DOCSTRING, +) +class CLIPVisionModelWithProjection(CLIPPreTrainedModel): + config_class = CLIPVisionConfig + main_input_name = "pixel_values" + + def __init__(self, config: CLIPVisionConfig): + super().__init__(config) + + self.vision_model = CLIPVisionTransformer(config) + + self.visual_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.vision_model.embeddings.patch_embedding + + @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CLIPVisionModelOutput, config_class=CLIPVisionConfig) + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CLIPVisionModelOutput]: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import CLIPProcessor, CLIPVisionModelWithProjection + + >>> model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32") + >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> image_embeds = outputs.image_embeds + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = vision_outputs[1] # pooled_output + + image_embeds = self.visual_projection(pooled_output) + + if not return_dict: + outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:] + return tuple(output for output in outputs if output is not None) + + return CLIPVisionModelOutput( + image_embeds=image_embeds, + last_hidden_state=vision_outputs.last_hidden_state, + hidden_states=vision_outputs.hidden_states, + attentions=vision_outputs.attentions, + ) diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index 87caf24ed4..da5612fe2c 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -420,7 +420,6 @@ class CLIPSegEncoderLayer(nn.Module): return outputs -# Copied from transformers.models.clip.modeling_clip.CLIPPreTrainedModel with CLIP->CLIPSeg class CLIPSegPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index c0991a8b01..cc93aa76cb 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -1207,6 +1207,13 @@ class CLIPTextModel(metaclass=DummyObject): requires_backends(self, ["torch"]) +class CLIPTextModelWithProjection(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class CLIPVisionModel(metaclass=DummyObject): _backends = ["torch"] @@ -1214,6 +1221,13 @@ class CLIPVisionModel(metaclass=DummyObject): requires_backends(self, ["torch"]) +class CLIPVisionModelWithProjection(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py index 37893f18fd..61d0a7b315 100644 --- a/src/transformers/utils/fx.py +++ b/src/transformers/utils/fx.py @@ -151,7 +151,9 @@ for item in _REGULAR_SUPPORTED_MODEL_NAMES_AND_TASKS: _SPECIAL_SUPPORTED_MODELS = [ "CLIPTextModel", + "CLIPTextModelWithProjection", "CLIPVisionModel", + "CLIPVisionModelWithProjection", "GPT2DoubleHeadsModel", "Speech2Text2Decoder", "TrOCRDecoder", diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index 524f002ad3..8fa3ce21bb 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -49,7 +49,13 @@ if is_torch_available(): import torch from torch import nn - from transformers import CLIPModel, CLIPTextModel, CLIPVisionModel + from transformers import ( + CLIPModel, + CLIPTextModel, + CLIPTextModelWithProjection, + CLIPVisionModel, + CLIPVisionModelWithProjection, + ) from transformers.models.clip.modeling_clip import CLIP_PRETRAINED_MODEL_ARCHIVE_LIST @@ -77,6 +83,7 @@ class CLIPVisionModelTester: num_channels=3, is_training=True, hidden_size=32, + projection_dim=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37, @@ -92,6 +99,7 @@ class CLIPVisionModelTester: self.num_channels = num_channels self.is_training = is_training self.hidden_size = hidden_size + self.projection_dim = projection_dim self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.intermediate_size = intermediate_size @@ -116,6 +124,7 @@ class CLIPVisionModelTester: patch_size=self.patch_size, num_channels=self.num_channels, hidden_size=self.hidden_size, + projection_dim=self.projection_dim, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, @@ -137,6 +146,19 @@ class CLIPVisionModelTester: self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size)) self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + def create_and_check_model_with_projection(self, config, pixel_values): + model = CLIPVisionModelWithProjection(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(pixel_values) + # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) + image_size = (self.image_size, self.image_size) + patch_size = (self.patch_size, self.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size)) + self.parent.assertEqual(result.image_embeds.shape, (self.batch_size, self.projection_dim)) + def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() config, pixel_values = config_and_inputs @@ -151,7 +173,7 @@ class CLIPVisionModelTest(ModelTesterMixin, unittest.TestCase): attention_mask and seq_length. """ - all_model_classes = (CLIPVisionModel,) if is_torch_available() else () + all_model_classes = (CLIPVisionModel, CLIPVisionModelWithProjection) if is_torch_available() else () fx_compatible = True test_pruning = False test_resize_embeddings = False @@ -193,6 +215,10 @@ class CLIPVisionModelTest(ModelTesterMixin, unittest.TestCase): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_model_with_projection(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model_with_projection(*config_and_inputs) + def test_training(self): pass @@ -213,6 +239,13 @@ class CLIPVisionModelTest(ModelTesterMixin, unittest.TestCase): model = CLIPVisionModel.from_pretrained(model_name) self.assertIsNotNone(model) + @slow + def test_model_with_projection_from_pretrained(self): + for model_name in CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = CLIPVisionModelWithProjection.from_pretrained(model_name) + self.assertIsNotNone(model) + self.assertTrue(hasattr(model, "visual_projection")) + class CLIPTextModelTester: def __init__( @@ -225,6 +258,7 @@ class CLIPTextModelTester: use_labels=True, vocab_size=99, hidden_size=32, + projection_dim=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37, @@ -242,6 +276,7 @@ class CLIPTextModelTester: self.use_labels = use_labels self.vocab_size = vocab_size self.hidden_size = hidden_size + self.projection_dim = projection_dim self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.intermediate_size = intermediate_size @@ -273,6 +308,7 @@ class CLIPTextModelTester: return CLIPTextConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, + projection_dim=self.projection_dim, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, @@ -292,6 +328,16 @@ class CLIPTextModelTester: self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + def create_and_check_model_with_projection(self, config, input_ids, input_mask): + model = CLIPTextModelWithProjection(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.text_embeds.shape, (self.batch_size, self.projection_dim)) + def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() config, input_ids, input_mask = config_and_inputs @@ -302,7 +348,7 @@ class CLIPTextModelTester: @require_torch class CLIPTextModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (CLIPTextModel,) if is_torch_available() else () + all_model_classes = (CLIPTextModel, CLIPTextModelWithProjection) if is_torch_available() else () fx_compatible = True test_pruning = False test_head_masking = False @@ -318,6 +364,10 @@ class CLIPTextModelTest(ModelTesterMixin, unittest.TestCase): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_model_with_projection(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model_with_projection(*config_and_inputs) + def test_training(self): pass @@ -342,6 +392,13 @@ class CLIPTextModelTest(ModelTesterMixin, unittest.TestCase): model = CLIPTextModel.from_pretrained(model_name) self.assertIsNotNone(model) + @slow + def test_model_with_projection_from_pretrained(self): + for model_name in CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = CLIPTextModelWithProjection.from_pretrained(model_name) + self.assertIsNotNone(model) + self.assertTrue(hasattr(model, "text_projection")) + class CLIPModelTester: def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True): diff --git a/utils/check_repo.py b/utils/check_repo.py index 2271087fe1..951da58eee 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -177,7 +177,9 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [ "PLBartDecoderWrapper", "BeitForMaskedImageModeling", "CLIPTextModel", + "CLIPTextModelWithProjection", "CLIPVisionModel", + "CLIPVisionModelWithProjection", "GroupViTTextModel", "GroupViTVisionModel", "TFCLIPTextModel",