From df8640cedb5dd3266ebf8958731f9f1315507182 Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Thu, 31 Oct 2024 22:15:04 +0100 Subject: [PATCH] [CLIPSeg] Make interpolate_pos_encoding default to True (#34419) * Remove interpolate_pos_encoding * Make fixup * Make interpolate_pos_encoding default to True * Reuse existing interpolation * Add integration test --- .../models/clipseg/modeling_clipseg.py | 22 ++++++++----------- tests/models/clipseg/test_modeling_clipseg.py | 4 ++-- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index 8ff7f1cd96..4ead68032b 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -205,7 +205,7 @@ class CLIPSegVisionEmbeddings(nn.Module): return torch.cat((class_pos_embed, patch_pos_embed), dim=1) - def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=True) -> torch.Tensor: batch_size, _, height, width = pixel_values.shape if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): raise ValueError( @@ -535,7 +535,7 @@ CLIPSEG_VISION_INPUTS_DOCSTRING = r""" output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. - interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): + interpolate_pos_encoding (`bool`, *optional*, defaults to `True`): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. @@ -574,7 +574,7 @@ CLIPSEG_INPUTS_DOCSTRING = r""" output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. - interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): + interpolate_pos_encoding (`bool`, *optional*, defaults to `True`): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. @@ -845,14 +845,13 @@ class CLIPSegVisionTransformer(nn.Module): @add_start_docstrings_to_model_forward(CLIPSEG_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegVisionConfig) - # Copied from transformers.models.clip.modeling_clip.CLIPVisionTransformer.forward def forward( self, - pixel_values: Optional[torch.FloatTensor] = None, + pixel_values: Optional[torch.FloatTensor], output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - interpolate_pos_encoding: Optional[bool] = False, + interpolate_pos_encoding: Optional[bool] = True, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" Returns: @@ -864,9 +863,6 @@ class CLIPSegVisionTransformer(nn.Module): ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if pixel_values is None: - raise ValueError("You have to specify pixel_values") - hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) @@ -912,7 +908,7 @@ class CLIPSegVisionModel(CLIPSegPreTrainedModel): pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - interpolate_pos_encoding: Optional[bool] = False, + interpolate_pos_encoding: Optional[bool] = True, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" @@ -1035,7 +1031,7 @@ class CLIPSegModel(CLIPSegPreTrainedModel): pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - interpolate_pos_encoding: bool = False, + interpolate_pos_encoding: bool = True, return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" @@ -1091,7 +1087,7 @@ class CLIPSegModel(CLIPSegPreTrainedModel): return_loss: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - interpolate_pos_encoding: bool = False, + interpolate_pos_encoding: bool = True, return_dict: Optional[bool] = None, ) -> Union[Tuple, CLIPSegOutput]: r""" @@ -1397,7 +1393,7 @@ class CLIPSegForImageSegmentation(CLIPSegPreTrainedModel): labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - interpolate_pos_encoding: bool = False, + interpolate_pos_encoding: bool = True, return_dict: Optional[bool] = None, ) -> Union[Tuple, CLIPSegOutput]: r""" diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index c5edf7cb75..75ffa7ad23 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -796,7 +796,7 @@ class CLIPSegModelIntegrationTest(unittest.TestCase): # forward pass with torch.no_grad(): - outputs = model(**inputs, interpolate_pos_encoding=True) + outputs = model(**inputs) # verify the predicted masks self.assertEqual( @@ -804,7 +804,7 @@ class CLIPSegModelIntegrationTest(unittest.TestCase): torch.Size((3, 352, 352)), ) expected_masks_slice = torch.tensor( - [[-7.4613, -7.4785, -7.3627], [-7.3268, -7.0898, -7.1333], [-6.9838, -6.7900, -6.8913]] + [[-7.4613, -7.4785, -7.3628], [-7.3268, -7.0899, -7.1333], [-6.9838, -6.7900, -6.8913]] ).to(torch_device) self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_masks_slice, atol=1e-3))