From f42d46ccb406695fb57c0c669526f67fc30d1d84 Mon Sep 17 00:00:00 2001 From: Pavel Iakubovskii Date: Tue, 11 Feb 2025 11:37:31 +0000 Subject: [PATCH] Add common test for `torch.export` and fix some vision models (#35124) * Add is_torch_greater_or_equal test decorator * Add common test for torch.export * Fix bit * Fix focalnet * Fix imagegpt * Fix seggpt * Fix swin2sr * Enable torch.export test for vision models * Enable test for video models * Remove json * Enable for hiera * Enable for ijepa * Fix detr * Fic conditional_detr * Fix maskformer * Enable test maskformer * Fix test for deformable detr * Fix custom kernels for export in rt-detr and deformable-detr * Enable test for all DPT * Remove custom test for deformable detr * Simplify test to use only kwargs for export * Add comment * Move compile_compatible_method_lru_cache to utils * Fix beit export * Fix deformable detr * Fix copies data2vec<->beit * Fix typos, update test to work with dict * Add seed to the test * Enable test for vit_mae * Fix beit tests * [run-slow] beit, bit, conditional_detr, data2vec, deformable_detr, detr, focalnet, imagegpt, maskformer, rt_detr, seggpt, swin2sr * Add vitpose test * Add textnet test * Add dinov2 with registers * Update tests/test_modeling_common.py * Switch to torch.testing.assert_close * Fix masformer * Remove save-load from test * Add dab_detr * Add depth_pro * Fix and test RT-DETRv2 * Fix dab_detr --- src/transformers/models/beit/modeling_beit.py | 56 ++++++++------- src/transformers/models/bit/modeling_bit.py | 2 +- .../modeling_conditional_detr.py | 8 ++- .../models/dab_detr/modeling_dab_detr.py | 2 +- .../data2vec/modeling_data2vec_vision.py | 54 +++++++-------- .../modeling_deformable_detr.py | 7 +- src/transformers/models/detr/modeling_detr.py | 2 +- .../models/focalnet/modeling_focalnet.py | 10 +-- .../models/imagegpt/modeling_imagegpt.py | 10 ++- .../models/maskformer/modeling_maskformer.py | 6 +- .../models/rt_detr/modeling_rt_detr.py | 29 ++------ .../models/rt_detr_v2/modeling_rt_detr_v2.py | 35 +++------- .../models/rt_detr_v2/modular_rt_detr_v2.py | 10 +-- .../models/seggpt/modeling_seggpt.py | 14 ++-- .../models/swin2sr/modeling_swin2sr.py | 11 +-- src/transformers/pytorch_utils.py | 29 +++++++- src/transformers/testing_utils.py | 16 +++++ tests/models/beit/test_modeling_beit.py | 12 ++-- tests/models/bit/test_modeling_bit.py | 1 + .../test_modeling_conditional_detr.py | 1 + .../models/convnext/test_modeling_convnext.py | 1 + .../convnextv2/test_modeling_convnextv2.py | 1 + tests/models/cvt/test_modeling_cvt.py | 1 + .../models/dab_detr/test_modeling_dab_detr.py | 1 + .../test_modeling_deformable_detr.py | 1 + tests/models/deit/test_modeling_deit.py | 1 + .../test_modeling_depth_anything.py | 1 + .../depth_pro/test_modeling_depth_pro.py | 1 + tests/models/detr/test_modeling_detr.py | 1 + tests/models/dinat/test_modeling_dinat.py | 1 + tests/models/dinov2/test_modeling_dinov2.py | 2 + .../test_modeling_dinov2_with_registers.py | 1 + tests/models/dpt/test_modeling_dpt.py | 1 + .../dpt/test_modeling_dpt_auto_backbone.py | 1 + tests/models/dpt/test_modeling_dpt_hybrid.py | 1 + .../test_modeling_efficientnet.py | 1 + .../models/focalnet/test_modeling_focalnet.py | 1 + tests/models/glpn/test_modeling_glpn.py | 1 + tests/models/hiera/test_modeling_hiera.py | 1 + tests/models/ijepa/test_modeling_ijepa.py | 1 + .../models/imagegpt/test_modeling_imagegpt.py | 1 + .../mask2former/test_modeling_mask2former.py | 1 + .../maskformer/test_modeling_maskformer.py | 1 + .../test_modeling_maskformer_swin.py | 1 + .../test_modeling_mobilenet_v1.py | 1 + .../test_modeling_mobilenet_v2.py | 1 + .../mobilevit/test_modeling_mobilevit.py | 1 + .../mobilevitv2/test_modeling_mobilevitv2.py | 1 + .../poolformer/test_modeling_poolformer.py | 1 + tests/models/pvt/test_modeling_pvt.py | 1 + tests/models/pvt_v2/test_modeling_pvt_v2.py | 1 + tests/models/regnet/test_modeling_regnet.py | 1 + tests/models/resnet/test_modeling_resnet.py | 1 + tests/models/rt_detr/test_modeling_rt_detr.py | 1 + .../rt_detr_v2/test_modeling_rt_detr_v2.py | 1 + .../segformer/test_modeling_segformer.py | 1 + tests/models/seggpt/test_modeling_seggpt.py | 2 + .../swiftformer/test_modeling_swiftformer.py | 1 + tests/models/swin/test_modeling_swin.py | 1 + tests/models/swin2sr/test_modeling_swin2sr.py | 1 + tests/models/swinv2/test_modeling_swinv2.py | 1 + .../test_modeling_table_transformer.py | 1 + tests/models/textnet/test_modeling_textnet.py | 1 + .../timesformer/test_modeling_timesformer.py | 1 + tests/models/upernet/test_modeling_upernet.py | 1 + .../models/videomae/test_modeling_videomae.py | 1 + tests/models/vit/test_modeling_vit.py | 1 + tests/models/vit_mae/test_modeling_vit_mae.py | 1 + tests/models/vit_msn/test_modeling_vit_msn.py | 1 + tests/models/vitdet/test_modeling_vitdet.py | 1 + .../models/vitmatte/test_modeling_vitmatte.py | 1 + tests/models/vitpose/test_modeling_vitpose.py | 1 + .../test_modeling_vitpose_backbone.py | 16 ++++- tests/models/vivit/test_modeling_vivit.py | 1 + tests/models/yolos/test_modeling_yolos.py | 1 + .../models/zoedepth/test_modeling_zoedepth.py | 1 + tests/test_modeling_common.py | 68 +++++++++++++++++++ 77 files changed, 305 insertions(+), 151 deletions(-) diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py index 601e2801d6..40b01d34a8 100755 --- a/src/transformers/models/beit/modeling_beit.py +++ b/src/transformers/models/beit/modeling_beit.py @@ -34,7 +34,7 @@ from ...modeling_outputs import ( SemanticSegmenterOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ...pytorch_utils import compile_compatible_method_lru_cache, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -297,10 +297,9 @@ class BeitSelfAttention(nn.Module): self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - if window_size: + self.has_relative_position_bias = bool(window_size) + if self.has_relative_position_bias: self.relative_position_bias = BeitRelativePositionBias(config, window_size=window_size) - else: - self.relative_position_bias = None def transpose_for_scores(self, x): new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) @@ -312,7 +311,7 @@ class BeitSelfAttention(nn.Module): hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False, - relative_position_bias: Optional["BeitRelativePositionBias"] = None, + relative_position_bias: Optional[torch.Tensor] = None, interpolate_pos_encoding: bool = False, resolution: Optional[Tuple[int]] = None, ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: @@ -328,7 +327,7 @@ class BeitSelfAttention(nn.Module): attention_scores = attention_scores / math.sqrt(self.attention_head_size) # Add relative position bias if present. - if self.relative_position_bias is not None: + if self.has_relative_position_bias: height, width = resolution window_size = (height // self.config.patch_size, width // self.config.patch_size) attention_scores = attention_scores + self.relative_position_bias( @@ -367,7 +366,7 @@ class BeitSdpaSelfAttention(BeitSelfAttention): hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False, - relative_position_bias: Optional["BeitRelativePositionBias"] = None, + relative_position_bias: Optional[torch.Tensor] = None, interpolate_pos_encoding: bool = False, resolution: Optional[Tuple[int]] = None, ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: @@ -393,7 +392,7 @@ class BeitSdpaSelfAttention(BeitSelfAttention): query_layer = self.transpose_for_scores(mixed_query_layer) attn_bias = None - if self.relative_position_bias is not None: + if self.has_relative_position_bias: height, width = resolution window_size = (height // self.config.patch_size, width // self.config.patch_size) attn_bias = self.relative_position_bias( @@ -477,7 +476,7 @@ class BeitAttention(nn.Module): hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False, - relative_position_bias: Optional["BeitRelativePositionBias"] = None, + relative_position_bias: Optional[torch.Tensor] = None, interpolate_pos_encoding: bool = False, resolution: Optional[Tuple[int]] = None, ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: @@ -546,7 +545,7 @@ class BeitLayer(nn.Module): hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False, - relative_position_bias: Optional["BeitRelativePositionBias"] = None, + relative_position_bias: Optional[torch.Tensor] = None, interpolate_pos_encoding: bool = False, resolution: Optional[Tuple[int]] = None, ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: @@ -595,8 +594,7 @@ class BeitRelativePositionBias(nn.Module): ) # 2*Wh-1 * 2*Ww-1, nH # cls to token & token 2 cls & cls to cls - self.relative_position_indices = {} - + @compile_compatible_method_lru_cache(maxsize=10) def generate_relative_position_index(self, window_size: Tuple[int, int]) -> torch.Tensor: """ This method creates the relative position index, modified to support arbitrary window sizes, @@ -648,11 +646,9 @@ class BeitRelativePositionBias(nn.Module): [new_sub_table, old_relative_position_bias_table[old_num_relative_distance - 3 :]] ) - key = window_size - if key not in self.relative_position_indices.keys(): - self.relative_position_indices[key] = self.generate_relative_position_index(window_size) + relative_position_index = self.generate_relative_position_index(window_size) + relative_position_bias = new_relative_position_bias_table[relative_position_index.view(-1)] - relative_position_bias = new_relative_position_bias_table[self.relative_position_indices[key].view(-1)] # patch_size*num_patches_height, patch_size*num_patches_width, num_attention_heads relative_position_bias = relative_position_bias.view( window_size[0] * window_size[1] + 1, window_size[0] * window_size[1] + 1, -1 @@ -675,10 +671,9 @@ class BeitEncoder(nn.Module): def __init__(self, config: BeitConfig, window_size: Optional[tuple] = None) -> None: super().__init__() self.config = config - if config.use_shared_relative_position_bias: + self.has_relative_position_bias = config.use_shared_relative_position_bias + if self.has_relative_position_bias: self.relative_position_bias = BeitRelativePositionBias(config, window_size=window_size) - else: - self.relative_position_bias = None # stochastic depth decay rule dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)] @@ -701,7 +696,7 @@ class BeitEncoder(nn.Module): output_attentions: bool = False, output_hidden_states: bool = False, interpolate_pos_encoding: bool = False, - resolution: Optional[Tuple[int]] = None, + resolution: Optional[Tuple[int, int]] = None, return_dict: bool = True, ) -> Union[tuple, BaseModelOutput]: all_hidden_states = () if output_hidden_states else None @@ -711,6 +706,15 @@ class BeitEncoder(nn.Module): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) + if self.has_relative_position_bias: + height, width = resolution + window_size = (height // self.config.patch_size, width // self.config.patch_size) + relative_position_bias = self.relative_position_bias( + window_size, interpolate_pos_encoding=interpolate_pos_encoding, dim_size=hidden_states.shape[1] + ) + else: + relative_position_bias = None + layer_head_mask = head_mask[i] if head_mask is not None else None if self.gradient_checkpointing and self.training: @@ -719,17 +723,11 @@ class BeitEncoder(nn.Module): hidden_states, layer_head_mask, output_attentions, + relative_position_bias, + interpolate_pos_encoding, + resolution, ) else: - height, width = resolution - window_size = (height // self.config.patch_size, width // self.config.patch_size) - relative_position_bias = ( - self.relative_position_bias( - window_size, interpolate_pos_encoding=interpolate_pos_encoding, dim_size=hidden_states.shape[1] - ) - if self.relative_position_bias is not None - else None - ) layer_outputs = layer_module( hidden_states, layer_head_mask, diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py index 3d834671be..d71e3bf946 100644 --- a/src/transformers/models/bit/modeling_bit.py +++ b/src/transformers/models/bit/modeling_bit.py @@ -192,7 +192,7 @@ class DynamicPad2d(nn.Module): self.compute_padding = compute_padding - def __call__(self, input): + def forward(self, input): # Get width and height input_height, input_width = input.size()[-2:] diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py index d020b94cff..5a839f9513 100644 --- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py +++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py @@ -1735,7 +1735,11 @@ class ConditionalDetrForObjectDetection(ConditionalDetrPreTrainedModel): # class logits + predicted bounding boxes logits = self.class_labels_classifier(sequence_output) - reference = outputs.reference_points if return_dict else outputs[-1] + # Index [-2] is valid only if `output_attentions` and `output_hidden_states` + # are not specified, otherwise it will be another index which is hard to determine. + # Leave it as is, because it's not a common case to use + # return_dict=False + output_attentions=True / output_hidden_states=True + reference = outputs.reference_points if return_dict else outputs[-2] reference_before_sigmoid = inverse_sigmoid(reference).transpose(0, 1) hs = sequence_output @@ -2105,7 +2109,7 @@ class ConditionalDetrMHAttentionMap(nn.Module): weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head) if mask is not None: - weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), torch.finfo(weights.dtype).min) + weights = weights.masked_fill(mask.unsqueeze(1).unsqueeze(1), torch.finfo(weights.dtype).min) weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size()) weights = self.dropout(weights) return weights diff --git a/src/transformers/models/dab_detr/modeling_dab_detr.py b/src/transformers/models/dab_detr/modeling_dab_detr.py index 09c83147b9..3e3294db07 100644 --- a/src/transformers/models/dab_detr/modeling_dab_detr.py +++ b/src/transformers/models/dab_detr/modeling_dab_detr.py @@ -1537,7 +1537,7 @@ class DabDetrMHAttentionMap(nn.Module): weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head) if mask is not None: - weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), torch.finfo(weights.dtype).min) + weights = weights.masked_fill(mask.unsqueeze(1).unsqueeze(1), torch.finfo(weights.dtype).min) weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size()) weights = self.dropout(weights) return weights diff --git a/src/transformers/models/data2vec/modeling_data2vec_vision.py b/src/transformers/models/data2vec/modeling_data2vec_vision.py index d0a3b4dd59..8e4d9c0bb2 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_vision.py +++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py @@ -32,7 +32,7 @@ from ...modeling_outputs import ( SemanticSegmenterOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ...pytorch_utils import compile_compatible_method_lru_cache, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -298,10 +298,9 @@ class Data2VecVisionSelfAttention(nn.Module): self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - if window_size: + self.has_relative_position_bias = bool(window_size) + if self.has_relative_position_bias: self.relative_position_bias = Data2VecVisionRelativePositionBias(config, window_size=window_size) - else: - self.relative_position_bias = None def transpose_for_scores(self, x): new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) @@ -313,7 +312,7 @@ class Data2VecVisionSelfAttention(nn.Module): hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False, - relative_position_bias: Optional["Data2VecVisionRelativePositionBias"] = None, + relative_position_bias: Optional[torch.Tensor] = None, interpolate_pos_encoding: bool = False, resolution: Optional[Tuple[int]] = None, ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: @@ -329,7 +328,7 @@ class Data2VecVisionSelfAttention(nn.Module): attention_scores = attention_scores / math.sqrt(self.attention_head_size) # Add relative position bias if present. - if self.relative_position_bias is not None: + if self.has_relative_position_bias: height, width = resolution window_size = (height // self.config.patch_size, width // self.config.patch_size) attention_scores = attention_scores + self.relative_position_bias( @@ -369,7 +368,7 @@ class Data2VecVisionSdpaSelfAttention(Data2VecVisionSelfAttention): hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False, - relative_position_bias: Optional["Data2VecVisionRelativePositionBias"] = None, + relative_position_bias: Optional[torch.Tensor] = None, interpolate_pos_encoding: bool = False, resolution: Optional[Tuple[int]] = None, ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: @@ -395,7 +394,7 @@ class Data2VecVisionSdpaSelfAttention(Data2VecVisionSelfAttention): query_layer = self.transpose_for_scores(mixed_query_layer) attn_bias = None - if self.relative_position_bias is not None: + if self.has_relative_position_bias: height, width = resolution window_size = (height // self.config.patch_size, width // self.config.patch_size) attn_bias = self.relative_position_bias( @@ -557,7 +556,7 @@ class Data2VecVisionLayer(nn.Module): hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False, - relative_position_bias: Optional["Data2VecVisionRelativePositionBias"] = None, + relative_position_bias: Optional[torch.Tensor] = None, interpolate_pos_encoding: bool = False, resolution: Optional[Tuple[int]] = None, ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: @@ -607,8 +606,7 @@ class Data2VecVisionRelativePositionBias(nn.Module): ) # 2*Wh-1 * 2*Ww-1, nH # cls to token & token 2 cls & cls to cls - self.relative_position_indices = {} - + @compile_compatible_method_lru_cache(maxsize=10) def generate_relative_position_index(self, window_size: Tuple[int, int]) -> torch.Tensor: """ This method creates the relative position index, modified to support arbitrary window sizes, @@ -660,11 +658,9 @@ class Data2VecVisionRelativePositionBias(nn.Module): [new_sub_table, old_relative_position_bias_table[old_num_relative_distance - 3 :]] ) - key = window_size - if key not in self.relative_position_indices.keys(): - self.relative_position_indices[key] = self.generate_relative_position_index(window_size) + relative_position_index = self.generate_relative_position_index(window_size) + relative_position_bias = new_relative_position_bias_table[relative_position_index.view(-1)] - relative_position_bias = new_relative_position_bias_table[self.relative_position_indices[key].view(-1)] # patch_size*num_patches_height, patch_size*num_patches_width, num_attention_heads relative_position_bias = relative_position_bias.view( window_size[0] * window_size[1] + 1, window_size[0] * window_size[1] + 1, -1 @@ -688,10 +684,9 @@ class Data2VecVisionEncoder(nn.Module): def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None) -> None: super().__init__() self.config = config - if config.use_shared_relative_position_bias: + self.has_relative_position_bias = config.use_shared_relative_position_bias + if self.has_relative_position_bias: self.relative_position_bias = Data2VecVisionRelativePositionBias(config, window_size=window_size) - else: - self.relative_position_bias = None # stochastic depth decay rule dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)] @@ -714,7 +709,7 @@ class Data2VecVisionEncoder(nn.Module): output_attentions: bool = False, output_hidden_states: bool = False, interpolate_pos_encoding: bool = False, - resolution: Optional[Tuple[int]] = None, + resolution: Optional[Tuple[int, int]] = None, return_dict: bool = True, ) -> Union[tuple, BaseModelOutput]: all_hidden_states = () if output_hidden_states else None @@ -724,6 +719,15 @@ class Data2VecVisionEncoder(nn.Module): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) + if self.has_relative_position_bias: + height, width = resolution + window_size = (height // self.config.patch_size, width // self.config.patch_size) + relative_position_bias = self.relative_position_bias( + window_size, interpolate_pos_encoding=interpolate_pos_encoding, dim_size=hidden_states.shape[1] + ) + else: + relative_position_bias = None + layer_head_mask = head_mask[i] if head_mask is not None else None if self.gradient_checkpointing and self.training: @@ -732,17 +736,11 @@ class Data2VecVisionEncoder(nn.Module): hidden_states, layer_head_mask, output_attentions, + relative_position_bias, + interpolate_pos_encoding, + resolution, ) else: - height, width = resolution - window_size = (height // self.config.patch_size, width // self.config.patch_size) - relative_position_bias = ( - self.relative_position_bias( - window_size, interpolate_pos_encoding=interpolate_pos_encoding, dim_size=hidden_states.shape[1] - ) - if self.relative_position_bias is not None - else None - ) layer_outputs = layer_module( hidden_states, layer_head_mask, diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py index 3fd6914a6c..39d189418f 100755 --- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py @@ -40,6 +40,7 @@ from ...utils import ( is_ninja_available, is_timm_available, is_torch_cuda_available, + is_torchdynamo_compiling, logging, replace_return_docstrings, requires_backends, @@ -705,7 +706,7 @@ class DeformableDetrMultiscaleDeformableAttention(nn.Module): else: raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}") - if self.disable_custom_kernels or MultiScaleDeformableAttention is None: + if self.disable_custom_kernels or MultiScaleDeformableAttention is None or is_torchdynamo_compiling(): # PyTorch implementation output = multi_scale_deformable_attention( value, spatial_shapes_list, sampling_locations, attention_weights @@ -1606,7 +1607,7 @@ class DeformableDetrModel(DeformableDetrPreTrainedModel): Args: enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder. padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`. - spatial_shapes (Tensor[num_feature_levels, 2]): Spatial shapes of the feature maps. + spatial_shapes (List[Tuple[int, int]]): Spatial shapes of the feature maps. Returns: `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction. @@ -1786,7 +1787,7 @@ class DeformableDetrModel(DeformableDetrPreTrainedModel): enc_outputs_coord_logits = None if self.config.two_stage: object_query_embedding, output_proposals = self.gen_encoder_output_proposals( - encoder_outputs[0], ~mask_flatten, spatial_shapes + encoder_outputs[0], ~mask_flatten, spatial_shapes_list ) # hack implementation for two-stage Deformable DETR diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index ac9a78671f..0b006c44ad 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -1801,7 +1801,7 @@ class DetrMHAttentionMap(nn.Module): weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head) if mask is not None: - weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), torch.finfo(weights.dtype).min) + weights = weights.masked_fill(mask.unsqueeze(1).unsqueeze(1), torch.finfo(weights.dtype).min) weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size()) weights = self.dropout(weights) return weights diff --git a/src/transformers/models/focalnet/modeling_focalnet.py b/src/transformers/models/focalnet/modeling_focalnet.py index 43de96087c..687654a22d 100644 --- a/src/transformers/models/focalnet/modeling_focalnet.py +++ b/src/transformers/models/focalnet/modeling_focalnet.py @@ -358,23 +358,23 @@ class FocalNetModulation(nn.Module): # pre linear projection x = self.projection_in(hidden_state).permute(0, 3, 1, 2).contiguous() - q, ctx, self.gates = torch.split(x, (num_channels, num_channels, self.focal_level + 1), 1) + q, ctx, gates = torch.split(x, (num_channels, num_channels, self.focal_level + 1), 1) # context aggreation ctx_all = 0 for level in range(self.focal_level): ctx = self.focal_layers[level](ctx) - ctx_all = ctx_all + ctx * self.gates[:, level : level + 1] + ctx_all = ctx_all + ctx * gates[:, level : level + 1] ctx_global = self.activation(ctx.mean(2, keepdim=True).mean(3, keepdim=True)) - ctx_all = ctx_all + ctx_global * self.gates[:, self.focal_level :] + ctx_all = ctx_all + ctx_global * gates[:, self.focal_level :] # normalize context if self.normalize_modulator: ctx_all = ctx_all / (self.focal_level + 1) # focal modulation - self.modulator = self.projection_context(ctx_all) - x_out = q * self.modulator + modulator = self.projection_context(ctx_all) + x_out = q * modulator x_out = x_out.permute(0, 2, 3, 1).contiguous() if self.use_post_layernorm_in_modulation: x_out = self.layernorm(x_out) diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py index 16b41ef60f..dab0b0188a 100755 --- a/src/transformers/models/imagegpt/modeling_imagegpt.py +++ b/src/transformers/models/imagegpt/modeling_imagegpt.py @@ -164,13 +164,11 @@ class ImageGPTLayerNorm(nn.Module): self.eps = eps self.weight = nn.Parameter(torch.Tensor(hidden_size)) - def forward(self, tensor: torch.Tensor) -> tuple: + def forward(self, tensor: torch.Tensor) -> torch.Tensor: # input is not mean centered - return ( - tensor - / torch.sqrt(torch.mean(torch.square(tensor), axis=-1, keepdim=True) + self.eps) - * self.weight.data[..., :] - ) + tensor = tensor / torch.sqrt(torch.mean(torch.square(tensor), axis=-1, keepdim=True) + self.eps) + tensor = tensor * self.weight + return tensor class ImageGPTAttention(nn.Module): diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py index 9f067d3a15..b29672d7de 100644 --- a/src/transformers/models/maskformer/modeling_maskformer.py +++ b/src/transformers/models/maskformer/modeling_maskformer.py @@ -1424,7 +1424,11 @@ class MaskFormerTransformerModule(nn.Module): # repeat the queries "q c -> b q c" batch_size = image_features.shape[0] queries_embeddings = self.queries_embedder.weight.unsqueeze(0).repeat(batch_size, 1, 1) - inputs_embeds = torch.zeros_like(queries_embeddings, requires_grad=True) + inputs_embeds = torch.zeros_like(queries_embeddings, requires_grad=self.training) + + # torch.export.export does no support requires_grad + if self.training: + inputs_embeds.requires_grad_(True) batch_size, num_channels, height, width = image_features.shape # rearrange both image_features and object_queries "b c h w -> b (h w) c" diff --git a/src/transformers/models/rt_detr/modeling_rt_detr.py b/src/transformers/models/rt_detr/modeling_rt_detr.py index 5973c08039..3ff8250349 100644 --- a/src/transformers/models/rt_detr/modeling_rt_detr.py +++ b/src/transformers/models/rt_detr/modeling_rt_detr.py @@ -18,7 +18,7 @@ import math import os import warnings from dataclasses import dataclass -from functools import lru_cache, partial, wraps +from functools import partial from pathlib import Path from typing import Dict, List, Optional, Tuple, Union @@ -32,12 +32,14 @@ from ...activations import ACT2CLS, ACT2FN from ...image_transforms import center_to_corners_format, corners_to_center_format from ...modeling_outputs import BaseModelOutput from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import compile_compatible_method_lru_cache from ...utils import ( ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, is_ninja_available, is_torch_cuda_available, + is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -870,7 +872,7 @@ class RTDetrMultiscaleDeformableAttention(nn.Module): else: raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}") - if self.disable_custom_kernels or MultiScaleDeformableAttention is None: + if self.disable_custom_kernels or MultiScaleDeformableAttention is None or is_torchdynamo_compiling(): # PyTorch implementation output = multi_scale_deformable_attention( value, spatial_shapes_list, sampling_locations, attention_weights @@ -1590,27 +1592,6 @@ class RTDetrDecoder(RTDetrPreTrainedModel): ) -def compile_compatible_lru_cache(*lru_args, **lru_kwargs): - def decorator(func): - @wraps(func) - def wrapper(self, *args, **kwargs): - if not torch.compiler.is_compiling(): - # Cache the function only if the model is not being compiled - # check if the function is already cached, otherwise create it - if not hasattr(self, f"_cached_{func.__name__}"): - self.__setattr__( - f"_cached_{func.__name__}", lru_cache(*lru_args, **lru_kwargs)(func.__get__(self)) - ) - return self.__getattribute__(f"_cached_{func.__name__}")(*args, **kwargs) - else: - # Otherwise, just call the original function - return func(self, *args, **kwargs) - - return wrapper - - return decorator - - # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py class RTDetrMLPPredictionHead(nn.Module): """ @@ -1728,7 +1709,7 @@ class RTDetrModel(RTDetrPreTrainedModel): for param in self.backbone.parameters(): param.requires_grad_(True) - @compile_compatible_lru_cache(maxsize=32) + @compile_compatible_method_lru_cache(maxsize=32) def generate_anchors(self, spatial_shapes=None, grid_size=0.05, device="cpu", dtype=torch.float32): if spatial_shapes is None: spatial_shapes = [ diff --git a/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py b/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py index 11530a0800..0f9fddea5c 100644 --- a/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +++ b/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py @@ -22,7 +22,7 @@ import math import os import warnings from dataclasses import dataclass -from functools import lru_cache, partial, wraps +from functools import partial from pathlib import Path from typing import Dict, List, Optional, Tuple, Union @@ -34,12 +34,14 @@ from ...activations import ACT2CLS, ACT2FN from ...image_transforms import center_to_corners_format, corners_to_center_format from ...modeling_outputs import BaseModelOutput from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import compile_compatible_method_lru_cache from ...utils import ( ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, is_ninja_available, is_torch_cuda_available, + is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -97,7 +99,7 @@ def multi_scale_deformable_attention_v2( value_list = ( value.permute(0, 2, 3, 1) .flatten(0, 1) - .split([height.item() * width.item() for height, width in value_spatial_shapes], dim=-1) + .split([height * width for height, width in value_spatial_shapes], dim=-1) ) # sampling_offsets [8, 480, 8, 12, 2] if method == "default": @@ -226,9 +228,9 @@ class RTDetrV2MultiscaleDeformableAttention(nn.Module): position_embeddings: Optional[torch.Tensor] = None, reference_points=None, spatial_shapes=None, + spatial_shapes_list=None, level_start_index=None, output_attentions: bool = False, - **kwargs, ): # Process inputs up to sampling locations calculation using parent class logic if position_embeddings is not None: @@ -236,7 +238,7 @@ class RTDetrV2MultiscaleDeformableAttention(nn.Module): batch_size, num_queries, _ = hidden_states.shape batch_size, sequence_length, _ = encoder_hidden_states.shape - if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length: + if not is_torchdynamo_compiling() and (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length: raise ValueError( "Make sure to align the spatial shapes with the sequence length of the encoder hidden states" ) @@ -272,7 +274,7 @@ class RTDetrV2MultiscaleDeformableAttention(nn.Module): # V2-specific attention implementation choice output = multi_scale_deformable_attention_v2( - value, spatial_shapes, sampling_locations, attention_weights, self.n_points_list, self.method + value, spatial_shapes_list, sampling_locations, attention_weights, self.n_points_list, self.method ) output = self.output_proj(output) @@ -1329,27 +1331,6 @@ RTDetrV2_INPUTS_DOCSTRING = r""" """ -def compile_compatible_lru_cache(*lru_args, **lru_kwargs): - def decorator(func): - @wraps(func) - def wrapper(self, *args, **kwargs): - if not torch.compiler.is_compiling(): - # Cache the function only if the model is not being compiled - # check if the function is already cached, otherwise create it - if not hasattr(self, f"_cached_{func.__name__}"): - self.__setattr__( - f"_cached_{func.__name__}", lru_cache(*lru_args, **lru_kwargs)(func.__get__(self)) - ) - return self.__getattribute__(f"_cached_{func.__name__}")(*args, **kwargs) - else: - # Otherwise, just call the original function - return func(self, *args, **kwargs) - - return wrapper - - return decorator - - def _get_clones(partial_module, N): return nn.ModuleList([partial_module() for i in range(N)]) @@ -1669,7 +1650,7 @@ class RTDetrV2Model(RTDetrV2PreTrainedModel): for param in self.backbone.parameters(): param.requires_grad_(True) - @compile_compatible_lru_cache(maxsize=32) + @compile_compatible_method_lru_cache(maxsize=32) def generate_anchors(self, spatial_shapes=None, grid_size=0.05, device="cpu", dtype=torch.float32): if spatial_shapes is None: spatial_shapes = [ diff --git a/src/transformers/models/rt_detr_v2/modular_rt_detr_v2.py b/src/transformers/models/rt_detr_v2/modular_rt_detr_v2.py index 0faff32514..c6a192bd3e 100644 --- a/src/transformers/models/rt_detr_v2/modular_rt_detr_v2.py +++ b/src/transformers/models/rt_detr_v2/modular_rt_detr_v2.py @@ -20,7 +20,7 @@ import torch.nn.functional as F from torch import Tensor, nn from ...configuration_utils import PretrainedConfig -from ...utils import logging +from ...utils import is_torchdynamo_compiling, logging from ...utils.backbone_utils import ( verify_backbone_config_arguments, ) @@ -404,7 +404,7 @@ def multi_scale_deformable_attention_v2( value_list = ( value.permute(0, 2, 3, 1) .flatten(0, 1) - .split([height.item() * width.item() for height, width in value_spatial_shapes], dim=-1) + .split([height * width for height, width in value_spatial_shapes], dim=-1) ) # sampling_offsets [8, 480, 8, 12, 2] if method == "default": @@ -497,9 +497,9 @@ class RTDetrV2MultiscaleDeformableAttention(RTDetrMultiscaleDeformableAttention) position_embeddings: Optional[torch.Tensor] = None, reference_points=None, spatial_shapes=None, + spatial_shapes_list=None, level_start_index=None, output_attentions: bool = False, - **kwargs, ): # Process inputs up to sampling locations calculation using parent class logic if position_embeddings is not None: @@ -507,7 +507,7 @@ class RTDetrV2MultiscaleDeformableAttention(RTDetrMultiscaleDeformableAttention) batch_size, num_queries, _ = hidden_states.shape batch_size, sequence_length, _ = encoder_hidden_states.shape - if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length: + if not is_torchdynamo_compiling() and (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length: raise ValueError( "Make sure to align the spatial shapes with the sequence length of the encoder hidden states" ) @@ -543,7 +543,7 @@ class RTDetrV2MultiscaleDeformableAttention(RTDetrMultiscaleDeformableAttention) # V2-specific attention implementation choice output = multi_scale_deformable_attention_v2( - value, spatial_shapes, sampling_locations, attention_weights, self.n_points_list, self.method + value, spatial_shapes_list, sampling_locations, attention_weights, self.n_points_list, self.method ) output = self.output_proj(output) diff --git a/src/transformers/models/seggpt/modeling_seggpt.py b/src/transformers/models/seggpt/modeling_seggpt.py index e2d3c8e78a..d1c3d2eb6d 100644 --- a/src/transformers/models/seggpt/modeling_seggpt.py +++ b/src/transformers/models/seggpt/modeling_seggpt.py @@ -817,8 +817,11 @@ class SegGptModel(SegGptPreTrainedModel): # and reconstructed together (In-Context Painting). if bool_masked_pos is None: num_patches = self.embeddings.patch_embeddings.num_patches - bool_masked_pos = torch.zeros(num_patches, dtype=torch.bool).to(pixel_values.device) - bool_masked_pos[num_patches // 2 :] = 1 + bool_masked_pos_zeros = torch.zeros(num_patches // 2, dtype=torch.bool, device=pixel_values.device) + bool_masked_pos_ones = torch.ones( + num_patches - num_patches // 2, dtype=torch.bool, device=pixel_values.device + ) + bool_masked_pos = torch.cat([bool_masked_pos_zeros, bool_masked_pos_ones]) bool_masked_pos = bool_masked_pos.unsqueeze(0) embedding_output = self.embeddings( @@ -975,8 +978,11 @@ class SegGptForImageSegmentation(SegGptPreTrainedModel): if bool_masked_pos is None: num_patches = self.model.embeddings.patch_embeddings.num_patches - bool_masked_pos = torch.zeros(num_patches, dtype=torch.bool).to(pixel_values.device) - bool_masked_pos[num_patches // 2 :] = 1 + bool_masked_pos_zeros = torch.zeros(num_patches // 2, dtype=torch.bool, device=pixel_values.device) + bool_masked_pos_ones = torch.ones( + num_patches - num_patches // 2, dtype=torch.bool, device=pixel_values.device + ) + bool_masked_pos = torch.cat([bool_masked_pos_zeros, bool_masked_pos_ones]) bool_masked_pos = bool_masked_pos.unsqueeze(0) outputs = self.model( diff --git a/src/transformers/models/swin2sr/modeling_swin2sr.py b/src/transformers/models/swin2sr/modeling_swin2sr.py index 9c41e78793..784367a014 100644 --- a/src/transformers/models/swin2sr/modeling_swin2sr.py +++ b/src/transformers/models/swin2sr/modeling_swin2sr.py @@ -813,10 +813,11 @@ class Swin2SRModel(Swin2SRPreTrainedModel): self.config = config if config.num_channels == 3 and config.num_channels_out == 3: - rgb_mean = (0.4488, 0.4371, 0.4040) - self.mean = torch.Tensor(rgb_mean).view(1, 3, 1, 1) + mean = torch.tensor([0.4488, 0.4371, 0.4040]).view(1, 3, 1, 1) else: - self.mean = torch.zeros(1, 1, 1, 1) + mean = torch.zeros(1, 1, 1, 1) + self.register_buffer("mean", mean, persistent=False) + self.img_range = config.img_range self.first_convolution = nn.Conv2d(config.num_channels, config.embed_dim, 3, 1, 1) @@ -851,8 +852,8 @@ class Swin2SRModel(Swin2SRPreTrainedModel): pixel_values = nn.functional.pad(pixel_values, (0, modulo_pad_width, 0, modulo_pad_height), "reflect") # 2. normalize - self.mean = self.mean.type_as(pixel_values) - pixel_values = (pixel_values - self.mean) * self.img_range + mean = self.mean.type_as(pixel_values) + pixel_values = (pixel_values - mean) * self.img_range return pixel_values diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py index e058b639f7..c36adffd97 100644 --- a/src/transformers/pytorch_utils.py +++ b/src/transformers/pytorch_utils.py @@ -14,6 +14,7 @@ from __future__ import annotations import inspect +from functools import lru_cache, wraps from typing import Callable, List, Optional, Set, Tuple, Union import torch @@ -21,7 +22,7 @@ from packaging import version from safetensors.torch import storage_ptr, storage_size from torch import nn -from .utils import is_torch_greater_or_equal, is_torch_xla_available, logging +from .utils import is_torch_greater_or_equal, is_torch_xla_available, is_torchdynamo_compiling, logging ALL_LAYERNORM_LAYERS = [nn.LayerNorm] @@ -364,3 +365,29 @@ def translate_to_torch_parallel_style(style: str): return RowwiseParallel(input_layouts=Replicate()) else: raise ValueError(f"Unsupported parallel style value: {style}") + + +def compile_compatible_method_lru_cache(*lru_args, **lru_kwargs): + """ + LRU cache decorator from standard functools library, but with a workaround to disable + caching when torchdynamo is compiling. Expected to work with class methods. + """ + + def decorator(func): + @wraps(func) + def wrapper(self, *args, **kwargs): + if not is_torchdynamo_compiling(): + # Cache the function only if the model is not being compiled + # check if the function is already cached, otherwise create it + if not hasattr(self, f"_cached_{func.__name__}"): + self.__setattr__( + f"_cached_{func.__name__}", lru_cache(*lru_args, **lru_kwargs)(func.__get__(self)) + ) + return self.__getattribute__(f"_cached_{func.__name__}")(*args, **kwargs) + else: + # Otherwise, just call the original function + return func(self, *args, **kwargs) + + return wrapper + + return decorator diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 6d1965e29d..bf7c5e8ea3 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -135,6 +135,7 @@ from .utils import ( is_torch_bf16_gpu_available, is_torch_deterministic, is_torch_fp16_available_on_device, + is_torch_greater_or_equal, is_torch_neuroncore_available, is_torch_npu_available, is_torch_sdpa_available, @@ -556,6 +557,21 @@ def require_torch(test_case): return unittest.skipUnless(is_torch_available(), "test requires PyTorch")(test_case) +def require_torch_greater_or_equal(version: str): + """ + Decorator marking a test that requires PyTorch version >= `version`. + + These tests are skipped when PyTorch version is less than `version`. + """ + + def decorator(test_case): + return unittest.skipUnless(is_torch_greater_or_equal(version), f"test requires PyTorch version >= {version}")( + test_case + ) + + return decorator + + def require_flash_attn(test_case): """ Decorator marking a test that requires Flash Attention. diff --git a/tests/models/beit/test_modeling_beit.py b/tests/models/beit/test_modeling_beit.py index 6ed9182ad3..89245c7009 100644 --- a/tests/models/beit/test_modeling_beit.py +++ b/tests/models/beit/test_modeling_beit.py @@ -271,6 +271,7 @@ class BeitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_pruning = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = BeitModelTester(self) @@ -292,6 +293,10 @@ class BeitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def test_feed_forward_chunking(self): pass + @unittest.skip(reason="BEiT can't compile dynamic") + def test_sdpa_can_compile_dynamic(self): + pass + def test_model_get_set_embeddings(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() @@ -764,13 +769,6 @@ class BeitModelIntegrationTest(unittest.TestCase): inputs = processor(images=image, return_tensors="pt", size={"height": 480, "width": 480}) pixel_values = inputs.pixel_values.to(torch_device) - # with interpolate_pos_encoding being False an exception should be raised with higher resolution - # images than what the model supports. - self.assertFalse(processor.do_center_crop) - with torch.no_grad(): - with self.assertRaises(ValueError, msg="doesn't match model"): - model(pixel_values, interpolate_pos_encoding=False) - # with interpolate_pos_encoding being True the model should process the higher resolution image # successfully and produce the expected output. with torch.no_grad(): diff --git a/tests/models/bit/test_modeling_bit.py b/tests/models/bit/test_modeling_bit.py index 8e366f506a..31effc266d 100644 --- a/tests/models/bit/test_modeling_bit.py +++ b/tests/models/bit/test_modeling_bit.py @@ -170,6 +170,7 @@ class BitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_resize_embeddings = False test_head_masking = False has_attentions = False + test_torch_exportable = True def setUp(self): self.model_tester = BitModelTester(self) diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py index 5a4357c40c..80360e8177 100644 --- a/tests/models/conditional_detr/test_modeling_conditional_detr.py +++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py @@ -194,6 +194,7 @@ class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline test_head_masking = False test_missing_keys = False zero_init_hidden_state = True + test_torch_exportable = True # special case for head models def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): diff --git a/tests/models/convnext/test_modeling_convnext.py b/tests/models/convnext/test_modeling_convnext.py index 1965a76fad..694d3931a9 100644 --- a/tests/models/convnext/test_modeling_convnext.py +++ b/tests/models/convnext/test_modeling_convnext.py @@ -180,6 +180,7 @@ class ConvNextModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase test_resize_embeddings = False test_head_masking = False has_attentions = False + test_torch_exportable = True def setUp(self): self.model_tester = ConvNextModelTester(self) diff --git a/tests/models/convnextv2/test_modeling_convnextv2.py b/tests/models/convnextv2/test_modeling_convnextv2.py index 18e7be96fb..4a163ddcd7 100644 --- a/tests/models/convnextv2/test_modeling_convnextv2.py +++ b/tests/models/convnextv2/test_modeling_convnextv2.py @@ -188,6 +188,7 @@ class ConvNextV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa test_resize_embeddings = False test_head_masking = False has_attentions = False + test_torch_exportable = True def setUp(self): self.model_tester = ConvNextV2ModelTester(self) diff --git a/tests/models/cvt/test_modeling_cvt.py b/tests/models/cvt/test_modeling_cvt.py index fe02a16656..a63789d887 100644 --- a/tests/models/cvt/test_modeling_cvt.py +++ b/tests/models/cvt/test_modeling_cvt.py @@ -159,6 +159,7 @@ class CvtModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_resize_embeddings = False test_head_masking = False has_attentions = False + test_torch_exportable = True def setUp(self): self.model_tester = CvtModelTester(self) diff --git a/tests/models/dab_detr/test_modeling_dab_detr.py b/tests/models/dab_detr/test_modeling_dab_detr.py index d3d70d67d4..35d43123bf 100644 --- a/tests/models/dab_detr/test_modeling_dab_detr.py +++ b/tests/models/dab_detr/test_modeling_dab_detr.py @@ -197,6 +197,7 @@ class DabDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi test_head_masking = False test_missing_keys = False zero_init_hidden_state = True + test_torch_exportable = True # special case for head models def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py index 42f6928648..b9404e08a9 100644 --- a/tests/models/deformable_detr/test_modeling_deformable_detr.py +++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py @@ -200,6 +200,7 @@ class DeformableDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT test_pruning = False test_head_masking = False test_missing_keys = False + test_torch_exportable = True # special case for head models def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): diff --git a/tests/models/deit/test_modeling_deit.py b/tests/models/deit/test_modeling_deit.py index 1637b22e95..bf58e1cd32 100644 --- a/tests/models/deit/test_modeling_deit.py +++ b/tests/models/deit/test_modeling_deit.py @@ -222,6 +222,7 @@ class DeiTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_pruning = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = DeiTModelTester(self) diff --git a/tests/models/depth_anything/test_modeling_depth_anything.py b/tests/models/depth_anything/test_modeling_depth_anything.py index 91f9589217..95026c1054 100644 --- a/tests/models/depth_anything/test_modeling_depth_anything.py +++ b/tests/models/depth_anything/test_modeling_depth_anything.py @@ -146,6 +146,7 @@ class DepthAnythingModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Tes test_pruning = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = DepthAnythingModelTester(self) diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index 44529270fd..89db2a57cc 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -212,6 +212,7 @@ class DepthProModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase test_pruning = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = DepthProModelTester(self) diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py index 1451eaeb80..bfeded558b 100644 --- a/tests/models/detr/test_modeling_detr.py +++ b/tests/models/detr/test_modeling_detr.py @@ -194,6 +194,7 @@ class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin test_head_masking = False test_missing_keys = False zero_init_hidden_state = True + test_torch_exportable = True # special case for head models def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): diff --git a/tests/models/dinat/test_modeling_dinat.py b/tests/models/dinat/test_modeling_dinat.py index 27a3eafddc..7c49827051 100644 --- a/tests/models/dinat/test_modeling_dinat.py +++ b/tests/models/dinat/test_modeling_dinat.py @@ -216,6 +216,7 @@ class DinatModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_pruning = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = DinatModelTester(self) diff --git a/tests/models/dinov2/test_modeling_dinov2.py b/tests/models/dinov2/test_modeling_dinov2.py index 9d84937334..5cbcbe77d9 100644 --- a/tests/models/dinov2/test_modeling_dinov2.py +++ b/tests/models/dinov2/test_modeling_dinov2.py @@ -212,6 +212,8 @@ class Dinov2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): attention_mask and seq_length. """ + test_torch_exportable = True + all_model_classes = ( ( Dinov2Model, diff --git a/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py b/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py index 185492d6d4..a276eedd3a 100644 --- a/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py +++ b/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py @@ -237,6 +237,7 @@ class Dinov2WithRegistersModelTest(ModelTesterMixin, PipelineTesterMixin, unitte test_pruning = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = Dinov2WithRegistersModelTester(self) diff --git a/tests/models/dpt/test_modeling_dpt.py b/tests/models/dpt/test_modeling_dpt.py index 520d87081e..44fb2afb3b 100644 --- a/tests/models/dpt/test_modeling_dpt.py +++ b/tests/models/dpt/test_modeling_dpt.py @@ -172,6 +172,7 @@ class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_pruning = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = DPTModelTester(self) diff --git a/tests/models/dpt/test_modeling_dpt_auto_backbone.py b/tests/models/dpt/test_modeling_dpt_auto_backbone.py index 6b30ed323d..62240d24bc 100644 --- a/tests/models/dpt/test_modeling_dpt_auto_backbone.py +++ b/tests/models/dpt/test_modeling_dpt_auto_backbone.py @@ -140,6 +140,7 @@ class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_pruning = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = DPTModelTester(self) diff --git a/tests/models/dpt/test_modeling_dpt_hybrid.py b/tests/models/dpt/test_modeling_dpt_hybrid.py index 1229f3e40f..568b05e2d4 100644 --- a/tests/models/dpt/test_modeling_dpt_hybrid.py +++ b/tests/models/dpt/test_modeling_dpt_hybrid.py @@ -186,6 +186,7 @@ class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_pruning = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = DPTModelTester(self) diff --git a/tests/models/efficientnet/test_modeling_efficientnet.py b/tests/models/efficientnet/test_modeling_efficientnet.py index 796c5a149a..68bc175ad4 100644 --- a/tests/models/efficientnet/test_modeling_efficientnet.py +++ b/tests/models/efficientnet/test_modeling_efficientnet.py @@ -139,6 +139,7 @@ class EfficientNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Test test_resize_embeddings = False test_head_masking = False has_attentions = False + test_torch_exportable = True def setUp(self): self.model_tester = EfficientNetModelTester(self) diff --git a/tests/models/focalnet/test_modeling_focalnet.py b/tests/models/focalnet/test_modeling_focalnet.py index 2d3d8b6f3a..2960234c3b 100644 --- a/tests/models/focalnet/test_modeling_focalnet.py +++ b/tests/models/focalnet/test_modeling_focalnet.py @@ -247,6 +247,7 @@ class FocalNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase test_resize_embeddings = False test_head_masking = False has_attentions = False + test_torch_exportable = True def setUp(self): self.model_tester = FocalNetModelTester(self) diff --git a/tests/models/glpn/test_modeling_glpn.py b/tests/models/glpn/test_modeling_glpn.py index f6bd7b146c..c4dd380a0d 100644 --- a/tests/models/glpn/test_modeling_glpn.py +++ b/tests/models/glpn/test_modeling_glpn.py @@ -152,6 +152,7 @@ class GLPNModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_head_masking = False test_pruning = False test_resize_embeddings = False + test_torch_exportable = True def setUp(self): self.model_tester = GLPNModelTester(self) diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py index 923bdd1156..bbd35f29ef 100644 --- a/tests/models/hiera/test_modeling_hiera.py +++ b/tests/models/hiera/test_modeling_hiera.py @@ -250,6 +250,7 @@ class HieraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_pruning = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = HieraModelTester(self) diff --git a/tests/models/ijepa/test_modeling_ijepa.py b/tests/models/ijepa/test_modeling_ijepa.py index 147e576036..4088f355a4 100644 --- a/tests/models/ijepa/test_modeling_ijepa.py +++ b/tests/models/ijepa/test_modeling_ijepa.py @@ -207,6 +207,7 @@ class IJepaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_pruning = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = IJepaModelTester(self) diff --git a/tests/models/imagegpt/test_modeling_imagegpt.py b/tests/models/imagegpt/test_modeling_imagegpt.py index 6d96c444c3..b21054525c 100644 --- a/tests/models/imagegpt/test_modeling_imagegpt.py +++ b/tests/models/imagegpt/test_modeling_imagegpt.py @@ -237,6 +237,7 @@ class ImageGPTModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM else {} ) test_missing_keys = False + test_torch_exportable = True # as ImageGPTForImageClassification isn't included in any auto mapping, we add labels here def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): diff --git a/tests/models/mask2former/test_modeling_mask2former.py b/tests/models/mask2former/test_modeling_mask2former.py index af7704b1ef..b1fecd9253 100644 --- a/tests/models/mask2former/test_modeling_mask2former.py +++ b/tests/models/mask2former/test_modeling_mask2former.py @@ -205,6 +205,7 @@ class Mask2FormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC test_pruning = False test_head_masking = False test_missing_keys = False + test_torch_exportable = True def setUp(self): self.model_tester = Mask2FormerModelTester(self) diff --git a/tests/models/maskformer/test_modeling_maskformer.py b/tests/models/maskformer/test_modeling_maskformer.py index 9298fe2d1c..284cdb5f49 100644 --- a/tests/models/maskformer/test_modeling_maskformer.py +++ b/tests/models/maskformer/test_modeling_maskformer.py @@ -209,6 +209,7 @@ class MaskFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa test_head_masking = False test_missing_keys = False zero_init_hidden_state = True + test_torch_exportable = True def setUp(self): self.model_tester = MaskFormerModelTester(self) diff --git a/tests/models/maskformer/test_modeling_maskformer_swin.py b/tests/models/maskformer/test_modeling_maskformer_swin.py index 513ac6f67b..502660b191 100644 --- a/tests/models/maskformer/test_modeling_maskformer_swin.py +++ b/tests/models/maskformer/test_modeling_maskformer_swin.py @@ -181,6 +181,7 @@ class MaskFormerSwinModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Te test_pruning = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = MaskFormerSwinModelTester(self) diff --git a/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py b/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py index ab588cf20f..9488700193 100644 --- a/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py +++ b/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py @@ -154,6 +154,7 @@ class MobileNetV1ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC test_resize_embeddings = False test_head_masking = False has_attentions = False + test_torch_exportable = True def setUp(self): self.model_tester = MobileNetV1ModelTester(self) diff --git a/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py b/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py index 7e96dea4fe..b5a12edd7b 100644 --- a/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py +++ b/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py @@ -205,6 +205,7 @@ class MobileNetV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC test_resize_embeddings = False test_head_masking = False has_attentions = False + test_torch_exportable = True def setUp(self): self.model_tester = MobileNetV2ModelTester(self) diff --git a/tests/models/mobilevit/test_modeling_mobilevit.py b/tests/models/mobilevit/test_modeling_mobilevit.py index a14a5fb445..b67abce934 100644 --- a/tests/models/mobilevit/test_modeling_mobilevit.py +++ b/tests/models/mobilevit/test_modeling_mobilevit.py @@ -198,6 +198,7 @@ class MobileViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas test_resize_embeddings = False test_head_masking = False has_attentions = False + test_torch_exportable = True def setUp(self): self.model_tester = MobileViTModelTester(self) diff --git a/tests/models/mobilevitv2/test_modeling_mobilevitv2.py b/tests/models/mobilevitv2/test_modeling_mobilevitv2.py index 136bb51312..70fc70124f 100644 --- a/tests/models/mobilevitv2/test_modeling_mobilevitv2.py +++ b/tests/models/mobilevitv2/test_modeling_mobilevitv2.py @@ -200,6 +200,7 @@ class MobileViTV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC test_resize_embeddings = False test_head_masking = False has_attentions = False + test_torch_exportable = True def setUp(self): self.model_tester = MobileViTV2ModelTester(self) diff --git a/tests/models/poolformer/test_modeling_poolformer.py b/tests/models/poolformer/test_modeling_poolformer.py index 775df97cde..6f4ead91a2 100644 --- a/tests/models/poolformer/test_modeling_poolformer.py +++ b/tests/models/poolformer/test_modeling_poolformer.py @@ -132,6 +132,7 @@ class PoolFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa test_resize_embeddings = False test_torchscript = False has_attentions = False + test_torch_exportable = True def setUp(self): self.model_tester = PoolFormerModelTester(self) diff --git a/tests/models/pvt/test_modeling_pvt.py b/tests/models/pvt/test_modeling_pvt.py index 3bc5e3892d..df2b3bcc85 100644 --- a/tests/models/pvt/test_modeling_pvt.py +++ b/tests/models/pvt/test_modeling_pvt.py @@ -166,6 +166,7 @@ class PvtModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_resize_embeddings = False test_torchscript = False has_attentions = False + test_torch_exportable = True def setUp(self): self.model_tester = PvtModelTester(self) diff --git a/tests/models/pvt_v2/test_modeling_pvt_v2.py b/tests/models/pvt_v2/test_modeling_pvt_v2.py index 1c69385745..d850da24f3 100644 --- a/tests/models/pvt_v2/test_modeling_pvt_v2.py +++ b/tests/models/pvt_v2/test_modeling_pvt_v2.py @@ -202,6 +202,7 @@ class PvtV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_resize_embeddings = False test_torchscript = False has_attentions = False + test_torch_exportable = True def setUp(self): self.model_tester = PvtV2ModelTester(self) diff --git a/tests/models/regnet/test_modeling_regnet.py b/tests/models/regnet/test_modeling_regnet.py index 371e699d23..62faeb58b2 100644 --- a/tests/models/regnet/test_modeling_regnet.py +++ b/tests/models/regnet/test_modeling_regnet.py @@ -133,6 +133,7 @@ class RegNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_resize_embeddings = False test_head_masking = False has_attentions = False + test_torch_exportable = True def setUp(self): self.model_tester = RegNetModelTester(self) diff --git a/tests/models/resnet/test_modeling_resnet.py b/tests/models/resnet/test_modeling_resnet.py index c940521a8d..d8790130df 100644 --- a/tests/models/resnet/test_modeling_resnet.py +++ b/tests/models/resnet/test_modeling_resnet.py @@ -178,6 +178,7 @@ class ResNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_resize_embeddings = False test_head_masking = False has_attentions = False + test_torch_exportable = True def setUp(self): self.model_tester = ResNetModelTester(self) diff --git a/tests/models/rt_detr/test_modeling_rt_detr.py b/tests/models/rt_detr/test_modeling_rt_detr.py index c3ccc89efc..ab465065f1 100644 --- a/tests/models/rt_detr/test_modeling_rt_detr.py +++ b/tests/models/rt_detr/test_modeling_rt_detr.py @@ -261,6 +261,7 @@ class RTDetrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_pruning = False test_head_masking = False test_missing_keys = False + test_torch_exportable = True # special case for head models def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): diff --git a/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py b/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py index 64664d7b94..d5388cf41a 100644 --- a/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py +++ b/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py @@ -259,6 +259,7 @@ class RTDetrV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase test_pruning = False test_head_masking = False test_missing_keys = False + test_torch_exportable = True # special case for head models def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): diff --git a/tests/models/segformer/test_modeling_segformer.py b/tests/models/segformer/test_modeling_segformer.py index 5f6493a36c..30ea5c63f4 100644 --- a/tests/models/segformer/test_modeling_segformer.py +++ b/tests/models/segformer/test_modeling_segformer.py @@ -180,6 +180,7 @@ class SegformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas test_head_masking = False test_pruning = False test_resize_embeddings = False + test_torch_exportable = True def setUp(self): self.model_tester = SegformerModelTester(self) diff --git a/tests/models/seggpt/test_modeling_seggpt.py b/tests/models/seggpt/test_modeling_seggpt.py index c8b7362b60..3a31557971 100644 --- a/tests/models/seggpt/test_modeling_seggpt.py +++ b/tests/models/seggpt/test_modeling_seggpt.py @@ -172,6 +172,8 @@ class SegGptModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_resize_embeddings = False test_head_masking = False test_torchscript = False + test_torch_exportable = True + pipeline_model_mapping = ( {"feature-extraction": SegGptModel, "mask-generation": SegGptModel} if is_torch_available() else {} ) diff --git a/tests/models/swiftformer/test_modeling_swiftformer.py b/tests/models/swiftformer/test_modeling_swiftformer.py index 234c8aa15f..0d6f471e96 100644 --- a/tests/models/swiftformer/test_modeling_swiftformer.py +++ b/tests/models/swiftformer/test_modeling_swiftformer.py @@ -147,6 +147,7 @@ class SwiftFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC test_resize_embeddings = False test_head_masking = False has_attentions = False + test_torch_exportable = True def setUp(self): self.model_tester = SwiftFormerModelTester(self) diff --git a/tests/models/swin/test_modeling_swin.py b/tests/models/swin/test_modeling_swin.py index 92c06de971..cf33a3f3df 100644 --- a/tests/models/swin/test_modeling_swin.py +++ b/tests/models/swin/test_modeling_swin.py @@ -240,6 +240,7 @@ class SwinModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_pruning = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = SwinModelTester(self) diff --git a/tests/models/swin2sr/test_modeling_swin2sr.py b/tests/models/swin2sr/test_modeling_swin2sr.py index 91d04915d1..3752f6ef30 100644 --- a/tests/models/swin2sr/test_modeling_swin2sr.py +++ b/tests/models/swin2sr/test_modeling_swin2sr.py @@ -172,6 +172,7 @@ class Swin2SRModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase) test_resize_embeddings = False test_head_masking = False test_torchscript = False + test_torch_exportable = True def setUp(self): self.model_tester = Swin2SRModelTester(self) diff --git a/tests/models/swinv2/test_modeling_swinv2.py b/tests/models/swinv2/test_modeling_swinv2.py index 4bf309cc6a..a4e93eeb3d 100644 --- a/tests/models/swinv2/test_modeling_swinv2.py +++ b/tests/models/swinv2/test_modeling_swinv2.py @@ -226,6 +226,7 @@ class Swinv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_pruning = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = Swinv2ModelTester(self) diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py index 50165cbe1a..cbed595f66 100644 --- a/tests/models/table_transformer/test_modeling_table_transformer.py +++ b/tests/models/table_transformer/test_modeling_table_transformer.py @@ -209,6 +209,7 @@ class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, Pipelin test_head_masking = False test_missing_keys = False zero_init_hidden_state = True + test_torch_exportable = True # special case for head models def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py index 0f02cfcaaf..5d560f919b 100644 --- a/tests/models/textnet/test_modeling_textnet.py +++ b/tests/models/textnet/test_modeling_textnet.py @@ -217,6 +217,7 @@ class TextNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase) test_pruning = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True has_attentions = False def setUp(self): diff --git a/tests/models/timesformer/test_modeling_timesformer.py b/tests/models/timesformer/test_modeling_timesformer.py index ec8b34e5e2..2cbe3a6dce 100644 --- a/tests/models/timesformer/test_modeling_timesformer.py +++ b/tests/models/timesformer/test_modeling_timesformer.py @@ -167,6 +167,7 @@ class TimesformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC test_torchscript = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = TimesformerModelTester(self) diff --git a/tests/models/upernet/test_modeling_upernet.py b/tests/models/upernet/test_modeling_upernet.py index 94ddae0ee7..1b337460f8 100644 --- a/tests/models/upernet/test_modeling_upernet.py +++ b/tests/models/upernet/test_modeling_upernet.py @@ -157,6 +157,7 @@ class UperNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase) test_head_masking = False test_torchscript = False has_attentions = False + test_torch_exportable = True def setUp(self): self.model_tester = UperNetModelTester(self) diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py index 1e470e2d78..f2171f37ad 100644 --- a/tests/models/videomae/test_modeling_videomae.py +++ b/tests/models/videomae/test_modeling_videomae.py @@ -186,6 +186,7 @@ class VideoMAEModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase test_torchscript = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = VideoMAEModelTester(self) diff --git a/tests/models/vit/test_modeling_vit.py b/tests/models/vit/test_modeling_vit.py index aeb38f73f2..929d7fec95 100644 --- a/tests/models/vit/test_modeling_vit.py +++ b/tests/models/vit/test_modeling_vit.py @@ -207,6 +207,7 @@ class ViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_pruning = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = ViTModelTester(self) diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py index fb312a17e4..1b83690014 100644 --- a/tests/models/vit_mae/test_modeling_vit_mae.py +++ b/tests/models/vit_mae/test_modeling_vit_mae.py @@ -174,6 +174,7 @@ class ViTMAEModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_torchscript = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = ViTMAEModelTester(self) diff --git a/tests/models/vit_msn/test_modeling_vit_msn.py b/tests/models/vit_msn/test_modeling_vit_msn.py index bfee2d81de..8c94a13771 100644 --- a/tests/models/vit_msn/test_modeling_vit_msn.py +++ b/tests/models/vit_msn/test_modeling_vit_msn.py @@ -162,6 +162,7 @@ class ViTMSNModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_torchscript = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = ViTMSNModelTester(self) diff --git a/tests/models/vitdet/test_modeling_vitdet.py b/tests/models/vitdet/test_modeling_vitdet.py index a9690eee23..2c46b60f7e 100644 --- a/tests/models/vitdet/test_modeling_vitdet.py +++ b/tests/models/vitdet/test_modeling_vitdet.py @@ -169,6 +169,7 @@ class VitDetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_pruning = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = VitDetModelTester(self) diff --git a/tests/models/vitmatte/test_modeling_vitmatte.py b/tests/models/vitmatte/test_modeling_vitmatte.py index d52cc38f7d..035e1a65b8 100644 --- a/tests/models/vitmatte/test_modeling_vitmatte.py +++ b/tests/models/vitmatte/test_modeling_vitmatte.py @@ -143,6 +143,7 @@ class VitMatteModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase test_pruning = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = VitMatteModelTester(self) diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py index f7acf4ca1d..a47e9ca737 100644 --- a/tests/models/vitpose/test_modeling_vitpose.py +++ b/tests/models/vitpose/test_modeling_vitpose.py @@ -154,6 +154,7 @@ class VitPoseModelTest(ModelTesterMixin, unittest.TestCase): test_pruning = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = VitPoseModelTester(self) diff --git a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py index c32a57e9f2..91c3d37abb 100644 --- a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py +++ b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py @@ -18,7 +18,7 @@ import inspect import unittest from transformers import VitPoseBackboneConfig -from transformers.testing_utils import require_torch +from transformers.testing_utils import require_torch, torch_device from transformers.utils import is_torch_available, is_vision_available from ...test_backbone_common import BackboneTesterMixin @@ -27,6 +27,8 @@ from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor if is_torch_available(): + import torch + from transformers import VitPoseBackbone @@ -129,6 +131,7 @@ class VitPoseBackboneModelTest(ModelTesterMixin, unittest.TestCase): test_pruning = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = VitPoseBackboneModelTester(self) @@ -187,6 +190,17 @@ class VitPoseBackboneModelTest(ModelTesterMixin, unittest.TestCase): expected_arg_names = ["pixel_values"] self.assertListEqual(arg_names[:1], expected_arg_names) + def test_torch_export(self): + # Dense architecture + super().test_torch_export() + + # MOE architecture + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.num_experts = 2 + config.part_features = config.hidden_size // config.num_experts + inputs_dict["dataset_index"] = torch.tensor([0] * self.model_tester.batch_size, device=torch_device) + super().test_torch_export(config=config, inputs_dict=inputs_dict) + @require_torch class VitPoseBackboneTest(unittest.TestCase, BackboneTesterMixin): diff --git a/tests/models/vivit/test_modeling_vivit.py b/tests/models/vivit/test_modeling_vivit.py index 5cab10700b..364998eb65 100644 --- a/tests/models/vivit/test_modeling_vivit.py +++ b/tests/models/vivit/test_modeling_vivit.py @@ -175,6 +175,7 @@ class VivitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_torchscript = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = VivitModelTester(self) diff --git a/tests/models/yolos/test_modeling_yolos.py b/tests/models/yolos/test_modeling_yolos.py index e5857c8a33..bfc428961c 100644 --- a/tests/models/yolos/test_modeling_yolos.py +++ b/tests/models/yolos/test_modeling_yolos.py @@ -178,6 +178,7 @@ class YolosModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_resize_embeddings = False test_head_masking = False test_torchscript = False + test_torch_exportable = True # special case for head model def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): diff --git a/tests/models/zoedepth/test_modeling_zoedepth.py b/tests/models/zoedepth/test_modeling_zoedepth.py index ee1da96880..782b99af43 100644 --- a/tests/models/zoedepth/test_modeling_zoedepth.py +++ b/tests/models/zoedepth/test_modeling_zoedepth.py @@ -147,6 +147,7 @@ class ZoeDepthModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase test_pruning = False test_resize_embeddings = False test_head_masking = False + test_torch_exportable = True def setUp(self): self.model_tester = ZoeDepthModelTester(self) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 964595a0cd..0b437d9356 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -86,6 +86,7 @@ from transformers.testing_utils import ( require_torch, require_torch_accelerator, require_torch_gpu, + require_torch_greater_or_equal, require_torch_multi_accelerator, require_torch_multi_gpu, require_torch_sdpa, @@ -221,6 +222,7 @@ class ModelTesterMixin: test_mismatched_shapes = True test_missing_keys = True test_model_parallel = False + test_torch_exportable = False # Used in `check_training_gradient_checkpointing` to NOT check all params having gradient (e.g. for some MOE models) test_all_params_have_gradient = True is_encoder_decoder = False @@ -4865,6 +4867,72 @@ class ModelTesterMixin: # Assert the last tokens are actually the same (except for the natural fluctuation due to order of FP ops) torch.testing.assert_close(all_logits[:, -1:, :], last_token_logits, rtol=1e-5, atol=1e-5) + @slow + @require_torch_greater_or_equal("2.5") + def test_torch_export(self, config=None, inputs_dict=None, tolerance=1e-4): + """ + Test if model can be exported with torch.export.export() + + Args: + config (PretrainedConfig): + Config to use for the model, if None, use default config from model_tester + inputs_dict (dict): + Inputs to use for the model, if None, use default inputs from model_tester + tolerance (float): + `atol` for torch.allclose(), defined in signature for test overriding + """ + if not self.test_torch_exportable: + self.skipTest(reason="test_torch_exportable=False for this model.") + + def recursively_check(eager_outputs, exported_outputs): + is_tested = False + if isinstance(eager_outputs, torch.Tensor): + torch.testing.assert_close(eager_outputs, exported_outputs, atol=tolerance, rtol=tolerance) + return True + elif isinstance(eager_outputs, (tuple, list)): + for eager_output, exported_output in zip(eager_outputs, exported_outputs): + is_tested = is_tested or recursively_check(eager_output, exported_output) + return is_tested + elif isinstance(eager_outputs, dict): + for key in eager_outputs: + is_tested = is_tested or recursively_check(eager_outputs[key], exported_outputs[key]) + return is_tested + return is_tested + + default_config, default_inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config = config or default_config + inputs_dict = inputs_dict or default_inputs_dict + + for model_class in self.all_model_classes: + if model_class.__name__.endswith("ForPreTraining"): + continue + + with self.subTest(model_class.__name__): + model = model_class(config).eval().to(torch_device) + + # Export model + exported_model = torch.export.export( + model, + args=(), + kwargs=inputs_dict, + strict=True, + ) + + # Run exported model and eager model + with torch.no_grad(): + # set seed in case anything is not deterministic in model (e.g. vit_mae noise) + torch.manual_seed(1234) + eager_outputs = model(**inputs_dict) + torch.manual_seed(1234) + exported_outputs = exported_model.module().forward(**inputs_dict) + + # Check if outputs are close: + # is_tested is a boolean flag idicating if we comapre any outputs, + # e.g. there might be a situation when outputs are empty list, then is_tested will be False. + # In case of outputs are different the error will be rasied in `recursively_check` function. + is_tested = recursively_check(eager_outputs, exported_outputs) + self.assertTrue(is_tested, msg=f"No outputs were compared for {model_class.__name__}") + @require_torch_gpu def test_flex_attention_with_grads(self): for model_class in self.all_model_classes: