Add tests for batching support (#29297)

* add tests for batching support

* Update src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Update src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Update tests/test_modeling_common.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Update tests/test_modeling_common.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Update tests/test_modeling_common.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* fixes and comments

* use cosine distance for conv models

* skip mra model testing

* Update tests/models/vilt/test_modeling_vilt.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* finzalize  and make style

* check model type by input names

* Update tests/models/vilt/test_modeling_vilt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* fixed batch size for all testers

* Revert "fixed batch size for all testers"

This reverts commit 525f3a0a058f069fbda00352cf202b728d40df99.

* add batch_size for all testers

* dict from model output

* do not skip layoutlm

* bring back some code from git revert

* Update tests/test_modeling_common.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/test_modeling_common.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* clean-up

* where did minus go in tolerance

* make whisper happy

* deal with consequences of losing minus

* deal with consequences of losing minus

* maskformer needs its own test for happiness

* fix more models

* tag flaky CV models from Amy's approval

* make codestyle

---------

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
This commit is contained in:
Raushan Turganbay
2024-03-12 22:46:19 +05:00
committed by GitHub
parent 11163fff58
commit 8e64ba2890
48 changed files with 350 additions and 67 deletions

View File

@@ -1292,7 +1292,7 @@ class CLIPSegDecoder(CLIPSegPreTrainedModel):
batch_size = conditional_embeddings.shape[0] batch_size = conditional_embeddings.shape[0]
output = output.view(batch_size, output.shape[1], size, size) output = output.view(batch_size, output.shape[1], size, size)
logits = self.transposed_convolution(output).squeeze() logits = self.transposed_convolution(output).squeeze(1)
if not return_dict: if not return_dict:
return tuple(v for v in [logits, all_hidden_states, all_attentions] if v is not None) return tuple(v for v in [logits, all_hidden_states, all_attentions] if v is not None)

View File

@@ -51,13 +51,13 @@ ENCODEC_PRETRAINED_MODEL_ARCHIVE_LIST = [
class EncodecOutput(ModelOutput): class EncodecOutput(ModelOutput):
""" """
Args: Args:
audio_codes (`torch.FloatTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*): audio_codes (`torch.LongTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
Discret code embeddings computed using `model.encode`. Discret code embeddings computed using `model.encode`.
audio_values (`torch.FlaotTensor` of shape `(batch_size, sequence_length)`, *optional*) audio_values (`torch.FlaotTensor` of shape `(batch_size, sequence_length)`, *optional*)
Decoded audio values, obtained using the decoder part of Encodec. Decoded audio values, obtained using the decoder part of Encodec.
""" """
audio_codes: torch.FloatTensor = None audio_codes: torch.LongTensor = None
audio_values: torch.FloatTensor = None audio_values: torch.FloatTensor = None
@@ -65,13 +65,13 @@ class EncodecOutput(ModelOutput):
class EncodecEncoderOutput(ModelOutput): class EncodecEncoderOutput(ModelOutput):
""" """
Args: Args:
audio_codes (`torch.FloatTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*): audio_codes (`torch.LongTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
Discret code embeddings computed using `model.encode`. Discret code embeddings computed using `model.encode`.
audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*): audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
Scaling factor for each `audio_codes` input. This is used to unscale each chunk of audio when decoding. Scaling factor for each `audio_codes` input. This is used to unscale each chunk of audio when decoding.
""" """
audio_codes: torch.FloatTensor = None audio_codes: torch.LongTensor = None
audio_scales: torch.FloatTensor = None audio_scales: torch.FloatTensor = None
@@ -514,7 +514,7 @@ ENCODEC_INPUTS_DOCSTRING = r"""
The target bandwidth. Must be one of `config.target_bandwidths`. If `None`, uses the smallest possible The target bandwidth. Must be one of `config.target_bandwidths`. If `None`, uses the smallest possible
bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented as bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented as
`bandwidth == 6.0` `bandwidth == 6.0`
audio_codes (`torch.FloatTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*): audio_codes (`torch.LongTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
Discret code embeddings computed using `model.encode`. Discret code embeddings computed using `model.encode`.
audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*): audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
Scaling factor for each `audio_codes` input. Scaling factor for each `audio_codes` input.
@@ -718,7 +718,7 @@ class EncodecModel(EncodecPreTrainedModel):
trimmed. trimmed.
Args: Args:
audio_codes (`torch.FloatTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*): audio_codes (`torch.LongTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
Discret code embeddings computed using `model.encode`. Discret code embeddings computed using `model.encode`.
audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*): audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
Scaling factor for each `audio_codes` input. Scaling factor for each `audio_codes` input.

View File

@@ -776,7 +776,7 @@ class FunnelDiscriminatorPredictions(nn.Module):
def forward(self, discriminator_hidden_states: torch.Tensor) -> torch.Tensor: def forward(self, discriminator_hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(discriminator_hidden_states) hidden_states = self.dense(discriminator_hidden_states)
hidden_states = ACT2FN[self.config.hidden_act](hidden_states) hidden_states = ACT2FN[self.config.hidden_act](hidden_states)
logits = self.dense_prediction(hidden_states).squeeze() logits = self.dense_prediction(hidden_states).squeeze(-1)
return logits return logits

View File

@@ -679,7 +679,7 @@ class TvpFramePadPrompter(nn.Module):
prompt = torch.cat([self.pad_left, base, self.pad_right], dim=4) prompt = torch.cat([self.pad_left, base, self.pad_right], dim=4)
prompt = torch.cat([self.pad_up, prompt, self.pad_down], dim=3) prompt = torch.cat([self.pad_up, prompt, self.pad_down], dim=3)
prompt = torch.cat(pixel_values.size(0) * [prompt]) prompt = torch.cat(pixel_values.size(0) * [prompt])
pixel_values += prompt.to(pixel_values.dtype) pixel_values = pixel_values + prompt.to(pixel_values.dtype)
return pixel_values return pixel_values

View File

@@ -371,10 +371,12 @@ class YosoSelfAttention(nn.Module):
key_layer = key_layer.reshape(batch_size * num_heads, seq_len, head_dim) key_layer = key_layer.reshape(batch_size * num_heads, seq_len, head_dim)
value_layer = value_layer.reshape(batch_size * num_heads, seq_len, head_dim) value_layer = value_layer.reshape(batch_size * num_heads, seq_len, head_dim)
# revert changes made by get_extended_attention_mask
attention_mask = 1.0 + attention_mask / 10000.0 attention_mask = 1.0 + attention_mask / 10000.0
attention_mask = ( attention_mask = (
attention_mask.squeeze().repeat(1, num_heads, 1).reshape(batch_size * num_heads, seq_len).int() attention_mask.unsqueeze(1)
.repeat_interleave(num_heads, dim=1)
.reshape(batch_size * num_heads, seq_len)
.int()
) )
# The CUDA kernels are most efficient with inputs whose size is a multiple of a GPU's warp size (32). Inputs # The CUDA kernels are most efficient with inputs whose size is a multiple of a GPU's warp size (32). Inputs
@@ -808,10 +810,6 @@ class YosoModel(YosoPreTrainedModel):
else: else:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
# Prepare head mask if needed # Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head # 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N # attention_probs has shape bsz x n_heads x N x N
@@ -827,7 +825,7 @@ class YosoModel(YosoPreTrainedModel):
) )
encoder_outputs = self.encoder( encoder_outputs = self.encoder(
embedding_output, embedding_output,
attention_mask=extended_attention_mask, attention_mask=attention_mask,
head_mask=head_mask, head_mask=head_mask,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,

View File

@@ -405,6 +405,7 @@ class AlignModelTester:
self.parent = parent self.parent = parent
self.text_model_tester = AlignTextModelTester(parent, **text_kwargs) self.text_model_tester = AlignTextModelTester(parent, **text_kwargs)
self.vision_model_tester = AlignVisionModelTester(parent, **vision_kwargs) self.vision_model_tester = AlignVisionModelTester(parent, **vision_kwargs)
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
self.is_training = is_training self.is_training = is_training
def prepare_config_and_inputs(self): def prepare_config_and_inputs(self):

View File

@@ -380,6 +380,7 @@ class AltCLIPModelTester:
self.parent = parent self.parent = parent
self.text_model_tester = AltCLIPTextModelTester(parent, **text_kwargs) self.text_model_tester = AltCLIPTextModelTester(parent, **text_kwargs)
self.vision_model_tester = AltCLIPVisionModelTester(parent, **vision_kwargs) self.vision_model_tester = AltCLIPVisionModelTester(parent, **vision_kwargs)
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
self.is_training = is_training self.is_training = is_training
def prepare_config_and_inputs(self): def prepare_config_and_inputs(self):

View File

@@ -107,6 +107,7 @@ class AutoformerModelTester:
cardinality=[self.cardinality], cardinality=[self.cardinality],
embedding_dimension=[self.embedding_dimension], embedding_dimension=[self.embedding_dimension],
moving_average=self.moving_average, moving_average=self.moving_average,
scaling="std", # we need std to get non-zero `loc`
) )
def prepare_autoformer_inputs_dict(self, config): def prepare_autoformer_inputs_dict(self, config):

View File

@@ -67,7 +67,7 @@ class BarkSemanticModelTester:
def __init__( def __init__(
self, self,
parent, parent,
batch_size=2, batch_size=3, # need batch_size != num_hidden_layers
seq_length=4, seq_length=4,
is_training=False, # for now training is not supported is_training=False, # for now training is not supported
use_input_mask=True, use_input_mask=True,
@@ -203,7 +203,7 @@ class BarkCoarseModelTester:
def __init__( def __init__(
self, self,
parent, parent,
batch_size=2, batch_size=3, # need batch_size != num_hidden_layers
seq_length=4, seq_length=4,
is_training=False, # for now training is not supported is_training=False, # for now training is not supported
use_input_mask=True, use_input_mask=True,
@@ -339,7 +339,7 @@ class BarkFineModelTester:
def __init__( def __init__(
self, self,
parent, parent,
batch_size=2, batch_size=3, # need batch_size != num_hidden_layers
seq_length=4, seq_length=4,
is_training=False, # for now training is not supported is_training=False, # for now training is not supported
use_input_mask=True, use_input_mask=True,

View File

@@ -387,6 +387,7 @@ class BlipModelTester:
self.parent = parent self.parent = parent
self.text_model_tester = BlipTextModelTester(parent, **text_kwargs) self.text_model_tester = BlipTextModelTester(parent, **text_kwargs)
self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs) self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs)
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
self.is_training = is_training self.is_training = is_training
def prepare_config_and_inputs(self): def prepare_config_and_inputs(self):
@@ -596,6 +597,7 @@ class BlipTextRetrievalModelTester:
self.parent = parent self.parent = parent
self.text_model_tester = BlipTextModelTester(parent, **text_kwargs) self.text_model_tester = BlipTextModelTester(parent, **text_kwargs)
self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs) self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs)
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
self.is_training = is_training self.is_training = is_training
def prepare_config_and_inputs(self): def prepare_config_and_inputs(self):
@@ -643,6 +645,7 @@ class BlipTextImageModelsModelTester:
self.parent = parent self.parent = parent
self.text_model_tester = BlipTextModelTester(parent, **text_kwargs) self.text_model_tester = BlipTextModelTester(parent, **text_kwargs)
self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs) self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs)
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
self.is_training = is_training self.is_training = is_training
def prepare_config_and_inputs(self): def prepare_config_and_inputs(self):
@@ -691,6 +694,7 @@ class BlipVQAModelTester:
self.parent = parent self.parent = parent
self.text_model_tester = BlipTextModelTester(parent, **text_kwargs) self.text_model_tester = BlipTextModelTester(parent, **text_kwargs)
self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs) self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs)
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
self.is_training = is_training self.is_training = is_training
def prepare_config_and_inputs(self): def prepare_config_and_inputs(self):

View File

@@ -390,6 +390,7 @@ class Blip2ForConditionalGenerationDecoderOnlyModelTester:
self.vision_model_tester = Blip2VisionModelTester(parent, **vision_kwargs) self.vision_model_tester = Blip2VisionModelTester(parent, **vision_kwargs)
self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs) self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs)
self.text_model_tester = Blip2TextModelDecoderOnlyTester(parent, **text_kwargs) self.text_model_tester = Blip2TextModelDecoderOnlyTester(parent, **text_kwargs)
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
self.is_training = is_training self.is_training = is_training
self.num_query_tokens = num_query_tokens self.num_query_tokens = num_query_tokens
@@ -616,6 +617,7 @@ class Blip2ModelTester:
self.vision_model_tester = Blip2VisionModelTester(parent, **vision_kwargs) self.vision_model_tester = Blip2VisionModelTester(parent, **vision_kwargs)
self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs) self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs)
self.text_model_tester = Blip2TextModelTester(parent, **text_kwargs) self.text_model_tester = Blip2TextModelTester(parent, **text_kwargs)
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
self.is_training = is_training self.is_training = is_training
self.num_query_tokens = num_query_tokens self.num_query_tokens = num_query_tokens

View File

@@ -510,6 +510,7 @@ class ChineseCLIPModelTester:
self.parent = parent self.parent = parent
self.text_model_tester = ChineseCLIPTextModelTester(parent, **text_kwargs) self.text_model_tester = ChineseCLIPTextModelTester(parent, **text_kwargs)
self.vision_model_tester = ChineseCLIPVisionModelTester(parent, **vision_kwargs) self.vision_model_tester = ChineseCLIPVisionModelTester(parent, **vision_kwargs)
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
self.is_training = is_training self.is_training = is_training
def prepare_config_and_inputs(self): def prepare_config_and_inputs(self):

View File

@@ -466,6 +466,7 @@ class ClapModelTester:
self.parent = parent self.parent = parent
self.text_model_tester = ClapTextModelTester(parent, **text_kwargs) self.text_model_tester = ClapTextModelTester(parent, **text_kwargs)
self.audio_model_tester = ClapAudioModelTester(parent, **audio_kwargs) self.audio_model_tester = ClapAudioModelTester(parent, **audio_kwargs)
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
self.is_training = is_training self.is_training = is_training
def prepare_config_and_inputs(self): def prepare_config_and_inputs(self):

View File

@@ -437,6 +437,7 @@ class CLIPModelTester:
self.parent = parent self.parent = parent
self.text_model_tester = CLIPTextModelTester(parent, **text_kwargs) self.text_model_tester = CLIPTextModelTester(parent, **text_kwargs)
self.vision_model_tester = CLIPVisionModelTester(parent, **vision_kwargs) self.vision_model_tester = CLIPVisionModelTester(parent, **vision_kwargs)
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
self.is_training = is_training self.is_training = is_training
def prepare_config_and_inputs(self): def prepare_config_and_inputs(self):

View File

@@ -388,6 +388,7 @@ class CLIPSegModelTester:
self.parent = parent self.parent = parent
self.text_model_tester = CLIPSegTextModelTester(parent, **text_kwargs) self.text_model_tester = CLIPSegTextModelTester(parent, **text_kwargs)
self.vision_model_tester = CLIPSegVisionModelTester(parent, **vision_kwargs) self.vision_model_tester = CLIPSegVisionModelTester(parent, **vision_kwargs)
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
self.is_training = is_training self.is_training = is_training
self.extract_layers = extract_layers self.extract_layers = extract_layers

View File

@@ -344,6 +344,7 @@ class ClvpModelForConditionalGenerationTester:
self.parent = parent self.parent = parent
self.clvp_encoder_tester = ClvpEncoderTester(parent) self.clvp_encoder_tester = ClvpEncoderTester(parent)
self.is_training = is_training self.is_training = is_training
self.batch_size = self.clvp_encoder_tester.batch_size # need bs for batching_equivalence test
def get_config(self): def get_config(self):
decoder_config = ClvpDecoderConfig( decoder_config = ClvpDecoderConfig(

View File

@@ -194,6 +194,7 @@ class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline
test_pruning = False test_pruning = False
test_head_masking = False test_head_masking = False
test_missing_keys = False test_missing_keys = False
zero_init_hidden_state = True
# special case for head models # special case for head models
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):

View File

@@ -57,7 +57,7 @@ class CpmAntModelTester:
prompt_length=8, prompt_length=8,
prompt_types=8, prompt_types=8,
segment_types=8, segment_types=8,
init_std=1.0, init_std=0.02,
return_dict=True, return_dict=True,
): ):
self.parent = parent self.parent = parent

View File

@@ -194,6 +194,7 @@ class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
test_pruning = False test_pruning = False
test_head_masking = False test_head_masking = False
test_missing_keys = False test_missing_keys = False
zero_init_hidden_state = True
# special case for head models # special case for head models
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):

View File

@@ -19,7 +19,7 @@ import unittest
from transformers import DPTConfig from transformers import DPTConfig
from transformers.file_utils import is_torch_available, is_vision_available from transformers.file_utils import is_torch_available, is_vision_available
from transformers.testing_utils import require_torch, require_vision, slow, torch_device from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
@@ -306,6 +306,10 @@ class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
_ = DPTForDepthEstimation(config) _ = DPTForDepthEstimation(config)
@is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516")
def test_batching_equivalence(self):
super().test_batching_equivalence()
# We will verify our results on an image of cute cats # We will verify our results on an image of cute cats
def prepare_img(): def prepare_img():

View File

@@ -33,11 +33,7 @@ from transformers.testing_utils import (
) )
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ( from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
ModelTesterMixin,
_config_zero_init,
floats_tensor,
)
from ...test_pipeline_mixin import PipelineTesterMixin from ...test_pipeline_mixin import PipelineTesterMixin
@@ -107,6 +103,15 @@ class EncodecModelTester:
config, inputs_dict = self.prepare_config_and_inputs() config, inputs_dict = self.prepare_config_and_inputs()
return config, inputs_dict return config, inputs_dict
def prepare_config_and_inputs_for_model_class(self, model_class):
config, inputs_dict = self.prepare_config_and_inputs()
inputs_dict["audio_codes"] = ids_tensor([1, self.batch_size, 1, self.num_channels], self.codebook_size).type(
torch.int32
)
inputs_dict["audio_scales"] = [None]
return config, inputs_dict
def get_config(self): def get_config(self):
return EncodecConfig( return EncodecConfig(
audio_channels=self.num_channels, audio_channels=self.num_channels,

View File

@@ -347,6 +347,13 @@ class FastSpeech2ConformerModelTest(ModelTesterMixin, unittest.TestCase):
def test_model_common_attributes(self): def test_model_common_attributes(self):
pass pass
@unittest.skip(
"FastSpeech2Conformer predicts durations in linear domain during inference"
"Even small differences on hidden states lead to different durations, due to `torch.round`"
)
def test_batching_equivalence(self):
pass
@require_torch @require_torch
@require_g2p_en @require_g2p_en
@@ -762,6 +769,13 @@ class FastSpeech2ConformerWithHifiGanTest(ModelTesterMixin, unittest.TestCase):
def test_model_common_attributes(self): def test_model_common_attributes(self):
pass pass
@unittest.skip(
"FastSpeech2Conformer predicts durations in linear domain during inference"
"Even small differences on hidden states lead to different durations, due to `torch.round`"
)
def test_batching_equivalence(self):
pass
@require_torch @require_torch
@require_g2p_en @require_g2p_en

View File

@@ -836,6 +836,7 @@ class FlavaModelTester:
self.projection_dim = projection_dim self.projection_dim = projection_dim
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
def test_config(self): def test_config(self):
self.config_tester.run_common_tests() self.config_tester.run_common_tests()

View File

@@ -507,6 +507,7 @@ class GroupViTModelTester:
self.parent = parent self.parent = parent
self.text_model_tester = GroupViTTextModelTester(parent, **text_kwargs) self.text_model_tester = GroupViTTextModelTester(parent, **text_kwargs)
self.vision_model_tester = GroupViTVisionModelTester(parent, **vision_kwargs) self.vision_model_tester = GroupViTVisionModelTester(parent, **vision_kwargs)
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
self.is_training = is_training self.is_training = is_training
def prepare_config_and_inputs(self): def prepare_config_and_inputs(self):

View File

@@ -279,6 +279,10 @@ class InformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
def test_determinism(self): def test_determinism(self):
pass pass
@unittest.skip("randomly selects U keys while calculating attentions")
def test_batching_equivalence(self):
pass
@unittest.skip( @unittest.skip(
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
) )

View File

@@ -397,6 +397,7 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
self.vision_model_tester = InstructBlipVisionModelTester(parent, **vision_kwargs) self.vision_model_tester = InstructBlipVisionModelTester(parent, **vision_kwargs)
self.qformer_model_tester = InstructBlipQFormerModelTester(parent, **qformer_kwargs) self.qformer_model_tester = InstructBlipQFormerModelTester(parent, **qformer_kwargs)
self.text_model_tester = InstructBlipTextModelDecoderOnlyTester(parent, **text_kwargs) self.text_model_tester = InstructBlipTextModelDecoderOnlyTester(parent, **text_kwargs)
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
self.is_training = is_training self.is_training = is_training
self.num_query_tokens = num_query_tokens self.num_query_tokens = num_query_tokens

View File

@@ -197,6 +197,7 @@ class Kosmos2ModelTester:
self.parent = parent self.parent = parent
self.text_model_tester = Kosmos2TextModelTester(parent, **text_kwargs) self.text_model_tester = Kosmos2TextModelTester(parent, **text_kwargs)
self.vision_model_tester = Kosmos2VisionModelTester(parent, **vision_kwargs) self.vision_model_tester = Kosmos2VisionModelTester(parent, **vision_kwargs)
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
self.latent_query_num = latent_query_num self.latent_query_num = latent_query_num
self.is_training = is_training self.is_training = is_training

View File

@@ -27,6 +27,7 @@ from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available(): if is_torch_available():
import torch import torch
import torch.nn.functional as F
from transformers import ( from transformers import (
LayoutLMv2Config, LayoutLMv2Config,
@@ -442,6 +443,64 @@ class LayoutLMv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
msg=f"Parameter {name} of model {model_class} seems not properly initialized", msg=f"Parameter {name} of model {model_class} seems not properly initialized",
) )
def test_batching_equivalence(self):
def equivalence(tensor1, tensor2):
return 1.0 - F.cosine_similarity(tensor1.float().flatten(), tensor2.float().flatten(), dim=0, eps=0)
def recursive_check(batched_object, single_row_object, model_name, key):
if isinstance(batched_object, (list, tuple)):
for batched_object_value, single_row_object_value in zip(batched_object, single_row_object):
recursive_check(batched_object_value, single_row_object_value, model_name, key)
elif batched_object is None:
return
else:
batched_row = batched_object[:1]
self.assertFalse(
torch.isnan(batched_row).any(), f"Batched output has `nan` in {model_name} for key={key}"
)
self.assertFalse(
torch.isinf(batched_row).any(), f"Batched output has `inf` in {model_name} for key={key}"
)
self.assertFalse(
torch.isnan(single_row_object).any(), f"Single row output has `nan` in {model_name} for key={key}"
)
self.assertFalse(
torch.isinf(single_row_object).any(), f"Single row output has `inf` in {model_name} for key={key}"
)
self.assertTrue(
(equivalence(batched_row, single_row_object)) <= 1e-03,
msg=(
f"Batched and Single row outputs are not equal in {model_name} for key={key}. "
f"Difference={equivalence(batched_row, single_row_object)}."
),
)
config, batched_input = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
config.output_hidden_states = True
model_name = model_class.__name__
batched_input_prepared = self._prepare_for_class(batched_input, model_class)
model = model_class(config).to(torch_device).eval()
batch_size = self.model_tester.batch_size
single_row_input = {}
for key, value in batched_input_prepared.items():
if isinstance(value, torch.Tensor) and value.shape[0] % batch_size == 0:
single_batch_shape = value.shape[0] // batch_size
single_row_input[key] = value[:single_batch_shape]
elif hasattr(value, "tensor"):
# layoutlmv2uses ImageList intead of pixel values (needs for torchscript)
single_row_input[key] = value.tensor[:single_batch_shape]
with torch.no_grad():
model_batched_output = model(**batched_input_prepared)
model_row_output = model(**single_row_input)
for key in model_batched_output:
recursive_check(model_batched_output[key], model_row_output[key], model_name, key)
def prepare_layoutlmv2_batch_inputs(): def prepare_layoutlmv2_batch_inputs():
# Here we prepare a batch of 2 sequences to test a LayoutLMv2 forward pass on: # Here we prepare a batch of 2 sequences to test a LayoutLMv2 forward pass on:

View File

@@ -388,6 +388,10 @@ class LongformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
# longformer cannot keep gradients in attentions or hidden states # longformer cannot keep gradients in attentions or hidden states
return return
@unittest.skip("LongFormer calculates global attn only when attn_mask has non-zero elements")
def test_batching_equivalence(self):
return
@require_torch @require_torch
@require_sentencepiece @require_sentencepiece

View File

@@ -39,6 +39,7 @@ from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available(): if is_torch_available():
import torch import torch
import torch.nn.functional as F
from transformers import MaskFormerForInstanceSegmentation, MaskFormerModel from transformers import MaskFormerForInstanceSegmentation, MaskFormerModel
@@ -206,6 +207,7 @@ class MaskFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
test_pruning = False test_pruning = False
test_head_masking = False test_head_masking = False
test_missing_keys = False test_missing_keys = False
zero_init_hidden_state = True
def setUp(self): def setUp(self):
self.model_tester = MaskFormerModelTester(self) self.model_tester = MaskFormerModelTester(self)
@@ -381,6 +383,67 @@ class MaskFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
self.assertIsNotNone(outputs.auxiliary_logits) self.assertIsNotNone(outputs.auxiliary_logits)
self.assertEqual(len(outputs.auxiliary_logits), self.model_tester.num_channels - 1) self.assertEqual(len(outputs.auxiliary_logits), self.model_tester.num_channels - 1)
def test_batching_equivalence(self):
def equivalence(tensor1, tensor2):
return 1.0 - F.cosine_similarity(tensor1.float().flatten(), tensor2.float().flatten(), dim=0, eps=0).max()
def recursive_check(batched_object, single_row_object, model_name, key):
if isinstance(batched_object, (list, tuple)):
for batched_object_value, single_row_object_value in zip(batched_object, single_row_object):
recursive_check(batched_object_value, single_row_object_value, model_name, key)
elif batched_object is None:
return
else:
batched_row = batched_object[:1]
self.assertFalse(
torch.isnan(batched_row).any(), f"Batched output has `nan` in {model_name} for key={key}"
)
self.assertFalse(
torch.isinf(batched_row).any(), f"Batched output has `inf` in {model_name} for key={key}"
)
self.assertFalse(
torch.isnan(single_row_object).any(), f"Single row output has `nan` in {model_name} for key={key}"
)
self.assertFalse(
torch.isinf(single_row_object).any(), f"Single row output has `inf` in {model_name} for key={key}"
)
self.assertTrue(
(equivalence(batched_row, single_row_object)) <= 1e-03,
msg=(
f"Batched and Single row outputs are not equal in {model_name} for key={key}. "
f"Difference={equivalence(batched_row, single_row_object)}."
),
)
config, batched_input = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
config.output_hidden_states = True
model_name = model_class.__name__
batched_input_prepared = self._prepare_for_class(batched_input, model_class)
model = model_class(config).to(torch_device).eval()
batch_size = self.model_tester.batch_size
single_row_input = {}
for key, value in batched_input_prepared.items():
single_batch_shape = value.shape[0] // batch_size
single_row_input[key] = value[:single_batch_shape]
with torch.no_grad():
model_batched_output = model(**batched_input_prepared)
model_row_output = model(**single_row_input)
for key in model_batched_output:
# remove the first zero-init queries to decoder, otherwise cos_similarity = `nan`
# no need to check all hidden_states, already checked separately each one
if key == "transformer_decoder_hidden_states":
model_batched_output[key] = model_batched_output[key][1:]
model_row_output[key] = model_row_output[key][1:]
elif key == "hidden_states":
continue
recursive_check(model_batched_output[key], model_row_output[key], model_name, key)
TOLERANCE = 1e-4 TOLERANCE = 1e-4

View File

@@ -18,7 +18,7 @@
import unittest import unittest
from transformers import MobileNetV2Config from transformers import MobileNetV2Config
from transformers.testing_utils import require_torch, require_vision, slow, torch_device from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device
from transformers.utils import cached_property, is_torch_available, is_vision_available from transformers.utils import cached_property, is_torch_available, is_vision_available
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
@@ -271,6 +271,10 @@ class MobileNetV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
model = MobileNetV2Model.from_pretrained(model_name) model = MobileNetV2Model.from_pretrained(model_name)
self.assertIsNotNone(model) self.assertIsNotNone(model)
@is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516")
def test_batching_equivalence(self):
super().test_batching_equivalence()
# We will verify our results on an image of cute cats # We will verify our results on an image of cute cats
def prepare_img(): def prepare_img():

View File

@@ -378,6 +378,10 @@ class MraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
def test_training_gradient_checkpointing_use_reentrant_false(self): def test_training_gradient_checkpointing_use_reentrant_false(self):
pass pass
@unittest.skip("Model has `nan` in hidden_states, see https://github.com/huggingface/transformers/issues/29373.")
def test_batching_equivalence(self):
pass
@require_torch @require_torch
class MraModelIntegrationTest(unittest.TestCase): class MraModelIntegrationTest(unittest.TestCase):

View File

@@ -103,7 +103,7 @@ class MusicgenDecoderTester:
def __init__( def __init__(
self, self,
parent, parent,
batch_size=2, batch_size=3, # need batch_size != num_hidden_layers
seq_length=7, seq_length=7,
is_training=False, is_training=False,
use_labels=False, use_labels=False,
@@ -441,7 +441,7 @@ class MusicgenTester:
def __init__( def __init__(
self, self,
parent, parent,
batch_size=2, batch_size=3, # need batch_size != num_hidden_layers
seq_length=7, seq_length=7,
is_training=False, is_training=False,
use_labels=False, use_labels=False,

View File

@@ -385,6 +385,7 @@ class Owlv2ModelTester:
self.is_training = is_training self.is_training = is_training
self.text_config = self.text_model_tester.get_config().to_dict() self.text_config = self.text_model_tester.get_config().to_dict()
self.vision_config = self.vision_model_tester.get_config().to_dict() self.vision_config = self.vision_model_tester.get_config().to_dict()
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
def prepare_config_and_inputs(self): def prepare_config_and_inputs(self):
text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
@@ -591,6 +592,7 @@ class Owlv2ForObjectDetectionTester:
self.is_training = is_training self.is_training = is_training
self.text_config = self.text_model_tester.get_config().to_dict() self.text_config = self.text_model_tester.get_config().to_dict()
self.vision_config = self.vision_model_tester.get_config().to_dict() self.vision_config = self.vision_model_tester.get_config().to_dict()
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
def prepare_config_and_inputs(self): def prepare_config_and_inputs(self):
text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()

View File

@@ -381,6 +381,7 @@ class OwlViTModelTester:
self.is_training = is_training self.is_training = is_training
self.text_config = self.text_model_tester.get_config().to_dict() self.text_config = self.text_model_tester.get_config().to_dict()
self.vision_config = self.vision_model_tester.get_config().to_dict() self.vision_config = self.vision_model_tester.get_config().to_dict()
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
def prepare_config_and_inputs(self): def prepare_config_and_inputs(self):
text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
@@ -585,6 +586,7 @@ class OwlViTForObjectDetectionTester:
self.is_training = is_training self.is_training = is_training
self.text_config = self.text_model_tester.get_config().to_dict() self.text_config = self.text_model_tester.get_config().to_dict()
self.vision_config = self.vision_model_tester.get_config().to_dict() self.vision_config = self.vision_model_tester.get_config().to_dict()
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
def prepare_config_and_inputs(self): def prepare_config_and_inputs(self):
text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()

View File

@@ -386,6 +386,7 @@ class Pix2StructModelTester:
self.parent = parent self.parent = parent
self.text_model_tester = Pix2StructTextModelTester(parent, **text_kwargs) self.text_model_tester = Pix2StructTextModelTester(parent, **text_kwargs)
self.vision_model_tester = Pix2StructVisionModelTester(parent, **vision_kwargs) self.vision_model_tester = Pix2StructVisionModelTester(parent, **vision_kwargs)
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
self.is_training = is_training self.is_training = is_training
def prepare_config_and_inputs(self): def prepare_config_and_inputs(self):

View File

@@ -389,6 +389,7 @@ class SiglipModelTester:
self.parent = parent self.parent = parent
self.text_model_tester = SiglipTextModelTester(parent, **text_kwargs) self.text_model_tester = SiglipTextModelTester(parent, **text_kwargs)
self.vision_model_tester = SiglipVisionModelTester(parent, **vision_kwargs) self.vision_model_tester = SiglipVisionModelTester(parent, **vision_kwargs)
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
self.is_training = is_training self.is_training = is_training
# Copied from tests.models.clip.test_modeling_clip.CLIPModelTester.prepare_config_and_inputs # Copied from tests.models.clip.test_modeling_clip.CLIPModelTester.prepare_config_and_inputs

View File

@@ -916,6 +916,10 @@ class SpeechT5ForTextToSpeechTest(ModelTesterMixin, unittest.TestCase):
def test_determinism(self): def test_determinism(self):
pass pass
@unittest.skip("skipped because there is always dropout in SpeechT5SpeechDecoderPrenet")
def test_batching_equivalence(self):
pass
def test_forward_signature(self): def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common() config, _ = self.model_tester.prepare_config_and_inputs_for_common()
@@ -1438,6 +1442,10 @@ class SpeechT5ForSpeechToSpeechTest(ModelTesterMixin, unittest.TestCase):
def test_determinism(self): def test_determinism(self):
pass pass
@unittest.skip("skipped because there is always dropout in SpeechT5SpeechDecoderPrenet")
def test_batching_equivalence(self):
pass
def test_attention_outputs(self): def test_attention_outputs(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.return_dict = True config.return_dict = True

View File

@@ -209,6 +209,7 @@ class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, Pipelin
test_pruning = False test_pruning = False
test_head_masking = False test_head_masking = False
test_missing_keys = False test_missing_keys = False
zero_init_hidden_state = True
# special case for head models # special case for head models
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):

View File

@@ -104,6 +104,7 @@ class TimeSeriesTransformerModelTester:
num_static_categorical_features=1, num_static_categorical_features=1,
cardinality=[self.cardinality], cardinality=[self.cardinality],
embedding_dimension=[self.embedding_dimension], embedding_dimension=[self.embedding_dimension],
scaling="std", # we need std to get non-zero `loc`
) )
def prepare_time_series_transformer_inputs_dict(self, config): def prepare_time_series_transformer_inputs_dict(self, config):

View File

@@ -66,13 +66,13 @@ class UnivNetModelTester:
def prepare_noise_sequence(self): def prepare_noise_sequence(self):
generator = torch.manual_seed(self.seed) generator = torch.manual_seed(self.seed)
noise_shape = (self.seq_length, self.in_channels) noise_shape = (self.batch_size, self.seq_length, self.in_channels)
# Create noise on CPU for reproducibility # Create noise on CPU for reproducibility
noise_sequence = torch.randn(noise_shape, generator=generator, dtype=torch.float) noise_sequence = torch.randn(noise_shape, generator=generator, dtype=torch.float)
return noise_sequence return noise_sequence
def prepare_config_and_inputs(self): def prepare_config_and_inputs(self):
spectrogram = floats_tensor([self.seq_length, self.num_mel_bins], scale=1.0) spectrogram = floats_tensor([self.batch_size, self.seq_length, self.num_mel_bins], scale=1.0)
noise_sequence = self.prepare_noise_sequence() noise_sequence = self.prepare_noise_sequence()
noise_sequence = noise_sequence.to(spectrogram.device) noise_sequence = noise_sequence.to(spectrogram.device)
config = self.get_config() config = self.get_config()
@@ -89,7 +89,7 @@ class UnivNetModelTester:
def create_and_check_model(self, config, spectrogram, noise_sequence): def create_and_check_model(self, config, spectrogram, noise_sequence):
model = UnivNetModel(config=config).to(torch_device).eval() model = UnivNetModel(config=config).to(torch_device).eval()
result = model(spectrogram, noise_sequence)[0] result = model(spectrogram, noise_sequence)[0]
self.parent.assertEqual(result.shape, (1, self.seq_length * 256)) self.parent.assertEqual(result.shape, (self.batch_size, self.seq_length * 256))
def prepare_config_and_inputs_for_common(self): def prepare_config_and_inputs_for_common(self):
config, spectrogram, noise_sequence = self.prepare_config_and_inputs() config, spectrogram, noise_sequence = self.prepare_config_and_inputs()
@@ -182,8 +182,8 @@ class UnivNetModelTest(ModelTesterMixin, unittest.TestCase):
model.to(torch_device) model.to(torch_device)
model.eval() model.eval()
batched_spectrogram = inputs["input_features"].unsqueeze(0).repeat(2, 1, 1) batched_spectrogram = inputs["input_features"]
batched_noise_sequence = inputs["noise_sequence"].unsqueeze(0).repeat(2, 1, 1) batched_noise_sequence = inputs["noise_sequence"]
with torch.no_grad(): with torch.no_grad():
batched_outputs = model( batched_outputs = model(
batched_spectrogram.to(torch_device), batched_spectrogram.to(torch_device),
@@ -205,37 +205,11 @@ class UnivNetModelTest(ModelTesterMixin, unittest.TestCase):
model.eval() model.eval()
with torch.no_grad(): with torch.no_grad():
outputs = model(inputs["input_features"].to(torch_device), inputs["noise_sequence"].to(torch_device))[ outputs = model(
0 inputs["input_features"][:1].to(torch_device), inputs["noise_sequence"][:1].to(torch_device)
] )[0]
self.assertTrue(outputs.shape[0] == 1, msg="Unbatched input should create batched output with bsz = 1") self.assertTrue(outputs.shape[0] == 1, msg="Unbatched input should create batched output with bsz = 1")
def test_unbatched_batched_outputs_consistency(self):
config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
unbatched_spectrogram = inputs["input_features"].detach().clone()
unbatched_noise_sequence = inputs["noise_sequence"].detach().clone()
batched_spectrogram = inputs["input_features"].unsqueeze(0)
batched_noise_sequence = inputs["noise_sequence"].unsqueeze(0)
with torch.no_grad():
unbatched_outputs = model(
unbatched_spectrogram.to(torch_device),
unbatched_noise_sequence.to(torch_device),
)[0]
batched_outputs = model(
batched_spectrogram.to(torch_device),
batched_noise_sequence.to(torch_device),
)[0]
torch.testing.assert_close(unbatched_outputs, batched_outputs)
@require_torch_gpu @require_torch_gpu
@slow @slow

View File

@@ -345,6 +345,12 @@ class ViltModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
def test_determinism(self): def test_determinism(self):
pass pass
@unittest.skip(
"VilT samples image tokens from a multinomial distribution, resulting in not deterministic hidden states"
)
def test_batching_equivalence(self):
pass
@unittest.skip( @unittest.skip(
reason="""VilT samples image tokens from a multinomial distribution, resulting in not deterministic reason="""VilT samples image tokens from a multinomial distribution, resulting in not deterministic
hidden states""" hidden states"""

View File

@@ -18,7 +18,7 @@
import unittest import unittest
from transformers import ViTHybridConfig from transformers import ViTHybridConfig
from transformers.testing_utils import require_accelerate, require_torch, require_vision, slow, torch_device from transformers.testing_utils import is_flaky, require_accelerate, require_torch, require_vision, slow, torch_device
from transformers.utils import cached_property, is_torch_available, is_vision_available from transformers.utils import cached_property, is_torch_available, is_vision_available
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
@@ -221,6 +221,10 @@ class ViTHybridModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
model = ViTHybridModel.from_pretrained(model_name) model = ViTHybridModel.from_pretrained(model_name)
self.assertIsNotNone(model) self.assertIsNotNone(model)
@is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516")
def test_batching_equivalence(self):
super().test_batching_equivalence()
# We will verify our results on an image of cute cats # We will verify our results on an image of cute cats
def prepare_img(): def prepare_img():

View File

@@ -270,6 +270,10 @@ class ViTMAEModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
def test_model_outputs_equivalence(self): def test_model_outputs_equivalence(self):
pass pass
@unittest.skip(reason="ViTMAE returns a random mask + ids_restore in each forward pass")
def test_batching_equivalence(self):
pass
@slow @slow
def test_model_from_pretrained(self): def test_model_from_pretrained(self):
for model_name in VIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: for model_name in VIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:

View File

@@ -216,6 +216,10 @@ class VitsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
def test_determinism(self): def test_determinism(self):
pass pass
@unittest.skip("VITS is not deterministic")
def test_batching_equivalence(self):
pass
@is_flaky( @is_flaky(
max_attempts=3, max_attempts=3,
description="Weight initialisation for the VITS conv layers sometimes exceeds the kaiming normal range", description="Weight initialisation for the VITS conv layers sometimes exceeds the kaiming normal range",

View File

@@ -190,7 +190,7 @@ class WhisperModelTester:
def __init__( def __init__(
self, self,
parent, parent,
batch_size=2, batch_size=3, # need batch_size != num_hidden_layers
seq_length=60, seq_length=60,
is_training=True, is_training=True,
use_labels=False, use_labels=False,
@@ -1446,6 +1446,7 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
model = WhisperForConditionalGeneration(config).eval().to(torch_device) model = WhisperForConditionalGeneration(config).eval().to(torch_device)
input_features = input_dict["input_features"].to(torch_device) input_features = input_dict["input_features"].to(torch_device)
input_features = input_features[:2]
# len = 250 with num_input_frames = 60 # len = 250 with num_input_frames = 60
long_input_features = torch.cat([input_features.repeat(1, 1, 4), input_features[:, :, :10]], dim=-1) long_input_features = torch.cat([input_features.repeat(1, 1, 4), input_features[:, :, :10]], dim=-1)
@@ -2626,7 +2627,7 @@ class WhisperEncoderModelTester:
def __init__( def __init__(
self, self,
parent, parent,
batch_size=2, batch_size=3, # need batch_size != num_hidden layers
seq_length=60, seq_length=60,
is_training=True, is_training=True,
use_labels=True, use_labels=True,
@@ -2997,7 +2998,7 @@ class WhisperStandaloneDecoderModelTester:
def __init__( def __init__(
self, self,
parent, parent,
batch_size=2, batch_size=3, # need batch_size != num_hidden layers
is_training=True, is_training=True,
use_labels=False, use_labels=False,
vocab_size=200, vocab_size=200,

View File

@@ -479,6 +479,7 @@ class XCLIPModelTester:
self.mit_hidden_size = mit_hidden_size self.mit_hidden_size = mit_hidden_size
self.text_model_tester = XCLIPTextModelTester(parent, **text_kwargs) self.text_model_tester = XCLIPTextModelTester(parent, **text_kwargs)
self.vision_model_tester = XCLIPVisionModelTester(parent, **vision_kwargs) self.vision_model_tester = XCLIPVisionModelTester(parent, **vision_kwargs)
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
self.is_training = is_training self.is_training = is_training
def prepare_config_and_inputs(self): def prepare_config_and_inputs(self):

View File

@@ -99,6 +99,7 @@ if is_accelerate_available():
if is_torch_available(): if is_torch_available():
import torch import torch
import torch.nn.functional as F
from safetensors.torch import load_file as safe_load_file from safetensors.torch import load_file as safe_load_file
from safetensors.torch import save_file as safe_save_file from safetensors.torch import save_file as safe_save_file
from torch import nn from torch import nn
@@ -693,6 +694,99 @@ class ModelTesterMixin:
expected_arg_names = [model.main_input_name] expected_arg_names = [model.main_input_name]
self.assertListEqual(arg_names[:1], expected_arg_names) self.assertListEqual(arg_names[:1], expected_arg_names)
def test_batching_equivalence(self):
"""
Tests that the model supports batching and that the output is the nearly the same for the same input in
different batch sizes.
(Why "nearly the same" not "exactly the same"? Batching uses different matmul shapes, which often leads to
different results: https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535)
"""
def get_tensor_equivalence_function(batched_input):
# models operating on continuous spaces have higher abs difference than LMs
# instead, we can rely on cos distance for image/speech models, similar to `diffusers`
if "input_ids" not in batched_input:
return lambda tensor1, tensor2: (
1.0 - F.cosine_similarity(tensor1.float().flatten(), tensor2.float().flatten(), dim=0, eps=1e-38)
)
return lambda tensor1, tensor2: torch.max(torch.abs(tensor1 - tensor2))
def recursive_check(batched_object, single_row_object, model_name, key):
if isinstance(batched_object, (list, tuple)):
for batched_object_value, single_row_object_value in zip(batched_object, single_row_object):
recursive_check(batched_object_value, single_row_object_value, model_name, key)
elif isinstance(batched_object, dict):
for batched_object_value, single_row_object_value in zip(
batched_object.values(), single_row_object.values()
):
recursive_check(batched_object_value, single_row_object_value, model_name, key)
# do not compare returned loss (0-dim tensor) or codebook ids (int)
elif batched_object is None or isinstance(batched_object, int):
return
elif batched_object.dim() == 0:
return
else:
# indexing the first element does not always work
# e.g. models that output similarity scores of size (N, M) would need to index [0, 0]
slice_ids = [slice(0, index) for index in single_row_object.shape]
batched_row = batched_object[slice_ids]
self.assertFalse(
torch.isnan(batched_row).any(), f"Batched output has `nan` in {model_name} for key={key}"
)
self.assertFalse(
torch.isinf(batched_row).any(), f"Batched output has `inf` in {model_name} for key={key}"
)
self.assertFalse(
torch.isnan(single_row_object).any(), f"Single row output has `nan` in {model_name} for key={key}"
)
self.assertFalse(
torch.isinf(single_row_object).any(), f"Single row output has `inf` in {model_name} for key={key}"
)
self.assertTrue(
(equivalence(batched_row, single_row_object)) <= 1e-03,
msg=(
f"Batched and Single row outputs are not equal in {model_name} for key={key}. "
f"Difference={equivalence(batched_row, single_row_object)}."
),
)
config, batched_input = self.model_tester.prepare_config_and_inputs_for_common()
equivalence = get_tensor_equivalence_function(batched_input)
for model_class in self.all_model_classes:
config.output_hidden_states = True
model_name = model_class.__name__
if hasattr(self.model_tester, "prepare_config_and_inputs_for_model_class"):
config, batched_input = self.model_tester.prepare_config_and_inputs_for_model_class(model_class)
batched_input_prepared = self._prepare_for_class(batched_input, model_class)
model = model_class(config).to(torch_device).eval()
batch_size = self.model_tester.batch_size
single_row_input = {}
for key, value in batched_input_prepared.items():
if isinstance(value, torch.Tensor) and value.shape[0] % batch_size == 0:
# e.g. musicgen has inputs of size (bs*codebooks). in most cases value.shape[0] == batch_size
single_batch_shape = value.shape[0] // batch_size
single_row_input[key] = value[:single_batch_shape]
else:
single_row_input[key] = value
with torch.no_grad():
model_batched_output = model(**batched_input_prepared)
model_row_output = model(**single_row_input)
if isinstance(model_batched_output, torch.Tensor):
model_batched_output = {"model_output": model_batched_output}
model_row_output = {"model_output": model_row_output}
for key in model_batched_output:
# DETR starts from zero-init queries to decoder, leading to cos_similarity = `nan`
if hasattr(self, "zero_init_hidden_state") and "decoder_hidden_states" in key:
model_batched_output[key] = model_batched_output[key][1:]
model_row_output[key] = model_row_output[key][1:]
recursive_check(model_batched_output[key], model_row_output[key], model_name, key)
def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None): def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None):
if not self.model_tester.is_training: if not self.model_tester.is_training:
return return