Add tests for batching support (#29297)
* add tests for batching support * Update src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update tests/test_modeling_common.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update tests/test_modeling_common.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update tests/test_modeling_common.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * fixes and comments * use cosine distance for conv models * skip mra model testing * Update tests/models/vilt/test_modeling_vilt.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * finzalize and make style * check model type by input names * Update tests/models/vilt/test_modeling_vilt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * fixed batch size for all testers * Revert "fixed batch size for all testers" This reverts commit 525f3a0a058f069fbda00352cf202b728d40df99. * add batch_size for all testers * dict from model output * do not skip layoutlm * bring back some code from git revert * Update tests/test_modeling_common.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/test_modeling_common.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * clean-up * where did minus go in tolerance * make whisper happy * deal with consequences of losing minus * deal with consequences of losing minus * maskformer needs its own test for happiness * fix more models * tag flaky CV models from Amy's approval * make codestyle --------- Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
11163fff58
commit
8e64ba2890
@@ -1292,7 +1292,7 @@ class CLIPSegDecoder(CLIPSegPreTrainedModel):
|
|||||||
batch_size = conditional_embeddings.shape[0]
|
batch_size = conditional_embeddings.shape[0]
|
||||||
output = output.view(batch_size, output.shape[1], size, size)
|
output = output.view(batch_size, output.shape[1], size, size)
|
||||||
|
|
||||||
logits = self.transposed_convolution(output).squeeze()
|
logits = self.transposed_convolution(output).squeeze(1)
|
||||||
|
|
||||||
if not return_dict:
|
if not return_dict:
|
||||||
return tuple(v for v in [logits, all_hidden_states, all_attentions] if v is not None)
|
return tuple(v for v in [logits, all_hidden_states, all_attentions] if v is not None)
|
||||||
|
|||||||
@@ -51,13 +51,13 @@ ENCODEC_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
|||||||
class EncodecOutput(ModelOutput):
|
class EncodecOutput(ModelOutput):
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
audio_codes (`torch.FloatTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
|
audio_codes (`torch.LongTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
|
||||||
Discret code embeddings computed using `model.encode`.
|
Discret code embeddings computed using `model.encode`.
|
||||||
audio_values (`torch.FlaotTensor` of shape `(batch_size, sequence_length)`, *optional*)
|
audio_values (`torch.FlaotTensor` of shape `(batch_size, sequence_length)`, *optional*)
|
||||||
Decoded audio values, obtained using the decoder part of Encodec.
|
Decoded audio values, obtained using the decoder part of Encodec.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
audio_codes: torch.FloatTensor = None
|
audio_codes: torch.LongTensor = None
|
||||||
audio_values: torch.FloatTensor = None
|
audio_values: torch.FloatTensor = None
|
||||||
|
|
||||||
|
|
||||||
@@ -65,13 +65,13 @@ class EncodecOutput(ModelOutput):
|
|||||||
class EncodecEncoderOutput(ModelOutput):
|
class EncodecEncoderOutput(ModelOutput):
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
audio_codes (`torch.FloatTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
|
audio_codes (`torch.LongTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
|
||||||
Discret code embeddings computed using `model.encode`.
|
Discret code embeddings computed using `model.encode`.
|
||||||
audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
|
audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
|
||||||
Scaling factor for each `audio_codes` input. This is used to unscale each chunk of audio when decoding.
|
Scaling factor for each `audio_codes` input. This is used to unscale each chunk of audio when decoding.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
audio_codes: torch.FloatTensor = None
|
audio_codes: torch.LongTensor = None
|
||||||
audio_scales: torch.FloatTensor = None
|
audio_scales: torch.FloatTensor = None
|
||||||
|
|
||||||
|
|
||||||
@@ -514,7 +514,7 @@ ENCODEC_INPUTS_DOCSTRING = r"""
|
|||||||
The target bandwidth. Must be one of `config.target_bandwidths`. If `None`, uses the smallest possible
|
The target bandwidth. Must be one of `config.target_bandwidths`. If `None`, uses the smallest possible
|
||||||
bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented as
|
bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented as
|
||||||
`bandwidth == 6.0`
|
`bandwidth == 6.0`
|
||||||
audio_codes (`torch.FloatTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
|
audio_codes (`torch.LongTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
|
||||||
Discret code embeddings computed using `model.encode`.
|
Discret code embeddings computed using `model.encode`.
|
||||||
audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
|
audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
|
||||||
Scaling factor for each `audio_codes` input.
|
Scaling factor for each `audio_codes` input.
|
||||||
@@ -718,7 +718,7 @@ class EncodecModel(EncodecPreTrainedModel):
|
|||||||
trimmed.
|
trimmed.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
audio_codes (`torch.FloatTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
|
audio_codes (`torch.LongTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
|
||||||
Discret code embeddings computed using `model.encode`.
|
Discret code embeddings computed using `model.encode`.
|
||||||
audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
|
audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
|
||||||
Scaling factor for each `audio_codes` input.
|
Scaling factor for each `audio_codes` input.
|
||||||
|
|||||||
@@ -776,7 +776,7 @@ class FunnelDiscriminatorPredictions(nn.Module):
|
|||||||
def forward(self, discriminator_hidden_states: torch.Tensor) -> torch.Tensor:
|
def forward(self, discriminator_hidden_states: torch.Tensor) -> torch.Tensor:
|
||||||
hidden_states = self.dense(discriminator_hidden_states)
|
hidden_states = self.dense(discriminator_hidden_states)
|
||||||
hidden_states = ACT2FN[self.config.hidden_act](hidden_states)
|
hidden_states = ACT2FN[self.config.hidden_act](hidden_states)
|
||||||
logits = self.dense_prediction(hidden_states).squeeze()
|
logits = self.dense_prediction(hidden_states).squeeze(-1)
|
||||||
return logits
|
return logits
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -679,7 +679,7 @@ class TvpFramePadPrompter(nn.Module):
|
|||||||
prompt = torch.cat([self.pad_left, base, self.pad_right], dim=4)
|
prompt = torch.cat([self.pad_left, base, self.pad_right], dim=4)
|
||||||
prompt = torch.cat([self.pad_up, prompt, self.pad_down], dim=3)
|
prompt = torch.cat([self.pad_up, prompt, self.pad_down], dim=3)
|
||||||
prompt = torch.cat(pixel_values.size(0) * [prompt])
|
prompt = torch.cat(pixel_values.size(0) * [prompt])
|
||||||
pixel_values += prompt.to(pixel_values.dtype)
|
pixel_values = pixel_values + prompt.to(pixel_values.dtype)
|
||||||
return pixel_values
|
return pixel_values
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -371,10 +371,12 @@ class YosoSelfAttention(nn.Module):
|
|||||||
key_layer = key_layer.reshape(batch_size * num_heads, seq_len, head_dim)
|
key_layer = key_layer.reshape(batch_size * num_heads, seq_len, head_dim)
|
||||||
value_layer = value_layer.reshape(batch_size * num_heads, seq_len, head_dim)
|
value_layer = value_layer.reshape(batch_size * num_heads, seq_len, head_dim)
|
||||||
|
|
||||||
# revert changes made by get_extended_attention_mask
|
|
||||||
attention_mask = 1.0 + attention_mask / 10000.0
|
attention_mask = 1.0 + attention_mask / 10000.0
|
||||||
attention_mask = (
|
attention_mask = (
|
||||||
attention_mask.squeeze().repeat(1, num_heads, 1).reshape(batch_size * num_heads, seq_len).int()
|
attention_mask.unsqueeze(1)
|
||||||
|
.repeat_interleave(num_heads, dim=1)
|
||||||
|
.reshape(batch_size * num_heads, seq_len)
|
||||||
|
.int()
|
||||||
)
|
)
|
||||||
|
|
||||||
# The CUDA kernels are most efficient with inputs whose size is a multiple of a GPU's warp size (32). Inputs
|
# The CUDA kernels are most efficient with inputs whose size is a multiple of a GPU's warp size (32). Inputs
|
||||||
@@ -808,10 +810,6 @@ class YosoModel(YosoPreTrainedModel):
|
|||||||
else:
|
else:
|
||||||
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
|
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
|
||||||
|
|
||||||
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
|
|
||||||
# ourselves in which case we just need to make it broadcastable to all heads.
|
|
||||||
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
|
|
||||||
|
|
||||||
# Prepare head mask if needed
|
# Prepare head mask if needed
|
||||||
# 1.0 in head_mask indicate we keep the head
|
# 1.0 in head_mask indicate we keep the head
|
||||||
# attention_probs has shape bsz x n_heads x N x N
|
# attention_probs has shape bsz x n_heads x N x N
|
||||||
@@ -827,7 +825,7 @@ class YosoModel(YosoPreTrainedModel):
|
|||||||
)
|
)
|
||||||
encoder_outputs = self.encoder(
|
encoder_outputs = self.encoder(
|
||||||
embedding_output,
|
embedding_output,
|
||||||
attention_mask=extended_attention_mask,
|
attention_mask=attention_mask,
|
||||||
head_mask=head_mask,
|
head_mask=head_mask,
|
||||||
output_attentions=output_attentions,
|
output_attentions=output_attentions,
|
||||||
output_hidden_states=output_hidden_states,
|
output_hidden_states=output_hidden_states,
|
||||||
|
|||||||
@@ -405,6 +405,7 @@ class AlignModelTester:
|
|||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.text_model_tester = AlignTextModelTester(parent, **text_kwargs)
|
self.text_model_tester = AlignTextModelTester(parent, **text_kwargs)
|
||||||
self.vision_model_tester = AlignVisionModelTester(parent, **vision_kwargs)
|
self.vision_model_tester = AlignVisionModelTester(parent, **vision_kwargs)
|
||||||
|
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
def prepare_config_and_inputs(self):
|
||||||
|
|||||||
@@ -380,6 +380,7 @@ class AltCLIPModelTester:
|
|||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.text_model_tester = AltCLIPTextModelTester(parent, **text_kwargs)
|
self.text_model_tester = AltCLIPTextModelTester(parent, **text_kwargs)
|
||||||
self.vision_model_tester = AltCLIPVisionModelTester(parent, **vision_kwargs)
|
self.vision_model_tester = AltCLIPVisionModelTester(parent, **vision_kwargs)
|
||||||
|
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
def prepare_config_and_inputs(self):
|
||||||
|
|||||||
@@ -107,6 +107,7 @@ class AutoformerModelTester:
|
|||||||
cardinality=[self.cardinality],
|
cardinality=[self.cardinality],
|
||||||
embedding_dimension=[self.embedding_dimension],
|
embedding_dimension=[self.embedding_dimension],
|
||||||
moving_average=self.moving_average,
|
moving_average=self.moving_average,
|
||||||
|
scaling="std", # we need std to get non-zero `loc`
|
||||||
)
|
)
|
||||||
|
|
||||||
def prepare_autoformer_inputs_dict(self, config):
|
def prepare_autoformer_inputs_dict(self, config):
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ class BarkSemanticModelTester:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
parent,
|
parent,
|
||||||
batch_size=2,
|
batch_size=3, # need batch_size != num_hidden_layers
|
||||||
seq_length=4,
|
seq_length=4,
|
||||||
is_training=False, # for now training is not supported
|
is_training=False, # for now training is not supported
|
||||||
use_input_mask=True,
|
use_input_mask=True,
|
||||||
@@ -203,7 +203,7 @@ class BarkCoarseModelTester:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
parent,
|
parent,
|
||||||
batch_size=2,
|
batch_size=3, # need batch_size != num_hidden_layers
|
||||||
seq_length=4,
|
seq_length=4,
|
||||||
is_training=False, # for now training is not supported
|
is_training=False, # for now training is not supported
|
||||||
use_input_mask=True,
|
use_input_mask=True,
|
||||||
@@ -339,7 +339,7 @@ class BarkFineModelTester:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
parent,
|
parent,
|
||||||
batch_size=2,
|
batch_size=3, # need batch_size != num_hidden_layers
|
||||||
seq_length=4,
|
seq_length=4,
|
||||||
is_training=False, # for now training is not supported
|
is_training=False, # for now training is not supported
|
||||||
use_input_mask=True,
|
use_input_mask=True,
|
||||||
|
|||||||
@@ -387,6 +387,7 @@ class BlipModelTester:
|
|||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.text_model_tester = BlipTextModelTester(parent, **text_kwargs)
|
self.text_model_tester = BlipTextModelTester(parent, **text_kwargs)
|
||||||
self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs)
|
self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs)
|
||||||
|
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
def prepare_config_and_inputs(self):
|
||||||
@@ -596,6 +597,7 @@ class BlipTextRetrievalModelTester:
|
|||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.text_model_tester = BlipTextModelTester(parent, **text_kwargs)
|
self.text_model_tester = BlipTextModelTester(parent, **text_kwargs)
|
||||||
self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs)
|
self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs)
|
||||||
|
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
def prepare_config_and_inputs(self):
|
||||||
@@ -643,6 +645,7 @@ class BlipTextImageModelsModelTester:
|
|||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.text_model_tester = BlipTextModelTester(parent, **text_kwargs)
|
self.text_model_tester = BlipTextModelTester(parent, **text_kwargs)
|
||||||
self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs)
|
self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs)
|
||||||
|
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
def prepare_config_and_inputs(self):
|
||||||
@@ -691,6 +694,7 @@ class BlipVQAModelTester:
|
|||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.text_model_tester = BlipTextModelTester(parent, **text_kwargs)
|
self.text_model_tester = BlipTextModelTester(parent, **text_kwargs)
|
||||||
self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs)
|
self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs)
|
||||||
|
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
def prepare_config_and_inputs(self):
|
||||||
|
|||||||
@@ -390,6 +390,7 @@ class Blip2ForConditionalGenerationDecoderOnlyModelTester:
|
|||||||
self.vision_model_tester = Blip2VisionModelTester(parent, **vision_kwargs)
|
self.vision_model_tester = Blip2VisionModelTester(parent, **vision_kwargs)
|
||||||
self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs)
|
self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs)
|
||||||
self.text_model_tester = Blip2TextModelDecoderOnlyTester(parent, **text_kwargs)
|
self.text_model_tester = Blip2TextModelDecoderOnlyTester(parent, **text_kwargs)
|
||||||
|
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
self.num_query_tokens = num_query_tokens
|
self.num_query_tokens = num_query_tokens
|
||||||
|
|
||||||
@@ -616,6 +617,7 @@ class Blip2ModelTester:
|
|||||||
self.vision_model_tester = Blip2VisionModelTester(parent, **vision_kwargs)
|
self.vision_model_tester = Blip2VisionModelTester(parent, **vision_kwargs)
|
||||||
self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs)
|
self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs)
|
||||||
self.text_model_tester = Blip2TextModelTester(parent, **text_kwargs)
|
self.text_model_tester = Blip2TextModelTester(parent, **text_kwargs)
|
||||||
|
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
self.num_query_tokens = num_query_tokens
|
self.num_query_tokens = num_query_tokens
|
||||||
|
|
||||||
|
|||||||
@@ -510,6 +510,7 @@ class ChineseCLIPModelTester:
|
|||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.text_model_tester = ChineseCLIPTextModelTester(parent, **text_kwargs)
|
self.text_model_tester = ChineseCLIPTextModelTester(parent, **text_kwargs)
|
||||||
self.vision_model_tester = ChineseCLIPVisionModelTester(parent, **vision_kwargs)
|
self.vision_model_tester = ChineseCLIPVisionModelTester(parent, **vision_kwargs)
|
||||||
|
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
def prepare_config_and_inputs(self):
|
||||||
|
|||||||
@@ -466,6 +466,7 @@ class ClapModelTester:
|
|||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.text_model_tester = ClapTextModelTester(parent, **text_kwargs)
|
self.text_model_tester = ClapTextModelTester(parent, **text_kwargs)
|
||||||
self.audio_model_tester = ClapAudioModelTester(parent, **audio_kwargs)
|
self.audio_model_tester = ClapAudioModelTester(parent, **audio_kwargs)
|
||||||
|
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
def prepare_config_and_inputs(self):
|
||||||
|
|||||||
@@ -437,6 +437,7 @@ class CLIPModelTester:
|
|||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.text_model_tester = CLIPTextModelTester(parent, **text_kwargs)
|
self.text_model_tester = CLIPTextModelTester(parent, **text_kwargs)
|
||||||
self.vision_model_tester = CLIPVisionModelTester(parent, **vision_kwargs)
|
self.vision_model_tester = CLIPVisionModelTester(parent, **vision_kwargs)
|
||||||
|
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
def prepare_config_and_inputs(self):
|
||||||
|
|||||||
@@ -388,6 +388,7 @@ class CLIPSegModelTester:
|
|||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.text_model_tester = CLIPSegTextModelTester(parent, **text_kwargs)
|
self.text_model_tester = CLIPSegTextModelTester(parent, **text_kwargs)
|
||||||
self.vision_model_tester = CLIPSegVisionModelTester(parent, **vision_kwargs)
|
self.vision_model_tester = CLIPSegVisionModelTester(parent, **vision_kwargs)
|
||||||
|
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
self.extract_layers = extract_layers
|
self.extract_layers = extract_layers
|
||||||
|
|
||||||
|
|||||||
@@ -344,6 +344,7 @@ class ClvpModelForConditionalGenerationTester:
|
|||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.clvp_encoder_tester = ClvpEncoderTester(parent)
|
self.clvp_encoder_tester = ClvpEncoderTester(parent)
|
||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
|
self.batch_size = self.clvp_encoder_tester.batch_size # need bs for batching_equivalence test
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
decoder_config = ClvpDecoderConfig(
|
decoder_config = ClvpDecoderConfig(
|
||||||
|
|||||||
@@ -194,6 +194,7 @@ class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline
|
|||||||
test_pruning = False
|
test_pruning = False
|
||||||
test_head_masking = False
|
test_head_masking = False
|
||||||
test_missing_keys = False
|
test_missing_keys = False
|
||||||
|
zero_init_hidden_state = True
|
||||||
|
|
||||||
# special case for head models
|
# special case for head models
|
||||||
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
|
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ class CpmAntModelTester:
|
|||||||
prompt_length=8,
|
prompt_length=8,
|
||||||
prompt_types=8,
|
prompt_types=8,
|
||||||
segment_types=8,
|
segment_types=8,
|
||||||
init_std=1.0,
|
init_std=0.02,
|
||||||
return_dict=True,
|
return_dict=True,
|
||||||
):
|
):
|
||||||
self.parent = parent
|
self.parent = parent
|
||||||
|
|||||||
@@ -194,6 +194,7 @@ class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
|
|||||||
test_pruning = False
|
test_pruning = False
|
||||||
test_head_masking = False
|
test_head_masking = False
|
||||||
test_missing_keys = False
|
test_missing_keys = False
|
||||||
|
zero_init_hidden_state = True
|
||||||
|
|
||||||
# special case for head models
|
# special case for head models
|
||||||
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
|
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ import unittest
|
|||||||
|
|
||||||
from transformers import DPTConfig
|
from transformers import DPTConfig
|
||||||
from transformers.file_utils import is_torch_available, is_vision_available
|
from transformers.file_utils import is_torch_available, is_vision_available
|
||||||
from transformers.testing_utils import require_torch, require_vision, slow, torch_device
|
from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device
|
||||||
|
|
||||||
from ...test_configuration_common import ConfigTester
|
from ...test_configuration_common import ConfigTester
|
||||||
from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
|
from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
|
||||||
@@ -306,6 +306,10 @@ class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
|||||||
with self.assertRaises(ValueError):
|
with self.assertRaises(ValueError):
|
||||||
_ = DPTForDepthEstimation(config)
|
_ = DPTForDepthEstimation(config)
|
||||||
|
|
||||||
|
@is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516")
|
||||||
|
def test_batching_equivalence(self):
|
||||||
|
super().test_batching_equivalence()
|
||||||
|
|
||||||
|
|
||||||
# We will verify our results on an image of cute cats
|
# We will verify our results on an image of cute cats
|
||||||
def prepare_img():
|
def prepare_img():
|
||||||
|
|||||||
@@ -33,11 +33,7 @@ from transformers.testing_utils import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
from ...test_configuration_common import ConfigTester
|
from ...test_configuration_common import ConfigTester
|
||||||
from ...test_modeling_common import (
|
from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
|
||||||
ModelTesterMixin,
|
|
||||||
_config_zero_init,
|
|
||||||
floats_tensor,
|
|
||||||
)
|
|
||||||
from ...test_pipeline_mixin import PipelineTesterMixin
|
from ...test_pipeline_mixin import PipelineTesterMixin
|
||||||
|
|
||||||
|
|
||||||
@@ -107,6 +103,15 @@ class EncodecModelTester:
|
|||||||
config, inputs_dict = self.prepare_config_and_inputs()
|
config, inputs_dict = self.prepare_config_and_inputs()
|
||||||
return config, inputs_dict
|
return config, inputs_dict
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_model_class(self, model_class):
|
||||||
|
config, inputs_dict = self.prepare_config_and_inputs()
|
||||||
|
inputs_dict["audio_codes"] = ids_tensor([1, self.batch_size, 1, self.num_channels], self.codebook_size).type(
|
||||||
|
torch.int32
|
||||||
|
)
|
||||||
|
inputs_dict["audio_scales"] = [None]
|
||||||
|
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
return EncodecConfig(
|
return EncodecConfig(
|
||||||
audio_channels=self.num_channels,
|
audio_channels=self.num_channels,
|
||||||
|
|||||||
@@ -347,6 +347,13 @@ class FastSpeech2ConformerModelTest(ModelTesterMixin, unittest.TestCase):
|
|||||||
def test_model_common_attributes(self):
|
def test_model_common_attributes(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@unittest.skip(
|
||||||
|
"FastSpeech2Conformer predicts durations in linear domain during inference"
|
||||||
|
"Even small differences on hidden states lead to different durations, due to `torch.round`"
|
||||||
|
)
|
||||||
|
def test_batching_equivalence(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
@require_g2p_en
|
@require_g2p_en
|
||||||
@@ -762,6 +769,13 @@ class FastSpeech2ConformerWithHifiGanTest(ModelTesterMixin, unittest.TestCase):
|
|||||||
def test_model_common_attributes(self):
|
def test_model_common_attributes(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@unittest.skip(
|
||||||
|
"FastSpeech2Conformer predicts durations in linear domain during inference"
|
||||||
|
"Even small differences on hidden states lead to different durations, due to `torch.round`"
|
||||||
|
)
|
||||||
|
def test_batching_equivalence(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
@require_g2p_en
|
@require_g2p_en
|
||||||
|
|||||||
@@ -836,6 +836,7 @@ class FlavaModelTester:
|
|||||||
self.projection_dim = projection_dim
|
self.projection_dim = projection_dim
|
||||||
self.initializer_range = initializer_range
|
self.initializer_range = initializer_range
|
||||||
self.layer_norm_eps = layer_norm_eps
|
self.layer_norm_eps = layer_norm_eps
|
||||||
|
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
self.config_tester.run_common_tests()
|
self.config_tester.run_common_tests()
|
||||||
|
|||||||
@@ -507,6 +507,7 @@ class GroupViTModelTester:
|
|||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.text_model_tester = GroupViTTextModelTester(parent, **text_kwargs)
|
self.text_model_tester = GroupViTTextModelTester(parent, **text_kwargs)
|
||||||
self.vision_model_tester = GroupViTVisionModelTester(parent, **vision_kwargs)
|
self.vision_model_tester = GroupViTVisionModelTester(parent, **vision_kwargs)
|
||||||
|
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
def prepare_config_and_inputs(self):
|
||||||
|
|||||||
@@ -279,6 +279,10 @@ class InformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
|
|||||||
def test_determinism(self):
|
def test_determinism(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@unittest.skip("randomly selects U keys while calculating attentions")
|
||||||
|
def test_batching_equivalence(self):
|
||||||
|
pass
|
||||||
|
|
||||||
@unittest.skip(
|
@unittest.skip(
|
||||||
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -397,6 +397,7 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
|
|||||||
self.vision_model_tester = InstructBlipVisionModelTester(parent, **vision_kwargs)
|
self.vision_model_tester = InstructBlipVisionModelTester(parent, **vision_kwargs)
|
||||||
self.qformer_model_tester = InstructBlipQFormerModelTester(parent, **qformer_kwargs)
|
self.qformer_model_tester = InstructBlipQFormerModelTester(parent, **qformer_kwargs)
|
||||||
self.text_model_tester = InstructBlipTextModelDecoderOnlyTester(parent, **text_kwargs)
|
self.text_model_tester = InstructBlipTextModelDecoderOnlyTester(parent, **text_kwargs)
|
||||||
|
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
self.num_query_tokens = num_query_tokens
|
self.num_query_tokens = num_query_tokens
|
||||||
|
|
||||||
|
|||||||
@@ -197,6 +197,7 @@ class Kosmos2ModelTester:
|
|||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.text_model_tester = Kosmos2TextModelTester(parent, **text_kwargs)
|
self.text_model_tester = Kosmos2TextModelTester(parent, **text_kwargs)
|
||||||
self.vision_model_tester = Kosmos2VisionModelTester(parent, **vision_kwargs)
|
self.vision_model_tester = Kosmos2VisionModelTester(parent, **vision_kwargs)
|
||||||
|
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||||
self.latent_query_num = latent_query_num
|
self.latent_query_num = latent_query_num
|
||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
|
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ from ...test_pipeline_mixin import PipelineTesterMixin
|
|||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
import torch
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
from transformers import (
|
from transformers import (
|
||||||
LayoutLMv2Config,
|
LayoutLMv2Config,
|
||||||
@@ -442,6 +443,64 @@ class LayoutLMv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
|
|||||||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_batching_equivalence(self):
|
||||||
|
def equivalence(tensor1, tensor2):
|
||||||
|
return 1.0 - F.cosine_similarity(tensor1.float().flatten(), tensor2.float().flatten(), dim=0, eps=0)
|
||||||
|
|
||||||
|
def recursive_check(batched_object, single_row_object, model_name, key):
|
||||||
|
if isinstance(batched_object, (list, tuple)):
|
||||||
|
for batched_object_value, single_row_object_value in zip(batched_object, single_row_object):
|
||||||
|
recursive_check(batched_object_value, single_row_object_value, model_name, key)
|
||||||
|
elif batched_object is None:
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
batched_row = batched_object[:1]
|
||||||
|
self.assertFalse(
|
||||||
|
torch.isnan(batched_row).any(), f"Batched output has `nan` in {model_name} for key={key}"
|
||||||
|
)
|
||||||
|
self.assertFalse(
|
||||||
|
torch.isinf(batched_row).any(), f"Batched output has `inf` in {model_name} for key={key}"
|
||||||
|
)
|
||||||
|
self.assertFalse(
|
||||||
|
torch.isnan(single_row_object).any(), f"Single row output has `nan` in {model_name} for key={key}"
|
||||||
|
)
|
||||||
|
self.assertFalse(
|
||||||
|
torch.isinf(single_row_object).any(), f"Single row output has `inf` in {model_name} for key={key}"
|
||||||
|
)
|
||||||
|
self.assertTrue(
|
||||||
|
(equivalence(batched_row, single_row_object)) <= 1e-03,
|
||||||
|
msg=(
|
||||||
|
f"Batched and Single row outputs are not equal in {model_name} for key={key}. "
|
||||||
|
f"Difference={equivalence(batched_row, single_row_object)}."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
config, batched_input = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|
||||||
|
for model_class in self.all_model_classes:
|
||||||
|
config.output_hidden_states = True
|
||||||
|
|
||||||
|
model_name = model_class.__name__
|
||||||
|
batched_input_prepared = self._prepare_for_class(batched_input, model_class)
|
||||||
|
model = model_class(config).to(torch_device).eval()
|
||||||
|
batch_size = self.model_tester.batch_size
|
||||||
|
|
||||||
|
single_row_input = {}
|
||||||
|
for key, value in batched_input_prepared.items():
|
||||||
|
if isinstance(value, torch.Tensor) and value.shape[0] % batch_size == 0:
|
||||||
|
single_batch_shape = value.shape[0] // batch_size
|
||||||
|
single_row_input[key] = value[:single_batch_shape]
|
||||||
|
elif hasattr(value, "tensor"):
|
||||||
|
# layoutlmv2uses ImageList intead of pixel values (needs for torchscript)
|
||||||
|
single_row_input[key] = value.tensor[:single_batch_shape]
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
model_batched_output = model(**batched_input_prepared)
|
||||||
|
model_row_output = model(**single_row_input)
|
||||||
|
|
||||||
|
for key in model_batched_output:
|
||||||
|
recursive_check(model_batched_output[key], model_row_output[key], model_name, key)
|
||||||
|
|
||||||
|
|
||||||
def prepare_layoutlmv2_batch_inputs():
|
def prepare_layoutlmv2_batch_inputs():
|
||||||
# Here we prepare a batch of 2 sequences to test a LayoutLMv2 forward pass on:
|
# Here we prepare a batch of 2 sequences to test a LayoutLMv2 forward pass on:
|
||||||
|
|||||||
@@ -388,6 +388,10 @@ class LongformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
|
|||||||
# longformer cannot keep gradients in attentions or hidden states
|
# longformer cannot keep gradients in attentions or hidden states
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@unittest.skip("LongFormer calculates global attn only when attn_mask has non-zero elements")
|
||||||
|
def test_batching_equivalence(self):
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
@require_sentencepiece
|
@require_sentencepiece
|
||||||
|
|||||||
@@ -39,6 +39,7 @@ from ...test_pipeline_mixin import PipelineTesterMixin
|
|||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
import torch
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
from transformers import MaskFormerForInstanceSegmentation, MaskFormerModel
|
from transformers import MaskFormerForInstanceSegmentation, MaskFormerModel
|
||||||
|
|
||||||
@@ -206,6 +207,7 @@ class MaskFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
|
|||||||
test_pruning = False
|
test_pruning = False
|
||||||
test_head_masking = False
|
test_head_masking = False
|
||||||
test_missing_keys = False
|
test_missing_keys = False
|
||||||
|
zero_init_hidden_state = True
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = MaskFormerModelTester(self)
|
self.model_tester = MaskFormerModelTester(self)
|
||||||
@@ -381,6 +383,67 @@ class MaskFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
|
|||||||
self.assertIsNotNone(outputs.auxiliary_logits)
|
self.assertIsNotNone(outputs.auxiliary_logits)
|
||||||
self.assertEqual(len(outputs.auxiliary_logits), self.model_tester.num_channels - 1)
|
self.assertEqual(len(outputs.auxiliary_logits), self.model_tester.num_channels - 1)
|
||||||
|
|
||||||
|
def test_batching_equivalence(self):
|
||||||
|
def equivalence(tensor1, tensor2):
|
||||||
|
return 1.0 - F.cosine_similarity(tensor1.float().flatten(), tensor2.float().flatten(), dim=0, eps=0).max()
|
||||||
|
|
||||||
|
def recursive_check(batched_object, single_row_object, model_name, key):
|
||||||
|
if isinstance(batched_object, (list, tuple)):
|
||||||
|
for batched_object_value, single_row_object_value in zip(batched_object, single_row_object):
|
||||||
|
recursive_check(batched_object_value, single_row_object_value, model_name, key)
|
||||||
|
elif batched_object is None:
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
batched_row = batched_object[:1]
|
||||||
|
self.assertFalse(
|
||||||
|
torch.isnan(batched_row).any(), f"Batched output has `nan` in {model_name} for key={key}"
|
||||||
|
)
|
||||||
|
self.assertFalse(
|
||||||
|
torch.isinf(batched_row).any(), f"Batched output has `inf` in {model_name} for key={key}"
|
||||||
|
)
|
||||||
|
self.assertFalse(
|
||||||
|
torch.isnan(single_row_object).any(), f"Single row output has `nan` in {model_name} for key={key}"
|
||||||
|
)
|
||||||
|
self.assertFalse(
|
||||||
|
torch.isinf(single_row_object).any(), f"Single row output has `inf` in {model_name} for key={key}"
|
||||||
|
)
|
||||||
|
self.assertTrue(
|
||||||
|
(equivalence(batched_row, single_row_object)) <= 1e-03,
|
||||||
|
msg=(
|
||||||
|
f"Batched and Single row outputs are not equal in {model_name} for key={key}. "
|
||||||
|
f"Difference={equivalence(batched_row, single_row_object)}."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
config, batched_input = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|
||||||
|
for model_class in self.all_model_classes:
|
||||||
|
config.output_hidden_states = True
|
||||||
|
|
||||||
|
model_name = model_class.__name__
|
||||||
|
batched_input_prepared = self._prepare_for_class(batched_input, model_class)
|
||||||
|
model = model_class(config).to(torch_device).eval()
|
||||||
|
batch_size = self.model_tester.batch_size
|
||||||
|
|
||||||
|
single_row_input = {}
|
||||||
|
for key, value in batched_input_prepared.items():
|
||||||
|
single_batch_shape = value.shape[0] // batch_size
|
||||||
|
single_row_input[key] = value[:single_batch_shape]
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
model_batched_output = model(**batched_input_prepared)
|
||||||
|
model_row_output = model(**single_row_input)
|
||||||
|
|
||||||
|
for key in model_batched_output:
|
||||||
|
# remove the first zero-init queries to decoder, otherwise cos_similarity = `nan`
|
||||||
|
# no need to check all hidden_states, already checked separately each one
|
||||||
|
if key == "transformer_decoder_hidden_states":
|
||||||
|
model_batched_output[key] = model_batched_output[key][1:]
|
||||||
|
model_row_output[key] = model_row_output[key][1:]
|
||||||
|
elif key == "hidden_states":
|
||||||
|
continue
|
||||||
|
recursive_check(model_batched_output[key], model_row_output[key], model_name, key)
|
||||||
|
|
||||||
|
|
||||||
TOLERANCE = 1e-4
|
TOLERANCE = 1e-4
|
||||||
|
|
||||||
|
|||||||
@@ -18,7 +18,7 @@
|
|||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from transformers import MobileNetV2Config
|
from transformers import MobileNetV2Config
|
||||||
from transformers.testing_utils import require_torch, require_vision, slow, torch_device
|
from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device
|
||||||
from transformers.utils import cached_property, is_torch_available, is_vision_available
|
from transformers.utils import cached_property, is_torch_available, is_vision_available
|
||||||
|
|
||||||
from ...test_configuration_common import ConfigTester
|
from ...test_configuration_common import ConfigTester
|
||||||
@@ -271,6 +271,10 @@ class MobileNetV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
|
|||||||
model = MobileNetV2Model.from_pretrained(model_name)
|
model = MobileNetV2Model.from_pretrained(model_name)
|
||||||
self.assertIsNotNone(model)
|
self.assertIsNotNone(model)
|
||||||
|
|
||||||
|
@is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516")
|
||||||
|
def test_batching_equivalence(self):
|
||||||
|
super().test_batching_equivalence()
|
||||||
|
|
||||||
|
|
||||||
# We will verify our results on an image of cute cats
|
# We will verify our results on an image of cute cats
|
||||||
def prepare_img():
|
def prepare_img():
|
||||||
|
|||||||
@@ -378,6 +378,10 @@ class MraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
|||||||
def test_training_gradient_checkpointing_use_reentrant_false(self):
|
def test_training_gradient_checkpointing_use_reentrant_false(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@unittest.skip("Model has `nan` in hidden_states, see https://github.com/huggingface/transformers/issues/29373.")
|
||||||
|
def test_batching_equivalence(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
class MraModelIntegrationTest(unittest.TestCase):
|
class MraModelIntegrationTest(unittest.TestCase):
|
||||||
|
|||||||
@@ -103,7 +103,7 @@ class MusicgenDecoderTester:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
parent,
|
parent,
|
||||||
batch_size=2,
|
batch_size=3, # need batch_size != num_hidden_layers
|
||||||
seq_length=7,
|
seq_length=7,
|
||||||
is_training=False,
|
is_training=False,
|
||||||
use_labels=False,
|
use_labels=False,
|
||||||
@@ -441,7 +441,7 @@ class MusicgenTester:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
parent,
|
parent,
|
||||||
batch_size=2,
|
batch_size=3, # need batch_size != num_hidden_layers
|
||||||
seq_length=7,
|
seq_length=7,
|
||||||
is_training=False,
|
is_training=False,
|
||||||
use_labels=False,
|
use_labels=False,
|
||||||
|
|||||||
@@ -385,6 +385,7 @@ class Owlv2ModelTester:
|
|||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
self.text_config = self.text_model_tester.get_config().to_dict()
|
self.text_config = self.text_model_tester.get_config().to_dict()
|
||||||
self.vision_config = self.vision_model_tester.get_config().to_dict()
|
self.vision_config = self.vision_model_tester.get_config().to_dict()
|
||||||
|
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
def prepare_config_and_inputs(self):
|
||||||
text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
|
text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
|
||||||
@@ -591,6 +592,7 @@ class Owlv2ForObjectDetectionTester:
|
|||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
self.text_config = self.text_model_tester.get_config().to_dict()
|
self.text_config = self.text_model_tester.get_config().to_dict()
|
||||||
self.vision_config = self.vision_model_tester.get_config().to_dict()
|
self.vision_config = self.vision_model_tester.get_config().to_dict()
|
||||||
|
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
def prepare_config_and_inputs(self):
|
||||||
text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
|
text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
|
||||||
|
|||||||
@@ -381,6 +381,7 @@ class OwlViTModelTester:
|
|||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
self.text_config = self.text_model_tester.get_config().to_dict()
|
self.text_config = self.text_model_tester.get_config().to_dict()
|
||||||
self.vision_config = self.vision_model_tester.get_config().to_dict()
|
self.vision_config = self.vision_model_tester.get_config().to_dict()
|
||||||
|
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
def prepare_config_and_inputs(self):
|
||||||
text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
|
text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
|
||||||
@@ -585,6 +586,7 @@ class OwlViTForObjectDetectionTester:
|
|||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
self.text_config = self.text_model_tester.get_config().to_dict()
|
self.text_config = self.text_model_tester.get_config().to_dict()
|
||||||
self.vision_config = self.vision_model_tester.get_config().to_dict()
|
self.vision_config = self.vision_model_tester.get_config().to_dict()
|
||||||
|
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
def prepare_config_and_inputs(self):
|
||||||
text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
|
text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
|
||||||
|
|||||||
@@ -386,6 +386,7 @@ class Pix2StructModelTester:
|
|||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.text_model_tester = Pix2StructTextModelTester(parent, **text_kwargs)
|
self.text_model_tester = Pix2StructTextModelTester(parent, **text_kwargs)
|
||||||
self.vision_model_tester = Pix2StructVisionModelTester(parent, **vision_kwargs)
|
self.vision_model_tester = Pix2StructVisionModelTester(parent, **vision_kwargs)
|
||||||
|
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
def prepare_config_and_inputs(self):
|
||||||
|
|||||||
@@ -389,6 +389,7 @@ class SiglipModelTester:
|
|||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.text_model_tester = SiglipTextModelTester(parent, **text_kwargs)
|
self.text_model_tester = SiglipTextModelTester(parent, **text_kwargs)
|
||||||
self.vision_model_tester = SiglipVisionModelTester(parent, **vision_kwargs)
|
self.vision_model_tester = SiglipVisionModelTester(parent, **vision_kwargs)
|
||||||
|
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
|
|
||||||
# Copied from tests.models.clip.test_modeling_clip.CLIPModelTester.prepare_config_and_inputs
|
# Copied from tests.models.clip.test_modeling_clip.CLIPModelTester.prepare_config_and_inputs
|
||||||
|
|||||||
@@ -916,6 +916,10 @@ class SpeechT5ForTextToSpeechTest(ModelTesterMixin, unittest.TestCase):
|
|||||||
def test_determinism(self):
|
def test_determinism(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@unittest.skip("skipped because there is always dropout in SpeechT5SpeechDecoderPrenet")
|
||||||
|
def test_batching_equivalence(self):
|
||||||
|
pass
|
||||||
|
|
||||||
def test_forward_signature(self):
|
def test_forward_signature(self):
|
||||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|
||||||
@@ -1438,6 +1442,10 @@ class SpeechT5ForSpeechToSpeechTest(ModelTesterMixin, unittest.TestCase):
|
|||||||
def test_determinism(self):
|
def test_determinism(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@unittest.skip("skipped because there is always dropout in SpeechT5SpeechDecoderPrenet")
|
||||||
|
def test_batching_equivalence(self):
|
||||||
|
pass
|
||||||
|
|
||||||
def test_attention_outputs(self):
|
def test_attention_outputs(self):
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
config.return_dict = True
|
config.return_dict = True
|
||||||
|
|||||||
@@ -209,6 +209,7 @@ class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, Pipelin
|
|||||||
test_pruning = False
|
test_pruning = False
|
||||||
test_head_masking = False
|
test_head_masking = False
|
||||||
test_missing_keys = False
|
test_missing_keys = False
|
||||||
|
zero_init_hidden_state = True
|
||||||
|
|
||||||
# special case for head models
|
# special case for head models
|
||||||
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
|
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
|
||||||
|
|||||||
@@ -104,6 +104,7 @@ class TimeSeriesTransformerModelTester:
|
|||||||
num_static_categorical_features=1,
|
num_static_categorical_features=1,
|
||||||
cardinality=[self.cardinality],
|
cardinality=[self.cardinality],
|
||||||
embedding_dimension=[self.embedding_dimension],
|
embedding_dimension=[self.embedding_dimension],
|
||||||
|
scaling="std", # we need std to get non-zero `loc`
|
||||||
)
|
)
|
||||||
|
|
||||||
def prepare_time_series_transformer_inputs_dict(self, config):
|
def prepare_time_series_transformer_inputs_dict(self, config):
|
||||||
|
|||||||
@@ -66,13 +66,13 @@ class UnivNetModelTester:
|
|||||||
|
|
||||||
def prepare_noise_sequence(self):
|
def prepare_noise_sequence(self):
|
||||||
generator = torch.manual_seed(self.seed)
|
generator = torch.manual_seed(self.seed)
|
||||||
noise_shape = (self.seq_length, self.in_channels)
|
noise_shape = (self.batch_size, self.seq_length, self.in_channels)
|
||||||
# Create noise on CPU for reproducibility
|
# Create noise on CPU for reproducibility
|
||||||
noise_sequence = torch.randn(noise_shape, generator=generator, dtype=torch.float)
|
noise_sequence = torch.randn(noise_shape, generator=generator, dtype=torch.float)
|
||||||
return noise_sequence
|
return noise_sequence
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
def prepare_config_and_inputs(self):
|
||||||
spectrogram = floats_tensor([self.seq_length, self.num_mel_bins], scale=1.0)
|
spectrogram = floats_tensor([self.batch_size, self.seq_length, self.num_mel_bins], scale=1.0)
|
||||||
noise_sequence = self.prepare_noise_sequence()
|
noise_sequence = self.prepare_noise_sequence()
|
||||||
noise_sequence = noise_sequence.to(spectrogram.device)
|
noise_sequence = noise_sequence.to(spectrogram.device)
|
||||||
config = self.get_config()
|
config = self.get_config()
|
||||||
@@ -89,7 +89,7 @@ class UnivNetModelTester:
|
|||||||
def create_and_check_model(self, config, spectrogram, noise_sequence):
|
def create_and_check_model(self, config, spectrogram, noise_sequence):
|
||||||
model = UnivNetModel(config=config).to(torch_device).eval()
|
model = UnivNetModel(config=config).to(torch_device).eval()
|
||||||
result = model(spectrogram, noise_sequence)[0]
|
result = model(spectrogram, noise_sequence)[0]
|
||||||
self.parent.assertEqual(result.shape, (1, self.seq_length * 256))
|
self.parent.assertEqual(result.shape, (self.batch_size, self.seq_length * 256))
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
def prepare_config_and_inputs_for_common(self):
|
||||||
config, spectrogram, noise_sequence = self.prepare_config_and_inputs()
|
config, spectrogram, noise_sequence = self.prepare_config_and_inputs()
|
||||||
@@ -182,8 +182,8 @@ class UnivNetModelTest(ModelTesterMixin, unittest.TestCase):
|
|||||||
model.to(torch_device)
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
batched_spectrogram = inputs["input_features"].unsqueeze(0).repeat(2, 1, 1)
|
batched_spectrogram = inputs["input_features"]
|
||||||
batched_noise_sequence = inputs["noise_sequence"].unsqueeze(0).repeat(2, 1, 1)
|
batched_noise_sequence = inputs["noise_sequence"]
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
batched_outputs = model(
|
batched_outputs = model(
|
||||||
batched_spectrogram.to(torch_device),
|
batched_spectrogram.to(torch_device),
|
||||||
@@ -205,37 +205,11 @@ class UnivNetModelTest(ModelTesterMixin, unittest.TestCase):
|
|||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
outputs = model(inputs["input_features"].to(torch_device), inputs["noise_sequence"].to(torch_device))[
|
outputs = model(
|
||||||
0
|
inputs["input_features"][:1].to(torch_device), inputs["noise_sequence"][:1].to(torch_device)
|
||||||
]
|
)[0]
|
||||||
self.assertTrue(outputs.shape[0] == 1, msg="Unbatched input should create batched output with bsz = 1")
|
self.assertTrue(outputs.shape[0] == 1, msg="Unbatched input should create batched output with bsz = 1")
|
||||||
|
|
||||||
def test_unbatched_batched_outputs_consistency(self):
|
|
||||||
config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
|
|
||||||
|
|
||||||
for model_class in self.all_model_classes:
|
|
||||||
model = model_class(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
unbatched_spectrogram = inputs["input_features"].detach().clone()
|
|
||||||
unbatched_noise_sequence = inputs["noise_sequence"].detach().clone()
|
|
||||||
batched_spectrogram = inputs["input_features"].unsqueeze(0)
|
|
||||||
batched_noise_sequence = inputs["noise_sequence"].unsqueeze(0)
|
|
||||||
|
|
||||||
with torch.no_grad():
|
|
||||||
unbatched_outputs = model(
|
|
||||||
unbatched_spectrogram.to(torch_device),
|
|
||||||
unbatched_noise_sequence.to(torch_device),
|
|
||||||
)[0]
|
|
||||||
|
|
||||||
batched_outputs = model(
|
|
||||||
batched_spectrogram.to(torch_device),
|
|
||||||
batched_noise_sequence.to(torch_device),
|
|
||||||
)[0]
|
|
||||||
|
|
||||||
torch.testing.assert_close(unbatched_outputs, batched_outputs)
|
|
||||||
|
|
||||||
|
|
||||||
@require_torch_gpu
|
@require_torch_gpu
|
||||||
@slow
|
@slow
|
||||||
|
|||||||
@@ -345,6 +345,12 @@ class ViltModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
|||||||
def test_determinism(self):
|
def test_determinism(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@unittest.skip(
|
||||||
|
"VilT samples image tokens from a multinomial distribution, resulting in not deterministic hidden states"
|
||||||
|
)
|
||||||
|
def test_batching_equivalence(self):
|
||||||
|
pass
|
||||||
|
|
||||||
@unittest.skip(
|
@unittest.skip(
|
||||||
reason="""VilT samples image tokens from a multinomial distribution, resulting in not deterministic
|
reason="""VilT samples image tokens from a multinomial distribution, resulting in not deterministic
|
||||||
hidden states"""
|
hidden states"""
|
||||||
|
|||||||
@@ -18,7 +18,7 @@
|
|||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from transformers import ViTHybridConfig
|
from transformers import ViTHybridConfig
|
||||||
from transformers.testing_utils import require_accelerate, require_torch, require_vision, slow, torch_device
|
from transformers.testing_utils import is_flaky, require_accelerate, require_torch, require_vision, slow, torch_device
|
||||||
from transformers.utils import cached_property, is_torch_available, is_vision_available
|
from transformers.utils import cached_property, is_torch_available, is_vision_available
|
||||||
|
|
||||||
from ...test_configuration_common import ConfigTester
|
from ...test_configuration_common import ConfigTester
|
||||||
@@ -221,6 +221,10 @@ class ViTHybridModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
|
|||||||
model = ViTHybridModel.from_pretrained(model_name)
|
model = ViTHybridModel.from_pretrained(model_name)
|
||||||
self.assertIsNotNone(model)
|
self.assertIsNotNone(model)
|
||||||
|
|
||||||
|
@is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516")
|
||||||
|
def test_batching_equivalence(self):
|
||||||
|
super().test_batching_equivalence()
|
||||||
|
|
||||||
|
|
||||||
# We will verify our results on an image of cute cats
|
# We will verify our results on an image of cute cats
|
||||||
def prepare_img():
|
def prepare_img():
|
||||||
|
|||||||
@@ -270,6 +270,10 @@ class ViTMAEModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
|||||||
def test_model_outputs_equivalence(self):
|
def test_model_outputs_equivalence(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@unittest.skip(reason="ViTMAE returns a random mask + ids_restore in each forward pass")
|
||||||
|
def test_batching_equivalence(self):
|
||||||
|
pass
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
for model_name in VIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
|
for model_name in VIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
|
||||||
|
|||||||
@@ -216,6 +216,10 @@ class VitsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
|||||||
def test_determinism(self):
|
def test_determinism(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@unittest.skip("VITS is not deterministic")
|
||||||
|
def test_batching_equivalence(self):
|
||||||
|
pass
|
||||||
|
|
||||||
@is_flaky(
|
@is_flaky(
|
||||||
max_attempts=3,
|
max_attempts=3,
|
||||||
description="Weight initialisation for the VITS conv layers sometimes exceeds the kaiming normal range",
|
description="Weight initialisation for the VITS conv layers sometimes exceeds the kaiming normal range",
|
||||||
|
|||||||
@@ -190,7 +190,7 @@ class WhisperModelTester:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
parent,
|
parent,
|
||||||
batch_size=2,
|
batch_size=3, # need batch_size != num_hidden_layers
|
||||||
seq_length=60,
|
seq_length=60,
|
||||||
is_training=True,
|
is_training=True,
|
||||||
use_labels=False,
|
use_labels=False,
|
||||||
@@ -1446,6 +1446,7 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
|
|||||||
|
|
||||||
model = WhisperForConditionalGeneration(config).eval().to(torch_device)
|
model = WhisperForConditionalGeneration(config).eval().to(torch_device)
|
||||||
input_features = input_dict["input_features"].to(torch_device)
|
input_features = input_dict["input_features"].to(torch_device)
|
||||||
|
input_features = input_features[:2]
|
||||||
|
|
||||||
# len = 250 with num_input_frames = 60
|
# len = 250 with num_input_frames = 60
|
||||||
long_input_features = torch.cat([input_features.repeat(1, 1, 4), input_features[:, :, :10]], dim=-1)
|
long_input_features = torch.cat([input_features.repeat(1, 1, 4), input_features[:, :, :10]], dim=-1)
|
||||||
@@ -2626,7 +2627,7 @@ class WhisperEncoderModelTester:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
parent,
|
parent,
|
||||||
batch_size=2,
|
batch_size=3, # need batch_size != num_hidden layers
|
||||||
seq_length=60,
|
seq_length=60,
|
||||||
is_training=True,
|
is_training=True,
|
||||||
use_labels=True,
|
use_labels=True,
|
||||||
@@ -2997,7 +2998,7 @@ class WhisperStandaloneDecoderModelTester:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
parent,
|
parent,
|
||||||
batch_size=2,
|
batch_size=3, # need batch_size != num_hidden layers
|
||||||
is_training=True,
|
is_training=True,
|
||||||
use_labels=False,
|
use_labels=False,
|
||||||
vocab_size=200,
|
vocab_size=200,
|
||||||
|
|||||||
@@ -479,6 +479,7 @@ class XCLIPModelTester:
|
|||||||
self.mit_hidden_size = mit_hidden_size
|
self.mit_hidden_size = mit_hidden_size
|
||||||
self.text_model_tester = XCLIPTextModelTester(parent, **text_kwargs)
|
self.text_model_tester = XCLIPTextModelTester(parent, **text_kwargs)
|
||||||
self.vision_model_tester = XCLIPVisionModelTester(parent, **vision_kwargs)
|
self.vision_model_tester = XCLIPVisionModelTester(parent, **vision_kwargs)
|
||||||
|
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
def prepare_config_and_inputs(self):
|
||||||
|
|||||||
@@ -99,6 +99,7 @@ if is_accelerate_available():
|
|||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
import torch
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
from safetensors.torch import load_file as safe_load_file
|
from safetensors.torch import load_file as safe_load_file
|
||||||
from safetensors.torch import save_file as safe_save_file
|
from safetensors.torch import save_file as safe_save_file
|
||||||
from torch import nn
|
from torch import nn
|
||||||
@@ -693,6 +694,99 @@ class ModelTesterMixin:
|
|||||||
expected_arg_names = [model.main_input_name]
|
expected_arg_names = [model.main_input_name]
|
||||||
self.assertListEqual(arg_names[:1], expected_arg_names)
|
self.assertListEqual(arg_names[:1], expected_arg_names)
|
||||||
|
|
||||||
|
def test_batching_equivalence(self):
|
||||||
|
"""
|
||||||
|
Tests that the model supports batching and that the output is the nearly the same for the same input in
|
||||||
|
different batch sizes.
|
||||||
|
(Why "nearly the same" not "exactly the same"? Batching uses different matmul shapes, which often leads to
|
||||||
|
different results: https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def get_tensor_equivalence_function(batched_input):
|
||||||
|
# models operating on continuous spaces have higher abs difference than LMs
|
||||||
|
# instead, we can rely on cos distance for image/speech models, similar to `diffusers`
|
||||||
|
if "input_ids" not in batched_input:
|
||||||
|
return lambda tensor1, tensor2: (
|
||||||
|
1.0 - F.cosine_similarity(tensor1.float().flatten(), tensor2.float().flatten(), dim=0, eps=1e-38)
|
||||||
|
)
|
||||||
|
return lambda tensor1, tensor2: torch.max(torch.abs(tensor1 - tensor2))
|
||||||
|
|
||||||
|
def recursive_check(batched_object, single_row_object, model_name, key):
|
||||||
|
if isinstance(batched_object, (list, tuple)):
|
||||||
|
for batched_object_value, single_row_object_value in zip(batched_object, single_row_object):
|
||||||
|
recursive_check(batched_object_value, single_row_object_value, model_name, key)
|
||||||
|
elif isinstance(batched_object, dict):
|
||||||
|
for batched_object_value, single_row_object_value in zip(
|
||||||
|
batched_object.values(), single_row_object.values()
|
||||||
|
):
|
||||||
|
recursive_check(batched_object_value, single_row_object_value, model_name, key)
|
||||||
|
# do not compare returned loss (0-dim tensor) or codebook ids (int)
|
||||||
|
elif batched_object is None or isinstance(batched_object, int):
|
||||||
|
return
|
||||||
|
elif batched_object.dim() == 0:
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
# indexing the first element does not always work
|
||||||
|
# e.g. models that output similarity scores of size (N, M) would need to index [0, 0]
|
||||||
|
slice_ids = [slice(0, index) for index in single_row_object.shape]
|
||||||
|
batched_row = batched_object[slice_ids]
|
||||||
|
self.assertFalse(
|
||||||
|
torch.isnan(batched_row).any(), f"Batched output has `nan` in {model_name} for key={key}"
|
||||||
|
)
|
||||||
|
self.assertFalse(
|
||||||
|
torch.isinf(batched_row).any(), f"Batched output has `inf` in {model_name} for key={key}"
|
||||||
|
)
|
||||||
|
self.assertFalse(
|
||||||
|
torch.isnan(single_row_object).any(), f"Single row output has `nan` in {model_name} for key={key}"
|
||||||
|
)
|
||||||
|
self.assertFalse(
|
||||||
|
torch.isinf(single_row_object).any(), f"Single row output has `inf` in {model_name} for key={key}"
|
||||||
|
)
|
||||||
|
self.assertTrue(
|
||||||
|
(equivalence(batched_row, single_row_object)) <= 1e-03,
|
||||||
|
msg=(
|
||||||
|
f"Batched and Single row outputs are not equal in {model_name} for key={key}. "
|
||||||
|
f"Difference={equivalence(batched_row, single_row_object)}."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
config, batched_input = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
equivalence = get_tensor_equivalence_function(batched_input)
|
||||||
|
|
||||||
|
for model_class in self.all_model_classes:
|
||||||
|
config.output_hidden_states = True
|
||||||
|
|
||||||
|
model_name = model_class.__name__
|
||||||
|
if hasattr(self.model_tester, "prepare_config_and_inputs_for_model_class"):
|
||||||
|
config, batched_input = self.model_tester.prepare_config_and_inputs_for_model_class(model_class)
|
||||||
|
batched_input_prepared = self._prepare_for_class(batched_input, model_class)
|
||||||
|
model = model_class(config).to(torch_device).eval()
|
||||||
|
|
||||||
|
batch_size = self.model_tester.batch_size
|
||||||
|
single_row_input = {}
|
||||||
|
for key, value in batched_input_prepared.items():
|
||||||
|
if isinstance(value, torch.Tensor) and value.shape[0] % batch_size == 0:
|
||||||
|
# e.g. musicgen has inputs of size (bs*codebooks). in most cases value.shape[0] == batch_size
|
||||||
|
single_batch_shape = value.shape[0] // batch_size
|
||||||
|
single_row_input[key] = value[:single_batch_shape]
|
||||||
|
else:
|
||||||
|
single_row_input[key] = value
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
model_batched_output = model(**batched_input_prepared)
|
||||||
|
model_row_output = model(**single_row_input)
|
||||||
|
|
||||||
|
if isinstance(model_batched_output, torch.Tensor):
|
||||||
|
model_batched_output = {"model_output": model_batched_output}
|
||||||
|
model_row_output = {"model_output": model_row_output}
|
||||||
|
|
||||||
|
for key in model_batched_output:
|
||||||
|
# DETR starts from zero-init queries to decoder, leading to cos_similarity = `nan`
|
||||||
|
if hasattr(self, "zero_init_hidden_state") and "decoder_hidden_states" in key:
|
||||||
|
model_batched_output[key] = model_batched_output[key][1:]
|
||||||
|
model_row_output[key] = model_row_output[key][1:]
|
||||||
|
recursive_check(model_batched_output[key], model_row_output[key], model_name, key)
|
||||||
|
|
||||||
def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None):
|
def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None):
|
||||||
if not self.model_tester.is_training:
|
if not self.model_tester.is_training:
|
||||||
return
|
return
|
||||||
|
|||||||
Reference in New Issue
Block a user