Fix last models for common tests that are too big. (#25058)
* Fix last models for common tests that are too big. * Remove print statement
This commit is contained in:
@@ -97,6 +97,8 @@ class PerceiverConfig(PretrainedConfig):
|
|||||||
Number of audio samples per frame for the multimodal autoencoding model.
|
Number of audio samples per frame for the multimodal autoencoding model.
|
||||||
samples_per_patch (`int`, *optional*, defaults to 16):
|
samples_per_patch (`int`, *optional*, defaults to 16):
|
||||||
Number of audio samples per patch when preprocessing the audio for the multimodal autoencoding model.
|
Number of audio samples per patch when preprocessing the audio for the multimodal autoencoding model.
|
||||||
|
output_num_channels (`int`, *optional*, defaults to 512):
|
||||||
|
Number of output channels for each modalitiy decoder.
|
||||||
output_shape (`List[int]`, *optional*, defaults to `[1, 16, 224, 224]`):
|
output_shape (`List[int]`, *optional*, defaults to `[1, 16, 224, 224]`):
|
||||||
Shape of the output (batch_size, num_frames, height, width) for the video decoder queries of the multimodal
|
Shape of the output (batch_size, num_frames, height, width) for the video decoder queries of the multimodal
|
||||||
autoencoding model. This excludes the channel dimension.
|
autoencoding model. This excludes the channel dimension.
|
||||||
@@ -144,6 +146,8 @@ class PerceiverConfig(PretrainedConfig):
|
|||||||
audio_samples_per_frame=1920,
|
audio_samples_per_frame=1920,
|
||||||
samples_per_patch=16,
|
samples_per_patch=16,
|
||||||
output_shape=[1, 16, 224, 224],
|
output_shape=[1, 16, 224, 224],
|
||||||
|
output_num_channels=512,
|
||||||
|
_label_trainable_num_channels=1024,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
@@ -177,6 +181,8 @@ class PerceiverConfig(PretrainedConfig):
|
|||||||
self.audio_samples_per_frame = audio_samples_per_frame
|
self.audio_samples_per_frame = audio_samples_per_frame
|
||||||
self.samples_per_patch = samples_per_patch
|
self.samples_per_patch = samples_per_patch
|
||||||
self.output_shape = output_shape
|
self.output_shape = output_shape
|
||||||
|
self.output_num_channels = output_num_channels
|
||||||
|
self._label_trainable_num_channels = _label_trainable_num_channels
|
||||||
|
|
||||||
|
|
||||||
class PerceiverOnnxConfig(OnnxConfig):
|
class PerceiverOnnxConfig(OnnxConfig):
|
||||||
|
|||||||
@@ -1830,7 +1830,7 @@ class PerceiverForMultimodalAutoencoding(PerceiverPreTrainedModel):
|
|||||||
# Autoencoding, don't pass inputs to the queries.
|
# Autoencoding, don't pass inputs to the queries.
|
||||||
concat_preprocessed_input=False,
|
concat_preprocessed_input=False,
|
||||||
output_shape=config.output_shape,
|
output_shape=config.output_shape,
|
||||||
output_num_channels=512,
|
output_num_channels=config.output_num_channels,
|
||||||
use_query_residual=False,
|
use_query_residual=False,
|
||||||
position_encoding_only=True,
|
position_encoding_only=True,
|
||||||
position_encoding_type="fourier",
|
position_encoding_type="fourier",
|
||||||
@@ -1854,7 +1854,7 @@ class PerceiverForMultimodalAutoencoding(PerceiverPreTrainedModel):
|
|||||||
# Autoencoding, don't pass inputs to the queries.
|
# Autoencoding, don't pass inputs to the queries.
|
||||||
concat_preprocessed_input=False,
|
concat_preprocessed_input=False,
|
||||||
output_index_dims=(n_audio_samples // config.samples_per_patch,),
|
output_index_dims=(n_audio_samples // config.samples_per_patch,),
|
||||||
output_num_channels=512,
|
output_num_channels=config.output_num_channels,
|
||||||
use_query_residual=False,
|
use_query_residual=False,
|
||||||
position_encoding_only=True,
|
position_encoding_only=True,
|
||||||
position_encoding_type="fourier",
|
position_encoding_type="fourier",
|
||||||
@@ -1874,21 +1874,21 @@ class PerceiverForMultimodalAutoencoding(PerceiverPreTrainedModel):
|
|||||||
position_encoding_only=True,
|
position_encoding_only=True,
|
||||||
position_encoding_type="trainable",
|
position_encoding_type="trainable",
|
||||||
trainable_position_encoding_kwargs={
|
trainable_position_encoding_kwargs={
|
||||||
"num_channels": 1024,
|
"num_channels": config._label_trainable_num_channels,
|
||||||
"index_dims": 1,
|
"index_dims": 1,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
},
|
},
|
||||||
num_outputs=None,
|
num_outputs=None,
|
||||||
output_num_channels=512,
|
output_num_channels=config.output_num_channels,
|
||||||
use_query_residual=False,
|
use_query_residual=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
output_postprocessor = PerceiverMultimodalPostprocessor(
|
output_postprocessor = PerceiverMultimodalPostprocessor(
|
||||||
modalities={
|
modalities={
|
||||||
"audio": PerceiverAudioPostprocessor(config, in_channels=512),
|
"audio": PerceiverAudioPostprocessor(config, in_channels=config.output_num_channels),
|
||||||
"image": PerceiverProjectionPostprocessor(in_channels=512, out_channels=3),
|
"image": PerceiverProjectionPostprocessor(in_channels=config.output_num_channels, out_channels=3),
|
||||||
"label": PerceiverClassificationPostprocessor(config, in_channels=512),
|
"label": PerceiverClassificationPostprocessor(config, in_channels=config.output_num_channels),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -13,9 +13,9 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
""" Table Transformer model configuration"""
|
""" Table Transformer model configuration"""
|
||||||
|
import copy
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from typing import Mapping
|
from typing import Dict, Mapping
|
||||||
|
|
||||||
from packaging import version
|
from packaging import version
|
||||||
|
|
||||||
@@ -237,6 +237,17 @@ class TableTransformerConfig(PretrainedConfig):
|
|||||||
def hidden_size(self) -> int:
|
def hidden_size(self) -> int:
|
||||||
return self.d_model
|
return self.d_model
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, any]:
|
||||||
|
"""
|
||||||
|
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
|
||||||
|
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
|
||||||
|
"""
|
||||||
|
output = copy.deepcopy(self.__dict__)
|
||||||
|
if output["backbone_config"] is not None:
|
||||||
|
output["backbone_config"] = self.backbone_config.to_dict()
|
||||||
|
output["model_type"] = self.__class__.model_type
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.configuration_detr.DetrOnnxConfig
|
# Copied from transformers.models.detr.configuration_detr.DetrOnnxConfig
|
||||||
class TableTransformerOnnxConfig(OnnxConfig):
|
class TableTransformerOnnxConfig(OnnxConfig):
|
||||||
|
|||||||
@@ -279,10 +279,6 @@ class LayoutLMModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
|
|||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
|
self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
|
||||||
|
|
||||||
@unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
|
|
||||||
def test_model_is_small(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def prepare_layoutlm_batch_inputs():
|
def prepare_layoutlm_batch_inputs():
|
||||||
# Here we prepare a batch of 2 sequences to test a LayoutLM forward pass on:
|
# Here we prepare a batch of 2 sequences to test a LayoutLM forward pass on:
|
||||||
|
|||||||
@@ -415,7 +415,7 @@ class LayoutLMv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
|
|||||||
|
|
||||||
check_hidden_states_output(inputs_dict, config, model_class)
|
check_hidden_states_output(inputs_dict, config, model_class)
|
||||||
|
|
||||||
@unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
|
@unittest.skip("We cannot configure detectron2 to output a smaller backbone")
|
||||||
def test_model_is_small(self):
|
def test_model_is_small(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
@@ -112,16 +112,20 @@ class OneFormerModelTester:
|
|||||||
config = OneFormerConfig(
|
config = OneFormerConfig(
|
||||||
text_encoder_vocab_size=self.vocab_size,
|
text_encoder_vocab_size=self.vocab_size,
|
||||||
hidden_size=self.hidden_dim,
|
hidden_size=self.hidden_dim,
|
||||||
|
num_queries=self.num_queries,
|
||||||
|
num_labels=self.num_labels,
|
||||||
|
encoder_feedforward_dim=32,
|
||||||
|
dim_feedforward=64,
|
||||||
|
encoder_layers=2,
|
||||||
|
decoder_layers=2,
|
||||||
)
|
)
|
||||||
|
|
||||||
config.num_queries = self.num_queries
|
config.backbone_config.embed_dim = 16
|
||||||
config.num_labels = self.num_labels
|
|
||||||
|
|
||||||
config.backbone_config.depths = [1, 1, 1, 1]
|
config.backbone_config.depths = [1, 1, 1, 1]
|
||||||
|
config.backbone_config.hidden_size = 16
|
||||||
config.backbone_config.num_channels = self.num_channels
|
config.backbone_config.num_channels = self.num_channels
|
||||||
|
config.backbone_config.num_heads = [1, 1, 2, 2]
|
||||||
|
|
||||||
config.encoder_feedforward_dim = 64
|
|
||||||
config.dim_feedforward = 128
|
|
||||||
config.hidden_dim = self.hidden_dim
|
config.hidden_dim = self.hidden_dim
|
||||||
config.mask_dim = self.hidden_dim
|
config.mask_dim = self.hidden_dim
|
||||||
config.conv_dim = self.hidden_dim
|
config.conv_dim = self.hidden_dim
|
||||||
@@ -309,10 +313,6 @@ class OneFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
|
|||||||
expected_arg_names = ["pixel_values", "task_inputs"]
|
expected_arg_names = ["pixel_values", "task_inputs"]
|
||||||
self.assertListEqual(arg_names[:2], expected_arg_names)
|
self.assertListEqual(arg_names[:2], expected_arg_names)
|
||||||
|
|
||||||
@unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
|
|
||||||
def test_model_is_small(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
for model_name in ["shi-labs/oneformer_ade20k_swin_tiny"]:
|
for model_name in ["shi-labs/oneformer_ade20k_swin_tiny"]:
|
||||||
|
|||||||
@@ -79,6 +79,7 @@ class PerceiverModelTester:
|
|||||||
nchunks=20,
|
nchunks=20,
|
||||||
num_latents=10,
|
num_latents=10,
|
||||||
d_latents=20,
|
d_latents=20,
|
||||||
|
d_model=64,
|
||||||
num_blocks=1,
|
num_blocks=1,
|
||||||
num_self_attends_per_block=2,
|
num_self_attends_per_block=2,
|
||||||
num_self_attention_heads=1,
|
num_self_attention_heads=1,
|
||||||
@@ -108,6 +109,7 @@ class PerceiverModelTester:
|
|||||||
self.nchunks = nchunks
|
self.nchunks = nchunks
|
||||||
self.num_latents = num_latents
|
self.num_latents = num_latents
|
||||||
self.d_latents = d_latents
|
self.d_latents = d_latents
|
||||||
|
self.d_model = d_model
|
||||||
self.num_blocks = num_blocks
|
self.num_blocks = num_blocks
|
||||||
self.num_self_attends_per_block = num_self_attends_per_block
|
self.num_self_attends_per_block = num_self_attends_per_block
|
||||||
self.num_self_attention_heads = num_self_attention_heads
|
self.num_self_attention_heads = num_self_attention_heads
|
||||||
@@ -181,6 +183,7 @@ class PerceiverModelTester:
|
|||||||
return PerceiverConfig(
|
return PerceiverConfig(
|
||||||
num_latents=self.num_latents,
|
num_latents=self.num_latents,
|
||||||
d_latents=self.d_latents,
|
d_latents=self.d_latents,
|
||||||
|
d_model=self.d_model,
|
||||||
qk_channels=self.d_latents,
|
qk_channels=self.d_latents,
|
||||||
v_channels=self.d_latents,
|
v_channels=self.d_latents,
|
||||||
num_blocks=self.num_blocks,
|
num_blocks=self.num_blocks,
|
||||||
@@ -200,6 +203,8 @@ class PerceiverModelTester:
|
|||||||
audio_samples_per_frame=self.audio_samples_per_frame,
|
audio_samples_per_frame=self.audio_samples_per_frame,
|
||||||
samples_per_patch=self.samples_per_patch,
|
samples_per_patch=self.samples_per_patch,
|
||||||
num_labels=self.num_labels,
|
num_labels=self.num_labels,
|
||||||
|
output_num_channels=32,
|
||||||
|
_label_trainable_num_channels=16,
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_pipeline_config(self):
|
def get_pipeline_config(self):
|
||||||
@@ -784,10 +789,6 @@ class PerceiverModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
|
|||||||
|
|
||||||
loss.backward()
|
loss.backward()
|
||||||
|
|
||||||
@unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
|
|
||||||
def test_model_is_small(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
@require_torch_multi_gpu
|
@require_torch_multi_gpu
|
||||||
@unittest.skip(
|
@unittest.skip(
|
||||||
reason=(
|
reason=(
|
||||||
|
|||||||
@@ -61,11 +61,11 @@ class SegformerModelTester:
|
|||||||
image_size=64,
|
image_size=64,
|
||||||
num_channels=3,
|
num_channels=3,
|
||||||
num_encoder_blocks=4,
|
num_encoder_blocks=4,
|
||||||
depths=[2, 2, 2, 2],
|
depths=[1, 1, 1, 1],
|
||||||
sr_ratios=[8, 4, 2, 1],
|
sr_ratios=[8, 4, 2, 1],
|
||||||
hidden_sizes=[16, 32, 64, 128],
|
hidden_sizes=[8, 8, 16, 16],
|
||||||
downsampling_rates=[1, 4, 8, 16],
|
downsampling_rates=[1, 4, 8, 16],
|
||||||
num_attention_heads=[1, 2, 4, 8],
|
num_attention_heads=[1, 1, 2, 2],
|
||||||
is_training=True,
|
is_training=True,
|
||||||
use_labels=True,
|
use_labels=True,
|
||||||
hidden_act="gelu",
|
hidden_act="gelu",
|
||||||
@@ -347,10 +347,6 @@ class SegformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
|
|||||||
loss = model(**inputs).loss
|
loss = model(**inputs).loss
|
||||||
loss.backward()
|
loss.backward()
|
||||||
|
|
||||||
@unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
|
|
||||||
def test_model_is_small(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
for model_name in SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
|
for model_name in SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
|
||||||
|
|||||||
@@ -58,11 +58,11 @@ class TFSegformerModelTester:
|
|||||||
image_size=64,
|
image_size=64,
|
||||||
num_channels=3,
|
num_channels=3,
|
||||||
num_encoder_blocks=4,
|
num_encoder_blocks=4,
|
||||||
depths=[2, 2, 2, 2],
|
depths=[1, 1, 1, 1],
|
||||||
sr_ratios=[8, 4, 2, 1],
|
sr_ratios=[8, 4, 2, 1],
|
||||||
hidden_sizes=[16, 32, 64, 128],
|
hidden_sizes=[8, 8, 16, 16],
|
||||||
downsampling_rates=[1, 4, 8, 16],
|
downsampling_rates=[1, 4, 8, 16],
|
||||||
num_attention_heads=[1, 2, 4, 8],
|
num_attention_heads=[1, 1, 2, 2],
|
||||||
is_training=True,
|
is_training=True,
|
||||||
use_labels=True,
|
use_labels=True,
|
||||||
hidden_act="gelu",
|
hidden_act="gelu",
|
||||||
|
|||||||
@@ -238,10 +238,6 @@ class SpeechT5ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
|
|||||||
# disabled because this model doesn't have decoder_input_ids
|
# disabled because this model doesn't have decoder_input_ids
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
|
|
||||||
def test_model_is_small(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
class SpeechT5ForSpeechToTextTester:
|
class SpeechT5ForSpeechToTextTester:
|
||||||
@@ -705,10 +701,6 @@ class SpeechT5ForSpeechToTextTest(ModelTesterMixin, unittest.TestCase):
|
|||||||
def test_training_gradient_checkpointing(self):
|
def test_training_gradient_checkpointing(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
|
|
||||||
def test_model_is_small(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
# overwrite from test_modeling_common
|
# overwrite from test_modeling_common
|
||||||
def _mock_init_weights(self, module):
|
def _mock_init_weights(self, module):
|
||||||
if hasattr(module, "weight") and module.weight is not None:
|
if hasattr(module, "weight") and module.weight is not None:
|
||||||
@@ -800,6 +792,9 @@ class SpeechT5ForTextToSpeechTester:
|
|||||||
vocab_size=81,
|
vocab_size=81,
|
||||||
num_mel_bins=20,
|
num_mel_bins=20,
|
||||||
reduction_factor=2,
|
reduction_factor=2,
|
||||||
|
speech_decoder_postnet_layers=2,
|
||||||
|
speech_decoder_postnet_units=32,
|
||||||
|
speech_decoder_prenet_units=32,
|
||||||
):
|
):
|
||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
@@ -813,6 +808,9 @@ class SpeechT5ForTextToSpeechTester:
|
|||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.num_mel_bins = num_mel_bins
|
self.num_mel_bins = num_mel_bins
|
||||||
self.reduction_factor = reduction_factor
|
self.reduction_factor = reduction_factor
|
||||||
|
self.speech_decoder_postnet_layers = speech_decoder_postnet_layers
|
||||||
|
self.speech_decoder_postnet_units = speech_decoder_postnet_units
|
||||||
|
self.speech_decoder_prenet_units = speech_decoder_prenet_units
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
def prepare_config_and_inputs(self):
|
||||||
input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size).clamp(2)
|
input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size).clamp(2)
|
||||||
@@ -847,6 +845,9 @@ class SpeechT5ForTextToSpeechTester:
|
|||||||
vocab_size=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
num_mel_bins=self.num_mel_bins,
|
num_mel_bins=self.num_mel_bins,
|
||||||
reduction_factor=self.reduction_factor,
|
reduction_factor=self.reduction_factor,
|
||||||
|
speech_decoder_postnet_layers=self.speech_decoder_postnet_layers,
|
||||||
|
speech_decoder_postnet_units=self.speech_decoder_postnet_units,
|
||||||
|
speech_decoder_prenet_units=self.speech_decoder_prenet_units,
|
||||||
)
|
)
|
||||||
|
|
||||||
def create_and_check_model_forward(self, config, inputs_dict):
|
def create_and_check_model_forward(self, config, inputs_dict):
|
||||||
@@ -996,10 +997,6 @@ class SpeechT5ForTextToSpeechTest(ModelTesterMixin, unittest.TestCase):
|
|||||||
if hasattr(module, "bias") and module.bias is not None:
|
if hasattr(module, "bias") and module.bias is not None:
|
||||||
module.bias.data.fill_(3)
|
module.bias.data.fill_(3)
|
||||||
|
|
||||||
@unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
|
|
||||||
def test_model_is_small(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
@require_sentencepiece
|
@require_sentencepiece
|
||||||
@@ -1046,6 +1043,9 @@ class SpeechT5ForSpeechToSpeechTester:
|
|||||||
vocab_size=81,
|
vocab_size=81,
|
||||||
num_mel_bins=20,
|
num_mel_bins=20,
|
||||||
reduction_factor=2,
|
reduction_factor=2,
|
||||||
|
speech_decoder_postnet_layers=2,
|
||||||
|
speech_decoder_postnet_units=32,
|
||||||
|
speech_decoder_prenet_units=32,
|
||||||
):
|
):
|
||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
@@ -1065,6 +1065,9 @@ class SpeechT5ForSpeechToSpeechTester:
|
|||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.num_mel_bins = num_mel_bins
|
self.num_mel_bins = num_mel_bins
|
||||||
self.reduction_factor = reduction_factor
|
self.reduction_factor = reduction_factor
|
||||||
|
self.speech_decoder_postnet_layers = speech_decoder_postnet_layers
|
||||||
|
self.speech_decoder_postnet_units = speech_decoder_postnet_units
|
||||||
|
self.speech_decoder_prenet_units = speech_decoder_prenet_units
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
def prepare_config_and_inputs(self):
|
||||||
input_values = floats_tensor([self.batch_size, self.encoder_seq_length], scale=1.0)
|
input_values = floats_tensor([self.batch_size, self.encoder_seq_length], scale=1.0)
|
||||||
@@ -1105,6 +1108,9 @@ class SpeechT5ForSpeechToSpeechTester:
|
|||||||
vocab_size=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
num_mel_bins=self.num_mel_bins,
|
num_mel_bins=self.num_mel_bins,
|
||||||
reduction_factor=self.reduction_factor,
|
reduction_factor=self.reduction_factor,
|
||||||
|
speech_decoder_postnet_layers=self.speech_decoder_postnet_layers,
|
||||||
|
speech_decoder_postnet_units=self.speech_decoder_postnet_units,
|
||||||
|
speech_decoder_prenet_units=self.speech_decoder_prenet_units,
|
||||||
)
|
)
|
||||||
|
|
||||||
def create_and_check_model_forward(self, config, inputs_dict):
|
def create_and_check_model_forward(self, config, inputs_dict):
|
||||||
@@ -1416,10 +1422,6 @@ class SpeechT5ForSpeechToSpeechTest(ModelTesterMixin, unittest.TestCase):
|
|||||||
if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
|
if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
|
||||||
module.masked_spec_embed.data.fill_(3)
|
module.masked_spec_embed.data.fill_(3)
|
||||||
|
|
||||||
@unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
|
|
||||||
def test_model_is_small(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
@require_sentencepiece
|
@require_sentencepiece
|
||||||
@@ -1478,6 +1480,7 @@ class SpeechT5HifiGanTester:
|
|||||||
def get_config(self):
|
def get_config(self):
|
||||||
return SpeechT5HifiGanConfig(
|
return SpeechT5HifiGanConfig(
|
||||||
model_in_dim=self.num_mel_bins,
|
model_in_dim=self.num_mel_bins,
|
||||||
|
upsample_initial_channel=32,
|
||||||
)
|
)
|
||||||
|
|
||||||
def create_and_check_model(self, config, input_values):
|
def create_and_check_model(self, config, input_values):
|
||||||
@@ -1562,10 +1565,6 @@ class SpeechT5HifiGanTest(ModelTesterMixin, unittest.TestCase):
|
|||||||
def test_retain_grad_hidden_states_attentions(self):
|
def test_retain_grad_hidden_states_attentions(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
|
|
||||||
def test_model_is_small(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
# skip because it fails on automapping of SpeechT5HifiGanConfig
|
# skip because it fails on automapping of SpeechT5HifiGanConfig
|
||||||
def test_save_load_fast_init_from_base(self):
|
def test_save_load_fast_init_from_base(self):
|
||||||
pass
|
pass
|
||||||
|
|||||||
@@ -58,9 +58,9 @@ class SwiftFormerModelTester:
|
|||||||
hidden_dropout_prob=0.1,
|
hidden_dropout_prob=0.1,
|
||||||
attention_probs_dropout_prob=0.1,
|
attention_probs_dropout_prob=0.1,
|
||||||
image_size=224,
|
image_size=224,
|
||||||
num_labels=1000,
|
num_labels=3,
|
||||||
layer_depths=[3, 3, 6, 4],
|
layer_depths=[1, 1, 1, 1],
|
||||||
embed_dims=[48, 56, 112, 220],
|
embed_dims=[16, 16, 32, 32],
|
||||||
):
|
):
|
||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
@@ -272,10 +272,6 @@ class SwiftFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
|
|||||||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||||
)
|
)
|
||||||
|
|
||||||
@unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
|
|
||||||
def test_model_is_small(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
# We will verify our results on an image of cute cats
|
# We will verify our results on an image of cute cats
|
||||||
def prepare_img():
|
def prepare_img():
|
||||||
|
|||||||
@@ -21,8 +21,8 @@ import unittest
|
|||||||
|
|
||||||
from huggingface_hub import hf_hub_download
|
from huggingface_hub import hf_hub_download
|
||||||
|
|
||||||
from transformers import TableTransformerConfig, is_timm_available, is_vision_available
|
from transformers import ResNetConfig, TableTransformerConfig, is_torch_available, is_vision_available
|
||||||
from transformers.testing_utils import require_timm, require_vision, slow, torch_device
|
from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device
|
||||||
|
|
||||||
from ...generation.test_utils import GenerationTesterMixin
|
from ...generation.test_utils import GenerationTesterMixin
|
||||||
from ...test_configuration_common import ConfigTester
|
from ...test_configuration_common import ConfigTester
|
||||||
@@ -30,10 +30,10 @@ from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_
|
|||||||
from ...test_pipeline_mixin import PipelineTesterMixin
|
from ...test_pipeline_mixin import PipelineTesterMixin
|
||||||
|
|
||||||
|
|
||||||
if is_timm_available():
|
if is_torch_available():
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from transformers import ResNetConfig, TableTransformerForObjectDetection, TableTransformerModel
|
from transformers import TableTransformerForObjectDetection, TableTransformerModel
|
||||||
|
|
||||||
|
|
||||||
if is_vision_available():
|
if is_vision_available():
|
||||||
@@ -49,7 +49,7 @@ class TableTransformerModelTester:
|
|||||||
batch_size=8,
|
batch_size=8,
|
||||||
is_training=True,
|
is_training=True,
|
||||||
use_labels=True,
|
use_labels=True,
|
||||||
hidden_size=256,
|
hidden_size=32,
|
||||||
num_hidden_layers=2,
|
num_hidden_layers=2,
|
||||||
num_attention_heads=8,
|
num_attention_heads=8,
|
||||||
intermediate_size=4,
|
intermediate_size=4,
|
||||||
@@ -61,7 +61,7 @@ class TableTransformerModelTester:
|
|||||||
min_size=200,
|
min_size=200,
|
||||||
max_size=200,
|
max_size=200,
|
||||||
n_targets=8,
|
n_targets=8,
|
||||||
num_labels=91,
|
num_labels=3,
|
||||||
):
|
):
|
||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
@@ -107,6 +107,16 @@ class TableTransformerModelTester:
|
|||||||
return config, pixel_values, pixel_mask, labels
|
return config, pixel_values, pixel_mask, labels
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
|
resnet_config = ResNetConfig(
|
||||||
|
num_channels=3,
|
||||||
|
embeddings_size=10,
|
||||||
|
hidden_sizes=[10, 20, 30, 40],
|
||||||
|
depths=[1, 1, 2, 1],
|
||||||
|
hidden_act="relu",
|
||||||
|
num_labels=3,
|
||||||
|
out_features=["stage2", "stage3", "stage4"],
|
||||||
|
out_indices=[2, 3, 4],
|
||||||
|
)
|
||||||
return TableTransformerConfig(
|
return TableTransformerConfig(
|
||||||
d_model=self.hidden_size,
|
d_model=self.hidden_size,
|
||||||
encoder_layers=self.num_hidden_layers,
|
encoder_layers=self.num_hidden_layers,
|
||||||
@@ -119,6 +129,8 @@ class TableTransformerModelTester:
|
|||||||
attention_dropout=self.attention_probs_dropout_prob,
|
attention_dropout=self.attention_probs_dropout_prob,
|
||||||
num_queries=self.num_queries,
|
num_queries=self.num_queries,
|
||||||
num_labels=self.num_labels,
|
num_labels=self.num_labels,
|
||||||
|
use_timm_backbone=False,
|
||||||
|
backbone_config=resnet_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
def prepare_config_and_inputs_for_common(self):
|
||||||
@@ -175,19 +187,19 @@ class TableTransformerModelTester:
|
|||||||
self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
|
self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
|
||||||
|
|
||||||
|
|
||||||
@require_timm
|
@require_torch
|
||||||
class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||||
all_model_classes = (
|
all_model_classes = (
|
||||||
(
|
(
|
||||||
TableTransformerModel,
|
TableTransformerModel,
|
||||||
TableTransformerForObjectDetection,
|
TableTransformerForObjectDetection,
|
||||||
)
|
)
|
||||||
if is_timm_available()
|
if is_torch_available()
|
||||||
else ()
|
else ()
|
||||||
)
|
)
|
||||||
pipeline_model_mapping = (
|
pipeline_model_mapping = (
|
||||||
{"feature-extraction": TableTransformerModel, "object-detection": TableTransformerForObjectDetection}
|
{"feature-extraction": TableTransformerModel, "object-detection": TableTransformerForObjectDetection}
|
||||||
if is_timm_available()
|
if is_torch_available()
|
||||||
else {}
|
else {}
|
||||||
)
|
)
|
||||||
is_encoder_decoder = True
|
is_encoder_decoder = True
|
||||||
@@ -453,6 +465,7 @@ class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, Pipelin
|
|||||||
|
|
||||||
# let's set num_channels to 1
|
# let's set num_channels to 1
|
||||||
config.num_channels = 1
|
config.num_channels = 1
|
||||||
|
config.backbone_config.num_channels = 1
|
||||||
|
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
@@ -486,10 +499,6 @@ class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, Pipelin
|
|||||||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||||
)
|
)
|
||||||
|
|
||||||
@unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
|
|
||||||
def test_model_is_small(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
TOLERANCE = 1e-4
|
TOLERANCE = 1e-4
|
||||||
|
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ class TimmBackboneModelTester:
|
|||||||
out_indices=None,
|
out_indices=None,
|
||||||
out_features=None,
|
out_features=None,
|
||||||
stage_names=None,
|
stage_names=None,
|
||||||
backbone="resnet50",
|
backbone="resnet18",
|
||||||
batch_size=3,
|
batch_size=3,
|
||||||
image_size=32,
|
image_size=32,
|
||||||
num_channels=3,
|
num_channels=3,
|
||||||
@@ -196,7 +196,7 @@ class TimmBackboneModelTest(ModelTesterMixin, BackboneTesterMixin, PipelineTeste
|
|||||||
def test_can_use_safetensors(self):
|
def test_can_use_safetensors(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
|
@unittest.skip("Need to use a timm backbone and there is no tiny model available.")
|
||||||
def test_model_is_small(self):
|
def test_model_is_small(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
@@ -67,8 +67,8 @@ class TvltModelTester:
|
|||||||
num_image_channels=3,
|
num_image_channels=3,
|
||||||
num_audio_channels=1,
|
num_audio_channels=1,
|
||||||
num_frames=2,
|
num_frames=2,
|
||||||
hidden_size=128,
|
hidden_size=32,
|
||||||
num_hidden_layers=12,
|
num_hidden_layers=3,
|
||||||
num_attention_heads=4,
|
num_attention_heads=4,
|
||||||
intermediate_size=128,
|
intermediate_size=128,
|
||||||
hidden_act="gelu",
|
hidden_act="gelu",
|
||||||
@@ -79,7 +79,7 @@ class TvltModelTester:
|
|||||||
qkv_bias=True,
|
qkv_bias=True,
|
||||||
use_mean_pooling=True,
|
use_mean_pooling=True,
|
||||||
decoder_num_attention_heads=4,
|
decoder_num_attention_heads=4,
|
||||||
decoder_hidden_size=64,
|
decoder_hidden_size=32,
|
||||||
decoder_num_hidden_layers=2,
|
decoder_num_hidden_layers=2,
|
||||||
decoder_intermediate_size=128,
|
decoder_intermediate_size=128,
|
||||||
image_mask_ratio=0.75,
|
image_mask_ratio=0.75,
|
||||||
@@ -542,10 +542,6 @@ class TvltModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
check_hidden_states_output(inputs_dict, config, model_class)
|
check_hidden_states_output(inputs_dict, config, model_class)
|
||||||
|
|
||||||
@unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
|
|
||||||
def test_model_is_small(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
# We will verify our results on a video of eating spaghetti
|
# We will verify our results on a video of eating spaghetti
|
||||||
# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
|
# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ class UperNetModelTester:
|
|||||||
num_channels=3,
|
num_channels=3,
|
||||||
num_stages=4,
|
num_stages=4,
|
||||||
hidden_sizes=[10, 20, 30, 40],
|
hidden_sizes=[10, 20, 30, 40],
|
||||||
depths=[2, 2, 3, 2],
|
depths=[1, 1, 1, 1],
|
||||||
is_training=True,
|
is_training=True,
|
||||||
use_labels=True,
|
use_labels=True,
|
||||||
intermediate_size=37,
|
intermediate_size=37,
|
||||||
@@ -106,12 +106,12 @@ class UperNetModelTester:
|
|||||||
def get_config(self):
|
def get_config(self):
|
||||||
return UperNetConfig(
|
return UperNetConfig(
|
||||||
backbone_config=self.get_backbone_config(),
|
backbone_config=self.get_backbone_config(),
|
||||||
hidden_size=512,
|
hidden_size=64,
|
||||||
pool_scales=[1, 2, 3, 6],
|
pool_scales=[1, 2, 3, 6],
|
||||||
use_auxiliary_head=True,
|
use_auxiliary_head=True,
|
||||||
auxiliary_loss_weight=0.4,
|
auxiliary_loss_weight=0.4,
|
||||||
auxiliary_in_channels=40,
|
auxiliary_in_channels=40,
|
||||||
auxiliary_channels=256,
|
auxiliary_channels=32,
|
||||||
auxiliary_num_convs=1,
|
auxiliary_num_convs=1,
|
||||||
auxiliary_concat_input=False,
|
auxiliary_concat_input=False,
|
||||||
loss_ignore_index=255,
|
loss_ignore_index=255,
|
||||||
@@ -207,10 +207,6 @@ class UperNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
|
|||||||
def test_multi_gpu_data_parallel_forward(self):
|
def test_multi_gpu_data_parallel_forward(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
|
|
||||||
def test_model_is_small(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def test_hidden_states_output(self):
|
def test_hidden_states_output(self):
|
||||||
def check_hidden_states_output(inputs_dict, config, model_class):
|
def check_hidden_states_output(inputs_dict, config, model_class):
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
|
|||||||
@@ -130,6 +130,10 @@ class VideoMAEModelTester:
|
|||||||
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||||
is_decoder=False,
|
is_decoder=False,
|
||||||
initializer_range=self.initializer_range,
|
initializer_range=self.initializer_range,
|
||||||
|
decoder_hidden_size=self.hidden_size,
|
||||||
|
decoder_intermediate_size=self.intermediate_size,
|
||||||
|
decoder_num_attention_heads=self.num_attention_heads,
|
||||||
|
decoder_num_hidden_layers=self.num_hidden_layers,
|
||||||
)
|
)
|
||||||
|
|
||||||
def create_and_check_model(self, config, pixel_values, labels):
|
def create_and_check_model(self, config, pixel_values, labels):
|
||||||
@@ -344,10 +348,6 @@ class VideoMAEModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
|
|||||||
|
|
||||||
check_hidden_states_output(inputs_dict, config, model_class)
|
check_hidden_states_output(inputs_dict, config, model_class)
|
||||||
|
|
||||||
@unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
|
|
||||||
def test_model_is_small(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
# We will verify our results on a video of eating spaghetti
|
# We will verify our results on a video of eating spaghetti
|
||||||
# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
|
# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
|
||||||
|
|||||||
@@ -118,6 +118,10 @@ class ViTMAEModelTester:
|
|||||||
is_decoder=False,
|
is_decoder=False,
|
||||||
initializer_range=self.initializer_range,
|
initializer_range=self.initializer_range,
|
||||||
mask_ratio=self.mask_ratio,
|
mask_ratio=self.mask_ratio,
|
||||||
|
decoder_hidden_size=self.hidden_size,
|
||||||
|
decoder_intermediate_size=self.intermediate_size,
|
||||||
|
decoder_num_attention_heads=self.num_attention_heads,
|
||||||
|
decoder_num_hidden_layers=self.num_hidden_layers,
|
||||||
)
|
)
|
||||||
|
|
||||||
def create_and_check_model(self, config, pixel_values, labels):
|
def create_and_check_model(self, config, pixel_values, labels):
|
||||||
@@ -279,10 +283,6 @@ class ViTMAEModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
|||||||
def test_model_outputs_equivalence(self):
|
def test_model_outputs_equivalence(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
|
|
||||||
def test_model_is_small(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
for model_name in VIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
|
for model_name in VIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
|
||||||
|
|||||||
@@ -55,8 +55,8 @@ class VivitModelTester:
|
|||||||
num_frames=8, # decreased, because default 32 takes too much RAM at inference
|
num_frames=8, # decreased, because default 32 takes too much RAM at inference
|
||||||
tubelet_size=[2, 4, 4],
|
tubelet_size=[2, 4, 4],
|
||||||
num_channels=3,
|
num_channels=3,
|
||||||
hidden_size=768,
|
hidden_size=32,
|
||||||
num_hidden_layers=5,
|
num_hidden_layers=2,
|
||||||
num_attention_heads=4,
|
num_attention_heads=4,
|
||||||
intermediate_size=37,
|
intermediate_size=37,
|
||||||
hidden_act="gelu_fast",
|
hidden_act="gelu_fast",
|
||||||
@@ -310,10 +310,6 @@ class VivitModelTest(ModelTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
check_hidden_states_output(inputs_dict, config, model_class)
|
check_hidden_states_output(inputs_dict, config, model_class)
|
||||||
|
|
||||||
@unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
|
|
||||||
def test_model_is_small(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
# We will verify our results on a video of eating spaghetti
|
# We will verify our results on a video of eating spaghetti
|
||||||
# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
|
# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
|
||||||
|
|||||||
@@ -2708,7 +2708,6 @@ class ModelTesterMixin:
|
|||||||
def test_model_is_small(self):
|
def test_model_is_small(self):
|
||||||
# Just a consistency check to make sure we are not running tests on 80M parameter models.
|
# Just a consistency check to make sure we are not running tests on 80M parameter models.
|
||||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
# print(config)
|
|
||||||
|
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
|
|||||||
Reference in New Issue
Block a user