From b338414e614a30af5f940269484ef15bf716d078 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Tue, 7 Mar 2023 22:31:14 +0100 Subject: [PATCH] Update tiny model creation script and some others files (#22006) * Update 1 * Update 2 * Update 3 * Update 4 * Update 5 * Update 6 * Update 7 * Update 8 * Update 9 * Update 10 --------- Co-authored-by: ydshieh --- .../models/auto/feature_extraction_auto.py | 1 + .../models/auto/image_processing_auto.py | 1 + src/transformers/models/auto/processing_auto.py | 1 + .../configuration_gptsan_japanese.py | 3 +-- .../timesformer/configuration_timesformer.py | 3 ++- .../models/tvlt/configuration_tvlt.py | 2 +- .../models/xmod/configuration_xmod.py | 4 ++-- .../models/oneformer/test_modeling_oneformer.py | 17 +++++++++++++---- tests/models/speecht5/test_modeling_speecht5.py | 3 +++ utils/check_config_docstrings.py | 10 ++++++---- utils/create_dummy_models.py | 5 +++++ 11 files changed, 36 insertions(+), 14 deletions(-) diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py index caf27f2176..adeadf17e3 100644 --- a/src/transformers/models/auto/feature_extraction_auto.py +++ b/src/transformers/models/auto/feature_extraction_auto.py @@ -82,6 +82,7 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict( ("swinv2", "ViTFeatureExtractor"), ("table-transformer", "DetrFeatureExtractor"), ("timesformer", "VideoMAEFeatureExtractor"), + ("tvlt", "TvltFeatureExtractor"), ("unispeech", "Wav2Vec2FeatureExtractor"), ("unispeech-sat", "Wav2Vec2FeatureExtractor"), ("van", "ConvNextFeatureExtractor"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index fd2331131e..8b45c4d651 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -87,6 +87,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict( ("swinv2", "ViTImageProcessor"), ("table-transformer", "DetrImageProcessor"), ("timesformer", "VideoMAEImageProcessor"), + ("tvlt", "TvltImageProcessor"), ("upernet", "SegformerImageProcessor"), ("van", "ConvNextImageProcessor"), ("videomae", "VideoMAEImageProcessor"), diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index 5e40992188..197cfe8e79 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -65,6 +65,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict( ("speech_to_text_2", "Speech2Text2Processor"), ("speecht5", "SpeechT5Processor"), ("trocr", "TrOCRProcessor"), + ("tvlt", "TvltProcessor"), ("unispeech", "Wav2Vec2Processor"), ("unispeech-sat", "Wav2Vec2Processor"), ("vilt", "ViltProcessor"), diff --git a/src/transformers/models/gptsan_japanese/configuration_gptsan_japanese.py b/src/transformers/models/gptsan_japanese/configuration_gptsan_japanese.py index 196d7e2d93..d20b79daac 100644 --- a/src/transformers/models/gptsan_japanese/configuration_gptsan_japanese.py +++ b/src/transformers/models/gptsan_japanese/configuration_gptsan_japanese.py @@ -31,8 +31,7 @@ class GPTSanJapaneseConfig(PretrainedConfig): This is the configuration class to store the configuration of a [`GPTSanJapaneseModel`]. It is used to instantiate a GPTSANJapanese model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the GPTSANJapanese - [tanreinama/GPTSAN-2.8B-spout_is_uniform](https://huggingface.co/tanreinama/GPTSAN-2.8B-spout_is_uniform) - architecture. + [Tanrei/GPTSAN-japanese](https://huggingface.co/Tanrei/GPTSAN-japanese) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. diff --git a/src/transformers/models/timesformer/configuration_timesformer.py b/src/transformers/models/timesformer/configuration_timesformer.py index 34e538e56e..77f2aa008c 100644 --- a/src/transformers/models/timesformer/configuration_timesformer.py +++ b/src/transformers/models/timesformer/configuration_timesformer.py @@ -30,7 +30,8 @@ class TimesformerConfig(PretrainedConfig): This is the configuration class to store the configuration of a [`TimesformerModel`]. It is used to instantiate a TimeSformer model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the TimeSformer - [facebook/timesformer](https://huggingface.co/facebook/timesformer-base-finetuned-k600) architecture. + [facebook/timesformer-base-finetuned-k600](https://huggingface.co/facebook/timesformer-base-finetuned-k600) + architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. diff --git a/src/transformers/models/tvlt/configuration_tvlt.py b/src/transformers/models/tvlt/configuration_tvlt.py index f92d15a84e..a475fe89ed 100644 --- a/src/transformers/models/tvlt/configuration_tvlt.py +++ b/src/transformers/models/tvlt/configuration_tvlt.py @@ -30,7 +30,7 @@ class TvltConfig(PretrainedConfig): This is the configuration class to store the configuration of a [`TvltModel`]. It is used to instantiate a TVLT model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the TVLT - [TVLT/tvlt-base](https://huggingface.co/ZinengTang/tvlt-base) architecture. + [ZinengTang/tvlt-base](https://huggingface.co/ZinengTang/tvlt-base) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. diff --git a/src/transformers/models/xmod/configuration_xmod.py b/src/transformers/models/xmod/configuration_xmod.py index b2ca65cfe6..012b7446c4 100644 --- a/src/transformers/models/xmod/configuration_xmod.py +++ b/src/transformers/models/xmod/configuration_xmod.py @@ -41,8 +41,8 @@ class XmodConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`XmodModel`]. It is used to instantiate an X-MOD model according to the specified arguments, defining the model architecture. Instantiating a configuration with the - defaults will yield a similar configuration to that of the [xmod-base](https://huggingface.co/facebook/xmod-base) - architecture. + defaults will yield a similar configuration to that of the + [facebook/xmod-base](https://huggingface.co/facebook/xmod-base) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. diff --git a/tests/models/oneformer/test_modeling_oneformer.py b/tests/models/oneformer/test_modeling_oneformer.py index 99ab909a4d..f16c165380 100644 --- a/tests/models/oneformer/test_modeling_oneformer.py +++ b/tests/models/oneformer/test_modeling_oneformer.py @@ -56,6 +56,7 @@ class OneFormerModelTester: parent, batch_size=2, is_training=True, + vocab_size=99, use_auxiliary_loss=False, num_queries=10, num_channels=3, @@ -69,6 +70,7 @@ class OneFormerModelTester: self.parent = parent self.batch_size = batch_size self.is_training = is_training + self.vocab_size = vocab_size self.use_auxiliary_loss = use_auxiliary_loss self.num_queries = num_queries self.num_channels = num_channels @@ -84,12 +86,16 @@ class OneFormerModelTester: torch_device ) - task_inputs = torch.randint(high=49408, size=(self.batch_size, self.sequence_length)).to(torch_device).long() + task_inputs = ( + torch.randint(high=self.vocab_size, size=(self.batch_size, self.sequence_length)).to(torch_device).long() + ) pixel_mask = torch.ones([self.batch_size, self.min_size, self.max_size], device=torch_device) text_inputs = ( - torch.randint(high=49408, size=(self.batch_size, self.num_queries - self.n_ctx, self.sequence_length)) + torch.randint( + high=self.vocab_size, size=(self.batch_size, self.num_queries - self.n_ctx, self.sequence_length) + ) .to(torch_device) .long() ) @@ -104,6 +110,7 @@ class OneFormerModelTester: def get_config(self): config = OneFormerConfig( + text_encoder_vocab_size=self.vocab_size, hidden_size=self.hidden_dim, ) @@ -303,8 +310,10 @@ class OneFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas size = (self.model_tester.min_size,) * 2 inputs = { "pixel_values": torch.randn((2, 3, *size), device=torch_device), - "task_inputs": torch.randint(high=49408, size=(2, 77), device=torch_device).long(), - "text_inputs": torch.randint(high=49408, size=(2, 134, 77), device=torch_device).long(), + "task_inputs": torch.randint(high=self.model_tester.vocab_size, size=(2, 77), device=torch_device).long(), + "text_inputs": torch.randint( + high=self.model_tester.vocab_size, size=(2, 134, 77), device=torch_device + ).long(), "mask_labels": torch.randn((2, 150, *size), device=torch_device), "class_labels": torch.zeros(2, 150, device=torch_device).long(), } diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py index e18886c14d..028a4c50df 100644 --- a/tests/models/speecht5/test_modeling_speecht5.py +++ b/tests/models/speecht5/test_modeling_speecht5.py @@ -103,6 +103,7 @@ class SpeechT5ModelTester: batch_size=13, seq_length=7, is_training=False, + vocab_size=81, hidden_size=24, num_hidden_layers=4, num_attention_heads=2, @@ -112,6 +113,7 @@ class SpeechT5ModelTester: self.batch_size = batch_size self.seq_length = seq_length self.is_training = is_training + self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads @@ -140,6 +142,7 @@ class SpeechT5ModelTester: def get_config(self): return SpeechT5Config( + vocab_size=self.vocab_size, hidden_size=self.hidden_size, encoder_layers=self.num_hidden_layers, decoder_layers=self.num_hidden_layers, diff --git a/utils/check_config_docstrings.py b/utils/check_config_docstrings.py index 8c00574806..94066ff764 100644 --- a/utils/check_config_docstrings.py +++ b/utils/check_config_docstrings.py @@ -51,10 +51,12 @@ def get_checkpoint_from_config_class(config_class): config_source = inspect.getsource(config_class) checkpoints = _re_checkpoint.findall(config_source) - for checkpoint in checkpoints: - # Each `checkpoint` is a tuple of a checkpoint name and a checkpoint link. - # For example, `('bert-base-uncased', 'https://huggingface.co/bert-base-uncased')` - ckpt_name, ckpt_link = checkpoint + # Each `checkpoint` is a tuple of a checkpoint name and a checkpoint link. + # For example, `('bert-base-uncased', 'https://huggingface.co/bert-base-uncased')` + for ckpt_name, ckpt_link in checkpoints: + # allow the link to end with `/` + if ckpt_link.endswith("/"): + ckpt_link = ckpt_link[:-1] # verify the checkpoint name corresponds to the checkpoint link ckpt_link_from_name = f"https://huggingface.co/{ckpt_name}" diff --git a/utils/create_dummy_models.py b/utils/create_dummy_models.py index 9d8c93a762..60e055d670 100644 --- a/utils/create_dummy_models.py +++ b/utils/create_dummy_models.py @@ -782,6 +782,11 @@ def get_config_overrides(config_class, processors): # CLIP-like models have `text_model_tester` and `vision_model_tester`, and we need to pass `vocab_size` to # `text_model_tester` via `text_kwargs`. The same trick is also necessary for `Flava`. if config_class.__name__ in [ + "AlignConfig", + "AltCLIPConfig", + "ChineseCLIPConfig", + "CLIPSegConfig", + "ClapConfig", "CLIPConfig", "GroupViTConfig", "OwlViTConfig",