Update tiny model creation script and some others files (#22006)
* Update 1 * Update 2 * Update 3 * Update 4 * Update 5 * Update 6 * Update 7 * Update 8 * Update 9 * Update 10 --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
@@ -82,6 +82,7 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
|
|||||||
("swinv2", "ViTFeatureExtractor"),
|
("swinv2", "ViTFeatureExtractor"),
|
||||||
("table-transformer", "DetrFeatureExtractor"),
|
("table-transformer", "DetrFeatureExtractor"),
|
||||||
("timesformer", "VideoMAEFeatureExtractor"),
|
("timesformer", "VideoMAEFeatureExtractor"),
|
||||||
|
("tvlt", "TvltFeatureExtractor"),
|
||||||
("unispeech", "Wav2Vec2FeatureExtractor"),
|
("unispeech", "Wav2Vec2FeatureExtractor"),
|
||||||
("unispeech-sat", "Wav2Vec2FeatureExtractor"),
|
("unispeech-sat", "Wav2Vec2FeatureExtractor"),
|
||||||
("van", "ConvNextFeatureExtractor"),
|
("van", "ConvNextFeatureExtractor"),
|
||||||
|
|||||||
@@ -87,6 +87,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
|
|||||||
("swinv2", "ViTImageProcessor"),
|
("swinv2", "ViTImageProcessor"),
|
||||||
("table-transformer", "DetrImageProcessor"),
|
("table-transformer", "DetrImageProcessor"),
|
||||||
("timesformer", "VideoMAEImageProcessor"),
|
("timesformer", "VideoMAEImageProcessor"),
|
||||||
|
("tvlt", "TvltImageProcessor"),
|
||||||
("upernet", "SegformerImageProcessor"),
|
("upernet", "SegformerImageProcessor"),
|
||||||
("van", "ConvNextImageProcessor"),
|
("van", "ConvNextImageProcessor"),
|
||||||
("videomae", "VideoMAEImageProcessor"),
|
("videomae", "VideoMAEImageProcessor"),
|
||||||
|
|||||||
@@ -65,6 +65,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
|
|||||||
("speech_to_text_2", "Speech2Text2Processor"),
|
("speech_to_text_2", "Speech2Text2Processor"),
|
||||||
("speecht5", "SpeechT5Processor"),
|
("speecht5", "SpeechT5Processor"),
|
||||||
("trocr", "TrOCRProcessor"),
|
("trocr", "TrOCRProcessor"),
|
||||||
|
("tvlt", "TvltProcessor"),
|
||||||
("unispeech", "Wav2Vec2Processor"),
|
("unispeech", "Wav2Vec2Processor"),
|
||||||
("unispeech-sat", "Wav2Vec2Processor"),
|
("unispeech-sat", "Wav2Vec2Processor"),
|
||||||
("vilt", "ViltProcessor"),
|
("vilt", "ViltProcessor"),
|
||||||
|
|||||||
@@ -31,8 +31,7 @@ class GPTSanJapaneseConfig(PretrainedConfig):
|
|||||||
This is the configuration class to store the configuration of a [`GPTSanJapaneseModel`]. It is used to instantiate
|
This is the configuration class to store the configuration of a [`GPTSanJapaneseModel`]. It is used to instantiate
|
||||||
a GPTSANJapanese model according to the specified arguments, defining the model architecture. Instantiating a
|
a GPTSANJapanese model according to the specified arguments, defining the model architecture. Instantiating a
|
||||||
configuration with the defaults will yield a similar configuration to that of the GPTSANJapanese
|
configuration with the defaults will yield a similar configuration to that of the GPTSANJapanese
|
||||||
[tanreinama/GPTSAN-2.8B-spout_is_uniform](https://huggingface.co/tanreinama/GPTSAN-2.8B-spout_is_uniform)
|
[Tanrei/GPTSAN-japanese](https://huggingface.co/Tanrei/GPTSAN-japanese) architecture.
|
||||||
architecture.
|
|
||||||
|
|
||||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||||
documentation from [`PretrainedConfig`] for more information.
|
documentation from [`PretrainedConfig`] for more information.
|
||||||
|
|||||||
@@ -30,7 +30,8 @@ class TimesformerConfig(PretrainedConfig):
|
|||||||
This is the configuration class to store the configuration of a [`TimesformerModel`]. It is used to instantiate a
|
This is the configuration class to store the configuration of a [`TimesformerModel`]. It is used to instantiate a
|
||||||
TimeSformer model according to the specified arguments, defining the model architecture. Instantiating a
|
TimeSformer model according to the specified arguments, defining the model architecture. Instantiating a
|
||||||
configuration with the defaults will yield a similar configuration to that of the TimeSformer
|
configuration with the defaults will yield a similar configuration to that of the TimeSformer
|
||||||
[facebook/timesformer](https://huggingface.co/facebook/timesformer-base-finetuned-k600) architecture.
|
[facebook/timesformer-base-finetuned-k600](https://huggingface.co/facebook/timesformer-base-finetuned-k600)
|
||||||
|
architecture.
|
||||||
|
|
||||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||||
documentation from [`PretrainedConfig`] for more information.
|
documentation from [`PretrainedConfig`] for more information.
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ class TvltConfig(PretrainedConfig):
|
|||||||
This is the configuration class to store the configuration of a [`TvltModel`]. It is used to instantiate a TVLT
|
This is the configuration class to store the configuration of a [`TvltModel`]. It is used to instantiate a TVLT
|
||||||
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
||||||
defaults will yield a similar configuration to that of the TVLT
|
defaults will yield a similar configuration to that of the TVLT
|
||||||
[TVLT/tvlt-base](https://huggingface.co/ZinengTang/tvlt-base) architecture.
|
[ZinengTang/tvlt-base](https://huggingface.co/ZinengTang/tvlt-base) architecture.
|
||||||
|
|
||||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||||
documentation from [`PretrainedConfig`] for more information.
|
documentation from [`PretrainedConfig`] for more information.
|
||||||
|
|||||||
@@ -41,8 +41,8 @@ class XmodConfig(PretrainedConfig):
|
|||||||
r"""
|
r"""
|
||||||
This is the configuration class to store the configuration of a [`XmodModel`]. It is used to instantiate an X-MOD
|
This is the configuration class to store the configuration of a [`XmodModel`]. It is used to instantiate an X-MOD
|
||||||
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
||||||
defaults will yield a similar configuration to that of the [xmod-base](https://huggingface.co/facebook/xmod-base)
|
defaults will yield a similar configuration to that of the
|
||||||
architecture.
|
[facebook/xmod-base](https://huggingface.co/facebook/xmod-base) architecture.
|
||||||
|
|
||||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||||
documentation from [`PretrainedConfig`] for more information.
|
documentation from [`PretrainedConfig`] for more information.
|
||||||
|
|||||||
@@ -56,6 +56,7 @@ class OneFormerModelTester:
|
|||||||
parent,
|
parent,
|
||||||
batch_size=2,
|
batch_size=2,
|
||||||
is_training=True,
|
is_training=True,
|
||||||
|
vocab_size=99,
|
||||||
use_auxiliary_loss=False,
|
use_auxiliary_loss=False,
|
||||||
num_queries=10,
|
num_queries=10,
|
||||||
num_channels=3,
|
num_channels=3,
|
||||||
@@ -69,6 +70,7 @@ class OneFormerModelTester:
|
|||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
|
self.vocab_size = vocab_size
|
||||||
self.use_auxiliary_loss = use_auxiliary_loss
|
self.use_auxiliary_loss = use_auxiliary_loss
|
||||||
self.num_queries = num_queries
|
self.num_queries = num_queries
|
||||||
self.num_channels = num_channels
|
self.num_channels = num_channels
|
||||||
@@ -84,12 +86,16 @@ class OneFormerModelTester:
|
|||||||
torch_device
|
torch_device
|
||||||
)
|
)
|
||||||
|
|
||||||
task_inputs = torch.randint(high=49408, size=(self.batch_size, self.sequence_length)).to(torch_device).long()
|
task_inputs = (
|
||||||
|
torch.randint(high=self.vocab_size, size=(self.batch_size, self.sequence_length)).to(torch_device).long()
|
||||||
|
)
|
||||||
|
|
||||||
pixel_mask = torch.ones([self.batch_size, self.min_size, self.max_size], device=torch_device)
|
pixel_mask = torch.ones([self.batch_size, self.min_size, self.max_size], device=torch_device)
|
||||||
|
|
||||||
text_inputs = (
|
text_inputs = (
|
||||||
torch.randint(high=49408, size=(self.batch_size, self.num_queries - self.n_ctx, self.sequence_length))
|
torch.randint(
|
||||||
|
high=self.vocab_size, size=(self.batch_size, self.num_queries - self.n_ctx, self.sequence_length)
|
||||||
|
)
|
||||||
.to(torch_device)
|
.to(torch_device)
|
||||||
.long()
|
.long()
|
||||||
)
|
)
|
||||||
@@ -104,6 +110,7 @@ class OneFormerModelTester:
|
|||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
config = OneFormerConfig(
|
config = OneFormerConfig(
|
||||||
|
text_encoder_vocab_size=self.vocab_size,
|
||||||
hidden_size=self.hidden_dim,
|
hidden_size=self.hidden_dim,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -303,8 +310,10 @@ class OneFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
|
|||||||
size = (self.model_tester.min_size,) * 2
|
size = (self.model_tester.min_size,) * 2
|
||||||
inputs = {
|
inputs = {
|
||||||
"pixel_values": torch.randn((2, 3, *size), device=torch_device),
|
"pixel_values": torch.randn((2, 3, *size), device=torch_device),
|
||||||
"task_inputs": torch.randint(high=49408, size=(2, 77), device=torch_device).long(),
|
"task_inputs": torch.randint(high=self.model_tester.vocab_size, size=(2, 77), device=torch_device).long(),
|
||||||
"text_inputs": torch.randint(high=49408, size=(2, 134, 77), device=torch_device).long(),
|
"text_inputs": torch.randint(
|
||||||
|
high=self.model_tester.vocab_size, size=(2, 134, 77), device=torch_device
|
||||||
|
).long(),
|
||||||
"mask_labels": torch.randn((2, 150, *size), device=torch_device),
|
"mask_labels": torch.randn((2, 150, *size), device=torch_device),
|
||||||
"class_labels": torch.zeros(2, 150, device=torch_device).long(),
|
"class_labels": torch.zeros(2, 150, device=torch_device).long(),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -103,6 +103,7 @@ class SpeechT5ModelTester:
|
|||||||
batch_size=13,
|
batch_size=13,
|
||||||
seq_length=7,
|
seq_length=7,
|
||||||
is_training=False,
|
is_training=False,
|
||||||
|
vocab_size=81,
|
||||||
hidden_size=24,
|
hidden_size=24,
|
||||||
num_hidden_layers=4,
|
num_hidden_layers=4,
|
||||||
num_attention_heads=2,
|
num_attention_heads=2,
|
||||||
@@ -112,6 +113,7 @@ class SpeechT5ModelTester:
|
|||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
self.seq_length = seq_length
|
self.seq_length = seq_length
|
||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
|
self.vocab_size = vocab_size
|
||||||
self.hidden_size = hidden_size
|
self.hidden_size = hidden_size
|
||||||
self.num_hidden_layers = num_hidden_layers
|
self.num_hidden_layers = num_hidden_layers
|
||||||
self.num_attention_heads = num_attention_heads
|
self.num_attention_heads = num_attention_heads
|
||||||
@@ -140,6 +142,7 @@ class SpeechT5ModelTester:
|
|||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
return SpeechT5Config(
|
return SpeechT5Config(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
hidden_size=self.hidden_size,
|
hidden_size=self.hidden_size,
|
||||||
encoder_layers=self.num_hidden_layers,
|
encoder_layers=self.num_hidden_layers,
|
||||||
decoder_layers=self.num_hidden_layers,
|
decoder_layers=self.num_hidden_layers,
|
||||||
|
|||||||
@@ -51,10 +51,12 @@ def get_checkpoint_from_config_class(config_class):
|
|||||||
config_source = inspect.getsource(config_class)
|
config_source = inspect.getsource(config_class)
|
||||||
checkpoints = _re_checkpoint.findall(config_source)
|
checkpoints = _re_checkpoint.findall(config_source)
|
||||||
|
|
||||||
for checkpoint in checkpoints:
|
|
||||||
# Each `checkpoint` is a tuple of a checkpoint name and a checkpoint link.
|
# Each `checkpoint` is a tuple of a checkpoint name and a checkpoint link.
|
||||||
# For example, `('bert-base-uncased', 'https://huggingface.co/bert-base-uncased')`
|
# For example, `('bert-base-uncased', 'https://huggingface.co/bert-base-uncased')`
|
||||||
ckpt_name, ckpt_link = checkpoint
|
for ckpt_name, ckpt_link in checkpoints:
|
||||||
|
# allow the link to end with `/`
|
||||||
|
if ckpt_link.endswith("/"):
|
||||||
|
ckpt_link = ckpt_link[:-1]
|
||||||
|
|
||||||
# verify the checkpoint name corresponds to the checkpoint link
|
# verify the checkpoint name corresponds to the checkpoint link
|
||||||
ckpt_link_from_name = f"https://huggingface.co/{ckpt_name}"
|
ckpt_link_from_name = f"https://huggingface.co/{ckpt_name}"
|
||||||
|
|||||||
@@ -782,6 +782,11 @@ def get_config_overrides(config_class, processors):
|
|||||||
# CLIP-like models have `text_model_tester` and `vision_model_tester`, and we need to pass `vocab_size` to
|
# CLIP-like models have `text_model_tester` and `vision_model_tester`, and we need to pass `vocab_size` to
|
||||||
# `text_model_tester` via `text_kwargs`. The same trick is also necessary for `Flava`.
|
# `text_model_tester` via `text_kwargs`. The same trick is also necessary for `Flava`.
|
||||||
if config_class.__name__ in [
|
if config_class.__name__ in [
|
||||||
|
"AlignConfig",
|
||||||
|
"AltCLIPConfig",
|
||||||
|
"ChineseCLIPConfig",
|
||||||
|
"CLIPSegConfig",
|
||||||
|
"ClapConfig",
|
||||||
"CLIPConfig",
|
"CLIPConfig",
|
||||||
"GroupViTConfig",
|
"GroupViTConfig",
|
||||||
"OwlViTConfig",
|
"OwlViTConfig",
|
||||||
|
|||||||
Reference in New Issue
Block a user