From c8be8a9adb218ecc593c687020e952554a5a55b5 Mon Sep 17 00:00:00 2001 From: Nils Reimers Date: Mon, 6 Sep 2021 16:30:13 +0200 Subject: [PATCH] Update model configs - Allow setters for common properties (#13026) * refactor GPT Config to allow dyn. properties * make attribute_map a class attribute * remove old code * update unit test to test config: Add test for common properties setter * update unit test to test config: Add test for common properties passed as parameters to __init__ * update to black code format * Allow that setters are not defined for certain config classes * update config classes to implement attribute_map * bugfix lxmert config - id2labels was not defined when num_labels was set * update broken configs - add attribute_maps * update bart config * update black codestyle * update documentation on common config attributes * update GPTJ config to new attribute map * update docs on common attributes * gptj config: add max_position_embeddings * gptj config: format with black * update speech to text 2 config * format doc file to max_len 119 * update config template --- docs/source/main_classes/configuration.rst | 5 +++ src/transformers/configuration_utils.py | 15 ++++++- .../models/bart/configuration_bart.py | 31 ++++++------- .../configuration_bigbird_pegasus.py | 33 ++++++-------- .../blenderbot/configuration_blenderbot.py | 29 +++++-------- .../configuration_blenderbot_small.py | 27 +++++------- .../models/ctrl/configuration_ctrl.py | 23 +++------- .../models/detr/configuration_detr.py | 7 ++- .../distilbert/configuration_distilbert.py | 19 +++----- .../models/flaubert/configuration_flaubert.py | 2 +- .../models/fsmt/configuration_fsmt.py | 33 ++++++-------- .../models/funnel/configuration_funnel.py | 24 +++++++---- .../models/gpt2/configuration_gpt2.py | 24 +++-------- .../models/gpt_neo/configuration_gpt_neo.py | 13 ++---- .../models/gptj/configuration_gptj.py | 24 +++-------- .../models/led/configuration_led.py | 38 ++++++---------- .../models/lxmert/configuration_lxmert.py | 7 ++- .../models/m2m_100/configuration_m2m_100.py | 25 ++++------- .../models/marian/configuration_marian.py | 26 ++++------- .../models/mbart/configuration_mbart.py | 26 ++++------- .../models/openai/configuration_openai.py | 25 +++-------- .../models/pegasus/configuration_pegasus.py | 18 ++++---- .../prophetnet/configuration_prophetnet.py | 30 +++++++------ .../models/reformer/configuration_reformer.py | 16 +++---- .../configuration_speech_to_text.py | 25 ++++------- .../configuration_speech_to_text_2.py | 23 ++++------ .../models/t5/configuration_t5.py | 25 +++-------- .../transfo_xl/configuration_transfo_xl.py | 33 ++++++-------- .../models/xlm/configuration_xlm.py | 27 +++--------- .../models/xlnet/configuration_xlnet.py | 34 ++++++--------- ...on_{{cookiecutter.lowercase_modelname}}.py | 43 +++++++++---------- tests/test_configuration_common.py | 36 ++++++++++++++-- 32 files changed, 326 insertions(+), 440 deletions(-) diff --git a/docs/source/main_classes/configuration.rst b/docs/source/main_classes/configuration.rst index 464160a9c6..bcd665a19a 100644 --- a/docs/source/main_classes/configuration.rst +++ b/docs/source/main_classes/configuration.rst @@ -17,6 +17,11 @@ The base class :class:`~transformers.PretrainedConfig` implements the common met either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository). +Each derived config class implements model specific attributes. Common attributes present in all config classes are: +:obj:`hidden_size`, :obj:`num_attention_heads`, and :obj:`num_hidden_layers`. Text models further implement: +:obj:`vocab_size`. + + PretrainedConfig ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 35a90a8a59..45683ac801 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -57,6 +57,8 @@ class PretrainedConfig(PushToHubMixin): :class:`~RagConfig`. - **keys_to_ignore_at_inference** (:obj:`List[str]`) -- A list of keys to ignore by default when looking at dictionary outputs of the model during inference. + - **attribute_map** (:obj:`Dict[str, str]`) -- A dict that maps model specific attribute names to the + standardized naming of attributes. Common attributes (present in all subclasses) @@ -218,6 +220,17 @@ class PretrainedConfig(PushToHubMixin): """ model_type: str = "" is_composition: bool = False + attribute_map: Dict[str, str] = {} + + def __setattr__(self, key, value): + if key in super().__getattribute__("attribute_map"): + key = super().__getattribute__("attribute_map")[key] + super().__setattr__(key, value) + + def __getattribute__(self, key): + if key != "attribute_map" and key in super().__getattribute__("attribute_map"): + key = super().__getattribute__("attribute_map")[key] + return super().__getattribute__(key) def __init__(self, **kwargs): # Attributes with defaults @@ -350,7 +363,7 @@ class PretrainedConfig(PushToHubMixin): @num_labels.setter def num_labels(self, num_labels: int): - if self.id2label is None or len(self.id2label) != num_labels: + if not hasattr(self, "id2label") or self.id2label is None or len(self.id2label) != num_labels: self.id2label = {i: f"LABEL_{i}" for i in range(num_labels)} self.label2id = dict(zip(self.id2label.values(), self.id2label.keys())) diff --git a/src/transformers/models/bart/configuration_bart.py b/src/transformers/models/bart/configuration_bart.py index 3890a9c803..e26afb2ab4 100644 --- a/src/transformers/models/bart/configuration_bart.py +++ b/src/transformers/models/bart/configuration_bart.py @@ -109,6 +109,7 @@ class BartConfig(PretrainedConfig): """ model_type = "bart" keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} def __init__( self, @@ -141,17 +142,6 @@ class BartConfig(PretrainedConfig): forced_eos_token_id=2, **kwargs ): - super().__init__( - num_labels=num_labels, - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - is_encoder_decoder=is_encoder_decoder, - decoder_start_token_id=decoder_start_token_id, - forced_eos_token_id=forced_eos_token_id, - **kwargs, - ) - self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.d_model = d_model @@ -174,6 +164,17 @@ class BartConfig(PretrainedConfig): self.gradient_checkpointing = gradient_checkpointing self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True + super().__init__( + num_labels=num_labels, + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + decoder_start_token_id=decoder_start_token_id, + forced_eos_token_id=forced_eos_token_id, + **kwargs, + ) + # ensure backward compatibility for BART CNN models if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False): self.forced_bos_token_id = self.bos_token_id @@ -182,14 +183,6 @@ class BartConfig(PretrainedConfig): "The config can simply be saved and uploaded again to be fixed." ) - @property - def num_attention_heads(self) -> int: - return self.encoder_attention_heads - - @property - def hidden_size(self) -> int: - return self.d_model - class BartOnnxConfig(OnnxConfigWithPast): @property diff --git a/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py index 7b3ce6f79b..28211c9b16 100644 --- a/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py @@ -112,6 +112,11 @@ class BigBirdPegasusConfig(PretrainedConfig): """ model_type = "bigbird_pegasus" keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "num_attention_heads": "encoder_attention_heads", + "hidden_size": "d_model", + "attention_probs_dropout_prob": "attention_dropout", + } def __init__( self, @@ -146,15 +151,6 @@ class BigBirdPegasusConfig(PretrainedConfig): use_bias=False, **kwargs ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - is_encoder_decoder=is_encoder_decoder, - decoder_start_token_id=decoder_start_token_id, - **kwargs, - ) - self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.d_model = d_model @@ -183,14 +179,11 @@ class BigBirdPegasusConfig(PretrainedConfig): self.num_random_blocks = num_random_blocks self.use_bias = use_bias - @property - def num_attention_heads(self) -> int: - return self.encoder_attention_heads - - @property - def hidden_size(self) -> int: - return self.d_model - - @property - def attention_probs_dropout_prob(self) -> float: - return self.attention_dropout + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + decoder_start_token_id=decoder_start_token_id, + **kwargs, + ) diff --git a/src/transformers/models/blenderbot/configuration_blenderbot.py b/src/transformers/models/blenderbot/configuration_blenderbot.py index 83c234155a..c2b272af03 100644 --- a/src/transformers/models/blenderbot/configuration_blenderbot.py +++ b/src/transformers/models/blenderbot/configuration_blenderbot.py @@ -103,6 +103,7 @@ class BlenderbotConfig(PretrainedConfig): """ model_type = "blenderbot" keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} def __init__( self, @@ -135,17 +136,6 @@ class BlenderbotConfig(PretrainedConfig): forced_eos_token_id=2, **kwargs ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - is_encoder_decoder=is_encoder_decoder, - decoder_start_token_id=decoder_start_token_id, - encoder_no_repeat_ngram_size=encoder_no_repeat_ngram_size, - forced_eos_token_id=forced_eos_token_id, - **kwargs, - ) - self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.d_model = d_model @@ -168,10 +158,13 @@ class BlenderbotConfig(PretrainedConfig): self.gradient_checkpointing = gradient_checkpointing self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True - @property - def num_attention_heads(self) -> int: - return self.encoder_attention_heads - - @property - def hidden_size(self) -> int: - return self.d_model + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + decoder_start_token_id=decoder_start_token_id, + encoder_no_repeat_ngram_size=encoder_no_repeat_ngram_size, + forced_eos_token_id=forced_eos_token_id, + **kwargs, + ) diff --git a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py index 9961980124..de8927a4ff 100644 --- a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py @@ -103,6 +103,7 @@ class BlenderbotSmallConfig(PretrainedConfig): """ model_type = "blenderbot-small" keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} def __init__( self, @@ -134,16 +135,6 @@ class BlenderbotSmallConfig(PretrainedConfig): forced_eos_token_id=2, **kwargs ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - is_encoder_decoder=is_encoder_decoder, - decoder_start_token_id=decoder_start_token_id, - forced_eos_token_id=forced_eos_token_id, - **kwargs, - ) - self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.d_model = d_model @@ -166,10 +157,12 @@ class BlenderbotSmallConfig(PretrainedConfig): self.gradient_checkpointing = gradient_checkpointing self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True - @property - def num_attention_heads(self) -> int: - return self.encoder_attention_heads - - @property - def hidden_size(self) -> int: - return self.d_model + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + decoder_start_token_id=decoder_start_token_id, + forced_eos_token_id=forced_eos_token_id, + **kwargs, + ) diff --git a/src/transformers/models/ctrl/configuration_ctrl.py b/src/transformers/models/ctrl/configuration_ctrl.py index ea6bedb706..bd045586b7 100644 --- a/src/transformers/models/ctrl/configuration_ctrl.py +++ b/src/transformers/models/ctrl/configuration_ctrl.py @@ -81,6 +81,12 @@ class CTRLConfig(PretrainedConfig): model_type = "ctrl" keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "max_position_embeddings": "n_positions", + "hidden_size": "n_embd", + "num_attention_heads": "n_head", + "num_hidden_layers": "n_layer", + } def __init__( self, @@ -104,7 +110,6 @@ class CTRLConfig(PretrainedConfig): use_cache=True, **kwargs ): - super().__init__(**kwargs) self.vocab_size = vocab_size self.n_ctx = n_ctx self.n_positions = n_positions @@ -125,18 +130,4 @@ class CTRLConfig(PretrainedConfig): self.summary_proj_to_labels = summary_proj_to_labels self.use_cache = use_cache - @property - def max_position_embeddings(self): - return self.n_positions - - @property - def hidden_size(self): - return self.n_embd - - @property - def num_attention_heads(self): - return self.n_head - - @property - def num_hidden_layers(self): - return self.n_layer + super().__init__(**kwargs) diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py index a8d9b4d6a2..2f6392a3c6 100644 --- a/src/transformers/models/detr/configuration_detr.py +++ b/src/transformers/models/detr/configuration_detr.py @@ -117,6 +117,10 @@ class DetrConfig(PretrainedConfig): """ model_type = "detr" keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "hidden_size": "d_model", + "num_attention_heads": "encoder_attention_heads", + } def __init__( self, @@ -154,8 +158,6 @@ class DetrConfig(PretrainedConfig): eos_coefficient=0.1, **kwargs ): - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) - self.num_queries = num_queries self.max_position_embeddings = max_position_embeddings self.d_model = d_model @@ -189,6 +191,7 @@ class DetrConfig(PretrainedConfig): self.bbox_loss_coefficient = bbox_loss_coefficient self.giou_loss_coefficient = giou_loss_coefficient self.eos_coefficient = eos_coefficient + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) @property def num_attention_heads(self) -> int: diff --git a/src/transformers/models/distilbert/configuration_distilbert.py b/src/transformers/models/distilbert/configuration_distilbert.py index a171ea1dca..733714e721 100644 --- a/src/transformers/models/distilbert/configuration_distilbert.py +++ b/src/transformers/models/distilbert/configuration_distilbert.py @@ -93,6 +93,11 @@ class DistilBertConfig(PretrainedConfig): >>> configuration = model.config """ model_type = "distilbert" + attribute_map = { + "hidden_size": "dim", + "num_attention_heads": "n_heads", + "num_hidden_layers": "n_layers", + } def __init__( self, @@ -112,7 +117,6 @@ class DistilBertConfig(PretrainedConfig): pad_token_id=0, **kwargs ): - super().__init__(**kwargs, pad_token_id=pad_token_id) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.sinusoidal_pos_embds = sinusoidal_pos_embds @@ -126,18 +130,7 @@ class DistilBertConfig(PretrainedConfig): self.initializer_range = initializer_range self.qa_dropout = qa_dropout self.seq_classif_dropout = seq_classif_dropout - - @property - def hidden_size(self): - return self.dim - - @property - def num_attention_heads(self): - return self.n_heads - - @property - def num_hidden_layers(self): - return self.n_layers + super().__init__(**kwargs, pad_token_id=pad_token_id) class DistilBertOnnxConfig(OnnxConfig): diff --git a/src/transformers/models/flaubert/configuration_flaubert.py b/src/transformers/models/flaubert/configuration_flaubert.py index 436e1a8871..a372ff47ce 100644 --- a/src/transformers/models/flaubert/configuration_flaubert.py +++ b/src/transformers/models/flaubert/configuration_flaubert.py @@ -136,6 +136,6 @@ class FlaubertConfig(XLMConfig): def __init__(self, layerdrop=0.0, pre_norm=False, pad_token_id=2, bos_token_id=0, **kwargs): """Constructs FlaubertConfig.""" - super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs) self.layerdrop = layerdrop self.pre_norm = pre_norm + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs) diff --git a/src/transformers/models/fsmt/configuration_fsmt.py b/src/transformers/models/fsmt/configuration_fsmt.py index d7a79298c7..2591b3d05b 100644 --- a/src/transformers/models/fsmt/configuration_fsmt.py +++ b/src/transformers/models/fsmt/configuration_fsmt.py @@ -124,6 +124,7 @@ class FSMTConfig(PretrainedConfig): """ model_type = "fsmt" + attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} # update the defaults from config file def __init__( @@ -161,18 +162,6 @@ class FSMTConfig(PretrainedConfig): forced_eos_token_id=2, **common_kwargs ): - if "hidden_size" in common_kwargs: - raise ValueError("hidden size is called d_model") - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - decoder_start_token_id=decoder_start_token_id, - is_encoder_decoder=is_encoder_decoder, - tie_word_embeddings=tie_word_embeddings, - forced_eos_token_id=forced_eos_token_id, - **common_kwargs, - ) self.langs = langs self.src_vocab_size = src_vocab_size self.tgt_vocab_size = tgt_vocab_size @@ -196,6 +185,8 @@ class FSMTConfig(PretrainedConfig): self.early_stopping = early_stopping self.decoder = DecoderConfig(vocab_size=tgt_vocab_size, bos_token_id=eos_token_id) + if "decoder" in common_kwargs: + del common_kwargs["decoder"] self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True @@ -205,14 +196,16 @@ class FSMTConfig(PretrainedConfig): self.dropout = dropout self.use_cache = use_cache - - @property - def num_attention_heads(self) -> int: - return self.encoder_attention_heads - - @property - def hidden_size(self) -> int: - return self.d_model + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + decoder_start_token_id=decoder_start_token_id, + is_encoder_decoder=is_encoder_decoder, + tie_word_embeddings=tie_word_embeddings, + forced_eos_token_id=forced_eos_token_id, + **common_kwargs, + ) def to_dict(self): """ diff --git a/src/transformers/models/funnel/configuration_funnel.py b/src/transformers/models/funnel/configuration_funnel.py index aeb836e9e9..1d25e765c4 100644 --- a/src/transformers/models/funnel/configuration_funnel.py +++ b/src/transformers/models/funnel/configuration_funnel.py @@ -102,6 +102,10 @@ class FunnelConfig(PretrainedConfig): Whether or not to apply the pooling only to the query or to query, key and values for the attention layers. """ model_type = "funnel" + attribute_map = { + "hidden_size": "d_model", + "num_attention_heads": "n_head", + } def __init__( self, @@ -129,8 +133,6 @@ class FunnelConfig(PretrainedConfig): pool_q_only=True, **kwargs ): - super().__init__(**kwargs) - self.vocab_size = vocab_size self.block_sizes = block_sizes self.block_repeats = [1] * len(block_sizes) if block_repeats is None else block_repeats @@ -165,18 +167,22 @@ class FunnelConfig(PretrainedConfig): self.truncate_seq = truncate_seq self.pool_q_only = pool_q_only - @property - def hidden_size(self): - return self.d_model - - @property - def num_attention_heads(self): - return self.n_head + super().__init__(**kwargs) @property def num_hidden_layers(self): return sum(self.block_sizes) + @num_hidden_layers.setter + def num_hidden_layers(self, value): + raise NotImplementedError( + "This model does not support the setting of `num_hidden_layers`. Please set `block_sizes`." + ) + @property def num_blocks(self): return len(self.block_sizes) + + @num_blocks.setter + def num_blocks(self, value): + raise NotImplementedError("This model does not support the setting of `num_blocks`. Please set `block_sizes`.") diff --git a/src/transformers/models/gpt2/configuration_gpt2.py b/src/transformers/models/gpt2/configuration_gpt2.py index 503199e4cf..f003023ca8 100644 --- a/src/transformers/models/gpt2/configuration_gpt2.py +++ b/src/transformers/models/gpt2/configuration_gpt2.py @@ -130,6 +130,12 @@ class GPT2Config(PretrainedConfig): model_type = "gpt2" keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "hidden_size": "n_embd", + "max_position_embeddings": "n_positions", + "num_attention_heads": "n_head", + "num_hidden_layers": "n_layer", + } def __init__( self, @@ -158,8 +164,6 @@ class GPT2Config(PretrainedConfig): eos_token_id=50256, **kwargs ): - super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) - self.vocab_size = vocab_size self.n_ctx = n_ctx self.n_positions = n_positions @@ -185,21 +189,7 @@ class GPT2Config(PretrainedConfig): self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id - @property - def max_position_embeddings(self): - return self.n_positions - - @property - def hidden_size(self): - return self.n_embd - - @property - def num_attention_heads(self): - return self.n_head - - @property - def num_hidden_layers(self): - return self.n_layer + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) class GPT2OnnxConfig(OnnxConfigWithPast): diff --git a/src/transformers/models/gpt_neo/configuration_gpt_neo.py b/src/transformers/models/gpt_neo/configuration_gpt_neo.py index c009056cd6..e788a7cffa 100644 --- a/src/transformers/models/gpt_neo/configuration_gpt_neo.py +++ b/src/transformers/models/gpt_neo/configuration_gpt_neo.py @@ -96,6 +96,7 @@ class GPTNeoConfig(PretrainedConfig): >>> configuration = model.config """ model_type = "gpt_neo" + attribute_map = {"num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"} def __init__( self, @@ -124,8 +125,6 @@ class GPTNeoConfig(PretrainedConfig): eos_token_id=50256, **kwargs ): - super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) - self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -163,6 +162,8 @@ class GPTNeoConfig(PretrainedConfig): "Please verify the value of `config.attention_types` argument." ) + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + @staticmethod def expand_attention_types_params(attention_types): attentions = [] @@ -171,14 +172,6 @@ class GPTNeoConfig(PretrainedConfig): attentions.extend(item[0]) return attentions - @property - def num_attention_heads(self): - return self.num_heads - - @property - def num_hidden_layers(self): - return self.num_layers - def custom_unfold(input, dimension, size, step): """Custom torch.Tensor.unfold implementation to enable the export to ONNX.""" diff --git a/src/transformers/models/gptj/configuration_gptj.py b/src/transformers/models/gptj/configuration_gptj.py index 227a73ab9c..93018fdcb6 100644 --- a/src/transformers/models/gptj/configuration_gptj.py +++ b/src/transformers/models/gptj/configuration_gptj.py @@ -87,6 +87,12 @@ class GPTJConfig(PretrainedConfig): >>> configuration = model.config """ model_type = "gptj" + attribute_map = { + "max_position_embeddings": "n_positions", + "hidden_size": "n_embd", + "num_attention_heads": "n_head", + "num_hidden_layers": "n_layer", + } def __init__( self, @@ -111,8 +117,6 @@ class GPTJConfig(PretrainedConfig): eos_token_id=50256, **kwargs ): - super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) - self.vocab_size = vocab_size self.n_ctx = n_ctx self.n_positions = n_positions @@ -134,18 +138,4 @@ class GPTJConfig(PretrainedConfig): self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id - @property - def max_position_embeddings(self): - return self.n_positions - - @property - def hidden_size(self): - return self.n_embd - - @property - def num_attention_heads(self): - return self.n_head - - @property - def num_hidden_layers(self): - return self.n_layer + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/led/configuration_led.py b/src/transformers/models/led/configuration_led.py index d18aec9b36..5992d275ed 100644 --- a/src/transformers/models/led/configuration_led.py +++ b/src/transformers/models/led/configuration_led.py @@ -99,6 +99,12 @@ class LEDConfig(PretrainedConfig): >>> configuration = model.config """ model_type = "led" + attribute_map = { + "num_attention_heads": "encoder_attention_heads", + "hidden_size": "d_model", + "attention_probs_dropout_prob": "attention_dropout", + "initializer_range": "init_std", + } def __init__( self, @@ -130,15 +136,6 @@ class LEDConfig(PretrainedConfig): attention_window: Union[List[int], int] = 512, **kwargs ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - is_encoder_decoder=is_encoder_decoder, - decoder_start_token_id=decoder_start_token_id, - **kwargs, - ) - self.vocab_size = vocab_size self.max_encoder_position_embeddings = max_encoder_position_embeddings self.max_decoder_position_embeddings = max_decoder_position_embeddings @@ -162,18 +159,11 @@ class LEDConfig(PretrainedConfig): self.attention_window = attention_window self.gradient_checkpointing = gradient_checkpointing - @property - def num_attention_heads(self) -> int: - return self.encoder_attention_heads - - @property - def hidden_size(self) -> int: - return self.d_model - - @property - def attention_probs_dropout_prob(self) -> float: - return self.attention_dropout - - @property - def initializer_range(self) -> float: - return self.init_std + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + decoder_start_token_id=decoder_start_token_id, + **kwargs, + ) diff --git a/src/transformers/models/lxmert/configuration_lxmert.py b/src/transformers/models/lxmert/configuration_lxmert.py index 85f191de6b..19741285e3 100644 --- a/src/transformers/models/lxmert/configuration_lxmert.py +++ b/src/transformers/models/lxmert/configuration_lxmert.py @@ -113,13 +113,13 @@ class LxmertConfig(PretrainedConfig): """ model_type = "lxmert" + attribute_map = {} def __init__( self, vocab_size=30522, hidden_size=768, num_attention_heads=12, - num_labels=2, num_qa_labels=9500, num_object_labels=1600, num_attr_labels=400, @@ -149,11 +149,9 @@ class LxmertConfig(PretrainedConfig): output_hidden_states=False, **kwargs, ): - super().__init__(**kwargs) self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_attention_heads = num_attention_heads - self.num_labels = num_labels self.hidden_act = hidden_act self.intermediate_size = intermediate_size self.hidden_dropout_prob = hidden_dropout_prob @@ -179,5 +177,6 @@ class LxmertConfig(PretrainedConfig): self.visual_attr_loss = visual_attr_loss self.visual_feat_loss = visual_feat_loss self.output_hidden_states = output_hidden_states - self.output_attentions = self.output_attentions + self.output_attentions = output_attentions self.num_hidden_layers = {"vision": r_layers, "cross_encoder": x_layers, "language": l_layers} + super().__init__(**kwargs) diff --git a/src/transformers/models/m2m_100/configuration_m2m_100.py b/src/transformers/models/m2m_100/configuration_m2m_100.py index 725be8f796..765bcb4cd1 100644 --- a/src/transformers/models/m2m_100/configuration_m2m_100.py +++ b/src/transformers/models/m2m_100/configuration_m2m_100.py @@ -97,6 +97,7 @@ class M2M100Config(PretrainedConfig): """ model_type = "m2m_100" keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} def __init__( self, @@ -126,15 +127,6 @@ class M2M100Config(PretrainedConfig): eos_token_id=2, **kwargs ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - is_encoder_decoder=is_encoder_decoder, - decoder_start_token_id=decoder_start_token_id, - **kwargs, - ) - self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.d_model = d_model @@ -156,10 +148,11 @@ class M2M100Config(PretrainedConfig): self.gradient_checkpointing = gradient_checkpointing self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True - @property - def num_attention_heads(self) -> int: - return self.encoder_attention_heads - - @property - def hidden_size(self) -> int: - return self.d_model + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + decoder_start_token_id=decoder_start_token_id, + **kwargs, + ) diff --git a/src/transformers/models/marian/configuration_marian.py b/src/transformers/models/marian/configuration_marian.py index 15893eef30..1b974badfa 100644 --- a/src/transformers/models/marian/configuration_marian.py +++ b/src/transformers/models/marian/configuration_marian.py @@ -103,6 +103,7 @@ class MarianConfig(PretrainedConfig): """ model_type = "marian" keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} def __init__( self, @@ -133,15 +134,6 @@ class MarianConfig(PretrainedConfig): forced_eos_token_id=0, **kwargs ): - super().__init__( - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - is_encoder_decoder=is_encoder_decoder, - decoder_start_token_id=decoder_start_token_id, - forced_eos_token_id=forced_eos_token_id, - **kwargs, - ) - self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.d_model = d_model @@ -163,11 +155,11 @@ class MarianConfig(PretrainedConfig): self.num_hidden_layers = encoder_layers self.gradient_checkpointing = gradient_checkpointing self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True - - @property - def num_attention_heads(self) -> int: - return self.encoder_attention_heads - - @property - def hidden_size(self) -> int: - return self.d_model + super().__init__( + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + decoder_start_token_id=decoder_start_token_id, + forced_eos_token_id=forced_eos_token_id, + **kwargs, + ) diff --git a/src/transformers/models/mbart/configuration_mbart.py b/src/transformers/models/mbart/configuration_mbart.py index 610ebf46cb..05857241b4 100644 --- a/src/transformers/models/mbart/configuration_mbart.py +++ b/src/transformers/models/mbart/configuration_mbart.py @@ -107,6 +107,7 @@ class MBartConfig(PretrainedConfig): """ model_type = "mbart" keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} def __init__( self, @@ -137,15 +138,6 @@ class MBartConfig(PretrainedConfig): forced_eos_token_id=2, **kwargs ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - is_encoder_decoder=is_encoder_decoder, - forced_eos_token_id=forced_eos_token_id, - **kwargs, - ) - self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.d_model = d_model @@ -167,14 +159,14 @@ class MBartConfig(PretrainedConfig): self.num_hidden_layers = encoder_layers self.gradient_checkpointing = gradient_checkpointing self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True - - @property - def num_attention_heads(self) -> int: - return self.encoder_attention_heads - - @property - def hidden_size(self) -> int: - return self.d_model + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + forced_eos_token_id=forced_eos_token_id, + **kwargs, + ) class MBartOnnxConfig(OnnxConfigWithPast): diff --git a/src/transformers/models/openai/configuration_openai.py b/src/transformers/models/openai/configuration_openai.py index 1e7bf8ec8c..f23fe74201 100644 --- a/src/transformers/models/openai/configuration_openai.py +++ b/src/transformers/models/openai/configuration_openai.py @@ -115,6 +115,12 @@ class OpenAIGPTConfig(PretrainedConfig): """ model_type = "openai-gpt" + attribute_map = { + "max_position_embeddings": "n_positions", + "hidden_size": "n_embd", + "num_attention_heads": "n_head", + "num_hidden_layers": "n_layer", + } def __init__( self, @@ -138,8 +144,6 @@ class OpenAIGPTConfig(PretrainedConfig): summary_first_dropout=0.1, **kwargs ): - super().__init__(**kwargs) - self.vocab_size = vocab_size self.n_ctx = n_ctx self.n_positions = n_positions @@ -158,19 +162,4 @@ class OpenAIGPTConfig(PretrainedConfig): self.summary_activation = summary_activation self.summary_first_dropout = summary_first_dropout self.summary_proj_to_labels = summary_proj_to_labels - - @property - def max_position_embeddings(self): - return self.n_positions - - @property - def hidden_size(self): - return self.n_embd - - @property - def num_attention_heads(self): - return self.n_head - - @property - def num_hidden_layers(self): - return self.n_layer + super().__init__(**kwargs) diff --git a/src/transformers/models/pegasus/configuration_pegasus.py b/src/transformers/models/pegasus/configuration_pegasus.py index 424458590c..2e815c2e48 100644 --- a/src/transformers/models/pegasus/configuration_pegasus.py +++ b/src/transformers/models/pegasus/configuration_pegasus.py @@ -103,6 +103,7 @@ class PegasusConfig(PretrainedConfig): """ model_type = "pegasus" keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} def __init__( self, @@ -133,15 +134,6 @@ class PegasusConfig(PretrainedConfig): forced_eos_token_id=1, **kwargs ): - super().__init__( - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - is_encoder_decoder=is_encoder_decoder, - decoder_start_token_id=decoder_start_token_id, - forced_eos_token_id=forced_eos_token_id, - **kwargs, - ) - self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.d_model = d_model @@ -163,6 +155,14 @@ class PegasusConfig(PretrainedConfig): self.num_hidden_layers = encoder_layers self.gradient_checkpointing = gradient_checkpointing self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True + super().__init__( + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + decoder_start_token_id=decoder_start_token_id, + forced_eos_token_id=forced_eos_token_id, + **kwargs, + ) @property def num_attention_heads(self) -> int: diff --git a/src/transformers/models/prophetnet/configuration_prophetnet.py b/src/transformers/models/prophetnet/configuration_prophetnet.py index 31097d9c01..c19e4a106f 100644 --- a/src/transformers/models/prophetnet/configuration_prophetnet.py +++ b/src/transformers/models/prophetnet/configuration_prophetnet.py @@ -97,6 +97,9 @@ class ProphetNetConfig(PretrainedConfig): """ model_type = "prophetnet" keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "num_attention_heads": "num_encoder_attention_heads", + } def __init__( self, @@ -129,15 +132,6 @@ class ProphetNetConfig(PretrainedConfig): eos_token_id=2, **kwargs ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - is_encoder_decoder=is_encoder_decoder, - add_cross_attention=add_cross_attention, - decoder_start_token_id=decoder_start_token_id, - **kwargs, - ) self.vocab_size = vocab_size self.hidden_size = hidden_size self.encoder_ffn_dim = encoder_ffn_dim @@ -167,10 +161,22 @@ class ProphetNetConfig(PretrainedConfig): # 4 Training Args (should be removed soon) self.gradient_checkpointing = gradient_checkpointing - @property - def num_attention_heads(self) -> int: - return self.num_encoder_attention_heads + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + add_cross_attention=add_cross_attention, + decoder_start_token_id=decoder_start_token_id, + **kwargs, + ) @property def num_hidden_layers(self) -> int: return self.num_encoder_layers + self.num_decoder_layers + + @num_hidden_layers.setter + def num_hidden_layers(self, value): + raise NotImplementedError( + "This model does not support the setting of `num_hidden_layers`. Please set `num_encoder_layers` and `num_decoder_layers`." + ) diff --git a/src/transformers/models/reformer/configuration_reformer.py b/src/transformers/models/reformer/configuration_reformer.py index b48fadfb32..406163a7ff 100755 --- a/src/transformers/models/reformer/configuration_reformer.py +++ b/src/transformers/models/reformer/configuration_reformer.py @@ -158,6 +158,7 @@ class ReformerConfig(PretrainedConfig): """ model_type = "reformer" keys_to_ignore_at_inference = ["past_buckets_states"] + attribute_map = {} def __init__( self, @@ -196,14 +197,6 @@ class ReformerConfig(PretrainedConfig): classifier_dropout=None, **kwargs ): - super().__init__( - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - is_decoder=is_decoder, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - self.hash_seed = hash_seed self.vocab_size = vocab_size self.attention_head_size = attention_head_size @@ -234,3 +227,10 @@ class ReformerConfig(PretrainedConfig): self.attn_layers = attn_layers self.use_cache = use_cache self.classifier_dropout = classifier_dropout + super().__init__( + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + is_decoder=is_decoder, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/src/transformers/models/speech_to_text/configuration_speech_to_text.py b/src/transformers/models/speech_to_text/configuration_speech_to_text.py index 760255d8aa..ff16601030 100644 --- a/src/transformers/models/speech_to_text/configuration_speech_to_text.py +++ b/src/transformers/models/speech_to_text/configuration_speech_to_text.py @@ -110,6 +110,7 @@ class Speech2TextConfig(PretrainedConfig): """ model_type = "speech_to_text" keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} def __init__( self, @@ -146,15 +147,6 @@ class Speech2TextConfig(PretrainedConfig): input_channels=1, **kwargs ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - is_encoder_decoder=is_encoder_decoder, - decoder_start_token_id=decoder_start_token_id, - **kwargs, - ) - self.vocab_size = vocab_size self.d_model = d_model self.encoder_ffn_dim = encoder_ffn_dim @@ -191,10 +183,11 @@ class Speech2TextConfig(PretrainedConfig): f"`config.num_conv_layers = {self.num_conv_layers}`." ) - @property - def num_attention_heads(self) -> int: - return self.encoder_attention_heads - - @property - def hidden_size(self) -> int: - return self.d_model + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + decoder_start_token_id=decoder_start_token_id, + **kwargs, + ) diff --git a/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py index a26d1643c6..f1f9505990 100644 --- a/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py +++ b/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py @@ -89,6 +89,7 @@ class Speech2Text2Config(PretrainedConfig): """ model_type = "speech_to_text_2" keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = {"num_attention_heads": "decoder_attention_heads", "hidden_size": "d_model"} def __init__( self, @@ -115,14 +116,6 @@ class Speech2Text2Config(PretrainedConfig): max_target_positions=1024, **kwargs ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - decoder_start_token_id=decoder_start_token_id, - **kwargs, - ) - self.vocab_size = vocab_size self.d_model = d_model self.decoder_ffn_dim = decoder_ffn_dim @@ -142,10 +135,10 @@ class Speech2Text2Config(PretrainedConfig): self.max_source_positions = max_source_positions self.max_target_positions = max_target_positions - @property - def num_attention_heads(self) -> int: - return self.decoder_attention_heads - - @property - def hidden_size(self) -> int: - return self.d_model + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + decoder_start_token_id=decoder_start_token_id, + **kwargs, + ) diff --git a/src/transformers/models/t5/configuration_t5.py b/src/transformers/models/t5/configuration_t5.py index 1a3c112503..9a40659127 100644 --- a/src/transformers/models/t5/configuration_t5.py +++ b/src/transformers/models/t5/configuration_t5.py @@ -82,6 +82,7 @@ class T5Config(PretrainedConfig): """ model_type = "t5" keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = {"hidden_size": "d_model", "num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"} def __init__( self, @@ -104,12 +105,6 @@ class T5Config(PretrainedConfig): gradient_checkpointing=False, **kwargs ): - super().__init__( - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - is_encoder_decoder=is_encoder_decoder, - **kwargs, - ) self.vocab_size = vocab_size self.d_model = d_model self.d_kv = d_kv @@ -126,18 +121,12 @@ class T5Config(PretrainedConfig): self.feed_forward_proj = feed_forward_proj self.use_cache = use_cache self.gradient_checkpointing = gradient_checkpointing - - @property - def hidden_size(self): - return self.d_model - - @property - def num_attention_heads(self): - return self.num_heads - - @property - def num_hidden_layers(self): - return self.num_layers + super().__init__( + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + **kwargs, + ) class T5OnnxConfig(OnnxConfigWithPast): diff --git a/src/transformers/models/transfo_xl/configuration_transfo_xl.py b/src/transformers/models/transfo_xl/configuration_transfo_xl.py index 1008f3488a..6787f0d022 100644 --- a/src/transformers/models/transfo_xl/configuration_transfo_xl.py +++ b/src/transformers/models/transfo_xl/configuration_transfo_xl.py @@ -106,6 +106,12 @@ class TransfoXLConfig(PretrainedConfig): model_type = "transfo-xl" keys_to_ignore_at_inference = ["mems"] + attribute_map = { + "n_token": "vocab_size", + "hidden_size": "d_model", + "num_attention_heads": "n_head", + "num_hidden_layers": "n_layer", + } def __init__( self, @@ -137,7 +143,6 @@ class TransfoXLConfig(PretrainedConfig): eos_token_id=0, **kwargs ): - super().__init__(eos_token_id=eos_token_id, **kwargs) self.vocab_size = vocab_size self.cutoffs = [] self.cutoffs.extend(cutoffs) @@ -167,6 +172,7 @@ class TransfoXLConfig(PretrainedConfig): self.proj_init_std = proj_init_std self.init_std = init_std self.layer_norm_epsilon = layer_norm_epsilon + super().__init__(eos_token_id=eos_token_id, **kwargs) @property def max_position_embeddings(self): @@ -174,22 +180,9 @@ class TransfoXLConfig(PretrainedConfig): logger.info(f"The model {self.model_type} is one of the few models that has no sequence length limit.") return -1 - @property - def n_token(self): # Backward compatibility - return self.vocab_size - - @n_token.setter - def n_token(self, value): # Backward compatibility - self.vocab_size = value - - @property - def hidden_size(self): - return self.d_model - - @property - def num_attention_heads(self): - return self.n_head - - @property - def num_hidden_layers(self): - return self.n_layer + @max_position_embeddings.setter + def max_position_embeddings(self, value): + # Message copied from Transformer-XL documentation + raise NotImplementedError( + f"The model {self.model_type} is one of the few models that has no sequence length limit." + ) diff --git a/src/transformers/models/xlm/configuration_xlm.py b/src/transformers/models/xlm/configuration_xlm.py index 839e4337ff..858bea96d3 100644 --- a/src/transformers/models/xlm/configuration_xlm.py +++ b/src/transformers/models/xlm/configuration_xlm.py @@ -146,6 +146,12 @@ class XLMConfig(PretrainedConfig): """ model_type = "xlm" + attribute_map = { + "hidden_size": "emb_dim", + "num_attention_heads": "n_heads", + "num_hidden_layers": "n_layers", + "n_words": "vocab_size", # For backward compatibility + } def __init__( self, @@ -185,7 +191,6 @@ class XLMConfig(PretrainedConfig): **kwargs ): """Constructs XLMConfig.""" - super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs) self.vocab_size = vocab_size self.emb_dim = emb_dim self.n_layers = n_layers @@ -221,22 +226,4 @@ class XLMConfig(PretrainedConfig): if "n_words" in kwargs: self.n_words = kwargs["n_words"] - @property - def n_words(self): # For backward compatibility - return self.vocab_size - - @n_words.setter - def n_words(self, value): # For backward compatibility - self.vocab_size = value - - @property - def hidden_size(self): - return self.emb_dim - - @property - def num_attention_heads(self): - return self.n_heads - - @property - def num_hidden_layers(self): - return self.n_layers + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs) diff --git a/src/transformers/models/xlnet/configuration_xlnet.py b/src/transformers/models/xlnet/configuration_xlnet.py index 5d06fb3e0f..1a87bcd9f4 100644 --- a/src/transformers/models/xlnet/configuration_xlnet.py +++ b/src/transformers/models/xlnet/configuration_xlnet.py @@ -137,6 +137,12 @@ class XLNetConfig(PretrainedConfig): model_type = "xlnet" keys_to_ignore_at_inference = ["mems"] + attribute_map = { + "n_token": "vocab_size", # Backward compatibility + "hidden_size": "d_model", + "num_attention_heads": "n_head", + "num_hidden_layers": "n_layer", + } def __init__( self, @@ -170,7 +176,6 @@ class XLNetConfig(PretrainedConfig): **kwargs ): """Constructs XLNetConfig.""" - super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) self.vocab_size = vocab_size self.d_model = d_model self.n_layer = n_layer @@ -216,27 +221,16 @@ class XLNetConfig(PretrainedConfig): self.use_mems_eval = use_mems_eval self.use_mems_train = use_mems_train + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) @property def max_position_embeddings(self): + logger.info(f"The model {self.model_type} is one of the few models that has no sequence length limit.") return -1 - @property - def n_token(self): # Backward compatibility - return self.vocab_size - - @n_token.setter - def n_token(self, value): # Backward compatibility - self.vocab_size = value - - @property - def hidden_size(self): - return self.d_model - - @property - def num_attention_heads(self): - return self.n_head - - @property - def num_hidden_layers(self): - return self.n_layer + @max_position_embeddings.setter + def max_position_embeddings(self, value): + # Message copied from Transformer-XL documentation + raise NotImplementedError( + f"The model {self.model_type} is one of the few models that has no sequence length limit." + ) diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py index 3b2a47894f..93da35a5d9 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py @@ -137,6 +137,15 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig): {% else -%} keys_to_ignore_at_inference = ["past_key_values"] {% endif -%} + + {% if cookiecutter.is_encoder_decoder_model == "False" %} + {%- else %} + attribute_map = { + "num_attention_heads": "encoder_attention_heads", + "hidden_size": "d_model" + } + + {%- endif %} def __init__( self, @@ -184,18 +193,6 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig): eos_token_id=2, **kwargs ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - {% if cookiecutter.is_encoder_decoder_model == "False" -%} - {% else -%} - is_encoder_decoder=is_encoder_decoder, - decoder_start_token_id=decoder_start_token_id, - {% endif -%} - **kwargs - ) - self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings {% if cookiecutter.is_encoder_decoder_model == "False" -%} @@ -232,14 +229,16 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig): self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True {% endif -%} + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + {% if cookiecutter.is_encoder_decoder_model == "False" -%} + {% else -%} + is_encoder_decoder=is_encoder_decoder, + decoder_start_token_id=decoder_start_token_id, + {% endif -%} + **kwargs + ) - {% if cookiecutter.is_encoder_decoder_model == "False" %} - {%- else %} - @property - def num_attention_heads(self) -> int: - return self.encoder_attention_heads - - @property - def hidden_size(self) -> int: - return self.d_model - {%- endif %} + \ No newline at end of file diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py index 8b98e5a0b2..97cdfe0455 100644 --- a/tests/test_configuration_common.py +++ b/tests/test_configuration_common.py @@ -34,11 +34,39 @@ class ConfigTester(object): def create_and_test_config_common_properties(self): config = self.config_class(**self.inputs_dict) + common_properties = ["hidden_size", "num_attention_heads", "num_hidden_layers"] + + # Add common fields for text models if self.has_text_modality: - self.parent.assertTrue(hasattr(config, "vocab_size")) - self.parent.assertTrue(hasattr(config, "hidden_size")) - self.parent.assertTrue(hasattr(config, "num_attention_heads")) - self.parent.assertTrue(hasattr(config, "num_hidden_layers")) + common_properties.extend(["vocab_size"]) + + # Test that config has the common properties as getters + for prop in common_properties: + self.parent.assertTrue(hasattr(config, prop), msg=f"`{prop}` does not exist") + + # Test that config has the common properties as setter + for idx, name in enumerate(common_properties): + try: + setattr(config, name, idx) + self.parent.assertEqual( + getattr(config, name), idx, msg=f"`{name} value {idx} expected, but was {getattr(config, name)}" + ) + except NotImplementedError: + # Some models might not be able to implement setters for common_properties + # In that case, a NotImplementedError is raised + pass + + # Test if config class can be called with Config(prop_name=..) + for idx, name in enumerate(common_properties): + try: + config = self.config_class(**{name: idx}) + self.parent.assertEqual( + getattr(config, name), idx, msg=f"`{name} value {idx} expected, but was {getattr(config, name)}" + ) + except NotImplementedError: + # Some models might not be able to implement setters for common_properties + # In that case, a NotImplementedError is raised + pass def create_and_test_config_to_json_string(self): config = self.config_class(**self.inputs_dict)