Update model configs - Allow setters for common properties (#13026)

* refactor GPT Config to allow dyn. properties

* make attribute_map a class attribute

* remove old code

* update unit test to test config: Add test for common properties setter

* update unit test to test config: Add test for common properties passed as parameters to __init__

* update to black code format

* Allow that setters are not defined for certain config classes

* update config classes to implement attribute_map

* bugfix lxmert config - id2labels was not defined when num_labels was set

* update broken configs - add attribute_maps

* update bart config

* update black codestyle

* update documentation on common config attributes

* update GPTJ config to new attribute map

* update docs on common attributes

* gptj config: add max_position_embeddings

* gptj config: format with black

* update speech to text 2 config

* format doc file to max_len 119

* update config template
This commit is contained in:
Nils Reimers
2021-09-06 16:30:13 +02:00
committed by GitHub
parent cf4eb8b3f9
commit c8be8a9adb
32 changed files with 326 additions and 440 deletions

View File

@@ -17,6 +17,11 @@ The base class :class:`~transformers.PretrainedConfig` implements the common met
either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded
from HuggingFace's AWS S3 repository). from HuggingFace's AWS S3 repository).
Each derived config class implements model specific attributes. Common attributes present in all config classes are:
:obj:`hidden_size`, :obj:`num_attention_heads`, and :obj:`num_hidden_layers`. Text models further implement:
:obj:`vocab_size`.
PretrainedConfig PretrainedConfig
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@@ -57,6 +57,8 @@ class PretrainedConfig(PushToHubMixin):
:class:`~RagConfig`. :class:`~RagConfig`.
- **keys_to_ignore_at_inference** (:obj:`List[str]`) -- A list of keys to ignore by default when looking at - **keys_to_ignore_at_inference** (:obj:`List[str]`) -- A list of keys to ignore by default when looking at
dictionary outputs of the model during inference. dictionary outputs of the model during inference.
- **attribute_map** (:obj:`Dict[str, str]`) -- A dict that maps model specific attribute names to the
standardized naming of attributes.
Common attributes (present in all subclasses) Common attributes (present in all subclasses)
@@ -218,6 +220,17 @@ class PretrainedConfig(PushToHubMixin):
""" """
model_type: str = "" model_type: str = ""
is_composition: bool = False is_composition: bool = False
attribute_map: Dict[str, str] = {}
def __setattr__(self, key, value):
if key in super().__getattribute__("attribute_map"):
key = super().__getattribute__("attribute_map")[key]
super().__setattr__(key, value)
def __getattribute__(self, key):
if key != "attribute_map" and key in super().__getattribute__("attribute_map"):
key = super().__getattribute__("attribute_map")[key]
return super().__getattribute__(key)
def __init__(self, **kwargs): def __init__(self, **kwargs):
# Attributes with defaults # Attributes with defaults
@@ -350,7 +363,7 @@ class PretrainedConfig(PushToHubMixin):
@num_labels.setter @num_labels.setter
def num_labels(self, num_labels: int): def num_labels(self, num_labels: int):
if self.id2label is None or len(self.id2label) != num_labels: if not hasattr(self, "id2label") or self.id2label is None or len(self.id2label) != num_labels:
self.id2label = {i: f"LABEL_{i}" for i in range(num_labels)} self.id2label = {i: f"LABEL_{i}" for i in range(num_labels)}
self.label2id = dict(zip(self.id2label.values(), self.id2label.keys())) self.label2id = dict(zip(self.id2label.values(), self.id2label.keys()))

View File

@@ -109,6 +109,7 @@ class BartConfig(PretrainedConfig):
""" """
model_type = "bart" model_type = "bart"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__( def __init__(
self, self,
@@ -141,17 +142,6 @@ class BartConfig(PretrainedConfig):
forced_eos_token_id=2, forced_eos_token_id=2,
**kwargs **kwargs
): ):
super().__init__(
num_labels=num_labels,
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.d_model = d_model self.d_model = d_model
@@ -174,6 +164,17 @@ class BartConfig(PretrainedConfig):
self.gradient_checkpointing = gradient_checkpointing self.gradient_checkpointing = gradient_checkpointing
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
super().__init__(
num_labels=num_labels,
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
# ensure backward compatibility for BART CNN models # ensure backward compatibility for BART CNN models
if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False): if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
self.forced_bos_token_id = self.bos_token_id self.forced_bos_token_id = self.bos_token_id
@@ -182,14 +183,6 @@ class BartConfig(PretrainedConfig):
"The config can simply be saved and uploaded again to be fixed." "The config can simply be saved and uploaded again to be fixed."
) )
@property
def num_attention_heads(self) -> int:
return self.encoder_attention_heads
@property
def hidden_size(self) -> int:
return self.d_model
class BartOnnxConfig(OnnxConfigWithPast): class BartOnnxConfig(OnnxConfigWithPast):
@property @property

View File

@@ -112,6 +112,11 @@ class BigBirdPegasusConfig(PretrainedConfig):
""" """
model_type = "bigbird_pegasus" model_type = "bigbird_pegasus"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_attention_heads": "encoder_attention_heads",
"hidden_size": "d_model",
"attention_probs_dropout_prob": "attention_dropout",
}
def __init__( def __init__(
self, self,
@@ -146,15 +151,6 @@ class BigBirdPegasusConfig(PretrainedConfig):
use_bias=False, use_bias=False,
**kwargs **kwargs
): ):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.d_model = d_model self.d_model = d_model
@@ -183,14 +179,11 @@ class BigBirdPegasusConfig(PretrainedConfig):
self.num_random_blocks = num_random_blocks self.num_random_blocks = num_random_blocks
self.use_bias = use_bias self.use_bias = use_bias
@property super().__init__(
def num_attention_heads(self) -> int: pad_token_id=pad_token_id,
return self.encoder_attention_heads bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
@property is_encoder_decoder=is_encoder_decoder,
def hidden_size(self) -> int: decoder_start_token_id=decoder_start_token_id,
return self.d_model **kwargs,
)
@property
def attention_probs_dropout_prob(self) -> float:
return self.attention_dropout

View File

@@ -103,6 +103,7 @@ class BlenderbotConfig(PretrainedConfig):
""" """
model_type = "blenderbot" model_type = "blenderbot"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__( def __init__(
self, self,
@@ -135,17 +136,6 @@ class BlenderbotConfig(PretrainedConfig):
forced_eos_token_id=2, forced_eos_token_id=2,
**kwargs **kwargs
): ):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
encoder_no_repeat_ngram_size=encoder_no_repeat_ngram_size,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.d_model = d_model self.d_model = d_model
@@ -168,10 +158,13 @@ class BlenderbotConfig(PretrainedConfig):
self.gradient_checkpointing = gradient_checkpointing self.gradient_checkpointing = gradient_checkpointing
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
@property super().__init__(
def num_attention_heads(self) -> int: pad_token_id=pad_token_id,
return self.encoder_attention_heads bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
@property is_encoder_decoder=is_encoder_decoder,
def hidden_size(self) -> int: decoder_start_token_id=decoder_start_token_id,
return self.d_model encoder_no_repeat_ngram_size=encoder_no_repeat_ngram_size,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)

View File

@@ -103,6 +103,7 @@ class BlenderbotSmallConfig(PretrainedConfig):
""" """
model_type = "blenderbot-small" model_type = "blenderbot-small"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__( def __init__(
self, self,
@@ -134,16 +135,6 @@ class BlenderbotSmallConfig(PretrainedConfig):
forced_eos_token_id=2, forced_eos_token_id=2,
**kwargs **kwargs
): ):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.d_model = d_model self.d_model = d_model
@@ -166,10 +157,12 @@ class BlenderbotSmallConfig(PretrainedConfig):
self.gradient_checkpointing = gradient_checkpointing self.gradient_checkpointing = gradient_checkpointing
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
@property super().__init__(
def num_attention_heads(self) -> int: pad_token_id=pad_token_id,
return self.encoder_attention_heads bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
@property is_encoder_decoder=is_encoder_decoder,
def hidden_size(self) -> int: decoder_start_token_id=decoder_start_token_id,
return self.d_model forced_eos_token_id=forced_eos_token_id,
**kwargs,
)

View File

@@ -81,6 +81,12 @@ class CTRLConfig(PretrainedConfig):
model_type = "ctrl" model_type = "ctrl"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"max_position_embeddings": "n_positions",
"hidden_size": "n_embd",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}
def __init__( def __init__(
self, self,
@@ -104,7 +110,6 @@ class CTRLConfig(PretrainedConfig):
use_cache=True, use_cache=True,
**kwargs **kwargs
): ):
super().__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.n_ctx = n_ctx self.n_ctx = n_ctx
self.n_positions = n_positions self.n_positions = n_positions
@@ -125,18 +130,4 @@ class CTRLConfig(PretrainedConfig):
self.summary_proj_to_labels = summary_proj_to_labels self.summary_proj_to_labels = summary_proj_to_labels
self.use_cache = use_cache self.use_cache = use_cache
@property super().__init__(**kwargs)
def max_position_embeddings(self):
return self.n_positions
@property
def hidden_size(self):
return self.n_embd
@property
def num_attention_heads(self):
return self.n_head
@property
def num_hidden_layers(self):
return self.n_layer

View File

@@ -117,6 +117,10 @@ class DetrConfig(PretrainedConfig):
""" """
model_type = "detr" model_type = "detr"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"hidden_size": "d_model",
"num_attention_heads": "encoder_attention_heads",
}
def __init__( def __init__(
self, self,
@@ -154,8 +158,6 @@ class DetrConfig(PretrainedConfig):
eos_coefficient=0.1, eos_coefficient=0.1,
**kwargs **kwargs
): ):
super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
self.num_queries = num_queries self.num_queries = num_queries
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.d_model = d_model self.d_model = d_model
@@ -189,6 +191,7 @@ class DetrConfig(PretrainedConfig):
self.bbox_loss_coefficient = bbox_loss_coefficient self.bbox_loss_coefficient = bbox_loss_coefficient
self.giou_loss_coefficient = giou_loss_coefficient self.giou_loss_coefficient = giou_loss_coefficient
self.eos_coefficient = eos_coefficient self.eos_coefficient = eos_coefficient
super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
@property @property
def num_attention_heads(self) -> int: def num_attention_heads(self) -> int:

View File

@@ -93,6 +93,11 @@ class DistilBertConfig(PretrainedConfig):
>>> configuration = model.config >>> configuration = model.config
""" """
model_type = "distilbert" model_type = "distilbert"
attribute_map = {
"hidden_size": "dim",
"num_attention_heads": "n_heads",
"num_hidden_layers": "n_layers",
}
def __init__( def __init__(
self, self,
@@ -112,7 +117,6 @@ class DistilBertConfig(PretrainedConfig):
pad_token_id=0, pad_token_id=0,
**kwargs **kwargs
): ):
super().__init__(**kwargs, pad_token_id=pad_token_id)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.sinusoidal_pos_embds = sinusoidal_pos_embds self.sinusoidal_pos_embds = sinusoidal_pos_embds
@@ -126,18 +130,7 @@ class DistilBertConfig(PretrainedConfig):
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.qa_dropout = qa_dropout self.qa_dropout = qa_dropout
self.seq_classif_dropout = seq_classif_dropout self.seq_classif_dropout = seq_classif_dropout
super().__init__(**kwargs, pad_token_id=pad_token_id)
@property
def hidden_size(self):
return self.dim
@property
def num_attention_heads(self):
return self.n_heads
@property
def num_hidden_layers(self):
return self.n_layers
class DistilBertOnnxConfig(OnnxConfig): class DistilBertOnnxConfig(OnnxConfig):

View File

@@ -136,6 +136,6 @@ class FlaubertConfig(XLMConfig):
def __init__(self, layerdrop=0.0, pre_norm=False, pad_token_id=2, bos_token_id=0, **kwargs): def __init__(self, layerdrop=0.0, pre_norm=False, pad_token_id=2, bos_token_id=0, **kwargs):
"""Constructs FlaubertConfig.""" """Constructs FlaubertConfig."""
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
self.layerdrop = layerdrop self.layerdrop = layerdrop
self.pre_norm = pre_norm self.pre_norm = pre_norm
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)

View File

@@ -124,6 +124,7 @@ class FSMTConfig(PretrainedConfig):
""" """
model_type = "fsmt" model_type = "fsmt"
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
# update the defaults from config file # update the defaults from config file
def __init__( def __init__(
@@ -161,18 +162,6 @@ class FSMTConfig(PretrainedConfig):
forced_eos_token_id=2, forced_eos_token_id=2,
**common_kwargs **common_kwargs
): ):
if "hidden_size" in common_kwargs:
raise ValueError("hidden size is called d_model")
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
decoder_start_token_id=decoder_start_token_id,
is_encoder_decoder=is_encoder_decoder,
tie_word_embeddings=tie_word_embeddings,
forced_eos_token_id=forced_eos_token_id,
**common_kwargs,
)
self.langs = langs self.langs = langs
self.src_vocab_size = src_vocab_size self.src_vocab_size = src_vocab_size
self.tgt_vocab_size = tgt_vocab_size self.tgt_vocab_size = tgt_vocab_size
@@ -196,6 +185,8 @@ class FSMTConfig(PretrainedConfig):
self.early_stopping = early_stopping self.early_stopping = early_stopping
self.decoder = DecoderConfig(vocab_size=tgt_vocab_size, bos_token_id=eos_token_id) self.decoder = DecoderConfig(vocab_size=tgt_vocab_size, bos_token_id=eos_token_id)
if "decoder" in common_kwargs:
del common_kwargs["decoder"]
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
@@ -205,14 +196,16 @@ class FSMTConfig(PretrainedConfig):
self.dropout = dropout self.dropout = dropout
self.use_cache = use_cache self.use_cache = use_cache
super().__init__(
@property pad_token_id=pad_token_id,
def num_attention_heads(self) -> int: bos_token_id=bos_token_id,
return self.encoder_attention_heads eos_token_id=eos_token_id,
decoder_start_token_id=decoder_start_token_id,
@property is_encoder_decoder=is_encoder_decoder,
def hidden_size(self) -> int: tie_word_embeddings=tie_word_embeddings,
return self.d_model forced_eos_token_id=forced_eos_token_id,
**common_kwargs,
)
def to_dict(self): def to_dict(self):
""" """

View File

@@ -102,6 +102,10 @@ class FunnelConfig(PretrainedConfig):
Whether or not to apply the pooling only to the query or to query, key and values for the attention layers. Whether or not to apply the pooling only to the query or to query, key and values for the attention layers.
""" """
model_type = "funnel" model_type = "funnel"
attribute_map = {
"hidden_size": "d_model",
"num_attention_heads": "n_head",
}
def __init__( def __init__(
self, self,
@@ -129,8 +133,6 @@ class FunnelConfig(PretrainedConfig):
pool_q_only=True, pool_q_only=True,
**kwargs **kwargs
): ):
super().__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.block_sizes = block_sizes self.block_sizes = block_sizes
self.block_repeats = [1] * len(block_sizes) if block_repeats is None else block_repeats self.block_repeats = [1] * len(block_sizes) if block_repeats is None else block_repeats
@@ -165,18 +167,22 @@ class FunnelConfig(PretrainedConfig):
self.truncate_seq = truncate_seq self.truncate_seq = truncate_seq
self.pool_q_only = pool_q_only self.pool_q_only = pool_q_only
@property super().__init__(**kwargs)
def hidden_size(self):
return self.d_model
@property
def num_attention_heads(self):
return self.n_head
@property @property
def num_hidden_layers(self): def num_hidden_layers(self):
return sum(self.block_sizes) return sum(self.block_sizes)
@num_hidden_layers.setter
def num_hidden_layers(self, value):
raise NotImplementedError(
"This model does not support the setting of `num_hidden_layers`. Please set `block_sizes`."
)
@property @property
def num_blocks(self): def num_blocks(self):
return len(self.block_sizes) return len(self.block_sizes)
@num_blocks.setter
def num_blocks(self, value):
raise NotImplementedError("This model does not support the setting of `num_blocks`. Please set `block_sizes`.")

View File

@@ -130,6 +130,12 @@ class GPT2Config(PretrainedConfig):
model_type = "gpt2" model_type = "gpt2"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"hidden_size": "n_embd",
"max_position_embeddings": "n_positions",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}
def __init__( def __init__(
self, self,
@@ -158,8 +164,6 @@ class GPT2Config(PretrainedConfig):
eos_token_id=50256, eos_token_id=50256,
**kwargs **kwargs
): ):
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.n_ctx = n_ctx self.n_ctx = n_ctx
self.n_positions = n_positions self.n_positions = n_positions
@@ -185,21 +189,7 @@ class GPT2Config(PretrainedConfig):
self.bos_token_id = bos_token_id self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id self.eos_token_id = eos_token_id
@property super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
def max_position_embeddings(self):
return self.n_positions
@property
def hidden_size(self):
return self.n_embd
@property
def num_attention_heads(self):
return self.n_head
@property
def num_hidden_layers(self):
return self.n_layer
class GPT2OnnxConfig(OnnxConfigWithPast): class GPT2OnnxConfig(OnnxConfigWithPast):

View File

@@ -96,6 +96,7 @@ class GPTNeoConfig(PretrainedConfig):
>>> configuration = model.config >>> configuration = model.config
""" """
model_type = "gpt_neo" model_type = "gpt_neo"
attribute_map = {"num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"}
def __init__( def __init__(
self, self,
@@ -124,8 +125,6 @@ class GPTNeoConfig(PretrainedConfig):
eos_token_id=50256, eos_token_id=50256,
**kwargs **kwargs
): ):
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size self.hidden_size = hidden_size
@@ -163,6 +162,8 @@ class GPTNeoConfig(PretrainedConfig):
"Please verify the value of `config.attention_types` argument." "Please verify the value of `config.attention_types` argument."
) )
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@staticmethod @staticmethod
def expand_attention_types_params(attention_types): def expand_attention_types_params(attention_types):
attentions = [] attentions = []
@@ -171,14 +172,6 @@ class GPTNeoConfig(PretrainedConfig):
attentions.extend(item[0]) attentions.extend(item[0])
return attentions return attentions
@property
def num_attention_heads(self):
return self.num_heads
@property
def num_hidden_layers(self):
return self.num_layers
def custom_unfold(input, dimension, size, step): def custom_unfold(input, dimension, size, step):
"""Custom torch.Tensor.unfold implementation to enable the export to ONNX.""" """Custom torch.Tensor.unfold implementation to enable the export to ONNX."""

View File

@@ -87,6 +87,12 @@ class GPTJConfig(PretrainedConfig):
>>> configuration = model.config >>> configuration = model.config
""" """
model_type = "gptj" model_type = "gptj"
attribute_map = {
"max_position_embeddings": "n_positions",
"hidden_size": "n_embd",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}
def __init__( def __init__(
self, self,
@@ -111,8 +117,6 @@ class GPTJConfig(PretrainedConfig):
eos_token_id=50256, eos_token_id=50256,
**kwargs **kwargs
): ):
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.n_ctx = n_ctx self.n_ctx = n_ctx
self.n_positions = n_positions self.n_positions = n_positions
@@ -134,18 +138,4 @@ class GPTJConfig(PretrainedConfig):
self.bos_token_id = bos_token_id self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id self.eos_token_id = eos_token_id
@property super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
def max_position_embeddings(self):
return self.n_positions
@property
def hidden_size(self):
return self.n_embd
@property
def num_attention_heads(self):
return self.n_head
@property
def num_hidden_layers(self):
return self.n_layer

View File

@@ -99,6 +99,12 @@ class LEDConfig(PretrainedConfig):
>>> configuration = model.config >>> configuration = model.config
""" """
model_type = "led" model_type = "led"
attribute_map = {
"num_attention_heads": "encoder_attention_heads",
"hidden_size": "d_model",
"attention_probs_dropout_prob": "attention_dropout",
"initializer_range": "init_std",
}
def __init__( def __init__(
self, self,
@@ -130,15 +136,6 @@ class LEDConfig(PretrainedConfig):
attention_window: Union[List[int], int] = 512, attention_window: Union[List[int], int] = 512,
**kwargs **kwargs
): ):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.max_encoder_position_embeddings = max_encoder_position_embeddings self.max_encoder_position_embeddings = max_encoder_position_embeddings
self.max_decoder_position_embeddings = max_decoder_position_embeddings self.max_decoder_position_embeddings = max_decoder_position_embeddings
@@ -162,18 +159,11 @@ class LEDConfig(PretrainedConfig):
self.attention_window = attention_window self.attention_window = attention_window
self.gradient_checkpointing = gradient_checkpointing self.gradient_checkpointing = gradient_checkpointing
@property super().__init__(
def num_attention_heads(self) -> int: pad_token_id=pad_token_id,
return self.encoder_attention_heads bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
@property is_encoder_decoder=is_encoder_decoder,
def hidden_size(self) -> int: decoder_start_token_id=decoder_start_token_id,
return self.d_model **kwargs,
)
@property
def attention_probs_dropout_prob(self) -> float:
return self.attention_dropout
@property
def initializer_range(self) -> float:
return self.init_std

View File

@@ -113,13 +113,13 @@ class LxmertConfig(PretrainedConfig):
""" """
model_type = "lxmert" model_type = "lxmert"
attribute_map = {}
def __init__( def __init__(
self, self,
vocab_size=30522, vocab_size=30522,
hidden_size=768, hidden_size=768,
num_attention_heads=12, num_attention_heads=12,
num_labels=2,
num_qa_labels=9500, num_qa_labels=9500,
num_object_labels=1600, num_object_labels=1600,
num_attr_labels=400, num_attr_labels=400,
@@ -149,11 +149,9 @@ class LxmertConfig(PretrainedConfig):
output_hidden_states=False, output_hidden_states=False,
**kwargs, **kwargs,
): ):
super().__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.num_attention_heads = num_attention_heads self.num_attention_heads = num_attention_heads
self.num_labels = num_labels
self.hidden_act = hidden_act self.hidden_act = hidden_act
self.intermediate_size = intermediate_size self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob self.hidden_dropout_prob = hidden_dropout_prob
@@ -179,5 +177,6 @@ class LxmertConfig(PretrainedConfig):
self.visual_attr_loss = visual_attr_loss self.visual_attr_loss = visual_attr_loss
self.visual_feat_loss = visual_feat_loss self.visual_feat_loss = visual_feat_loss
self.output_hidden_states = output_hidden_states self.output_hidden_states = output_hidden_states
self.output_attentions = self.output_attentions self.output_attentions = output_attentions
self.num_hidden_layers = {"vision": r_layers, "cross_encoder": x_layers, "language": l_layers} self.num_hidden_layers = {"vision": r_layers, "cross_encoder": x_layers, "language": l_layers}
super().__init__(**kwargs)

View File

@@ -97,6 +97,7 @@ class M2M100Config(PretrainedConfig):
""" """
model_type = "m2m_100" model_type = "m2m_100"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__( def __init__(
self, self,
@@ -126,15 +127,6 @@ class M2M100Config(PretrainedConfig):
eos_token_id=2, eos_token_id=2,
**kwargs **kwargs
): ):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.d_model = d_model self.d_model = d_model
@@ -156,10 +148,11 @@ class M2M100Config(PretrainedConfig):
self.gradient_checkpointing = gradient_checkpointing self.gradient_checkpointing = gradient_checkpointing
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
@property super().__init__(
def num_attention_heads(self) -> int: pad_token_id=pad_token_id,
return self.encoder_attention_heads bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
@property is_encoder_decoder=is_encoder_decoder,
def hidden_size(self) -> int: decoder_start_token_id=decoder_start_token_id,
return self.d_model **kwargs,
)

View File

@@ -103,6 +103,7 @@ class MarianConfig(PretrainedConfig):
""" """
model_type = "marian" model_type = "marian"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__( def __init__(
self, self,
@@ -133,15 +134,6 @@ class MarianConfig(PretrainedConfig):
forced_eos_token_id=0, forced_eos_token_id=0,
**kwargs **kwargs
): ):
super().__init__(
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.d_model = d_model self.d_model = d_model
@@ -163,11 +155,11 @@ class MarianConfig(PretrainedConfig):
self.num_hidden_layers = encoder_layers self.num_hidden_layers = encoder_layers
self.gradient_checkpointing = gradient_checkpointing self.gradient_checkpointing = gradient_checkpointing
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
super().__init__(
@property pad_token_id=pad_token_id,
def num_attention_heads(self) -> int: eos_token_id=eos_token_id,
return self.encoder_attention_heads is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
@property forced_eos_token_id=forced_eos_token_id,
def hidden_size(self) -> int: **kwargs,
return self.d_model )

View File

@@ -107,6 +107,7 @@ class MBartConfig(PretrainedConfig):
""" """
model_type = "mbart" model_type = "mbart"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__( def __init__(
self, self,
@@ -137,15 +138,6 @@ class MBartConfig(PretrainedConfig):
forced_eos_token_id=2, forced_eos_token_id=2,
**kwargs **kwargs
): ):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.d_model = d_model self.d_model = d_model
@@ -167,14 +159,14 @@ class MBartConfig(PretrainedConfig):
self.num_hidden_layers = encoder_layers self.num_hidden_layers = encoder_layers
self.gradient_checkpointing = gradient_checkpointing self.gradient_checkpointing = gradient_checkpointing
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
super().__init__(
@property pad_token_id=pad_token_id,
def num_attention_heads(self) -> int: bos_token_id=bos_token_id,
return self.encoder_attention_heads eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
@property forced_eos_token_id=forced_eos_token_id,
def hidden_size(self) -> int: **kwargs,
return self.d_model )
class MBartOnnxConfig(OnnxConfigWithPast): class MBartOnnxConfig(OnnxConfigWithPast):

View File

@@ -115,6 +115,12 @@ class OpenAIGPTConfig(PretrainedConfig):
""" """
model_type = "openai-gpt" model_type = "openai-gpt"
attribute_map = {
"max_position_embeddings": "n_positions",
"hidden_size": "n_embd",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}
def __init__( def __init__(
self, self,
@@ -138,8 +144,6 @@ class OpenAIGPTConfig(PretrainedConfig):
summary_first_dropout=0.1, summary_first_dropout=0.1,
**kwargs **kwargs
): ):
super().__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.n_ctx = n_ctx self.n_ctx = n_ctx
self.n_positions = n_positions self.n_positions = n_positions
@@ -158,19 +162,4 @@ class OpenAIGPTConfig(PretrainedConfig):
self.summary_activation = summary_activation self.summary_activation = summary_activation
self.summary_first_dropout = summary_first_dropout self.summary_first_dropout = summary_first_dropout
self.summary_proj_to_labels = summary_proj_to_labels self.summary_proj_to_labels = summary_proj_to_labels
super().__init__(**kwargs)
@property
def max_position_embeddings(self):
return self.n_positions
@property
def hidden_size(self):
return self.n_embd
@property
def num_attention_heads(self):
return self.n_head
@property
def num_hidden_layers(self):
return self.n_layer

View File

@@ -103,6 +103,7 @@ class PegasusConfig(PretrainedConfig):
""" """
model_type = "pegasus" model_type = "pegasus"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__( def __init__(
self, self,
@@ -133,15 +134,6 @@ class PegasusConfig(PretrainedConfig):
forced_eos_token_id=1, forced_eos_token_id=1,
**kwargs **kwargs
): ):
super().__init__(
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.d_model = d_model self.d_model = d_model
@@ -163,6 +155,14 @@ class PegasusConfig(PretrainedConfig):
self.num_hidden_layers = encoder_layers self.num_hidden_layers = encoder_layers
self.gradient_checkpointing = gradient_checkpointing self.gradient_checkpointing = gradient_checkpointing
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
super().__init__(
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
@property @property
def num_attention_heads(self) -> int: def num_attention_heads(self) -> int:

View File

@@ -97,6 +97,9 @@ class ProphetNetConfig(PretrainedConfig):
""" """
model_type = "prophetnet" model_type = "prophetnet"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_attention_heads": "num_encoder_attention_heads",
}
def __init__( def __init__(
self, self,
@@ -129,15 +132,6 @@ class ProphetNetConfig(PretrainedConfig):
eos_token_id=2, eos_token_id=2,
**kwargs **kwargs
): ):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
add_cross_attention=add_cross_attention,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.encoder_ffn_dim = encoder_ffn_dim self.encoder_ffn_dim = encoder_ffn_dim
@@ -167,10 +161,22 @@ class ProphetNetConfig(PretrainedConfig):
# 4 Training Args (should be removed soon) # 4 Training Args (should be removed soon)
self.gradient_checkpointing = gradient_checkpointing self.gradient_checkpointing = gradient_checkpointing
@property super().__init__(
def num_attention_heads(self) -> int: pad_token_id=pad_token_id,
return self.num_encoder_attention_heads bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
add_cross_attention=add_cross_attention,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
@property @property
def num_hidden_layers(self) -> int: def num_hidden_layers(self) -> int:
return self.num_encoder_layers + self.num_decoder_layers return self.num_encoder_layers + self.num_decoder_layers
@num_hidden_layers.setter
def num_hidden_layers(self, value):
raise NotImplementedError(
"This model does not support the setting of `num_hidden_layers`. Please set `num_encoder_layers` and `num_decoder_layers`."
)

View File

@@ -158,6 +158,7 @@ class ReformerConfig(PretrainedConfig):
""" """
model_type = "reformer" model_type = "reformer"
keys_to_ignore_at_inference = ["past_buckets_states"] keys_to_ignore_at_inference = ["past_buckets_states"]
attribute_map = {}
def __init__( def __init__(
self, self,
@@ -196,14 +197,6 @@ class ReformerConfig(PretrainedConfig):
classifier_dropout=None, classifier_dropout=None,
**kwargs **kwargs
): ):
super().__init__(
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
is_decoder=is_decoder,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
self.hash_seed = hash_seed self.hash_seed = hash_seed
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.attention_head_size = attention_head_size self.attention_head_size = attention_head_size
@@ -234,3 +227,10 @@ class ReformerConfig(PretrainedConfig):
self.attn_layers = attn_layers self.attn_layers = attn_layers
self.use_cache = use_cache self.use_cache = use_cache
self.classifier_dropout = classifier_dropout self.classifier_dropout = classifier_dropout
super().__init__(
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
is_decoder=is_decoder,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)

View File

@@ -110,6 +110,7 @@ class Speech2TextConfig(PretrainedConfig):
""" """
model_type = "speech_to_text" model_type = "speech_to_text"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__( def __init__(
self, self,
@@ -146,15 +147,6 @@ class Speech2TextConfig(PretrainedConfig):
input_channels=1, input_channels=1,
**kwargs **kwargs
): ):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.d_model = d_model self.d_model = d_model
self.encoder_ffn_dim = encoder_ffn_dim self.encoder_ffn_dim = encoder_ffn_dim
@@ -191,10 +183,11 @@ class Speech2TextConfig(PretrainedConfig):
f"`config.num_conv_layers = {self.num_conv_layers}`." f"`config.num_conv_layers = {self.num_conv_layers}`."
) )
@property super().__init__(
def num_attention_heads(self) -> int: pad_token_id=pad_token_id,
return self.encoder_attention_heads bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
@property is_encoder_decoder=is_encoder_decoder,
def hidden_size(self) -> int: decoder_start_token_id=decoder_start_token_id,
return self.d_model **kwargs,
)

View File

@@ -89,6 +89,7 @@ class Speech2Text2Config(PretrainedConfig):
""" """
model_type = "speech_to_text_2" model_type = "speech_to_text_2"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "decoder_attention_heads", "hidden_size": "d_model"}
def __init__( def __init__(
self, self,
@@ -115,14 +116,6 @@ class Speech2Text2Config(PretrainedConfig):
max_target_positions=1024, max_target_positions=1024,
**kwargs **kwargs
): ):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.d_model = d_model self.d_model = d_model
self.decoder_ffn_dim = decoder_ffn_dim self.decoder_ffn_dim = decoder_ffn_dim
@@ -142,10 +135,10 @@ class Speech2Text2Config(PretrainedConfig):
self.max_source_positions = max_source_positions self.max_source_positions = max_source_positions
self.max_target_positions = max_target_positions self.max_target_positions = max_target_positions
@property super().__init__(
def num_attention_heads(self) -> int: pad_token_id=pad_token_id,
return self.decoder_attention_heads bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
@property decoder_start_token_id=decoder_start_token_id,
def hidden_size(self) -> int: **kwargs,
return self.d_model )

View File

@@ -82,6 +82,7 @@ class T5Config(PretrainedConfig):
""" """
model_type = "t5" model_type = "t5"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"hidden_size": "d_model", "num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"}
def __init__( def __init__(
self, self,
@@ -104,12 +105,6 @@ class T5Config(PretrainedConfig):
gradient_checkpointing=False, gradient_checkpointing=False,
**kwargs **kwargs
): ):
super().__init__(
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
**kwargs,
)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.d_model = d_model self.d_model = d_model
self.d_kv = d_kv self.d_kv = d_kv
@@ -126,18 +121,12 @@ class T5Config(PretrainedConfig):
self.feed_forward_proj = feed_forward_proj self.feed_forward_proj = feed_forward_proj
self.use_cache = use_cache self.use_cache = use_cache
self.gradient_checkpointing = gradient_checkpointing self.gradient_checkpointing = gradient_checkpointing
super().__init__(
@property pad_token_id=pad_token_id,
def hidden_size(self): eos_token_id=eos_token_id,
return self.d_model is_encoder_decoder=is_encoder_decoder,
**kwargs,
@property )
def num_attention_heads(self):
return self.num_heads
@property
def num_hidden_layers(self):
return self.num_layers
class T5OnnxConfig(OnnxConfigWithPast): class T5OnnxConfig(OnnxConfigWithPast):

View File

@@ -106,6 +106,12 @@ class TransfoXLConfig(PretrainedConfig):
model_type = "transfo-xl" model_type = "transfo-xl"
keys_to_ignore_at_inference = ["mems"] keys_to_ignore_at_inference = ["mems"]
attribute_map = {
"n_token": "vocab_size",
"hidden_size": "d_model",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}
def __init__( def __init__(
self, self,
@@ -137,7 +143,6 @@ class TransfoXLConfig(PretrainedConfig):
eos_token_id=0, eos_token_id=0,
**kwargs **kwargs
): ):
super().__init__(eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.cutoffs = [] self.cutoffs = []
self.cutoffs.extend(cutoffs) self.cutoffs.extend(cutoffs)
@@ -167,6 +172,7 @@ class TransfoXLConfig(PretrainedConfig):
self.proj_init_std = proj_init_std self.proj_init_std = proj_init_std
self.init_std = init_std self.init_std = init_std
self.layer_norm_epsilon = layer_norm_epsilon self.layer_norm_epsilon = layer_norm_epsilon
super().__init__(eos_token_id=eos_token_id, **kwargs)
@property @property
def max_position_embeddings(self): def max_position_embeddings(self):
@@ -174,22 +180,9 @@ class TransfoXLConfig(PretrainedConfig):
logger.info(f"The model {self.model_type} is one of the few models that has no sequence length limit.") logger.info(f"The model {self.model_type} is one of the few models that has no sequence length limit.")
return -1 return -1
@property @max_position_embeddings.setter
def n_token(self): # Backward compatibility def max_position_embeddings(self, value):
return self.vocab_size # Message copied from Transformer-XL documentation
raise NotImplementedError(
@n_token.setter f"The model {self.model_type} is one of the few models that has no sequence length limit."
def n_token(self, value): # Backward compatibility )
self.vocab_size = value
@property
def hidden_size(self):
return self.d_model
@property
def num_attention_heads(self):
return self.n_head
@property
def num_hidden_layers(self):
return self.n_layer

View File

@@ -146,6 +146,12 @@ class XLMConfig(PretrainedConfig):
""" """
model_type = "xlm" model_type = "xlm"
attribute_map = {
"hidden_size": "emb_dim",
"num_attention_heads": "n_heads",
"num_hidden_layers": "n_layers",
"n_words": "vocab_size", # For backward compatibility
}
def __init__( def __init__(
self, self,
@@ -185,7 +191,6 @@ class XLMConfig(PretrainedConfig):
**kwargs **kwargs
): ):
"""Constructs XLMConfig.""" """Constructs XLMConfig."""
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.emb_dim = emb_dim self.emb_dim = emb_dim
self.n_layers = n_layers self.n_layers = n_layers
@@ -221,22 +226,4 @@ class XLMConfig(PretrainedConfig):
if "n_words" in kwargs: if "n_words" in kwargs:
self.n_words = kwargs["n_words"] self.n_words = kwargs["n_words"]
@property super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
def n_words(self): # For backward compatibility
return self.vocab_size
@n_words.setter
def n_words(self, value): # For backward compatibility
self.vocab_size = value
@property
def hidden_size(self):
return self.emb_dim
@property
def num_attention_heads(self):
return self.n_heads
@property
def num_hidden_layers(self):
return self.n_layers

View File

@@ -137,6 +137,12 @@ class XLNetConfig(PretrainedConfig):
model_type = "xlnet" model_type = "xlnet"
keys_to_ignore_at_inference = ["mems"] keys_to_ignore_at_inference = ["mems"]
attribute_map = {
"n_token": "vocab_size", # Backward compatibility
"hidden_size": "d_model",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}
def __init__( def __init__(
self, self,
@@ -170,7 +176,6 @@ class XLNetConfig(PretrainedConfig):
**kwargs **kwargs
): ):
"""Constructs XLNetConfig.""" """Constructs XLNetConfig."""
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.d_model = d_model self.d_model = d_model
self.n_layer = n_layer self.n_layer = n_layer
@@ -216,27 +221,16 @@ class XLNetConfig(PretrainedConfig):
self.use_mems_eval = use_mems_eval self.use_mems_eval = use_mems_eval
self.use_mems_train = use_mems_train self.use_mems_train = use_mems_train
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@property @property
def max_position_embeddings(self): def max_position_embeddings(self):
logger.info(f"The model {self.model_type} is one of the few models that has no sequence length limit.")
return -1 return -1
@property @max_position_embeddings.setter
def n_token(self): # Backward compatibility def max_position_embeddings(self, value):
return self.vocab_size # Message copied from Transformer-XL documentation
raise NotImplementedError(
@n_token.setter f"The model {self.model_type} is one of the few models that has no sequence length limit."
def n_token(self, value): # Backward compatibility )
self.vocab_size = value
@property
def hidden_size(self):
return self.d_model
@property
def num_attention_heads(self):
return self.n_head
@property
def num_hidden_layers(self):
return self.n_layer

View File

@@ -138,6 +138,15 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
{% endif -%} {% endif -%}
{% if cookiecutter.is_encoder_decoder_model == "False" %}
{%- else %}
attribute_map = {
"num_attention_heads": "encoder_attention_heads",
"hidden_size": "d_model"
}
{%- endif %}
def __init__( def __init__(
self, self,
{% if cookiecutter.is_encoder_decoder_model == "False" -%} {% if cookiecutter.is_encoder_decoder_model == "False" -%}
@@ -184,18 +193,6 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):
eos_token_id=2, eos_token_id=2,
**kwargs **kwargs
): ):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
{% if cookiecutter.is_encoder_decoder_model == "False" -%}
{% else -%}
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
{% endif -%}
**kwargs
)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
{% if cookiecutter.is_encoder_decoder_model == "False" -%} {% if cookiecutter.is_encoder_decoder_model == "False" -%}
@@ -232,14 +229,16 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
{% endif -%} {% endif -%}
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
{% if cookiecutter.is_encoder_decoder_model == "False" -%}
{% else -%}
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
{% endif -%}
**kwargs
)
{% if cookiecutter.is_encoder_decoder_model == "False" %}
{%- else %}
@property
def num_attention_heads(self) -> int:
return self.encoder_attention_heads
@property
def hidden_size(self) -> int:
return self.d_model
{%- endif %}

View File

@@ -34,11 +34,39 @@ class ConfigTester(object):
def create_and_test_config_common_properties(self): def create_and_test_config_common_properties(self):
config = self.config_class(**self.inputs_dict) config = self.config_class(**self.inputs_dict)
common_properties = ["hidden_size", "num_attention_heads", "num_hidden_layers"]
# Add common fields for text models
if self.has_text_modality: if self.has_text_modality:
self.parent.assertTrue(hasattr(config, "vocab_size")) common_properties.extend(["vocab_size"])
self.parent.assertTrue(hasattr(config, "hidden_size"))
self.parent.assertTrue(hasattr(config, "num_attention_heads")) # Test that config has the common properties as getters
self.parent.assertTrue(hasattr(config, "num_hidden_layers")) for prop in common_properties:
self.parent.assertTrue(hasattr(config, prop), msg=f"`{prop}` does not exist")
# Test that config has the common properties as setter
for idx, name in enumerate(common_properties):
try:
setattr(config, name, idx)
self.parent.assertEqual(
getattr(config, name), idx, msg=f"`{name} value {idx} expected, but was {getattr(config, name)}"
)
except NotImplementedError:
# Some models might not be able to implement setters for common_properties
# In that case, a NotImplementedError is raised
pass
# Test if config class can be called with Config(prop_name=..)
for idx, name in enumerate(common_properties):
try:
config = self.config_class(**{name: idx})
self.parent.assertEqual(
getattr(config, name), idx, msg=f"`{name} value {idx} expected, but was {getattr(config, name)}"
)
except NotImplementedError:
# Some models might not be able to implement setters for common_properties
# In that case, a NotImplementedError is raised
pass
def create_and_test_config_to_json_string(self): def create_and_test_config_to_json_string(self):
config = self.config_class(**self.inputs_dict) config = self.config_class(**self.inputs_dict)