Update model configs - Allow setters for common properties (#13026)
* refactor GPT Config to allow dyn. properties * make attribute_map a class attribute * remove old code * update unit test to test config: Add test for common properties setter * update unit test to test config: Add test for common properties passed as parameters to __init__ * update to black code format * Allow that setters are not defined for certain config classes * update config classes to implement attribute_map * bugfix lxmert config - id2labels was not defined when num_labels was set * update broken configs - add attribute_maps * update bart config * update black codestyle * update documentation on common config attributes * update GPTJ config to new attribute map * update docs on common attributes * gptj config: add max_position_embeddings * gptj config: format with black * update speech to text 2 config * format doc file to max_len 119 * update config template
This commit is contained in:
@@ -17,6 +17,11 @@ The base class :class:`~transformers.PretrainedConfig` implements the common met
|
||||
either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded
|
||||
from HuggingFace's AWS S3 repository).
|
||||
|
||||
Each derived config class implements model specific attributes. Common attributes present in all config classes are:
|
||||
:obj:`hidden_size`, :obj:`num_attention_heads`, and :obj:`num_hidden_layers`. Text models further implement:
|
||||
:obj:`vocab_size`.
|
||||
|
||||
|
||||
|
||||
PretrainedConfig
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
@@ -57,6 +57,8 @@ class PretrainedConfig(PushToHubMixin):
|
||||
:class:`~RagConfig`.
|
||||
- **keys_to_ignore_at_inference** (:obj:`List[str]`) -- A list of keys to ignore by default when looking at
|
||||
dictionary outputs of the model during inference.
|
||||
- **attribute_map** (:obj:`Dict[str, str]`) -- A dict that maps model specific attribute names to the
|
||||
standardized naming of attributes.
|
||||
|
||||
Common attributes (present in all subclasses)
|
||||
|
||||
@@ -218,6 +220,17 @@ class PretrainedConfig(PushToHubMixin):
|
||||
"""
|
||||
model_type: str = ""
|
||||
is_composition: bool = False
|
||||
attribute_map: Dict[str, str] = {}
|
||||
|
||||
def __setattr__(self, key, value):
|
||||
if key in super().__getattribute__("attribute_map"):
|
||||
key = super().__getattribute__("attribute_map")[key]
|
||||
super().__setattr__(key, value)
|
||||
|
||||
def __getattribute__(self, key):
|
||||
if key != "attribute_map" and key in super().__getattribute__("attribute_map"):
|
||||
key = super().__getattribute__("attribute_map")[key]
|
||||
return super().__getattribute__(key)
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
# Attributes with defaults
|
||||
@@ -350,7 +363,7 @@ class PretrainedConfig(PushToHubMixin):
|
||||
|
||||
@num_labels.setter
|
||||
def num_labels(self, num_labels: int):
|
||||
if self.id2label is None or len(self.id2label) != num_labels:
|
||||
if not hasattr(self, "id2label") or self.id2label is None or len(self.id2label) != num_labels:
|
||||
self.id2label = {i: f"LABEL_{i}" for i in range(num_labels)}
|
||||
self.label2id = dict(zip(self.id2label.values(), self.id2label.keys()))
|
||||
|
||||
|
||||
@@ -109,6 +109,7 @@ class BartConfig(PretrainedConfig):
|
||||
"""
|
||||
model_type = "bart"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -141,17 +142,6 @@ class BartConfig(PretrainedConfig):
|
||||
forced_eos_token_id=2,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
num_labels=num_labels,
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
forced_eos_token_id=forced_eos_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.d_model = d_model
|
||||
@@ -174,6 +164,17 @@ class BartConfig(PretrainedConfig):
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
||||
|
||||
super().__init__(
|
||||
num_labels=num_labels,
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
forced_eos_token_id=forced_eos_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# ensure backward compatibility for BART CNN models
|
||||
if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
|
||||
self.forced_bos_token_id = self.bos_token_id
|
||||
@@ -182,14 +183,6 @@ class BartConfig(PretrainedConfig):
|
||||
"The config can simply be saved and uploaded again to be fixed."
|
||||
)
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
return self.encoder_attention_heads
|
||||
|
||||
@property
|
||||
def hidden_size(self) -> int:
|
||||
return self.d_model
|
||||
|
||||
|
||||
class BartOnnxConfig(OnnxConfigWithPast):
|
||||
@property
|
||||
|
||||
@@ -112,6 +112,11 @@ class BigBirdPegasusConfig(PretrainedConfig):
|
||||
"""
|
||||
model_type = "bigbird_pegasus"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_attention_heads": "encoder_attention_heads",
|
||||
"hidden_size": "d_model",
|
||||
"attention_probs_dropout_prob": "attention_dropout",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -146,15 +151,6 @@ class BigBirdPegasusConfig(PretrainedConfig):
|
||||
use_bias=False,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.d_model = d_model
|
||||
@@ -183,14 +179,11 @@ class BigBirdPegasusConfig(PretrainedConfig):
|
||||
self.num_random_blocks = num_random_blocks
|
||||
self.use_bias = use_bias
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
return self.encoder_attention_heads
|
||||
|
||||
@property
|
||||
def hidden_size(self) -> int:
|
||||
return self.d_model
|
||||
|
||||
@property
|
||||
def attention_probs_dropout_prob(self) -> float:
|
||||
return self.attention_dropout
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@@ -103,6 +103,7 @@ class BlenderbotConfig(PretrainedConfig):
|
||||
"""
|
||||
model_type = "blenderbot"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -135,17 +136,6 @@ class BlenderbotConfig(PretrainedConfig):
|
||||
forced_eos_token_id=2,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
encoder_no_repeat_ngram_size=encoder_no_repeat_ngram_size,
|
||||
forced_eos_token_id=forced_eos_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.d_model = d_model
|
||||
@@ -168,10 +158,13 @@ class BlenderbotConfig(PretrainedConfig):
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
return self.encoder_attention_heads
|
||||
|
||||
@property
|
||||
def hidden_size(self) -> int:
|
||||
return self.d_model
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
encoder_no_repeat_ngram_size=encoder_no_repeat_ngram_size,
|
||||
forced_eos_token_id=forced_eos_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@@ -103,6 +103,7 @@ class BlenderbotSmallConfig(PretrainedConfig):
|
||||
"""
|
||||
model_type = "blenderbot-small"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -134,16 +135,6 @@ class BlenderbotSmallConfig(PretrainedConfig):
|
||||
forced_eos_token_id=2,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
forced_eos_token_id=forced_eos_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.d_model = d_model
|
||||
@@ -166,10 +157,12 @@ class BlenderbotSmallConfig(PretrainedConfig):
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
return self.encoder_attention_heads
|
||||
|
||||
@property
|
||||
def hidden_size(self) -> int:
|
||||
return self.d_model
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
forced_eos_token_id=forced_eos_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@@ -81,6 +81,12 @@ class CTRLConfig(PretrainedConfig):
|
||||
|
||||
model_type = "ctrl"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"max_position_embeddings": "n_positions",
|
||||
"hidden_size": "n_embd",
|
||||
"num_attention_heads": "n_head",
|
||||
"num_hidden_layers": "n_layer",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -104,7 +110,6 @@ class CTRLConfig(PretrainedConfig):
|
||||
use_cache=True,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.vocab_size = vocab_size
|
||||
self.n_ctx = n_ctx
|
||||
self.n_positions = n_positions
|
||||
@@ -125,18 +130,4 @@ class CTRLConfig(PretrainedConfig):
|
||||
self.summary_proj_to_labels = summary_proj_to_labels
|
||||
self.use_cache = use_cache
|
||||
|
||||
@property
|
||||
def max_position_embeddings(self):
|
||||
return self.n_positions
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
return self.n_embd
|
||||
|
||||
@property
|
||||
def num_attention_heads(self):
|
||||
return self.n_head
|
||||
|
||||
@property
|
||||
def num_hidden_layers(self):
|
||||
return self.n_layer
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@@ -117,6 +117,10 @@ class DetrConfig(PretrainedConfig):
|
||||
"""
|
||||
model_type = "detr"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"hidden_size": "d_model",
|
||||
"num_attention_heads": "encoder_attention_heads",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -154,8 +158,6 @@ class DetrConfig(PretrainedConfig):
|
||||
eos_coefficient=0.1,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
|
||||
|
||||
self.num_queries = num_queries
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.d_model = d_model
|
||||
@@ -189,6 +191,7 @@ class DetrConfig(PretrainedConfig):
|
||||
self.bbox_loss_coefficient = bbox_loss_coefficient
|
||||
self.giou_loss_coefficient = giou_loss_coefficient
|
||||
self.eos_coefficient = eos_coefficient
|
||||
super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
|
||||
@@ -93,6 +93,11 @@ class DistilBertConfig(PretrainedConfig):
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
model_type = "distilbert"
|
||||
attribute_map = {
|
||||
"hidden_size": "dim",
|
||||
"num_attention_heads": "n_heads",
|
||||
"num_hidden_layers": "n_layers",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -112,7 +117,6 @@ class DistilBertConfig(PretrainedConfig):
|
||||
pad_token_id=0,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(**kwargs, pad_token_id=pad_token_id)
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.sinusoidal_pos_embds = sinusoidal_pos_embds
|
||||
@@ -126,18 +130,7 @@ class DistilBertConfig(PretrainedConfig):
|
||||
self.initializer_range = initializer_range
|
||||
self.qa_dropout = qa_dropout
|
||||
self.seq_classif_dropout = seq_classif_dropout
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
return self.dim
|
||||
|
||||
@property
|
||||
def num_attention_heads(self):
|
||||
return self.n_heads
|
||||
|
||||
@property
|
||||
def num_hidden_layers(self):
|
||||
return self.n_layers
|
||||
super().__init__(**kwargs, pad_token_id=pad_token_id)
|
||||
|
||||
|
||||
class DistilBertOnnxConfig(OnnxConfig):
|
||||
|
||||
@@ -136,6 +136,6 @@ class FlaubertConfig(XLMConfig):
|
||||
|
||||
def __init__(self, layerdrop=0.0, pre_norm=False, pad_token_id=2, bos_token_id=0, **kwargs):
|
||||
"""Constructs FlaubertConfig."""
|
||||
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
|
||||
self.layerdrop = layerdrop
|
||||
self.pre_norm = pre_norm
|
||||
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
|
||||
|
||||
@@ -124,6 +124,7 @@ class FSMTConfig(PretrainedConfig):
|
||||
|
||||
"""
|
||||
model_type = "fsmt"
|
||||
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
|
||||
|
||||
# update the defaults from config file
|
||||
def __init__(
|
||||
@@ -161,18 +162,6 @@ class FSMTConfig(PretrainedConfig):
|
||||
forced_eos_token_id=2,
|
||||
**common_kwargs
|
||||
):
|
||||
if "hidden_size" in common_kwargs:
|
||||
raise ValueError("hidden size is called d_model")
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
forced_eos_token_id=forced_eos_token_id,
|
||||
**common_kwargs,
|
||||
)
|
||||
self.langs = langs
|
||||
self.src_vocab_size = src_vocab_size
|
||||
self.tgt_vocab_size = tgt_vocab_size
|
||||
@@ -196,6 +185,8 @@ class FSMTConfig(PretrainedConfig):
|
||||
self.early_stopping = early_stopping
|
||||
|
||||
self.decoder = DecoderConfig(vocab_size=tgt_vocab_size, bos_token_id=eos_token_id)
|
||||
if "decoder" in common_kwargs:
|
||||
del common_kwargs["decoder"]
|
||||
|
||||
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
||||
|
||||
@@ -205,14 +196,16 @@ class FSMTConfig(PretrainedConfig):
|
||||
self.dropout = dropout
|
||||
|
||||
self.use_cache = use_cache
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
return self.encoder_attention_heads
|
||||
|
||||
@property
|
||||
def hidden_size(self) -> int:
|
||||
return self.d_model
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
forced_eos_token_id=forced_eos_token_id,
|
||||
**common_kwargs,
|
||||
)
|
||||
|
||||
def to_dict(self):
|
||||
"""
|
||||
|
||||
@@ -102,6 +102,10 @@ class FunnelConfig(PretrainedConfig):
|
||||
Whether or not to apply the pooling only to the query or to query, key and values for the attention layers.
|
||||
"""
|
||||
model_type = "funnel"
|
||||
attribute_map = {
|
||||
"hidden_size": "d_model",
|
||||
"num_attention_heads": "n_head",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -129,8 +133,6 @@ class FunnelConfig(PretrainedConfig):
|
||||
pool_q_only=True,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.block_sizes = block_sizes
|
||||
self.block_repeats = [1] * len(block_sizes) if block_repeats is None else block_repeats
|
||||
@@ -165,18 +167,22 @@ class FunnelConfig(PretrainedConfig):
|
||||
self.truncate_seq = truncate_seq
|
||||
self.pool_q_only = pool_q_only
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
return self.d_model
|
||||
|
||||
@property
|
||||
def num_attention_heads(self):
|
||||
return self.n_head
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@property
|
||||
def num_hidden_layers(self):
|
||||
return sum(self.block_sizes)
|
||||
|
||||
@num_hidden_layers.setter
|
||||
def num_hidden_layers(self, value):
|
||||
raise NotImplementedError(
|
||||
"This model does not support the setting of `num_hidden_layers`. Please set `block_sizes`."
|
||||
)
|
||||
|
||||
@property
|
||||
def num_blocks(self):
|
||||
return len(self.block_sizes)
|
||||
|
||||
@num_blocks.setter
|
||||
def num_blocks(self, value):
|
||||
raise NotImplementedError("This model does not support the setting of `num_blocks`. Please set `block_sizes`.")
|
||||
|
||||
@@ -130,6 +130,12 @@ class GPT2Config(PretrainedConfig):
|
||||
|
||||
model_type = "gpt2"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"hidden_size": "n_embd",
|
||||
"max_position_embeddings": "n_positions",
|
||||
"num_attention_heads": "n_head",
|
||||
"num_hidden_layers": "n_layer",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -158,8 +164,6 @@ class GPT2Config(PretrainedConfig):
|
||||
eos_token_id=50256,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.n_ctx = n_ctx
|
||||
self.n_positions = n_positions
|
||||
@@ -185,21 +189,7 @@ class GPT2Config(PretrainedConfig):
|
||||
self.bos_token_id = bos_token_id
|
||||
self.eos_token_id = eos_token_id
|
||||
|
||||
@property
|
||||
def max_position_embeddings(self):
|
||||
return self.n_positions
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
return self.n_embd
|
||||
|
||||
@property
|
||||
def num_attention_heads(self):
|
||||
return self.n_head
|
||||
|
||||
@property
|
||||
def num_hidden_layers(self):
|
||||
return self.n_layer
|
||||
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
||||
|
||||
|
||||
class GPT2OnnxConfig(OnnxConfigWithPast):
|
||||
|
||||
@@ -96,6 +96,7 @@ class GPTNeoConfig(PretrainedConfig):
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
model_type = "gpt_neo"
|
||||
attribute_map = {"num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -124,8 +125,6 @@ class GPTNeoConfig(PretrainedConfig):
|
||||
eos_token_id=50256,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
@@ -163,6 +162,8 @@ class GPTNeoConfig(PretrainedConfig):
|
||||
"Please verify the value of `config.attention_types` argument."
|
||||
)
|
||||
|
||||
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
def expand_attention_types_params(attention_types):
|
||||
attentions = []
|
||||
@@ -171,14 +172,6 @@ class GPTNeoConfig(PretrainedConfig):
|
||||
attentions.extend(item[0])
|
||||
return attentions
|
||||
|
||||
@property
|
||||
def num_attention_heads(self):
|
||||
return self.num_heads
|
||||
|
||||
@property
|
||||
def num_hidden_layers(self):
|
||||
return self.num_layers
|
||||
|
||||
|
||||
def custom_unfold(input, dimension, size, step):
|
||||
"""Custom torch.Tensor.unfold implementation to enable the export to ONNX."""
|
||||
|
||||
@@ -87,6 +87,12 @@ class GPTJConfig(PretrainedConfig):
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
model_type = "gptj"
|
||||
attribute_map = {
|
||||
"max_position_embeddings": "n_positions",
|
||||
"hidden_size": "n_embd",
|
||||
"num_attention_heads": "n_head",
|
||||
"num_hidden_layers": "n_layer",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -111,8 +117,6 @@ class GPTJConfig(PretrainedConfig):
|
||||
eos_token_id=50256,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.n_ctx = n_ctx
|
||||
self.n_positions = n_positions
|
||||
@@ -134,18 +138,4 @@ class GPTJConfig(PretrainedConfig):
|
||||
self.bos_token_id = bos_token_id
|
||||
self.eos_token_id = eos_token_id
|
||||
|
||||
@property
|
||||
def max_position_embeddings(self):
|
||||
return self.n_positions
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
return self.n_embd
|
||||
|
||||
@property
|
||||
def num_attention_heads(self):
|
||||
return self.n_head
|
||||
|
||||
@property
|
||||
def num_hidden_layers(self):
|
||||
return self.n_layer
|
||||
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
||||
|
||||
@@ -99,6 +99,12 @@ class LEDConfig(PretrainedConfig):
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
model_type = "led"
|
||||
attribute_map = {
|
||||
"num_attention_heads": "encoder_attention_heads",
|
||||
"hidden_size": "d_model",
|
||||
"attention_probs_dropout_prob": "attention_dropout",
|
||||
"initializer_range": "init_std",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -130,15 +136,6 @@ class LEDConfig(PretrainedConfig):
|
||||
attention_window: Union[List[int], int] = 512,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.max_encoder_position_embeddings = max_encoder_position_embeddings
|
||||
self.max_decoder_position_embeddings = max_decoder_position_embeddings
|
||||
@@ -162,18 +159,11 @@ class LEDConfig(PretrainedConfig):
|
||||
self.attention_window = attention_window
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
return self.encoder_attention_heads
|
||||
|
||||
@property
|
||||
def hidden_size(self) -> int:
|
||||
return self.d_model
|
||||
|
||||
@property
|
||||
def attention_probs_dropout_prob(self) -> float:
|
||||
return self.attention_dropout
|
||||
|
||||
@property
|
||||
def initializer_range(self) -> float:
|
||||
return self.init_std
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@@ -113,13 +113,13 @@ class LxmertConfig(PretrainedConfig):
|
||||
"""
|
||||
|
||||
model_type = "lxmert"
|
||||
attribute_map = {}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=30522,
|
||||
hidden_size=768,
|
||||
num_attention_heads=12,
|
||||
num_labels=2,
|
||||
num_qa_labels=9500,
|
||||
num_object_labels=1600,
|
||||
num_attr_labels=400,
|
||||
@@ -149,11 +149,9 @@ class LxmertConfig(PretrainedConfig):
|
||||
output_hidden_states=False,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_labels = num_labels
|
||||
self.hidden_act = hidden_act
|
||||
self.intermediate_size = intermediate_size
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
@@ -179,5 +177,6 @@ class LxmertConfig(PretrainedConfig):
|
||||
self.visual_attr_loss = visual_attr_loss
|
||||
self.visual_feat_loss = visual_feat_loss
|
||||
self.output_hidden_states = output_hidden_states
|
||||
self.output_attentions = self.output_attentions
|
||||
self.output_attentions = output_attentions
|
||||
self.num_hidden_layers = {"vision": r_layers, "cross_encoder": x_layers, "language": l_layers}
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@@ -97,6 +97,7 @@ class M2M100Config(PretrainedConfig):
|
||||
"""
|
||||
model_type = "m2m_100"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -126,15 +127,6 @@ class M2M100Config(PretrainedConfig):
|
||||
eos_token_id=2,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.d_model = d_model
|
||||
@@ -156,10 +148,11 @@ class M2M100Config(PretrainedConfig):
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
return self.encoder_attention_heads
|
||||
|
||||
@property
|
||||
def hidden_size(self) -> int:
|
||||
return self.d_model
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@@ -103,6 +103,7 @@ class MarianConfig(PretrainedConfig):
|
||||
"""
|
||||
model_type = "marian"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -133,15 +134,6 @@ class MarianConfig(PretrainedConfig):
|
||||
forced_eos_token_id=0,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
forced_eos_token_id=forced_eos_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.d_model = d_model
|
||||
@@ -163,11 +155,11 @@ class MarianConfig(PretrainedConfig):
|
||||
self.num_hidden_layers = encoder_layers
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
return self.encoder_attention_heads
|
||||
|
||||
@property
|
||||
def hidden_size(self) -> int:
|
||||
return self.d_model
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
forced_eos_token_id=forced_eos_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@@ -107,6 +107,7 @@ class MBartConfig(PretrainedConfig):
|
||||
"""
|
||||
model_type = "mbart"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -137,15 +138,6 @@ class MBartConfig(PretrainedConfig):
|
||||
forced_eos_token_id=2,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
forced_eos_token_id=forced_eos_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.d_model = d_model
|
||||
@@ -167,14 +159,14 @@ class MBartConfig(PretrainedConfig):
|
||||
self.num_hidden_layers = encoder_layers
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
return self.encoder_attention_heads
|
||||
|
||||
@property
|
||||
def hidden_size(self) -> int:
|
||||
return self.d_model
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
forced_eos_token_id=forced_eos_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
class MBartOnnxConfig(OnnxConfigWithPast):
|
||||
|
||||
@@ -115,6 +115,12 @@ class OpenAIGPTConfig(PretrainedConfig):
|
||||
"""
|
||||
|
||||
model_type = "openai-gpt"
|
||||
attribute_map = {
|
||||
"max_position_embeddings": "n_positions",
|
||||
"hidden_size": "n_embd",
|
||||
"num_attention_heads": "n_head",
|
||||
"num_hidden_layers": "n_layer",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -138,8 +144,6 @@ class OpenAIGPTConfig(PretrainedConfig):
|
||||
summary_first_dropout=0.1,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.n_ctx = n_ctx
|
||||
self.n_positions = n_positions
|
||||
@@ -158,19 +162,4 @@ class OpenAIGPTConfig(PretrainedConfig):
|
||||
self.summary_activation = summary_activation
|
||||
self.summary_first_dropout = summary_first_dropout
|
||||
self.summary_proj_to_labels = summary_proj_to_labels
|
||||
|
||||
@property
|
||||
def max_position_embeddings(self):
|
||||
return self.n_positions
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
return self.n_embd
|
||||
|
||||
@property
|
||||
def num_attention_heads(self):
|
||||
return self.n_head
|
||||
|
||||
@property
|
||||
def num_hidden_layers(self):
|
||||
return self.n_layer
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@@ -103,6 +103,7 @@ class PegasusConfig(PretrainedConfig):
|
||||
"""
|
||||
model_type = "pegasus"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -133,15 +134,6 @@ class PegasusConfig(PretrainedConfig):
|
||||
forced_eos_token_id=1,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
forced_eos_token_id=forced_eos_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.d_model = d_model
|
||||
@@ -163,6 +155,14 @@ class PegasusConfig(PretrainedConfig):
|
||||
self.num_hidden_layers = encoder_layers
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
forced_eos_token_id=forced_eos_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
|
||||
@@ -97,6 +97,9 @@ class ProphetNetConfig(PretrainedConfig):
|
||||
"""
|
||||
model_type = "prophetnet"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_attention_heads": "num_encoder_attention_heads",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -129,15 +132,6 @@ class ProphetNetConfig(PretrainedConfig):
|
||||
eos_token_id=2,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
add_cross_attention=add_cross_attention,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.encoder_ffn_dim = encoder_ffn_dim
|
||||
@@ -167,10 +161,22 @@ class ProphetNetConfig(PretrainedConfig):
|
||||
# 4 Training Args (should be removed soon)
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
return self.num_encoder_attention_heads
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
add_cross_attention=add_cross_attention,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def num_hidden_layers(self) -> int:
|
||||
return self.num_encoder_layers + self.num_decoder_layers
|
||||
|
||||
@num_hidden_layers.setter
|
||||
def num_hidden_layers(self, value):
|
||||
raise NotImplementedError(
|
||||
"This model does not support the setting of `num_hidden_layers`. Please set `num_encoder_layers` and `num_decoder_layers`."
|
||||
)
|
||||
|
||||
@@ -158,6 +158,7 @@ class ReformerConfig(PretrainedConfig):
|
||||
"""
|
||||
model_type = "reformer"
|
||||
keys_to_ignore_at_inference = ["past_buckets_states"]
|
||||
attribute_map = {}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -196,14 +197,6 @@ class ReformerConfig(PretrainedConfig):
|
||||
classifier_dropout=None,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_decoder=is_decoder,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.hash_seed = hash_seed
|
||||
self.vocab_size = vocab_size
|
||||
self.attention_head_size = attention_head_size
|
||||
@@ -234,3 +227,10 @@ class ReformerConfig(PretrainedConfig):
|
||||
self.attn_layers = attn_layers
|
||||
self.use_cache = use_cache
|
||||
self.classifier_dropout = classifier_dropout
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_decoder=is_decoder,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@@ -110,6 +110,7 @@ class Speech2TextConfig(PretrainedConfig):
|
||||
"""
|
||||
model_type = "speech_to_text"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -146,15 +147,6 @@ class Speech2TextConfig(PretrainedConfig):
|
||||
input_channels=1,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.d_model = d_model
|
||||
self.encoder_ffn_dim = encoder_ffn_dim
|
||||
@@ -191,10 +183,11 @@ class Speech2TextConfig(PretrainedConfig):
|
||||
f"`config.num_conv_layers = {self.num_conv_layers}`."
|
||||
)
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
return self.encoder_attention_heads
|
||||
|
||||
@property
|
||||
def hidden_size(self) -> int:
|
||||
return self.d_model
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@@ -89,6 +89,7 @@ class Speech2Text2Config(PretrainedConfig):
|
||||
"""
|
||||
model_type = "speech_to_text_2"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {"num_attention_heads": "decoder_attention_heads", "hidden_size": "d_model"}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -115,14 +116,6 @@ class Speech2Text2Config(PretrainedConfig):
|
||||
max_target_positions=1024,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.d_model = d_model
|
||||
self.decoder_ffn_dim = decoder_ffn_dim
|
||||
@@ -142,10 +135,10 @@ class Speech2Text2Config(PretrainedConfig):
|
||||
self.max_source_positions = max_source_positions
|
||||
self.max_target_positions = max_target_positions
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
return self.decoder_attention_heads
|
||||
|
||||
@property
|
||||
def hidden_size(self) -> int:
|
||||
return self.d_model
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@@ -82,6 +82,7 @@ class T5Config(PretrainedConfig):
|
||||
"""
|
||||
model_type = "t5"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {"hidden_size": "d_model", "num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -104,12 +105,6 @@ class T5Config(PretrainedConfig):
|
||||
gradient_checkpointing=False,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
**kwargs,
|
||||
)
|
||||
self.vocab_size = vocab_size
|
||||
self.d_model = d_model
|
||||
self.d_kv = d_kv
|
||||
@@ -126,18 +121,12 @@ class T5Config(PretrainedConfig):
|
||||
self.feed_forward_proj = feed_forward_proj
|
||||
self.use_cache = use_cache
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
return self.d_model
|
||||
|
||||
@property
|
||||
def num_attention_heads(self):
|
||||
return self.num_heads
|
||||
|
||||
@property
|
||||
def num_hidden_layers(self):
|
||||
return self.num_layers
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
class T5OnnxConfig(OnnxConfigWithPast):
|
||||
|
||||
@@ -106,6 +106,12 @@ class TransfoXLConfig(PretrainedConfig):
|
||||
|
||||
model_type = "transfo-xl"
|
||||
keys_to_ignore_at_inference = ["mems"]
|
||||
attribute_map = {
|
||||
"n_token": "vocab_size",
|
||||
"hidden_size": "d_model",
|
||||
"num_attention_heads": "n_head",
|
||||
"num_hidden_layers": "n_layer",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -137,7 +143,6 @@ class TransfoXLConfig(PretrainedConfig):
|
||||
eos_token_id=0,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(eos_token_id=eos_token_id, **kwargs)
|
||||
self.vocab_size = vocab_size
|
||||
self.cutoffs = []
|
||||
self.cutoffs.extend(cutoffs)
|
||||
@@ -167,6 +172,7 @@ class TransfoXLConfig(PretrainedConfig):
|
||||
self.proj_init_std = proj_init_std
|
||||
self.init_std = init_std
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
super().__init__(eos_token_id=eos_token_id, **kwargs)
|
||||
|
||||
@property
|
||||
def max_position_embeddings(self):
|
||||
@@ -174,22 +180,9 @@ class TransfoXLConfig(PretrainedConfig):
|
||||
logger.info(f"The model {self.model_type} is one of the few models that has no sequence length limit.")
|
||||
return -1
|
||||
|
||||
@property
|
||||
def n_token(self): # Backward compatibility
|
||||
return self.vocab_size
|
||||
|
||||
@n_token.setter
|
||||
def n_token(self, value): # Backward compatibility
|
||||
self.vocab_size = value
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
return self.d_model
|
||||
|
||||
@property
|
||||
def num_attention_heads(self):
|
||||
return self.n_head
|
||||
|
||||
@property
|
||||
def num_hidden_layers(self):
|
||||
return self.n_layer
|
||||
@max_position_embeddings.setter
|
||||
def max_position_embeddings(self, value):
|
||||
# Message copied from Transformer-XL documentation
|
||||
raise NotImplementedError(
|
||||
f"The model {self.model_type} is one of the few models that has no sequence length limit."
|
||||
)
|
||||
|
||||
@@ -146,6 +146,12 @@ class XLMConfig(PretrainedConfig):
|
||||
"""
|
||||
|
||||
model_type = "xlm"
|
||||
attribute_map = {
|
||||
"hidden_size": "emb_dim",
|
||||
"num_attention_heads": "n_heads",
|
||||
"num_hidden_layers": "n_layers",
|
||||
"n_words": "vocab_size", # For backward compatibility
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -185,7 +191,6 @@ class XLMConfig(PretrainedConfig):
|
||||
**kwargs
|
||||
):
|
||||
"""Constructs XLMConfig."""
|
||||
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
|
||||
self.vocab_size = vocab_size
|
||||
self.emb_dim = emb_dim
|
||||
self.n_layers = n_layers
|
||||
@@ -221,22 +226,4 @@ class XLMConfig(PretrainedConfig):
|
||||
if "n_words" in kwargs:
|
||||
self.n_words = kwargs["n_words"]
|
||||
|
||||
@property
|
||||
def n_words(self): # For backward compatibility
|
||||
return self.vocab_size
|
||||
|
||||
@n_words.setter
|
||||
def n_words(self, value): # For backward compatibility
|
||||
self.vocab_size = value
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
return self.emb_dim
|
||||
|
||||
@property
|
||||
def num_attention_heads(self):
|
||||
return self.n_heads
|
||||
|
||||
@property
|
||||
def num_hidden_layers(self):
|
||||
return self.n_layers
|
||||
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
|
||||
|
||||
@@ -137,6 +137,12 @@ class XLNetConfig(PretrainedConfig):
|
||||
|
||||
model_type = "xlnet"
|
||||
keys_to_ignore_at_inference = ["mems"]
|
||||
attribute_map = {
|
||||
"n_token": "vocab_size", # Backward compatibility
|
||||
"hidden_size": "d_model",
|
||||
"num_attention_heads": "n_head",
|
||||
"num_hidden_layers": "n_layer",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -170,7 +176,6 @@ class XLNetConfig(PretrainedConfig):
|
||||
**kwargs
|
||||
):
|
||||
"""Constructs XLNetConfig."""
|
||||
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
||||
self.vocab_size = vocab_size
|
||||
self.d_model = d_model
|
||||
self.n_layer = n_layer
|
||||
@@ -216,27 +221,16 @@ class XLNetConfig(PretrainedConfig):
|
||||
|
||||
self.use_mems_eval = use_mems_eval
|
||||
self.use_mems_train = use_mems_train
|
||||
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
||||
|
||||
@property
|
||||
def max_position_embeddings(self):
|
||||
logger.info(f"The model {self.model_type} is one of the few models that has no sequence length limit.")
|
||||
return -1
|
||||
|
||||
@property
|
||||
def n_token(self): # Backward compatibility
|
||||
return self.vocab_size
|
||||
|
||||
@n_token.setter
|
||||
def n_token(self, value): # Backward compatibility
|
||||
self.vocab_size = value
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
return self.d_model
|
||||
|
||||
@property
|
||||
def num_attention_heads(self):
|
||||
return self.n_head
|
||||
|
||||
@property
|
||||
def num_hidden_layers(self):
|
||||
return self.n_layer
|
||||
@max_position_embeddings.setter
|
||||
def max_position_embeddings(self, value):
|
||||
# Message copied from Transformer-XL documentation
|
||||
raise NotImplementedError(
|
||||
f"The model {self.model_type} is one of the few models that has no sequence length limit."
|
||||
)
|
||||
|
||||
@@ -137,6 +137,15 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):
|
||||
{% else -%}
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
{% endif -%}
|
||||
|
||||
{% if cookiecutter.is_encoder_decoder_model == "False" %}
|
||||
{%- else %}
|
||||
attribute_map = {
|
||||
"num_attention_heads": "encoder_attention_heads",
|
||||
"hidden_size": "d_model"
|
||||
}
|
||||
|
||||
{%- endif %}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -184,18 +193,6 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):
|
||||
eos_token_id=2,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
{% if cookiecutter.is_encoder_decoder_model == "False" -%}
|
||||
{% else -%}
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
{% endif -%}
|
||||
**kwargs
|
||||
)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
{% if cookiecutter.is_encoder_decoder_model == "False" -%}
|
||||
@@ -232,14 +229,16 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):
|
||||
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
||||
|
||||
{% endif -%}
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
{% if cookiecutter.is_encoder_decoder_model == "False" -%}
|
||||
{% else -%}
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
{% endif -%}
|
||||
**kwargs
|
||||
)
|
||||
|
||||
{% if cookiecutter.is_encoder_decoder_model == "False" %}
|
||||
{%- else %}
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
return self.encoder_attention_heads
|
||||
|
||||
@property
|
||||
def hidden_size(self) -> int:
|
||||
return self.d_model
|
||||
{%- endif %}
|
||||
|
||||
@@ -34,11 +34,39 @@ class ConfigTester(object):
|
||||
|
||||
def create_and_test_config_common_properties(self):
|
||||
config = self.config_class(**self.inputs_dict)
|
||||
common_properties = ["hidden_size", "num_attention_heads", "num_hidden_layers"]
|
||||
|
||||
# Add common fields for text models
|
||||
if self.has_text_modality:
|
||||
self.parent.assertTrue(hasattr(config, "vocab_size"))
|
||||
self.parent.assertTrue(hasattr(config, "hidden_size"))
|
||||
self.parent.assertTrue(hasattr(config, "num_attention_heads"))
|
||||
self.parent.assertTrue(hasattr(config, "num_hidden_layers"))
|
||||
common_properties.extend(["vocab_size"])
|
||||
|
||||
# Test that config has the common properties as getters
|
||||
for prop in common_properties:
|
||||
self.parent.assertTrue(hasattr(config, prop), msg=f"`{prop}` does not exist")
|
||||
|
||||
# Test that config has the common properties as setter
|
||||
for idx, name in enumerate(common_properties):
|
||||
try:
|
||||
setattr(config, name, idx)
|
||||
self.parent.assertEqual(
|
||||
getattr(config, name), idx, msg=f"`{name} value {idx} expected, but was {getattr(config, name)}"
|
||||
)
|
||||
except NotImplementedError:
|
||||
# Some models might not be able to implement setters for common_properties
|
||||
# In that case, a NotImplementedError is raised
|
||||
pass
|
||||
|
||||
# Test if config class can be called with Config(prop_name=..)
|
||||
for idx, name in enumerate(common_properties):
|
||||
try:
|
||||
config = self.config_class(**{name: idx})
|
||||
self.parent.assertEqual(
|
||||
getattr(config, name), idx, msg=f"`{name} value {idx} expected, but was {getattr(config, name)}"
|
||||
)
|
||||
except NotImplementedError:
|
||||
# Some models might not be able to implement setters for common_properties
|
||||
# In that case, a NotImplementedError is raised
|
||||
pass
|
||||
|
||||
def create_and_test_config_to_json_string(self):
|
||||
config = self.config_class(**self.inputs_dict)
|
||||
|
||||
Reference in New Issue
Block a user