Updated Configurations

This commit is contained in:
Lysandre Debut
2020-01-12 21:53:19 +01:00
committed by Lysandre Debut
parent 2b566c182e
commit 632682726f
14 changed files with 528 additions and 319 deletions

View File

@@ -1,7 +1,7 @@
ALBERT ALBERT
---------------------------------------------------- ----------------------------------------------------
``AlbrtConfig`` ``AlbertConfig``
~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AlbertConfig .. autoclass:: transformers.AlbertConfig

View File

@@ -31,9 +31,73 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class AlbertConfig(PretrainedConfig): class AlbertConfig(PretrainedConfig):
"""Configuration for `AlbertModel`. r"""
This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`.
It is used to instantiate an ALBERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the ALBERT xxlarge architecture.
The default settings match the configuration of model `albert_xxlarge`. Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 30000):
Vocabulary size of the ALBERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
embedding_size (:obj:`int`, optional, defaults to 128):
Size of vocabulary embeddings.
hidden_size (:obj:`int`, optional, defaults to 4096):
Size of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_hidden_groups (:obj:`int`, optional, defaults to 1):
Number of groups for the hidden layers, parameters in the same group are shared.
num_attention_heads (:obj:`int`, optional, defaults to 64):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, optional, defaults to 16384):
The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
inner_group_num (:obj:`int`, optional, defaults to 1):
The number of inner repetition of attention and ffn.
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something
large (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
Example::
# Initializing an ALBERT-xxlarge style configuration
albert_xxlarge_configuration = AlbertConfig()
# Initializing an ALBERT-base style configuration
albert_base_configuration = AlbertConfig(
hidden_size=768,
num_attention_heads=12,
intermediate_size=3072,
)
# Initializing a model from the ALBERT-base style configuration
model = AlbertModel(bert_base_configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
""" """
pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
@@ -57,35 +121,6 @@ class AlbertConfig(PretrainedConfig):
layer_norm_eps=1e-12, layer_norm_eps=1e-12,
**kwargs **kwargs
): ):
"""Constructs AlbertConfig.
Args:
vocab_size: Vocabulary size of `inputs_ids` in `AlbertModel`.
embedding_size: size of voc embeddings.
hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_hidden_groups: Number of group for the hidden layers, parameters in
the same group are shared.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
inner_group_num: int, number of inner repetition of attention and ffn.
down_scale_factor: float, the scale to apply
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler.
hidden_dropout_prob: The dropout probability for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
`AlbertModel`.
initializer_range: The stdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
super(AlbertConfig, self).__init__(**kwargs) super(AlbertConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size

View File

@@ -57,29 +57,13 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
class AutoConfig(object): class AutoConfig(object):
r""":class:`~transformers.AutoConfig` is a generic configuration class r"""
:class:`~transformers.AutoConfig` is a generic configuration class
that will be instantiated as one of the configuration classes of the library that will be instantiated as one of the configuration classes of the library
when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` when created with the :func:`~transformers.AutoConfig.from_pretrained` class method.
class method.
The `from_pretrained()` method take care of returning the correct model class instance The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance
using pattern matching on the `pretrained_model_name_or_path` string. using pattern matching on the `pretrained_model_name_or_path` string argument.
The base model class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
- contains `distilbert`: DistilBertConfig (DistilBERT model)
- contains `albert`: AlbertConfig (ALBERT model)
- contains `camembert`: CamembertConfig (CamemBERT model)
- contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model)
- contains `roberta`: RobertaConfig (RoBERTa model)
- contains `bert`: BertConfig (Bert model)
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
- contains `xlnet`: XLNetConfig (XLNet model)
- contains `xlm`: XLMConfig (XLM model)
- contains `ctrl` : CTRLConfig (CTRL model)
This class cannot be instantiated using `__init__()` (throw an error).
""" """
def __init__(self): def __init__(self):
@@ -94,6 +78,8 @@ class AutoConfig(object):
return DistilBertConfig(*args, **kwargs) return DistilBertConfig(*args, **kwargs)
elif "roberta" in model_type: elif "roberta" in model_type:
return RobertaConfig(*args, **kwargs) return RobertaConfig(*args, **kwargs)
elif "albert" in model_type:
return AlbertConfig(*args, **kwargs)
elif "bert" in model_type: elif "bert" in model_type:
return BertConfig(*args, **kwargs) return BertConfig(*args, **kwargs)
elif "openai-gpt" in model_type: elif "openai-gpt" in model_type:
@@ -108,8 +94,6 @@ class AutoConfig(object):
return XLMConfig(*args, **kwargs) return XLMConfig(*args, **kwargs)
elif "ctrl" in model_type: elif "ctrl" in model_type:
return CTRLConfig(*args, **kwargs) return CTRLConfig(*args, **kwargs)
elif "albert" in model_type:
return AlbertConfig(*args, **kwargs)
elif "camembert" in model_type: elif "camembert" in model_type:
return CamembertConfig(*args, **kwargs) return CamembertConfig(*args, **kwargs)
raise ValueError( raise ValueError(
@@ -120,59 +104,60 @@ class AutoConfig(object):
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
r""" Instantiate a one of the configuration classes of the library r""" Instantiates one of the configuration classes of the library
from a pre-trained model configuration. from a pre-trained model configuration.
The configuration class to instantiate is selected as the first pattern matching The configuration class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order): in the `pretrained_model_name_or_path` string (in the following order):
- contains `t5`: T5Config (T5 model) - contains `t5`: :class:`~transformers.T5Config` (T5 model)
- contains `distilbert`: DistilBertConfig (DistilBERT model) - contains `distilbert`: :class:`~transformers.DistilBertConfig` (DistilBERT model)
- contains `albert`: AlbertConfig (ALBERT model) - contains `albert`: :class:`~transformers.AlbertConfig` (ALBERT model)
- contains `camembert`: CamembertConfig (CamemBERT model) - contains `camembert`: :class:`~transformers.CamembertConfig` (CamemBERT model)
- contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model) - contains `xlm-roberta`: :class:`~transformers.XLMRobertaConfig` (XLM-RoBERTa model)
- contains `roberta`: RobertaConfig (RoBERTa model) - contains `roberta`: :class:`~transformers.RobertaConfig` (RoBERTa model)
- contains `bert`: BertConfig (Bert model) - contains `bert`: :class:`~transformers.BertConfig` (Bert model)
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model) - contains `openai-gpt`: :class:`~transformers.OpenAIGPTConfig` (OpenAI GPT model)
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model) - contains `gpt2`: :class:`~transformers.GPT2Config` (OpenAI GPT-2 model)
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model) - contains `transfo-xl`: :class:`~transformers.TransfoXLConfig` (Transformer-XL model)
- contains `xlnet`: XLNetConfig (XLNet model) - contains `xlnet`: :class:`~transformers.XLNetConfig` (XLNet model)
- contains `xlm`: XLMConfig (XLM model) - contains `xlm`: :class:`~transformers.XLMConfig` (XLM model)
- contains `ctrl` : CTRLConfig (CTRL model) - contains `ctrl` : :class:`~transformers.CTRLConfig` (CTRL model)
Params:
pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
cache_dir: (`optional`) string: Args:
pretrained_model_name_or_path (:obj:`string`):
Is either: \
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
cache_dir (:obj:`string`, optional, defaults to `None`):
Path to a directory in which a downloaded pre-trained model Path to a directory in which a downloaded pre-trained model
configuration should be cached if the standard cache should not be used. configuration should be cached if the standard cache should not be used.
kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading. force_download (:obj:`boolean`, optional, defaults to `False`):
Force to (re-)download the model weights and configuration files and override the cached versions if they exist.
- The values in kwargs of any keys which are configuration attributes will be used to override the loaded values. resume_download (:obj:`boolean`, optional, defaults to `False`):
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter. Do not delete incompletely received file. Attempt to resume the download if such a file exists.
force_download: (`optional`) boolean, default False: proxies (:obj:`Dict[str, str]`, optional, defaults to `None`):
Force to (re-)download the model weights and configuration files and override the cached versions if they exists. A dictionary of proxy servers to use by protocol or endpoint, e.g.: :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`.
The proxies are used on each request. See `the requests documentation <https://requests.readthedocs.io/en/master/user/advanced/#proxies>`__ for usage.
resume_download: (`optional`) boolean, default False:
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
return_unused_kwargs: (`optional`) bool:
return_unused_kwargs (:obj:`boolean`, optional, defaults to `False`):
- If False, then this function returns just the final configuration object. - If False, then this function returns just the final configuration object.
- If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored. - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): key/value pairs with which to update the configuration object after loading.
- The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
Examples:: Examples::
config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
config = AutoConfig.from_pretrained('./test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` config = AutoConfig.from_pretrained('./test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json') config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False) config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)

View File

@@ -50,32 +50,44 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class BertConfig(PretrainedConfig): class BertConfig(PretrainedConfig):
r""" r"""
:class:`~transformers.BertConfig` is the configuration class to store the configuration of a This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
`BertModel`. It is used to instantiate an BERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the BERT bert-base-uncased architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Arguments: Args:
vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. vocab_size (:obj:`int`, optional, defaults to 30522):
hidden_size: Size of the encoder layers and the pooler layer. Vocabulary size of the BERT model. Defines the different tokens that
num_hidden_layers: Number of hidden layers in the Transformer encoder. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
num_attention_heads: Number of attention heads for each attention layer in hidden_size (:obj:`int`, optional, defaults to 768):
the Transformer encoder. Size of the encoder layers and the pooler layer.
intermediate_size: The size of the "intermediate" (i.e., feed-forward) num_hidden_layers (:obj:`int`, optional, defaults to 12):
layer in the Transformer encoder. Number of hidden layers in the Transformer encoder.
hidden_act: The non-linear activation function (function or string) in the num_attention_heads (:obj:`int`, optional, defaults to 12):
encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. Number of attention heads for each attention layer in the Transformer encoder.
hidden_dropout_prob: The dropout probabilitiy for all fully connected intermediate_size (:obj:`int`, optional, defaults to 3072):
layers in the embeddings, encoder, and pooler. The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
attention_probs_dropout_prob: The dropout ratio for the attention hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
probabilities. The non-linear activation function (function or string) in the encoder and pooler.
max_position_embeddings: The maximum sequence length that this model might If string, "gelu", "relu", "swish" and "gelu_new" are supported.
ever be used with. Typically set this to something large just in case hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
(e.g., 512 or 1024 or 2048). The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
type_vocab_size: The vocabulary size of the `token_type_ids` passed into attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
`BertModel`. The dropout ratio for the attention probabilities.
initializer_range: The sttdev of the truncated_normal_initializer for max_position_embeddings (:obj:`int`, optional, defaults to 512):
initializing all weight matrices. The maximum sequence length that this model might ever be used with.
layer_norm_eps: The epsilon used by LayerNorm. Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
""" """
pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
@@ -96,6 +108,7 @@ class BertConfig(PretrainedConfig):
**kwargs **kwargs
): ):
super(BertConfig, self).__init__(**kwargs) super(BertConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers self.num_hidden_layers = num_hidden_layers

View File

@@ -29,4 +29,17 @@ CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class CamembertConfig(RobertaConfig): class CamembertConfig(RobertaConfig):
r"""
This is the configuration class to store the configuration of an :class:`~transformers.CamembertModel`.
It is used to instantiate an Camembert model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the BERT bert-base-uncased architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
The :class:`~transformers.CamembertConfig` class directly inherits :class:`~transformers.BertConfig`.
It reuses the same defaults. Please check the parent class for more information.
"""
pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP

View File

@@ -26,25 +26,43 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf
class CTRLConfig(PretrainedConfig): class CTRLConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `CTRLModel`. """
This is the configuration class to store the configuration of an :class:`~transformers.CTRLModel`.
It is used to instantiate an CTRL model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the CTRL architecture from SalesForce.
Args: Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file. to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
n_positions: Number of positional embeddings. for more information.
n_ctx: Size of the causal mask (usually same as n_positions).
dff: Size of the inner dimension of the FFN. Args:
n_embd: Dimensionality of the embeddings and hidden states. vocab_size (:obj:`int`, optional, defaults to 246534):
n_layer: Number of hidden layers in the Transformer encoder. Vocabulary size of the CTRL model. Defines the different tokens that
n_head: Number of attention heads for each attention layer in can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
the Transformer encoder. n_positions (:obj:`int`, optional, defaults to 256):
layer_norm_epsilon: epsilon to use in the layer norm layers The maximum sequence length that this model might ever be used with.
resid_pdrop: The dropout probabilitiy for all fully connected Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
layers in the embeddings, encoder, and pooler. n_ctx (:obj:`int`, optional, defaults to 256):
attn_pdrop: The dropout ratio for the attention Size of the causal mask (usually same as n_positions).
probabilities. n_embd (:obj:`int`, optional, defaults to 1280):
embd_pdrop: The dropout ratio for the embeddings. Dimensionality of the embeddings and hidden states.
initializer_range: The sttdev of the truncated_normal_initializer for dff (:obj:`int`, optional, defaults to 8192):
initializing all weight matrices. Size of the inner dimension of the FFN.
n_layer (:obj:`int`, optional, defaults to 48):
Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6):
The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
""" """
pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
@@ -70,26 +88,6 @@ class CTRLConfig(PretrainedConfig):
summary_first_dropout=0.1, summary_first_dropout=0.1,
**kwargs **kwargs
): ):
"""Constructs CTRLConfig.
Args:
vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions).
dff: Size of the inner dimension of the FFN.
n_embd: Dimensionality of the embeddings and hidden states.
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
layer_norm_epsilon: epsilon to use in the layer norm layers
resid_pdrop: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attn_pdrop: The dropout ratio for the attention
probabilities.
embd_pdrop: The dropout ratio for the embeddings.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
super(CTRLConfig, self).__init__(**kwargs) super(CTRLConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.n_ctx = n_ctx self.n_ctx = n_ctx

View File

@@ -31,6 +31,50 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class DistilBertConfig(PretrainedConfig): class DistilBertConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the DistilBERT distilbert-base-uncased architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 30522):
Vocabulary size of the DistilBERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use sinusoidal positional embeddings.
n_layers (:obj:`int`, optional, defaults to 6):
Number of hidden layers in the Transformer encoder.
n_heads (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
dim (:obj:`int`, optional, defaults to 768):
Size of the encoder layers and the pooler layer.
intermediate_size (:obj:`int`, optional, defaults to 3072):
The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention probabilities.
activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
qa_dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probabilities used in the question answering model
:class:`~tranformers.DistilBertForQuestionAnswering`.
seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
The dropout probabilities used in the sequence classification model
:class:`~tranformers.DistilBertForSequenceClassification`.
"""
pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__( def __init__(
@@ -46,7 +90,6 @@ class DistilBertConfig(PretrainedConfig):
attention_dropout=0.1, attention_dropout=0.1,
activation="gelu", activation="gelu",
initializer_range=0.02, initializer_range=0.02,
tie_weights_=True,
qa_dropout=0.1, qa_dropout=0.1,
seq_classif_dropout=0.2, seq_classif_dropout=0.2,
**kwargs **kwargs
@@ -63,7 +106,6 @@ class DistilBertConfig(PretrainedConfig):
self.attention_dropout = attention_dropout self.attention_dropout = attention_dropout
self.activation = activation self.activation = activation
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.tie_weights_ = tie_weights_
self.qa_dropout = qa_dropout self.qa_dropout = qa_dropout
self.seq_classif_dropout = seq_classif_dropout self.seq_classif_dropout = seq_classif_dropout

View File

@@ -33,24 +33,42 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class GPT2Config(PretrainedConfig): class GPT2Config(PretrainedConfig):
"""Configuration class to store the configuration of a `GPT2Model`. """
This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the GPT-2 small architecture.
Args: Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
n_positions: Number of positional embeddings. for more information.
n_ctx: Size of the causal mask (usually same as n_positions).
n_embd: Dimensionality of the embeddings and hidden states.
n_layer: Number of hidden layers in the Transformer encoder. Args:
n_head: Number of attention heads for each attention layer in vocab_size (:obj:`int`, optional, defaults to 50257):
the Transformer encoder. Vocabulary size of the GPT-2 model. Defines the different tokens that
layer_norm_epsilon: epsilon to use in the layer norm layers can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`.
resid_pdrop: The dropout probabilitiy for all fully connected n_positions (:obj:`int`, optional, defaults to 1024):
layers in the embeddings, encoder, and pooler. The maximum sequence length that this model might ever be used with.
attn_pdrop: The dropout ratio for the attention Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
probabilities. n_ctx (:obj:`int`, optional, defaults to 1024):
embd_pdrop: The dropout ratio for the embeddings. Size of the causal mask (usually same as n_positions).
initializer_range: The sttdev of the truncated_normal_initializer for n_embd (:obj:`int`, optional, defaults to 768):
initializing all weight matrices. Dimensionality of the embeddings and hidden states.
n_layer (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, optional, defaults to 16):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
""" """
pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
@@ -75,26 +93,8 @@ class GPT2Config(PretrainedConfig):
summary_first_dropout=0.1, summary_first_dropout=0.1,
**kwargs **kwargs
): ):
"""Constructs GPT2Config.
Args:
vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions).
n_embd: Dimensionality of the embeddings and hidden states.
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
layer_norm_epsilon: epsilon to use in the layer norm layers
resid_pdrop: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attn_pdrop: The dropout ratio for the attention
probabilities.
embd_pdrop: The dropout ratio for the embeddings.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
super(GPT2Config, self).__init__(**kwargs) super(GPT2Config, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.n_ctx = n_ctx self.n_ctx = n_ctx
self.n_positions = n_positions self.n_positions = n_positions

View File

@@ -26,9 +26,13 @@ class MMBTConfig(object):
"""Configuration class to store the configuration of a `MMBT Model`. """Configuration class to store the configuration of a `MMBT Model`.
Args: Args:
config: config of the underlying Transformer models. It's values are copied over to use a single config. config (:obj:`~transformers.PreTrainedConfig`):
num_labels: Size of final Linear layer for classification. Config of the underlying Transformer models. Its values are
modal_hidden_size: Embedding dimension of the non-text modality encoder. copied over to use a single config.
num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`):
Size of final Linear layer for classification.
modal_hidden_size (:obj:`int`, optional, defautls to 2048):
Embedding dimension of the non-text modality encoder.
""" """
def __init__(self, config, num_labels=None, modal_hidden_size=2048): def __init__(self, config, num_labels=None, modal_hidden_size=2048):

View File

@@ -30,27 +30,45 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class OpenAIGPTConfig(PretrainedConfig): class OpenAIGPTConfig(PretrainedConfig):
""" """
Configuration class to store the configuration of a `OpenAIGPTModel`. This is the configuration class to store the configuration of an :class:`~transformers.OpenAIGPTModel`.
It is used to instantiate an GPT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the GPT architecture from OpenAI.
Args: Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
vocab_size: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file. to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
n_positions: Number of positional embeddings. for more information.
n_ctx: Size of the causal mask (usually same as n_positions).
n_embd: Dimensionality of the embeddings and hidden states. Args:
n_layer: Number of hidden layers in the Transformer encoder. vocab_size (:obj:`int`, optional, defaults to 40478):
n_head: Number of attention heads for each attention layer in Vocabulary size of the GPT model. Defines the different tokens that
the Transformer encoder. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
afn: The non-linear activation function (function or string) in the n_positions (:obj:`int`, optional, defaults to 512):
encoder and pooler. If string, "gelu", "relu" and "swish" are supported. The maximum sequence length that this model might ever be used with.
resid_pdrop: The dropout probabilitiy for all fully connected Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
layers in the embeddings, encoder, and pooler. n_ctx (:obj:`int`, optional, defaults to 512):
attn_pdrop: The dropout ratio for the attention Size of the causal mask (usually same as n_positions).
probabilities. n_embd (:obj:`int`, optional, defaults to 768):
embd_pdrop: The dropout ratio for the embeddings. Dimensionality of the embeddings and hidden states.
layer_norm_epsilon: epsilon to use in the layer norm layers n_layer (:obj:`int`, optional, defaults to 12):
initializer_range: The sttdev of the truncated_normal_initializer for Number of hidden layers in the Transformer encoder.
initializing all weight matrices. n_head (:obj:`int`, optional, defaults to 12):
predict_special_tokens: should we predict special tokens (when the model has a LM head) Number of attention heads for each attention layer in the Transformer encoder.
afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
Whether special tokens should be predicted when the model is has a language modeling head.
""" """
pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
@@ -77,9 +95,8 @@ class OpenAIGPTConfig(PretrainedConfig):
summary_first_dropout=0.1, summary_first_dropout=0.1,
**kwargs **kwargs
): ):
"""Constructs OpenAIGPTConfig.
"""
super(OpenAIGPTConfig, self).__init__(**kwargs) super(OpenAIGPTConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.n_ctx = n_ctx self.n_ctx = n_ctx
self.n_positions = n_positions self.n_positions = n_positions

View File

@@ -34,4 +34,17 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class RobertaConfig(BertConfig): class RobertaConfig(BertConfig):
r"""
This is the configuration class to store the configuration of an :class:`~transformers.RobertaModel`.
It is used to instantiate an RoBERTa model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the BERT bert-base-uncased architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`.
It reuses the same defaults. Please check the parent class for more information.
"""
pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP

View File

@@ -29,39 +29,74 @@ TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class TransfoXLConfig(PretrainedConfig): class TransfoXLConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `TransfoXLModel`. """
This is the configuration class to store the configuration of an :class:`~transformers.TransfoXLModel`.
It is used to instantiate a Transformer XL model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the Transformer XL architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args: Args:
vocab_size: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file. vocab_size (:obj:`int`, optional, defaults to 267735):
cutoffs: cutoffs for the adaptive softmax Vocabulary size of the Transformer XL model. Defines the different tokens that
d_model: Dimensionality of the model's hidden states. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.TransfoXLModel`.
d_embed: Dimensionality of the embeddings cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`):
d_head: Dimensionality of the model's heads. Cutoffs for the adaptive softmax
div_val: divident value for adapative input and softmax d_model (:obj:`int`, optional, defaults to 1024):
pre_lnorm: apply LayerNorm to the input instead of the output Dimensionality of the model's hidden states.
d_inner: Inner dimension in FF d_embed (:obj:`int`, optional, defaults to 1024):
n_layer: Number of hidden layers in the Transformer encoder. Dimensionality of the embeddings
n_head: Number of attention heads for each attention layer in n_head (:obj:`int`, optional, defaults to 16):
the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
tgt_len: number of tokens to predict d_head (:obj:`int`, optional, defaults to 64):
ext_len: length of the extended context Dimensionality of the model's heads.
mem_len: length of the retained previous heads d_inner (:obj:`int`, optional, defaults to 4096):
same_length: use the same attn length for all tokens Inner dimension in FF
proj_share_all_but_first: True to share all but first projs, False not to share. div_val (:obj:`int`, optional, defaults to 4):
attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al. Divident value for adapative input and softmax
clamp_len: use the same pos embeddings after clamp_len pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`):
sample_softmax: number of samples in sampled softmax Apply LayerNorm to the input instead of the output
adaptive: use adaptive softmax n_layer (:obj:`int`, optional, defaults to 18):
tie_weight: tie the word embedding and softmax weights Number of hidden layers in the Transformer encoder.
dropout: The dropout probabilitiy for all fully connected tgt_len (:obj:`int`, optional, defaults to 128):
layers in the embeddings, encoder, and pooler. Number of tokens to predict
dropatt: The dropout ratio for the attention probabilities. ext_len (:obj:`int`, optional, defaults to 0):
untie_r: untie relative position biases Length of the extended context
embd_pdrop: The dropout ratio for the embeddings. mem_len (:obj:`int`, optional, defaults to 1600):
init: parameter initializer to use Length of the retained previous heads
init_range: parameters initialized by U(-init_range, init_range). clamp_len (:obj:`int`, optional, defaults to 1000):
proj_init_std: parameters initialized by N(0, init_std) use the same pos embeddings after clamp_len
init_std: parameters initialized by N(0, init_std) same_length (:obj:`boolean`, optional, defaults to :obj:`True`):
Use the same attn length for all tokens
proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`):
True to share all but first projs, False not to share.
attn_type (:obj:`int`, optional, defaults to 0):
Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
sample_softmax (:obj:`int`, optional, defaults to -1):
number of samples in sampled softmax
adaptive (:obj:`boolean`, optional, defaults to :obj:`True`):
use adaptive softmax
tie_weight (:obj:`boolean`, optional, defaults to :obj:`True`):
tie the word embedding and softmax weights
dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
dropatt (:obj:`float`, optional, defaults to 0):
The dropout ratio for the attention probabilities.
untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
Untie relative position biases
init (:obj:`string`, optional, defaults to `normal`):
Parameter initializer to use
init_range (:obj:`float`, optional, defaults to 0.01):
Parameters initialized by U(-init_range, init_range).
proj_init_std (:obj:`float`, optional, defaults to 0.01):
Parameters initialized by N(0, init_std)
init_std (:obj:`float`, optional, defaults to 0.02):
Parameters initialized by N(0, init_std)
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
The epsilon to use in the layer normalization layers
""" """
pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
@@ -98,9 +133,8 @@ class TransfoXLConfig(PretrainedConfig):
layer_norm_epsilon=1e-5, layer_norm_epsilon=1e-5,
**kwargs **kwargs
): ):
"""Constructs TransfoXLConfig.
"""
super(TransfoXLConfig, self).__init__(**kwargs) super(TransfoXLConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.cutoffs = [] self.cutoffs = []
self.cutoffs.extend(cutoffs) self.cutoffs.extend(cutoffs)

View File

@@ -37,44 +37,81 @@ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class XLMConfig(PretrainedConfig): class XLMConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `XLMModel`. """
This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
It is used to instantiate an XLM model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
Args: Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
vocab_size: Vocabulary size of `inputs_ids` in `XLMModel`. to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
d_model: Size of the encoder layers and the pooler layer. for more information.
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
d_inner: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
ff_activation: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
untie_r: untie relative position biases
attn_type: 'bi' for XLM, 'uni' for Transformer-XL
dropout: The dropout probabilitiy for all fully connected Args:
layers in the embeddings, encoder, and pooler. vocab_size (:obj:`int`, optional, defaults to 30145):
max_position_embeddings: The maximum sequence length that this model might Vocabulary size of the XLM model. Defines the different tokens that
ever be used with. Typically set this to something large just in case can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.
(e.g., 512 or 1024 or 2048). emb_dim (:obj:`int`, optional, defaults to 2048):
initializer_range: The sttdev of the truncated_normal_initializer for Dimensionality of the encoder layers and the pooler layer.
initializing all weight matrices. n_layer (:obj:`int`, optional, defaults to 12):
layer_norm_eps: The epsilon used by LayerNorm. Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 16):
dropout: float, dropout rate. Number of attention heads for each attention layer in the Transformer encoder.
init: str, the initialization scheme, either "normal" or "uniform". dropout (:obj:`float`, optional, defaults to 0.1):
init_range: float, initialize the parameters with a uniform distribution The dropout probability for all fully connected
in [-init_range, init_range]. Only effective when init="uniform". layers in the embeddings, encoder, and pooler.
init_std: float, initialize the parameters with a normal distribution attention_dropout (:obj:`float`, optional, defaults to 0.1):
with mean 0 and stddev init_std. Only effective when init="normal". The dropout probability for the attention mechanism
mem_len: int, the number of tokens to cache. gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
reuse_len: int, the number of tokens in the currect batch to be cached The non-linear activation function (function or string) in the
and reused in the future. encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
bi_data: bool, whether to use bidirectional input pipeline. sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
Usually set to True during pretraining and False during finetuning. Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
clamp_len: int, clamp all relative distances larger than clamp_len. causal (:obj:`boolean`, optional, defaults to :obj:`False`):
-1 means no clamping. Set this to `True` for the model to behave in a causal manner.
same_length: bool, whether to use the same attention length for each token. Causal models use a triangular attention mask in order to only attend to the left-side context instead
if a bidirectional context.
asm (:obj:`boolean`, optional, defaults to :obj:`False`):
TODO
n_langs (:obj:`int`, optional, defaults to 1):
The number of languages the model handles. Set to 1 for monolingual models.
use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
Whether to use language embeddings. Some models use additional language embeddings, see
`the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
for information on how to use them.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
The standard deviation of the truncated_normal_initializer for
initializing the embedding matrices.
init_std (:obj:`int`, optional, defaults to 50257):
The standard deviation of the truncated_normal_initializer for
initializing all weight matrices except the embedding matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
bos_index (:obj:`int`, optional, defaults to 0):
The index of the beginning of sentence token in the vocabulary.
eos_index (:obj:`int`, optional, defaults to 1):
The index of the end of sentence token in the vocabulary.
pad_index (:obj:`int`, optional, defaults to 2):
The index of the padding token in the vocabulary.
unk_index (:obj:`int`, optional, defaults to 3):
The index of the unknown token in the vocabulary.
mask_index (:obj:`int`, optional, defaults to 5):
The index of the masking token in the vocabulary.
is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
start_n_top (:obj:`int`, optional, defaults to 5):
TODO
end_n_top (:obj:`int`, optional, defaults to 5):
TODO
mask_token_id (:obj:`int`, optional, defaults to 0):
Model agnostic parameter to identify masked tokens when generating text in an MLM context.
lang_id (:obj:`int`, optional, defaults to 1):
The ID of the language used by the model. This parameter is used when generating
text in a given language.
""" """
pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP

View File

@@ -30,42 +30,60 @@ XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class XLNetConfig(PretrainedConfig): class XLNetConfig(PretrainedConfig):
"""Configuration class to store the configuration of a ``XLNetModel``. """
This is the configuration class to store the configuration of a :class:`~transformers.XLNetModel`.
It is used to instantiate an XLNet model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the `xlnet-large-cased <https://huggingface.co/xlnet-large-cased>`__ architecture.
Args: Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
vocab_size: Vocabulary size of ``inputs_ids`` in ``XLNetModel``. to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
d_model: Size of the encoder layers and the pooler layer. for more information.
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
d_inner: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
ff_activation: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
untie_r: untie relative position biases
attn_type: 'bi' for XLNet, 'uni' for Transformer-XL
dropout: The dropout probabilitiy for all fully connected Args:
layers in the embeddings, encoder, and pooler. vocab_size (:obj:`int`, optional, defaults to 32000):
initializer_range: The sttdev of the truncated_normal_initializer for Vocabulary size of the XLNet model. Defines the different tokens that
initializing all weight matrices. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLNetModel`.
layer_norm_eps: The epsilon used by LayerNorm. d_model (:obj:`int`, optional, defaults to 1024):
Size of the encoder layers and the pooler layer.
dropout: float, dropout rate. n_layer (:obj:`int`, optional, defaults to 24):
init: str, the initialization scheme, either "normal" or "uniform". Number of hidden layers in the Transformer encoder.
init_range: float, initialize the parameters with a uniform distribution n_head (:obj:`int`, optional, defaults to 16):
in [-init_range, init_range]. Only effective when init="uniform". Number of attention heads for each attention layer in the Transformer encoder.
init_std: float, initialize the parameters with a normal distribution d_inner (:obj:`int`, optional, defaults to 4096):
with mean 0 and stddev init_std. Only effective when init="normal". The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
mem_len: int, the number of tokens to cache. ff_activation (:obj:`string`, optional, defaults to "gelu"):
reuse_len: int, the number of tokens in the currect batch to be cached The non-linear activation function (function or string) in the
and reused in the future. encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
bi_data: bool, whether to use bidirectional input pipeline. untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
Usually set to True during pretraining and False during finetuning. Untie relative position biases
clamp_len: int, clamp all relative distances larger than clamp_len. attn_type (:obj:`string`, optional, defaults to "bi"):
-1 means no clamping. The attention type used by the model. Set 'bi' for XLNet, 'uni' for Transformer-XL.
same_length: bool, whether to use the same attention length for each token. initializer_range (:obj:`float`, optional, defaults to 0.02):
finetuning_task: name of the glue task on which the model was fine-tuned if any The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
mem_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
The number of tokens to cache. The key/value pairs that have already been pre-computed
in a previous forward pass won't be re-computed. See the
`quickstart <https://huggingface.co/transformers/quickstart.html#using-the-past>`__
for more information.
reuse_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
The number of tokens in the current batch to be cached and reused in the future.
bi_data (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use bidirectional input pipeline. Usually set to `True` during
pretraining and `False` during finetuning.
clamp_len (:obj:`int`, optional, defaults to -1):
Clamp all relative distances larger than clamp_len.
Setting this attribute to -1 means no clamping.
same_length (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use the same attention length for each token.
start_n_top (:obj:`int`, optional, defaults to 5):
TODO
end_n_top (:obj:`int`, optional, defaults to 5):
TODO
""" """
pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP