From 632682726f64f83014e4259dc42195da6d817695 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Sun, 12 Jan 2020 21:53:19 +0100 Subject: [PATCH] Updated Configurations --- docs/source/model_doc/albert.rst | 2 +- src/transformers/configuration_albert.py | 97 +++++++++++------ src/transformers/configuration_auto.py | 103 ++++++++---------- src/transformers/configuration_bert.py | 61 +++++++---- src/transformers/configuration_camembert.py | 13 +++ src/transformers/configuration_ctrl.py | 74 ++++++------- src/transformers/configuration_distilbert.py | 46 +++++++- src/transformers/configuration_gpt2.py | 72 ++++++------ src/transformers/configuration_mmbt.py | 10 +- src/transformers/configuration_openai.py | 61 +++++++---- src/transformers/configuration_roberta.py | 13 +++ src/transformers/configuration_transfo_xl.py | 100 +++++++++++------ src/transformers/configuration_xlm.py | 109 +++++++++++++------ src/transformers/configuration_xlnet.py | 86 +++++++++------ 14 files changed, 528 insertions(+), 319 deletions(-) diff --git a/docs/source/model_doc/albert.rst b/docs/source/model_doc/albert.rst index 92970c9328..5cf3f5ee7b 100644 --- a/docs/source/model_doc/albert.rst +++ b/docs/source/model_doc/albert.rst @@ -1,7 +1,7 @@ ALBERT ---------------------------------------------------- -``AlbrtConfig`` +``AlbertConfig`` ~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.AlbertConfig diff --git a/src/transformers/configuration_albert.py b/src/transformers/configuration_albert.py index 1d6adfa7e9..bcf6f7f361 100644 --- a/src/transformers/configuration_albert.py +++ b/src/transformers/configuration_albert.py @@ -31,9 +31,73 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { class AlbertConfig(PretrainedConfig): - """Configuration for `AlbertModel`. + r""" + This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`. + It is used to instantiate an ALBERT model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the ALBERT xxlarge architecture. - The default settings match the configuration of model `albert_xxlarge`. + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. + + + Args: + vocab_size (:obj:`int`, optional, defaults to 30000): + Vocabulary size of the ALBERT model. Defines the different tokens that + can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`. + embedding_size (:obj:`int`, optional, defaults to 128): + Size of vocabulary embeddings. + hidden_size (:obj:`int`, optional, defaults to 4096): + Size of the encoder layers and the pooler layer. + num_hidden_layers (:obj:`int`, optional, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_hidden_groups (:obj:`int`, optional, defaults to 1): + Number of groups for the hidden layers, parameters in the same group are shared. + num_attention_heads (:obj:`int`, optional, defaults to 64): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (:obj:`int`, optional, defaults to 16384): + The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + inner_group_num (:obj:`int`, optional, defaults to 1): + The number of inner repetition of attention and ffn. + hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"): + The non-linear activation function (function or string) in the encoder and pooler. + If string, "gelu", "relu", "swish" and "gelu_new" are supported. + hidden_dropout_prob (:obj:`float`, optional, defaults to 0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0): + The dropout ratio for the attention probabilities. + max_position_embeddings (:obj:`int`, optional, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something + large (e.g., 512 or 1024 or 2048). + type_vocab_size (:obj:`int`, optional, defaults to 2): + The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`. + initializer_range (:obj:`float`, optional, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): + The epsilon used by the layer normalization layers. + + Example:: + + # Initializing an ALBERT-xxlarge style configuration + albert_xxlarge_configuration = AlbertConfig() + + # Initializing an ALBERT-base style configuration + albert_base_configuration = AlbertConfig( + hidden_size=768, + num_attention_heads=12, + intermediate_size=3072, + ) + + # Initializing a model from the ALBERT-base style configuration + model = AlbertModel(bert_base_configuration) + + # Accessing the model configuration + configuration = model.config + + Attributes: + pretrained_config_archive_map (Dict[str, str]): + A dictionary containing all the available pre-trained checkpoints. """ pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP @@ -57,35 +121,6 @@ class AlbertConfig(PretrainedConfig): layer_norm_eps=1e-12, **kwargs ): - """Constructs AlbertConfig. - - Args: - vocab_size: Vocabulary size of `inputs_ids` in `AlbertModel`. - embedding_size: size of voc embeddings. - hidden_size: Size of the encoder layers and the pooler layer. - num_hidden_layers: Number of hidden layers in the Transformer encoder. - num_hidden_groups: Number of group for the hidden layers, parameters in - the same group are shared. - num_attention_heads: Number of attention heads for each attention layer in - the Transformer encoder. - intermediate_size: The size of the "intermediate" (i.e., feed-forward) - layer in the Transformer encoder. - inner_group_num: int, number of inner repetition of attention and ffn. - down_scale_factor: float, the scale to apply - hidden_act: The non-linear activation function (function or string) in the - encoder and pooler. - hidden_dropout_prob: The dropout probability for all fully connected - layers in the embeddings, encoder, and pooler. - attention_probs_dropout_prob: The dropout ratio for the attention - probabilities. - max_position_embeddings: The maximum sequence length that this model might - ever be used with. Typically set this to something large just in case - (e.g., 512 or 1024 or 2048). - type_vocab_size: The vocabulary size of the `token_type_ids` passed into - `AlbertModel`. - initializer_range: The stdev of the truncated_normal_initializer for - initializing all weight matrices. - """ super(AlbertConfig, self).__init__(**kwargs) self.vocab_size = vocab_size diff --git a/src/transformers/configuration_auto.py b/src/transformers/configuration_auto.py index 32a0385eca..8ba9515435 100644 --- a/src/transformers/configuration_auto.py +++ b/src/transformers/configuration_auto.py @@ -57,29 +57,13 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict( class AutoConfig(object): - r""":class:`~transformers.AutoConfig` is a generic configuration class + r""" + :class:`~transformers.AutoConfig` is a generic configuration class that will be instantiated as one of the configuration classes of the library - when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` - class method. + when created with the :func:`~transformers.AutoConfig.from_pretrained` class method. - The `from_pretrained()` method take care of returning the correct model class instance - using pattern matching on the `pretrained_model_name_or_path` string. - - The base model class to instantiate is selected as the first pattern matching - in the `pretrained_model_name_or_path` string (in the following order): - - contains `distilbert`: DistilBertConfig (DistilBERT model) - - contains `albert`: AlbertConfig (ALBERT model) - - contains `camembert`: CamembertConfig (CamemBERT model) - - contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model) - - contains `roberta`: RobertaConfig (RoBERTa model) - - contains `bert`: BertConfig (Bert model) - - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model) - - contains `gpt2`: GPT2Config (OpenAI GPT-2 model) - - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model) - - contains `xlnet`: XLNetConfig (XLNet model) - - contains `xlm`: XLMConfig (XLM model) - - contains `ctrl` : CTRLConfig (CTRL model) - This class cannot be instantiated using `__init__()` (throw an error). + The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance + using pattern matching on the `pretrained_model_name_or_path` string argument. """ def __init__(self): @@ -94,6 +78,8 @@ class AutoConfig(object): return DistilBertConfig(*args, **kwargs) elif "roberta" in model_type: return RobertaConfig(*args, **kwargs) + elif "albert" in model_type: + return AlbertConfig(*args, **kwargs) elif "bert" in model_type: return BertConfig(*args, **kwargs) elif "openai-gpt" in model_type: @@ -108,8 +94,6 @@ class AutoConfig(object): return XLMConfig(*args, **kwargs) elif "ctrl" in model_type: return CTRLConfig(*args, **kwargs) - elif "albert" in model_type: - return AlbertConfig(*args, **kwargs) elif "camembert" in model_type: return CamembertConfig(*args, **kwargs) raise ValueError( @@ -120,59 +104,60 @@ class AutoConfig(object): @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): - r""" Instantiate a one of the configuration classes of the library + r""" Instantiates one of the configuration classes of the library from a pre-trained model configuration. The configuration class to instantiate is selected as the first pattern matching in the `pretrained_model_name_or_path` string (in the following order): - - contains `t5`: T5Config (T5 model) - - contains `distilbert`: DistilBertConfig (DistilBERT model) - - contains `albert`: AlbertConfig (ALBERT model) - - contains `camembert`: CamembertConfig (CamemBERT model) - - contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model) - - contains `roberta`: RobertaConfig (RoBERTa model) - - contains `bert`: BertConfig (Bert model) - - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model) - - contains `gpt2`: GPT2Config (OpenAI GPT-2 model) - - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model) - - contains `xlnet`: XLNetConfig (XLNet model) - - contains `xlm`: XLMConfig (XLM model) - - contains `ctrl` : CTRLConfig (CTRL model) - Params: - pretrained_model_name_or_path: either: + - contains `t5`: :class:`~transformers.T5Config` (T5 model) + - contains `distilbert`: :class:`~transformers.DistilBertConfig` (DistilBERT model) + - contains `albert`: :class:`~transformers.AlbertConfig` (ALBERT model) + - contains `camembert`: :class:`~transformers.CamembertConfig` (CamemBERT model) + - contains `xlm-roberta`: :class:`~transformers.XLMRobertaConfig` (XLM-RoBERTa model) + - contains `roberta`: :class:`~transformers.RobertaConfig` (RoBERTa model) + - contains `bert`: :class:`~transformers.BertConfig` (Bert model) + - contains `openai-gpt`: :class:`~transformers.OpenAIGPTConfig` (OpenAI GPT model) + - contains `gpt2`: :class:`~transformers.GPT2Config` (OpenAI GPT-2 model) + - contains `transfo-xl`: :class:`~transformers.TransfoXLConfig` (Transformer-XL model) + - contains `xlnet`: :class:`~transformers.XLNetConfig` (XLNet model) + - contains `xlm`: :class:`~transformers.XLMConfig` (XLM model) + - contains `ctrl` : :class:`~transformers.CTRLConfig` (CTRL model) - - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. - - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. - - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. - cache_dir: (`optional`) string: + Args: + pretrained_model_name_or_path (:obj:`string`): + Is either: \ + - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. + - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. + - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. + + cache_dir (:obj:`string`, optional, defaults to `None`): Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. - kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading. + force_download (:obj:`boolean`, optional, defaults to `False`): + Force to (re-)download the model weights and configuration files and override the cached versions if they exist. - - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values. - - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter. + resume_download (:obj:`boolean`, optional, defaults to `False`): + Do not delete incompletely received file. Attempt to resume the download if such a file exists. - force_download: (`optional`) boolean, default False: - Force to (re-)download the model weights and configuration files and override the cached versions if they exists. - - resume_download: (`optional`) boolean, default False: - Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. - - proxies: (`optional`) dict, default None: - A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. - The proxies are used on each request. - - return_unused_kwargs: (`optional`) bool: + proxies (:obj:`Dict[str, str]`, optional, defaults to `None`): + A dictionary of proxy servers to use by protocol or endpoint, e.g.: :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. + The proxies are used on each request. See `the requests documentation `__ for usage. + return_unused_kwargs (:obj:`boolean`, optional, defaults to `False`): - If False, then this function returns just the final configuration object. - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored. + kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): key/value pairs with which to update the configuration object after loading. + - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values. + - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter. + + Examples:: - config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. + config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. config = AutoConfig.from_pretrained('./test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json') config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False) diff --git a/src/transformers/configuration_bert.py b/src/transformers/configuration_bert.py index 32fa50a504..867ba61397 100644 --- a/src/transformers/configuration_bert.py +++ b/src/transformers/configuration_bert.py @@ -50,32 +50,44 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { class BertConfig(PretrainedConfig): r""" - :class:`~transformers.BertConfig` is the configuration class to store the configuration of a - `BertModel`. + This is the configuration class to store the configuration of a :class:`~transformers.BertModel`. + It is used to instantiate an BERT model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the BERT bert-base-uncased architecture. + + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. - Arguments: - vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. - hidden_size: Size of the encoder layers and the pooler layer. - num_hidden_layers: Number of hidden layers in the Transformer encoder. - num_attention_heads: Number of attention heads for each attention layer in - the Transformer encoder. - intermediate_size: The size of the "intermediate" (i.e., feed-forward) - layer in the Transformer encoder. - hidden_act: The non-linear activation function (function or string) in the - encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. - hidden_dropout_prob: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - attention_probs_dropout_prob: The dropout ratio for the attention - probabilities. - max_position_embeddings: The maximum sequence length that this model might - ever be used with. Typically set this to something large just in case - (e.g., 512 or 1024 or 2048). - type_vocab_size: The vocabulary size of the `token_type_ids` passed into - `BertModel`. - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. - layer_norm_eps: The epsilon used by LayerNorm. + Args: + vocab_size (:obj:`int`, optional, defaults to 30522): + Vocabulary size of the BERT model. Defines the different tokens that + can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`. + hidden_size (:obj:`int`, optional, defaults to 768): + Size of the encoder layers and the pooler layer. + num_hidden_layers (:obj:`int`, optional, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (:obj:`int`, optional, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (:obj:`int`, optional, defaults to 3072): + The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): + The non-linear activation function (function or string) in the encoder and pooler. + If string, "gelu", "relu", "swish" and "gelu_new" are supported. + hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1): + The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (:obj:`int`, optional, defaults to 512): + The maximum sequence length that this model might ever be used with. + Typically set this to something large just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (:obj:`int`, optional, defaults to 2): + The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`. + initializer_range (:obj:`float`, optional, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): + The epsilon used by the layer normalization layers. """ pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP @@ -96,6 +108,7 @@ class BertConfig(PretrainedConfig): **kwargs ): super(BertConfig, self).__init__(**kwargs) + self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers diff --git a/src/transformers/configuration_camembert.py b/src/transformers/configuration_camembert.py index 8ecdf714b1..618169b8fc 100644 --- a/src/transformers/configuration_camembert.py +++ b/src/transformers/configuration_camembert.py @@ -29,4 +29,17 @@ CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { class CamembertConfig(RobertaConfig): + r""" + This is the configuration class to store the configuration of an :class:`~transformers.CamembertModel`. + It is used to instantiate an Camembert model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the BERT bert-base-uncased architecture. + + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. + + The :class:`~transformers.CamembertConfig` class directly inherits :class:`~transformers.BertConfig`. + It reuses the same defaults. Please check the parent class for more information. + """ pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP diff --git a/src/transformers/configuration_ctrl.py b/src/transformers/configuration_ctrl.py index e23bf7a376..46113ac293 100644 --- a/src/transformers/configuration_ctrl.py +++ b/src/transformers/configuration_ctrl.py @@ -26,25 +26,43 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf class CTRLConfig(PretrainedConfig): - """Configuration class to store the configuration of a `CTRLModel`. + """ + This is the configuration class to store the configuration of an :class:`~transformers.CTRLModel`. + It is used to instantiate an CTRL model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the CTRL architecture from SalesForce. - Args: - vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file. - n_positions: Number of positional embeddings. - n_ctx: Size of the causal mask (usually same as n_positions). - dff: Size of the inner dimension of the FFN. - n_embd: Dimensionality of the embeddings and hidden states. - n_layer: Number of hidden layers in the Transformer encoder. - n_head: Number of attention heads for each attention layer in - the Transformer encoder. - layer_norm_epsilon: epsilon to use in the layer norm layers - resid_pdrop: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - attn_pdrop: The dropout ratio for the attention - probabilities. - embd_pdrop: The dropout ratio for the embeddings. - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. + + Args: + vocab_size (:obj:`int`, optional, defaults to 246534): + Vocabulary size of the CTRL model. Defines the different tokens that + can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`. + n_positions (:obj:`int`, optional, defaults to 256): + The maximum sequence length that this model might ever be used with. + Typically set this to something large just in case (e.g., 512 or 1024 or 2048). + n_ctx (:obj:`int`, optional, defaults to 256): + Size of the causal mask (usually same as n_positions). + n_embd (:obj:`int`, optional, defaults to 1280): + Dimensionality of the embeddings and hidden states. + dff (:obj:`int`, optional, defaults to 8192): + Size of the inner dimension of the FFN. + n_layer (:obj:`int`, optional, defaults to 48): + Number of hidden layers in the Transformer encoder. + n_head (:obj:`int`, optional, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + resid_pdrop (:obj:`float`, optional, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + embd_pdrop (:obj:`int`, optional, defaults to 0.1): + The dropout ratio for the embeddings. + attn_pdrop (:obj:`float`, optional, defaults to 0.1): + The dropout ratio for the attention. + layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6): + The epsilon to use in the layer normalization layers + initializer_range (:obj:`float`, optional, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. """ pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP @@ -70,26 +88,6 @@ class CTRLConfig(PretrainedConfig): summary_first_dropout=0.1, **kwargs ): - """Constructs CTRLConfig. - - Args: - vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file. - n_positions: Number of positional embeddings. - n_ctx: Size of the causal mask (usually same as n_positions). - dff: Size of the inner dimension of the FFN. - n_embd: Dimensionality of the embeddings and hidden states. - n_layer: Number of hidden layers in the Transformer encoder. - n_head: Number of attention heads for each attention layer in - the Transformer encoder. - layer_norm_epsilon: epsilon to use in the layer norm layers - resid_pdrop: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - attn_pdrop: The dropout ratio for the attention - probabilities. - embd_pdrop: The dropout ratio for the embeddings. - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. - """ super(CTRLConfig, self).__init__(**kwargs) self.vocab_size = vocab_size self.n_ctx = n_ctx diff --git a/src/transformers/configuration_distilbert.py b/src/transformers/configuration_distilbert.py index 1dd4a11912..df0b73ed9c 100644 --- a/src/transformers/configuration_distilbert.py +++ b/src/transformers/configuration_distilbert.py @@ -31,6 +31,50 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { class DistilBertConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`. + It is used to instantiate a DistilBERT model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the DistilBERT distilbert-base-uncased architecture. + + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. + + + Args: + vocab_size (:obj:`int`, optional, defaults to 30522): + Vocabulary size of the DistilBERT model. Defines the different tokens that + can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`. + max_position_embeddings (:obj:`int`, optional, defaults to 512): + The maximum sequence length that this model might ever be used with. + Typically set this to something large just in case (e.g., 512 or 1024 or 2048). + sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`): + Whether to use sinusoidal positional embeddings. + n_layers (:obj:`int`, optional, defaults to 6): + Number of hidden layers in the Transformer encoder. + n_heads (:obj:`int`, optional, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + dim (:obj:`int`, optional, defaults to 768): + Size of the encoder layers and the pooler layer. + intermediate_size (:obj:`int`, optional, defaults to 3072): + The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + dropout (:obj:`float`, optional, defaults to 0.1): + The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (:obj:`float`, optional, defaults to 0.1): + The dropout ratio for the attention probabilities. + activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): + The non-linear activation function (function or string) in the encoder and pooler. + If string, "gelu", "relu", "swish" and "gelu_new" are supported. + initializer_range (:obj:`float`, optional, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + qa_dropout (:obj:`float`, optional, defaults to 0.1): + The dropout probabilities used in the question answering model + :class:`~tranformers.DistilBertForQuestionAnswering`. + seq_classif_dropout (:obj:`float`, optional, defaults to 0.2): + The dropout probabilities used in the sequence classification model + :class:`~tranformers.DistilBertForSequenceClassification`. + """ pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP def __init__( @@ -46,7 +90,6 @@ class DistilBertConfig(PretrainedConfig): attention_dropout=0.1, activation="gelu", initializer_range=0.02, - tie_weights_=True, qa_dropout=0.1, seq_classif_dropout=0.2, **kwargs @@ -63,7 +106,6 @@ class DistilBertConfig(PretrainedConfig): self.attention_dropout = attention_dropout self.activation = activation self.initializer_range = initializer_range - self.tie_weights_ = tie_weights_ self.qa_dropout = qa_dropout self.seq_classif_dropout = seq_classif_dropout diff --git a/src/transformers/configuration_gpt2.py b/src/transformers/configuration_gpt2.py index 8da1800747..e421a7b10f 100644 --- a/src/transformers/configuration_gpt2.py +++ b/src/transformers/configuration_gpt2.py @@ -33,24 +33,42 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = { class GPT2Config(PretrainedConfig): - """Configuration class to store the configuration of a `GPT2Model`. + """ + This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`. + It is used to instantiate an GPT-2 model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the GPT-2 small architecture. - Args: - vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. - n_positions: Number of positional embeddings. - n_ctx: Size of the causal mask (usually same as n_positions). - n_embd: Dimensionality of the embeddings and hidden states. - n_layer: Number of hidden layers in the Transformer encoder. - n_head: Number of attention heads for each attention layer in - the Transformer encoder. - layer_norm_epsilon: epsilon to use in the layer norm layers - resid_pdrop: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - attn_pdrop: The dropout ratio for the attention - probabilities. - embd_pdrop: The dropout ratio for the embeddings. - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. + + + Args: + vocab_size (:obj:`int`, optional, defaults to 50257): + Vocabulary size of the GPT-2 model. Defines the different tokens that + can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`. + n_positions (:obj:`int`, optional, defaults to 1024): + The maximum sequence length that this model might ever be used with. + Typically set this to something large just in case (e.g., 512 or 1024 or 2048). + n_ctx (:obj:`int`, optional, defaults to 1024): + Size of the causal mask (usually same as n_positions). + n_embd (:obj:`int`, optional, defaults to 768): + Dimensionality of the embeddings and hidden states. + n_layer (:obj:`int`, optional, defaults to 12): + Number of hidden layers in the Transformer encoder. + n_head (:obj:`int`, optional, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + resid_pdrop (:obj:`float`, optional, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + embd_pdrop (:obj:`int`, optional, defaults to 0.1): + The dropout ratio for the embeddings. + attn_pdrop (:obj:`float`, optional, defaults to 0.1): + The dropout ratio for the attention. + layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5): + The epsilon to use in the layer normalization layers + initializer_range (:obj:`float`, optional, defaults to 16): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. """ pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP @@ -75,26 +93,8 @@ class GPT2Config(PretrainedConfig): summary_first_dropout=0.1, **kwargs ): - """Constructs GPT2Config. - - Args: - vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. - n_positions: Number of positional embeddings. - n_ctx: Size of the causal mask (usually same as n_positions). - n_embd: Dimensionality of the embeddings and hidden states. - n_layer: Number of hidden layers in the Transformer encoder. - n_head: Number of attention heads for each attention layer in - the Transformer encoder. - layer_norm_epsilon: epsilon to use in the layer norm layers - resid_pdrop: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - attn_pdrop: The dropout ratio for the attention - probabilities. - embd_pdrop: The dropout ratio for the embeddings. - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. - """ super(GPT2Config, self).__init__(**kwargs) + self.vocab_size = vocab_size self.n_ctx = n_ctx self.n_positions = n_positions diff --git a/src/transformers/configuration_mmbt.py b/src/transformers/configuration_mmbt.py index b072468e7f..56a35e237c 100644 --- a/src/transformers/configuration_mmbt.py +++ b/src/transformers/configuration_mmbt.py @@ -26,9 +26,13 @@ class MMBTConfig(object): """Configuration class to store the configuration of a `MMBT Model`. Args: - config: config of the underlying Transformer models. It's values are copied over to use a single config. - num_labels: Size of final Linear layer for classification. - modal_hidden_size: Embedding dimension of the non-text modality encoder. + config (:obj:`~transformers.PreTrainedConfig`): + Config of the underlying Transformer models. Its values are + copied over to use a single config. + num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`): + Size of final Linear layer for classification. + modal_hidden_size (:obj:`int`, optional, defautls to 2048): + Embedding dimension of the non-text modality encoder. """ def __init__(self, config, num_labels=None, modal_hidden_size=2048): diff --git a/src/transformers/configuration_openai.py b/src/transformers/configuration_openai.py index d7e88bda92..28c501b77e 100644 --- a/src/transformers/configuration_openai.py +++ b/src/transformers/configuration_openai.py @@ -30,27 +30,45 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = { class OpenAIGPTConfig(PretrainedConfig): """ - Configuration class to store the configuration of a `OpenAIGPTModel`. + This is the configuration class to store the configuration of an :class:`~transformers.OpenAIGPTModel`. + It is used to instantiate an GPT model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the GPT architecture from OpenAI. - Args: - vocab_size: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file. - n_positions: Number of positional embeddings. - n_ctx: Size of the causal mask (usually same as n_positions). - n_embd: Dimensionality of the embeddings and hidden states. - n_layer: Number of hidden layers in the Transformer encoder. - n_head: Number of attention heads for each attention layer in - the Transformer encoder. - afn: The non-linear activation function (function or string) in the - encoder and pooler. If string, "gelu", "relu" and "swish" are supported. - resid_pdrop: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - attn_pdrop: The dropout ratio for the attention - probabilities. - embd_pdrop: The dropout ratio for the embeddings. - layer_norm_epsilon: epsilon to use in the layer norm layers - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. - predict_special_tokens: should we predict special tokens (when the model has a LM head) + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. + + Args: + vocab_size (:obj:`int`, optional, defaults to 40478): + Vocabulary size of the GPT model. Defines the different tokens that + can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`. + n_positions (:obj:`int`, optional, defaults to 512): + The maximum sequence length that this model might ever be used with. + Typically set this to something large just in case (e.g., 512 or 1024 or 2048). + n_ctx (:obj:`int`, optional, defaults to 512): + Size of the causal mask (usually same as n_positions). + n_embd (:obj:`int`, optional, defaults to 768): + Dimensionality of the embeddings and hidden states. + n_layer (:obj:`int`, optional, defaults to 12): + Number of hidden layers in the Transformer encoder. + n_head (:obj:`int`, optional, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): + The non-linear activation function (function or string) in the encoder and pooler. + If string, "gelu", "relu", "swish" and "gelu_new" are supported. + resid_pdrop (:obj:`float`, optional, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + embd_pdrop (:obj:`int`, optional, defaults to 0.1): + The dropout ratio for the embeddings. + attn_pdrop (:obj:`float`, optional, defaults to 0.1): + The dropout ratio for the attention. + layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5): + The epsilon to use in the layer normalization layers + initializer_range (:obj:`float`, optional, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`): + Whether special tokens should be predicted when the model is has a language modeling head. """ pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP @@ -77,9 +95,8 @@ class OpenAIGPTConfig(PretrainedConfig): summary_first_dropout=0.1, **kwargs ): - """Constructs OpenAIGPTConfig. - """ super(OpenAIGPTConfig, self).__init__(**kwargs) + self.vocab_size = vocab_size self.n_ctx = n_ctx self.n_positions = n_positions diff --git a/src/transformers/configuration_roberta.py b/src/transformers/configuration_roberta.py index f505a699b1..5dc9776942 100644 --- a/src/transformers/configuration_roberta.py +++ b/src/transformers/configuration_roberta.py @@ -34,4 +34,17 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { class RobertaConfig(BertConfig): + r""" + This is the configuration class to store the configuration of an :class:`~transformers.RobertaModel`. + It is used to instantiate an RoBERTa model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the BERT bert-base-uncased architecture. + + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. + + The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`. + It reuses the same defaults. Please check the parent class for more information. + """ pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP diff --git a/src/transformers/configuration_transfo_xl.py b/src/transformers/configuration_transfo_xl.py index 7b285ca3ed..789f6c03a4 100644 --- a/src/transformers/configuration_transfo_xl.py +++ b/src/transformers/configuration_transfo_xl.py @@ -29,39 +29,74 @@ TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = { class TransfoXLConfig(PretrainedConfig): - """Configuration class to store the configuration of a `TransfoXLModel`. + """ + This is the configuration class to store the configuration of an :class:`~transformers.TransfoXLModel`. + It is used to instantiate a Transformer XL model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the Transformer XL architecture. + + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. Args: - vocab_size: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file. - cutoffs: cutoffs for the adaptive softmax - d_model: Dimensionality of the model's hidden states. - d_embed: Dimensionality of the embeddings - d_head: Dimensionality of the model's heads. - div_val: divident value for adapative input and softmax - pre_lnorm: apply LayerNorm to the input instead of the output - d_inner: Inner dimension in FF - n_layer: Number of hidden layers in the Transformer encoder. - n_head: Number of attention heads for each attention layer in - the Transformer encoder. - tgt_len: number of tokens to predict - ext_len: length of the extended context - mem_len: length of the retained previous heads - same_length: use the same attn length for all tokens - proj_share_all_but_first: True to share all but first projs, False not to share. - attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al. - clamp_len: use the same pos embeddings after clamp_len - sample_softmax: number of samples in sampled softmax - adaptive: use adaptive softmax - tie_weight: tie the word embedding and softmax weights - dropout: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - dropatt: The dropout ratio for the attention probabilities. - untie_r: untie relative position biases - embd_pdrop: The dropout ratio for the embeddings. - init: parameter initializer to use - init_range: parameters initialized by U(-init_range, init_range). - proj_init_std: parameters initialized by N(0, init_std) - init_std: parameters initialized by N(0, init_std) + vocab_size (:obj:`int`, optional, defaults to 267735): + Vocabulary size of the Transformer XL model. Defines the different tokens that + can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.TransfoXLModel`. + cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`): + Cutoffs for the adaptive softmax + d_model (:obj:`int`, optional, defaults to 1024): + Dimensionality of the model's hidden states. + d_embed (:obj:`int`, optional, defaults to 1024): + Dimensionality of the embeddings + n_head (:obj:`int`, optional, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + d_head (:obj:`int`, optional, defaults to 64): + Dimensionality of the model's heads. + d_inner (:obj:`int`, optional, defaults to 4096): + Inner dimension in FF + div_val (:obj:`int`, optional, defaults to 4): + Divident value for adapative input and softmax + pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`): + Apply LayerNorm to the input instead of the output + n_layer (:obj:`int`, optional, defaults to 18): + Number of hidden layers in the Transformer encoder. + tgt_len (:obj:`int`, optional, defaults to 128): + Number of tokens to predict + ext_len (:obj:`int`, optional, defaults to 0): + Length of the extended context + mem_len (:obj:`int`, optional, defaults to 1600): + Length of the retained previous heads + clamp_len (:obj:`int`, optional, defaults to 1000): + use the same pos embeddings after clamp_len + same_length (:obj:`boolean`, optional, defaults to :obj:`True`): + Use the same attn length for all tokens + proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`): + True to share all but first projs, False not to share. + attn_type (:obj:`int`, optional, defaults to 0): + Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al. + sample_softmax (:obj:`int`, optional, defaults to -1): + number of samples in sampled softmax + adaptive (:obj:`boolean`, optional, defaults to :obj:`True`): + use adaptive softmax + tie_weight (:obj:`boolean`, optional, defaults to :obj:`True`): + tie the word embedding and softmax weights + dropout (:obj:`float`, optional, defaults to 0.1): + The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. + dropatt (:obj:`float`, optional, defaults to 0): + The dropout ratio for the attention probabilities. + untie_r (:obj:`boolean`, optional, defaults to :obj:`True`): + Untie relative position biases + init (:obj:`string`, optional, defaults to `normal`): + Parameter initializer to use + init_range (:obj:`float`, optional, defaults to 0.01): + Parameters initialized by U(-init_range, init_range). + proj_init_std (:obj:`float`, optional, defaults to 0.01): + Parameters initialized by N(0, init_std) + init_std (:obj:`float`, optional, defaults to 0.02): + Parameters initialized by N(0, init_std) + layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5): + The epsilon to use in the layer normalization layers """ pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP @@ -98,9 +133,8 @@ class TransfoXLConfig(PretrainedConfig): layer_norm_epsilon=1e-5, **kwargs ): - """Constructs TransfoXLConfig. - """ super(TransfoXLConfig, self).__init__(**kwargs) + self.vocab_size = vocab_size self.cutoffs = [] self.cutoffs.extend(cutoffs) diff --git a/src/transformers/configuration_xlm.py b/src/transformers/configuration_xlm.py index b56182413b..0aa449ae7b 100644 --- a/src/transformers/configuration_xlm.py +++ b/src/transformers/configuration_xlm.py @@ -37,44 +37,81 @@ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = { class XLMConfig(PretrainedConfig): - """Configuration class to store the configuration of a `XLMModel`. + """ + This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`. + It is used to instantiate an XLM model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the `xlm-mlm-en-2048 `__ architecture. - Args: - vocab_size: Vocabulary size of `inputs_ids` in `XLMModel`. - d_model: Size of the encoder layers and the pooler layer. - n_layer: Number of hidden layers in the Transformer encoder. - n_head: Number of attention heads for each attention layer in - the Transformer encoder. - d_inner: The size of the "intermediate" (i.e., feed-forward) - layer in the Transformer encoder. - ff_activation: The non-linear activation function (function or string) in the - encoder and pooler. If string, "gelu", "relu" and "swish" are supported. - untie_r: untie relative position biases - attn_type: 'bi' for XLM, 'uni' for Transformer-XL + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. - dropout: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - max_position_embeddings: The maximum sequence length that this model might - ever be used with. Typically set this to something large just in case - (e.g., 512 or 1024 or 2048). - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. - layer_norm_eps: The epsilon used by LayerNorm. - - dropout: float, dropout rate. - init: str, the initialization scheme, either "normal" or "uniform". - init_range: float, initialize the parameters with a uniform distribution - in [-init_range, init_range]. Only effective when init="uniform". - init_std: float, initialize the parameters with a normal distribution - with mean 0 and stddev init_std. Only effective when init="normal". - mem_len: int, the number of tokens to cache. - reuse_len: int, the number of tokens in the currect batch to be cached - and reused in the future. - bi_data: bool, whether to use bidirectional input pipeline. - Usually set to True during pretraining and False during finetuning. - clamp_len: int, clamp all relative distances larger than clamp_len. - -1 means no clamping. - same_length: bool, whether to use the same attention length for each token. + Args: + vocab_size (:obj:`int`, optional, defaults to 30145): + Vocabulary size of the XLM model. Defines the different tokens that + can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`. + emb_dim (:obj:`int`, optional, defaults to 2048): + Dimensionality of the encoder layers and the pooler layer. + n_layer (:obj:`int`, optional, defaults to 12): + Number of hidden layers in the Transformer encoder. + n_head (:obj:`int`, optional, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + dropout (:obj:`float`, optional, defaults to 0.1): + The dropout probability for all fully connected + layers in the embeddings, encoder, and pooler. + attention_dropout (:obj:`float`, optional, defaults to 0.1): + The dropout probability for the attention mechanism + gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`): + The non-linear activation function (function or string) in the + encoder and pooler. If set to `True`, "gelu" will be used instead of "relu". + sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`): + Whether to use sinusoidal positional embeddings instead of absolute positional embeddings. + causal (:obj:`boolean`, optional, defaults to :obj:`False`): + Set this to `True` for the model to behave in a causal manner. + Causal models use a triangular attention mask in order to only attend to the left-side context instead + if a bidirectional context. + asm (:obj:`boolean`, optional, defaults to :obj:`False`): + TODO + n_langs (:obj:`int`, optional, defaults to 1): + The number of languages the model handles. Set to 1 for monolingual models. + use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`) + Whether to use language embeddings. Some models use additional language embeddings, see + `the multilingual models page `__ + for information on how to use them. + max_position_embeddings (:obj:`int`, optional, defaults to 512): + The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5): + The standard deviation of the truncated_normal_initializer for + initializing the embedding matrices. + init_std (:obj:`int`, optional, defaults to 50257): + The standard deviation of the truncated_normal_initializer for + initializing all weight matrices except the embedding matrices. + layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): + The epsilon used by the layer normalization layers. + bos_index (:obj:`int`, optional, defaults to 0): + The index of the beginning of sentence token in the vocabulary. + eos_index (:obj:`int`, optional, defaults to 1): + The index of the end of sentence token in the vocabulary. + pad_index (:obj:`int`, optional, defaults to 2): + The index of the padding token in the vocabulary. + unk_index (:obj:`int`, optional, defaults to 3): + The index of the unknown token in the vocabulary. + mask_index (:obj:`int`, optional, defaults to 5): + The index of the masking token in the vocabulary. + is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`): + Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al. + start_n_top (:obj:`int`, optional, defaults to 5): + TODO + end_n_top (:obj:`int`, optional, defaults to 5): + TODO + mask_token_id (:obj:`int`, optional, defaults to 0): + Model agnostic parameter to identify masked tokens when generating text in an MLM context. + lang_id (:obj:`int`, optional, defaults to 1): + The ID of the language used by the model. This parameter is used when generating + text in a given language. """ pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP diff --git a/src/transformers/configuration_xlnet.py b/src/transformers/configuration_xlnet.py index 38d00d7604..15337d3920 100644 --- a/src/transformers/configuration_xlnet.py +++ b/src/transformers/configuration_xlnet.py @@ -30,42 +30,60 @@ XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = { class XLNetConfig(PretrainedConfig): - """Configuration class to store the configuration of a ``XLNetModel``. + """ + This is the configuration class to store the configuration of a :class:`~transformers.XLNetModel`. + It is used to instantiate an XLNet model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the `xlnet-large-cased `__ architecture. - Args: - vocab_size: Vocabulary size of ``inputs_ids`` in ``XLNetModel``. - d_model: Size of the encoder layers and the pooler layer. - n_layer: Number of hidden layers in the Transformer encoder. - n_head: Number of attention heads for each attention layer in - the Transformer encoder. - d_inner: The size of the "intermediate" (i.e., feed-forward) - layer in the Transformer encoder. - ff_activation: The non-linear activation function (function or string) in the - encoder and pooler. If string, "gelu", "relu" and "swish" are supported. - untie_r: untie relative position biases - attn_type: 'bi' for XLNet, 'uni' for Transformer-XL + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. - dropout: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. - layer_norm_eps: The epsilon used by LayerNorm. - - dropout: float, dropout rate. - init: str, the initialization scheme, either "normal" or "uniform". - init_range: float, initialize the parameters with a uniform distribution - in [-init_range, init_range]. Only effective when init="uniform". - init_std: float, initialize the parameters with a normal distribution - with mean 0 and stddev init_std. Only effective when init="normal". - mem_len: int, the number of tokens to cache. - reuse_len: int, the number of tokens in the currect batch to be cached - and reused in the future. - bi_data: bool, whether to use bidirectional input pipeline. - Usually set to True during pretraining and False during finetuning. - clamp_len: int, clamp all relative distances larger than clamp_len. - -1 means no clamping. - same_length: bool, whether to use the same attention length for each token. - finetuning_task: name of the glue task on which the model was fine-tuned if any + Args: + vocab_size (:obj:`int`, optional, defaults to 32000): + Vocabulary size of the XLNet model. Defines the different tokens that + can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLNetModel`. + d_model (:obj:`int`, optional, defaults to 1024): + Size of the encoder layers and the pooler layer. + n_layer (:obj:`int`, optional, defaults to 24): + Number of hidden layers in the Transformer encoder. + n_head (:obj:`int`, optional, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + d_inner (:obj:`int`, optional, defaults to 4096): + The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + ff_activation (:obj:`string`, optional, defaults to "gelu"): + The non-linear activation function (function or string) in the + encoder and pooler. If string, "gelu", "relu" and "swish" are supported. + untie_r (:obj:`boolean`, optional, defaults to :obj:`True`): + Untie relative position biases + attn_type (:obj:`string`, optional, defaults to "bi"): + The attention type used by the model. Set 'bi' for XLNet, 'uni' for Transformer-XL. + initializer_range (:obj:`float`, optional, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): + The epsilon used by the layer normalization layers. + dropout (:obj:`float`, optional, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + mem_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`): + The number of tokens to cache. The key/value pairs that have already been pre-computed + in a previous forward pass won't be re-computed. See the + `quickstart `__ + for more information. + reuse_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`): + The number of tokens in the current batch to be cached and reused in the future. + bi_data (:obj:`boolean`, optional, defaults to :obj:`False`): + Whether to use bidirectional input pipeline. Usually set to `True` during + pretraining and `False` during finetuning. + clamp_len (:obj:`int`, optional, defaults to -1): + Clamp all relative distances larger than clamp_len. + Setting this attribute to -1 means no clamping. + same_length (:obj:`boolean`, optional, defaults to :obj:`False`): + Whether to use the same attention length for each token. + start_n_top (:obj:`int`, optional, defaults to 5): + TODO + end_n_top (:obj:`int`, optional, defaults to 5): + TODO """ pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP