diff --git a/docs/source/model_doc/albert.rst b/docs/source/model_doc/albert.rst index 92970c9328..5cf3f5ee7b 100644 --- a/docs/source/model_doc/albert.rst +++ b/docs/source/model_doc/albert.rst @@ -1,7 +1,7 @@ ALBERT ---------------------------------------------------- -``AlbrtConfig`` +``AlbertConfig`` ~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.AlbertConfig diff --git a/docs/source/model_doc/xlm.rst b/docs/source/model_doc/xlm.rst index f7034bb9d8..0c2231be42 100644 --- a/docs/source/model_doc/xlm.rst +++ b/docs/source/model_doc/xlm.rst @@ -34,6 +34,13 @@ XLM :members: +``XLMForQuestionAnsweringSimple`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.XLMForQuestionAnsweringSimple + :members: + + ``XLMForQuestionAnswering`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/model_doc/xlnet.rst b/docs/source/model_doc/xlnet.rst index 4005ce3a0a..0317fa0d78 100644 --- a/docs/source/model_doc/xlnet.rst +++ b/docs/source/model_doc/xlnet.rst @@ -36,6 +36,27 @@ XLNet :members: +``XLNetForTokenClassification`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.XLNetForTokenClassification + :members: + + +``XLNetForMultipleChoice`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.XLNetForMultipleChoice + :members: + + +``XLNetForQuestionAnsweringSimple`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.XLNetForQuestionAnsweringSimple + :members: + + ``XLNetForQuestionAnswering`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/examples/distillation/lm_seqs_dataset.py b/examples/distillation/lm_seqs_dataset.py index 691e010cf2..8f444f4e0e 100644 --- a/examples/distillation/lm_seqs_dataset.py +++ b/examples/distillation/lm_seqs_dataset.py @@ -42,6 +42,7 @@ class LmSeqsDataset(Dataset): self.check() self.remove_long_sequences() self.remove_empty_sequences() + self.remove_unknown_sequences() self.check() self.print_statistics() @@ -109,6 +110,22 @@ class LmSeqsDataset(Dataset): new_size = len(self) logger.info(f"Remove {init_size - new_size} too short (<=11 tokens) sequences.") + def remove_unknown_sequences(self): + """ + Remove sequences with a (too) high level of unknown tokens. + """ + if "unk_token" not in self.params.special_tok_ids: + return + else: + unk_token_id = self.params.special_tok_ids["unk_token"] + init_size = len(self) + unk_occs = np.array([np.count_nonzero(a == unk_token_id) for a in self.token_ids]) + indices = (unk_occs / self.lengths) < 0.5 + self.token_ids = self.token_ids[indices] + self.lengths = self.lengths[indices] + new_size = len(self) + logger.info(f"Remove {init_size - new_size} sequences with a high level of unknown tokens (50%).") + def print_statistics(self): """ Print some statistics on the corpus. Only the master process. diff --git a/examples/distillation/training_configs/distilbert-base-multilingual-cased.json b/examples/distillation/training_configs/distilbert-base-multilingual-cased.json new file mode 100644 index 0000000000..f76e7febcb --- /dev/null +++ b/examples/distillation/training_configs/distilbert-base-multilingual-cased.json @@ -0,0 +1,15 @@ +{ + "activation": "gelu", + "attention_dropout": 0.1, + "dim": 768, + "dropout": 0.1, + "hidden_dim": 3072, + "initializer_range": 0.02, + "max_position_embeddings": 512, + "n_heads": 12, + "n_layers": 6, + "sinusoidal_pos_embds": true, + "tie_weights_": true, + "vocab_size": 119547 + } + \ No newline at end of file diff --git a/examples/distillation/training_configs/distilroberta-base.json b/examples/distillation/training_configs/distilroberta-base.json new file mode 100644 index 0000000000..2d90ef6380 --- /dev/null +++ b/examples/distillation/training_configs/distilroberta-base.json @@ -0,0 +1,14 @@ +{ + "vocab_size": 50265, + "hidden_size": 768, + "num_hidden_layers": 6, + "num_attention_heads": 12, + "intermediate_size": 3072, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "attention_probs_dropout_prob": 0.1, + "max_position_embeddings": 514, + "type_vocab_size": 1, + "initializer_range": 0.02, + "layer_norm_eps": 0.00001 +} \ No newline at end of file diff --git a/examples/pplm/run_pplm.py b/examples/pplm/run_pplm.py index 8c405b56ad..b334a0098c 100644 --- a/examples/pplm/run_pplm.py +++ b/examples/pplm/run_pplm.py @@ -344,6 +344,7 @@ def full_text_generation( gamma=1.5, gm_scale=0.9, kl_scale=0.01, + repetition_penalty=1.0, **kwargs ): classifier, class_id = get_classifier(discrim, class_label, device) @@ -368,7 +369,14 @@ def full_text_generation( raise Exception("Specify either a bag of words or a discriminator") unpert_gen_tok_text, _, _ = generate_text_pplm( - model=model, tokenizer=tokenizer, context=context, device=device, length=length, sample=sample, perturb=False + model=model, + tokenizer=tokenizer, + context=context, + device=device, + length=length, + sample=sample, + perturb=False, + repetition_penalty=repetition_penalty, ) if device == "cuda": torch.cuda.empty_cache() @@ -401,6 +409,7 @@ def full_text_generation( gamma=gamma, gm_scale=gm_scale, kl_scale=kl_scale, + repetition_penalty=repetition_penalty, ) pert_gen_tok_texts.append(pert_gen_tok_text) if classifier is not None: @@ -437,6 +446,7 @@ def generate_text_pplm( gamma=1.5, gm_scale=0.9, kl_scale=0.01, + repetition_penalty=1.0, ): output_so_far = None if context: @@ -508,6 +518,13 @@ def generate_text_pplm( pert_logits, past, pert_all_hidden = model(last, past=pert_past) pert_logits = pert_logits[:, -1, :] / temperature # + SMALL_CONST + + for token_idx in set(output_so_far[0].tolist()): + if pert_logits[0, token_idx] < 0: + pert_logits[0, token_idx] *= repetition_penalty + else: + pert_logits[0, token_idx] /= repetition_penalty + pert_probs = F.softmax(pert_logits, dim=-1) if classifier is not None: @@ -588,6 +605,7 @@ def run_pplm_example( seed=0, no_cuda=False, colorama=False, + repetition_penalty=1.0, ): # set Random seed torch.manual_seed(seed) @@ -655,6 +673,7 @@ def run_pplm_example( gamma=gamma, gm_scale=gm_scale, kl_scale=kl_scale, + repetition_penalty=repetition_penalty, ) # untokenize unperturbed text @@ -767,6 +786,9 @@ if __name__ == "__main__": parser.add_argument("--seed", type=int, default=0) parser.add_argument("--no_cuda", action="store_true", help="no cuda") parser.add_argument("--colorama", action="store_true", help="colors keywords") + parser.add_argument( + "--repetition_penalty", type=float, default=1.0, help="Penalize repetition. More than 1.0 -> less repetition", + ) args = parser.parse_args() run_pplm_example(**vars(args)) diff --git a/src/transformers/configuration_albert.py b/src/transformers/configuration_albert.py index 2ac969538b..b210960f14 100644 --- a/src/transformers/configuration_albert.py +++ b/src/transformers/configuration_albert.py @@ -31,9 +31,73 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { class AlbertConfig(PretrainedConfig): - """Configuration for `AlbertModel`. + r""" + This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`. + It is used to instantiate an ALBERT model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the ALBERT `xxlarge `__ architecture. - The default settings match the configuration of model `albert_xxlarge`. + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. + + + Args: + vocab_size (:obj:`int`, optional, defaults to 30000): + Vocabulary size of the ALBERT model. Defines the different tokens that + can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`. + embedding_size (:obj:`int`, optional, defaults to 128): + Dimensionality of vocabulary embeddings. + hidden_size (:obj:`int`, optional, defaults to 4096): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (:obj:`int`, optional, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_hidden_groups (:obj:`int`, optional, defaults to 1): + Number of groups for the hidden layers, parameters in the same group are shared. + num_attention_heads (:obj:`int`, optional, defaults to 64): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (:obj:`int`, optional, defaults to 16384): + The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + inner_group_num (:obj:`int`, optional, defaults to 1): + The number of inner repetition of attention and ffn. + hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"): + The non-linear activation function (function or string) in the encoder and pooler. + If string, "gelu", "relu", "swish" and "gelu_new" are supported. + hidden_dropout_prob (:obj:`float`, optional, defaults to 0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0): + The dropout ratio for the attention probabilities. + max_position_embeddings (:obj:`int`, optional, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something + large (e.g., 512 or 1024 or 2048). + type_vocab_size (:obj:`int`, optional, defaults to 2): + The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`. + initializer_range (:obj:`float`, optional, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): + The epsilon used by the layer normalization layers. + + Example:: + + # Initializing an ALBERT-xxlarge style configuration + albert_xxlarge_configuration = AlbertConfig() + + # Initializing an ALBERT-base style configuration + albert_base_configuration = AlbertConfig( + hidden_size=768, + num_attention_heads=12, + intermediate_size=3072, + ) + + # Initializing a model from the ALBERT-base style configuration + model = AlbertModel(bert_base_configuration) + + # Accessing the model configuration + configuration = model.config + + Attributes: + pretrained_config_archive_map (Dict[str, str]): + A dictionary containing all the available pre-trained checkpoints. """ pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP @@ -58,35 +122,6 @@ class AlbertConfig(PretrainedConfig): layer_norm_eps=1e-12, **kwargs ): - """Constructs AlbertConfig. - - Args: - vocab_size: Vocabulary size of `inputs_ids` in `AlbertModel`. - embedding_size: size of voc embeddings. - hidden_size: Size of the encoder layers and the pooler layer. - num_hidden_layers: Number of hidden layers in the Transformer encoder. - num_hidden_groups: Number of group for the hidden layers, parameters in - the same group are shared. - num_attention_heads: Number of attention heads for each attention layer in - the Transformer encoder. - intermediate_size: The size of the "intermediate" (i.e., feed-forward) - layer in the Transformer encoder. - inner_group_num: int, number of inner repetition of attention and ffn. - down_scale_factor: float, the scale to apply - hidden_act: The non-linear activation function (function or string) in the - encoder and pooler. - hidden_dropout_prob: The dropout probability for all fully connected - layers in the embeddings, encoder, and pooler. - attention_probs_dropout_prob: The dropout ratio for the attention - probabilities. - max_position_embeddings: The maximum sequence length that this model might - ever be used with. Typically set this to something large just in case - (e.g., 512 or 1024 or 2048). - type_vocab_size: The vocabulary size of the `token_type_ids` passed into - `AlbertModel`. - initializer_range: The stdev of the truncated_normal_initializer for - initializing all weight matrices. - """ super(AlbertConfig, self).__init__(**kwargs) self.vocab_size = vocab_size diff --git a/src/transformers/configuration_auto.py b/src/transformers/configuration_auto.py index d8e67ca6d9..10cfe96929 100644 --- a/src/transformers/configuration_auto.py +++ b/src/transformers/configuration_auto.py @@ -77,32 +77,15 @@ CONFIG_MAPPING = OrderedDict( ) -class AutoConfig: - r""":class:`~transformers.AutoConfig` is a generic configuration class +class AutoConfig(object): + r""" + :class:`~transformers.AutoConfig` is a generic configuration class that will be instantiated as one of the configuration classes of the library - when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` - class method. + when created with the :func:`~transformers.AutoConfig.from_pretrained` class method. - The `from_pretrained()` method take care of returning the correct model class instance + The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string. - - When using string matching, the configuration class is matched on - the `pretrained_model_name_or_path` string in the following order: - - contains `t5`: T5Config (T5 model) - - contains `distilbert`: DistilBertConfig (DistilBERT model) - - contains `albert`: AlbertConfig (ALBERT model) - - contains `camembert`: CamembertConfig (CamemBERT model) - - contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model) - - contains `roberta`: RobertaConfig (RoBERTa model) - - contains `bert`: BertConfig (Bert model) - - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model) - - contains `gpt2`: GPT2Config (OpenAI GPT-2 model) - - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model) - - contains `xlnet`: XLNetConfig (XLNet model) - - contains `xlm`: XLMConfig (XLM model) - - contains `ctrl` : CTRLConfig (CTRL model) - This class cannot be instantiated using `__init__()` (throw an error). """ def __init__(self): @@ -124,60 +107,61 @@ class AutoConfig: @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): - r""" Instantiate one of the configuration classes of the library + r""" Instantiates one of the configuration classes of the library from a pre-trained model configuration. The configuration class to instantiate is selected based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string. - - contains `t5`: T5Config (T5 model) - - contains `distilbert`: DistilBertConfig (DistilBERT model) - - contains `albert`: AlbertConfig (ALBERT model) - - contains `camembert`: CamembertConfig (CamemBERT model) - - contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model) - - contains `roberta`: RobertaConfig (RoBERTa model) - - contains `bert`: BertConfig (Bert model) - - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model) - - contains `gpt2`: GPT2Config (OpenAI GPT-2 model) - - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model) - - contains `xlnet`: XLNetConfig (XLNet model) - - contains `xlm`: XLMConfig (XLM model) - - contains `ctrl` : CTRLConfig (CTRL model) - Params: - pretrained_model_name_or_path: either: + - contains `t5`: :class:`~transformers.T5Config` (T5 model) + - contains `distilbert`: :class:`~transformers.DistilBertConfig` (DistilBERT model) + - contains `albert`: :class:`~transformers.AlbertConfig` (ALBERT model) + - contains `camembert`: :class:`~transformers.CamembertConfig` (CamemBERT model) + - contains `xlm-roberta`: :class:`~transformers.XLMRobertaConfig` (XLM-RoBERTa model) + - contains `roberta`: :class:`~transformers.RobertaConfig` (RoBERTa model) + - contains `bert`: :class:`~transformers.BertConfig` (Bert model) + - contains `openai-gpt`: :class:`~transformers.OpenAIGPTConfig` (OpenAI GPT model) + - contains `gpt2`: :class:`~transformers.GPT2Config` (OpenAI GPT-2 model) + - contains `transfo-xl`: :class:`~transformers.TransfoXLConfig` (Transformer-XL model) + - contains `xlnet`: :class:`~transformers.XLNetConfig` (XLNet model) + - contains `xlm`: :class:`~transformers.XLMConfig` (XLM model) + - contains `ctrl` : :class:`~transformers.CTRLConfig` (CTRL model) - - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. - - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. - - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. - cache_dir: (`optional`) string: + Args: + pretrained_model_name_or_path (:obj:`string`): + Is either: \ + - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. + - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. + - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. + + cache_dir (:obj:`string`, optional, defaults to `None`): Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. - kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading. + force_download (:obj:`boolean`, optional, defaults to `False`): + Force to (re-)download the model weights and configuration files and override the cached versions if they exist. - - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values. - - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter. + resume_download (:obj:`boolean`, optional, defaults to `False`): + Do not delete incompletely received file. Attempt to resume the download if such a file exists. - force_download: (`optional`) boolean, default False: - Force to (re-)download the model weights and configuration files and override the cached versions if they exists. - - resume_download: (`optional`) boolean, default False: - Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. - - proxies: (`optional`) dict, default None: - A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. - The proxies are used on each request. - - return_unused_kwargs: (`optional`) bool: + proxies (:obj:`Dict[str, str]`, optional, defaults to `None`): + A dictionary of proxy servers to use by protocol or endpoint, e.g.: :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. + The proxies are used on each request. See `the requests documentation `__ for usage. + return_unused_kwargs (:obj:`boolean`, optional, defaults to `False`): - If False, then this function returns just the final configuration object. - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored. + kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): key/value pairs with which to update the configuration object after loading. + - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values. + - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter. + + Examples:: - config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. + config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. config = AutoConfig.from_pretrained('./test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json') config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False) diff --git a/src/transformers/configuration_bert.py b/src/transformers/configuration_bert.py index 762a8da9fd..b568f7e47d 100644 --- a/src/transformers/configuration_bert.py +++ b/src/transformers/configuration_bert.py @@ -50,32 +50,61 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { class BertConfig(PretrainedConfig): r""" - :class:`~transformers.BertConfig` is the configuration class to store the configuration of a - `BertModel`. + This is the configuration class to store the configuration of a :class:`~transformers.BertModel`. + It is used to instantiate an BERT model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the BERT `bert-base-uncased `__ architecture. + + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. - Arguments: - vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. - hidden_size: Size of the encoder layers and the pooler layer. - num_hidden_layers: Number of hidden layers in the Transformer encoder. - num_attention_heads: Number of attention heads for each attention layer in - the Transformer encoder. - intermediate_size: The size of the "intermediate" (i.e., feed-forward) - layer in the Transformer encoder. - hidden_act: The non-linear activation function (function or string) in the - encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. - hidden_dropout_prob: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - attention_probs_dropout_prob: The dropout ratio for the attention - probabilities. - max_position_embeddings: The maximum sequence length that this model might - ever be used with. Typically set this to something large just in case - (e.g., 512 or 1024 or 2048). - type_vocab_size: The vocabulary size of the `token_type_ids` passed into - `BertModel`. - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. - layer_norm_eps: The epsilon used by LayerNorm. + Args: + vocab_size (:obj:`int`, optional, defaults to 30522): + Vocabulary size of the BERT model. Defines the different tokens that + can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`. + hidden_size (:obj:`int`, optional, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (:obj:`int`, optional, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (:obj:`int`, optional, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (:obj:`int`, optional, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): + The non-linear activation function (function or string) in the encoder and pooler. + If string, "gelu", "relu", "swish" and "gelu_new" are supported. + hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1): + The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (:obj:`int`, optional, defaults to 512): + The maximum sequence length that this model might ever be used with. + Typically set this to something large just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (:obj:`int`, optional, defaults to 2): + The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`. + initializer_range (:obj:`float`, optional, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): + The epsilon used by the layer normalization layers. + + Example:: + + from transformers import BertModel, BertConfig + + # Initializing a BERT bert-base-uncased style configuration + configuration = BertConfig() + + # Initializing a model from the bert-base-uncased style configuration + model = BertModel(configuration) + + # Accessing the model configuration + configuration = model.config + + Attributes: + pretrained_config_archive_map (Dict[str, str]): + A dictionary containing all the available pre-trained checkpoints. """ pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP model_type = "bert" @@ -97,6 +126,7 @@ class BertConfig(PretrainedConfig): **kwargs ): super(BertConfig, self).__init__(**kwargs) + self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers diff --git a/src/transformers/configuration_camembert.py b/src/transformers/configuration_camembert.py index a4263556aa..ff55908584 100644 --- a/src/transformers/configuration_camembert.py +++ b/src/transformers/configuration_camembert.py @@ -29,5 +29,35 @@ CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { class CamembertConfig(RobertaConfig): + r""" + This is the configuration class to store the configuration of an :class:`~transformers.CamembertModel`. + It is used to instantiate an Camembert model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the BERT `bert-base-uncased `__ architecture. + + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. + + The :class:`~transformers.CamembertConfig` class directly inherits :class:`~transformers.BertConfig`. + It reuses the same defaults. Please check the parent class for more information. + + Example:: + + from transformers import CamembertModel, CamembertConfig + + # Initializing a CamemBERT configuration + configuration = CamembertConfig() + + # Initializing a model from the configuration + model = CamembertModel(configuration) + + # Accessing the model configuration + configuration = model.config + + Attributes: + pretrained_config_archive_map (Dict[str, str]): + A dictionary containing all the available pre-trained checkpoints. + """ pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP model_type = "camembert" diff --git a/src/transformers/configuration_ctrl.py b/src/transformers/configuration_ctrl.py index 9becc78754..ea5be7eccd 100644 --- a/src/transformers/configuration_ctrl.py +++ b/src/transformers/configuration_ctrl.py @@ -26,25 +26,60 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf class CTRLConfig(PretrainedConfig): - """Configuration class to store the configuration of a `CTRLModel`. + """ + This is the configuration class to store the configuration of an :class:`~transformers.CTRLModel`. + It is used to instantiate an CTRL model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the `ctrl `__ architecture from SalesForce. - Args: - vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file. - n_positions: Number of positional embeddings. - n_ctx: Size of the causal mask (usually same as n_positions). - dff: Size of the inner dimension of the FFN. - n_embd: Dimensionality of the embeddings and hidden states. - n_layer: Number of hidden layers in the Transformer encoder. - n_head: Number of attention heads for each attention layer in - the Transformer encoder. - layer_norm_epsilon: epsilon to use in the layer norm layers - resid_pdrop: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - attn_pdrop: The dropout ratio for the attention - probabilities. - embd_pdrop: The dropout ratio for the embeddings. - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. + + Args: + vocab_size (:obj:`int`, optional, defaults to 246534): + Vocabulary size of the CTRL model. Defines the different tokens that + can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`. + n_positions (:obj:`int`, optional, defaults to 256): + The maximum sequence length that this model might ever be used with. + Typically set this to something large just in case (e.g., 512 or 1024 or 2048). + n_ctx (:obj:`int`, optional, defaults to 256): + Dimensionality of the causal mask (usually same as n_positions). + n_embd (:obj:`int`, optional, defaults to 1280): + Dimensionality of the embeddings and hidden states. + dff (:obj:`int`, optional, defaults to 8192): + Dimensionality of the inner dimension of the FFN. + n_layer (:obj:`int`, optional, defaults to 48): + Number of hidden layers in the Transformer encoder. + n_head (:obj:`int`, optional, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + resid_pdrop (:obj:`float`, optional, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + embd_pdrop (:obj:`int`, optional, defaults to 0.1): + The dropout ratio for the embeddings. + attn_pdrop (:obj:`float`, optional, defaults to 0.1): + The dropout ratio for the attention. + layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6): + The epsilon to use in the layer normalization layers + initializer_range (:obj:`float`, optional, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + + Example:: + + from transformers import CTRLModel, CTRLConfig + + # Initializing a CTRL configuration + configuration = CTRLConfig() + + # Initializing a model from the configuration + model = CTRLModel(configuration) + + # Accessing the model configuration + configuration = model.config + + Attributes: + pretrained_config_archive_map (Dict[str, str]): + A dictionary containing all the available pre-trained checkpoints. """ pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP @@ -71,26 +106,6 @@ class CTRLConfig(PretrainedConfig): summary_first_dropout=0.1, **kwargs ): - """Constructs CTRLConfig. - - Args: - vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file. - n_positions: Number of positional embeddings. - n_ctx: Size of the causal mask (usually same as n_positions). - dff: Size of the inner dimension of the FFN. - n_embd: Dimensionality of the embeddings and hidden states. - n_layer: Number of hidden layers in the Transformer encoder. - n_head: Number of attention heads for each attention layer in - the Transformer encoder. - layer_norm_epsilon: epsilon to use in the layer norm layers - resid_pdrop: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - attn_pdrop: The dropout ratio for the attention - probabilities. - embd_pdrop: The dropout ratio for the embeddings. - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. - """ super(CTRLConfig, self).__init__(**kwargs) self.vocab_size = vocab_size self.n_ctx = n_ctx diff --git a/src/transformers/configuration_distilbert.py b/src/transformers/configuration_distilbert.py index b86a9f7fa8..239e2de08f 100644 --- a/src/transformers/configuration_distilbert.py +++ b/src/transformers/configuration_distilbert.py @@ -31,6 +31,67 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { class DistilBertConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`. + It is used to instantiate a DistilBERT model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the DistilBERT `distilbert-base-uncased `__ architecture. + + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. + + + Args: + vocab_size (:obj:`int`, optional, defaults to 30522): + Vocabulary size of the DistilBERT model. Defines the different tokens that + can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`. + max_position_embeddings (:obj:`int`, optional, defaults to 512): + The maximum sequence length that this model might ever be used with. + Typically set this to something large just in case (e.g., 512 or 1024 or 2048). + sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`): + Whether to use sinusoidal positional embeddings. + n_layers (:obj:`int`, optional, defaults to 6): + Number of hidden layers in the Transformer encoder. + n_heads (:obj:`int`, optional, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + dim (:obj:`int`, optional, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (:obj:`int`, optional, defaults to 3072): + The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + dropout (:obj:`float`, optional, defaults to 0.1): + The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (:obj:`float`, optional, defaults to 0.1): + The dropout ratio for the attention probabilities. + activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): + The non-linear activation function (function or string) in the encoder and pooler. + If string, "gelu", "relu", "swish" and "gelu_new" are supported. + initializer_range (:obj:`float`, optional, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + qa_dropout (:obj:`float`, optional, defaults to 0.1): + The dropout probabilities used in the question answering model + :class:`~tranformers.DistilBertForQuestionAnswering`. + seq_classif_dropout (:obj:`float`, optional, defaults to 0.2): + The dropout probabilities used in the sequence classification model + :class:`~tranformers.DistilBertForSequenceClassification`. + + Example:: + + from transformers import DistilBertModel, DistilBertConfig + + # Initializing a DistilBERT configuration + configuration = DistilBertConfig() + + # Initializing a model from the configuration + model = DistilBertModel(configuration) + + # Accessing the model configuration + configuration = model.config + + Attributes: + pretrained_config_archive_map (Dict[str, str]): + A dictionary containing all the available pre-trained checkpoints. + """ pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP model_type = "distilbert" @@ -47,7 +108,6 @@ class DistilBertConfig(PretrainedConfig): attention_dropout=0.1, activation="gelu", initializer_range=0.02, - tie_weights_=True, qa_dropout=0.1, seq_classif_dropout=0.2, **kwargs @@ -64,7 +124,6 @@ class DistilBertConfig(PretrainedConfig): self.attention_dropout = attention_dropout self.activation = activation self.initializer_range = initializer_range - self.tie_weights_ = tie_weights_ self.qa_dropout = qa_dropout self.seq_classif_dropout = seq_classif_dropout diff --git a/src/transformers/configuration_gpt2.py b/src/transformers/configuration_gpt2.py index c91d01b139..1275e56299 100644 --- a/src/transformers/configuration_gpt2.py +++ b/src/transformers/configuration_gpt2.py @@ -33,24 +33,84 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = { class GPT2Config(PretrainedConfig): - """Configuration class to store the configuration of a `GPT2Model`. + """ + This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`. + It is used to instantiate an GPT-2 model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the GPT-2 `small `__ architecture. - Args: - vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. - n_positions: Number of positional embeddings. - n_ctx: Size of the causal mask (usually same as n_positions). - n_embd: Dimensionality of the embeddings and hidden states. - n_layer: Number of hidden layers in the Transformer encoder. - n_head: Number of attention heads for each attention layer in - the Transformer encoder. - layer_norm_epsilon: epsilon to use in the layer norm layers - resid_pdrop: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - attn_pdrop: The dropout ratio for the attention - probabilities. - embd_pdrop: The dropout ratio for the embeddings. - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. + + + Args: + vocab_size (:obj:`int`, optional, defaults to 50257): + Vocabulary size of the GPT-2 model. Defines the different tokens that + can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`. + n_positions (:obj:`int`, optional, defaults to 1024): + The maximum sequence length that this model might ever be used with. + Typically set this to something large just in case (e.g., 512 or 1024 or 2048). + n_ctx (:obj:`int`, optional, defaults to 1024): + Dimensionality of the causal mask (usually same as n_positions). + n_embd (:obj:`int`, optional, defaults to 768): + Dimensionality of the embeddings and hidden states. + n_layer (:obj:`int`, optional, defaults to 12): + Number of hidden layers in the Transformer encoder. + n_head (:obj:`int`, optional, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + resid_pdrop (:obj:`float`, optional, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + embd_pdrop (:obj:`int`, optional, defaults to 0.1): + The dropout ratio for the embeddings. + attn_pdrop (:obj:`float`, optional, defaults to 0.1): + The dropout ratio for the attention. + layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5): + The epsilon to use in the layer normalization layers + initializer_range (:obj:`float`, optional, defaults to 16): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + summary_type (:obj:`string`, optional, defaults to "cls_index"): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.GPT2DoubleHeadsModel`. + Is one of the following options: + - 'last' => take the last token hidden state (like XLNet) + - 'first' => take the first token hidden state (like Bert) + - 'mean' => take the mean of all tokens hidden states + - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) + - 'attn' => Not implemented now, use multi-head attention + summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.GPT2DoubleHeadsModel`. + Add a projection after the vector extraction + summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.GPT2DoubleHeadsModel`. + 'tanh' => add a tanh activation to the output, Other => no activation. + summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.GPT2DoubleHeadsModel`. + If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. + summary_first_dropout (:obj:`float`, optional, defaults to 0.1): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.GPT2DoubleHeadsModel`. + Add a dropout before the projection and activation + + Example:: + + from transformers import GPT2Model, GPT2Config + + # Initializing a GPT2 configuration + configuration = GPT2Config() + + # Initializing a model from the configuration + model = GPT2Model(configuration) + + # Accessing the model configuration + configuration = model.config + + Attributes: + pretrained_config_archive_map (Dict[str, str]): + A dictionary containing all the available pre-trained checkpoints. """ pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP @@ -76,26 +136,8 @@ class GPT2Config(PretrainedConfig): summary_first_dropout=0.1, **kwargs ): - """Constructs GPT2Config. - - Args: - vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. - n_positions: Number of positional embeddings. - n_ctx: Size of the causal mask (usually same as n_positions). - n_embd: Dimensionality of the embeddings and hidden states. - n_layer: Number of hidden layers in the Transformer encoder. - n_head: Number of attention heads for each attention layer in - the Transformer encoder. - layer_norm_epsilon: epsilon to use in the layer norm layers - resid_pdrop: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - attn_pdrop: The dropout ratio for the attention - probabilities. - embd_pdrop: The dropout ratio for the embeddings. - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. - """ super(GPT2Config, self).__init__(**kwargs) + self.vocab_size = vocab_size self.n_ctx = n_ctx self.n_positions = n_positions diff --git a/src/transformers/configuration_mmbt.py b/src/transformers/configuration_mmbt.py index b072468e7f..56a35e237c 100644 --- a/src/transformers/configuration_mmbt.py +++ b/src/transformers/configuration_mmbt.py @@ -26,9 +26,13 @@ class MMBTConfig(object): """Configuration class to store the configuration of a `MMBT Model`. Args: - config: config of the underlying Transformer models. It's values are copied over to use a single config. - num_labels: Size of final Linear layer for classification. - modal_hidden_size: Embedding dimension of the non-text modality encoder. + config (:obj:`~transformers.PreTrainedConfig`): + Config of the underlying Transformer models. Its values are + copied over to use a single config. + num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`): + Size of final Linear layer for classification. + modal_hidden_size (:obj:`int`, optional, defautls to 2048): + Embedding dimension of the non-text modality encoder. """ def __init__(self, config, num_labels=None, modal_hidden_size=2048): diff --git a/src/transformers/configuration_openai.py b/src/transformers/configuration_openai.py index b2ad81eb02..0ba91689e8 100644 --- a/src/transformers/configuration_openai.py +++ b/src/transformers/configuration_openai.py @@ -30,27 +30,87 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = { class OpenAIGPTConfig(PretrainedConfig): """ - Configuration class to store the configuration of a `OpenAIGPTModel`. + This is the configuration class to store the configuration of an :class:`~transformers.OpenAIGPTModel`. + It is used to instantiate an GPT model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the `GPT `__ architecture from OpenAI. - Args: - vocab_size: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file. - n_positions: Number of positional embeddings. - n_ctx: Size of the causal mask (usually same as n_positions). - n_embd: Dimensionality of the embeddings and hidden states. - n_layer: Number of hidden layers in the Transformer encoder. - n_head: Number of attention heads for each attention layer in - the Transformer encoder. - afn: The non-linear activation function (function or string) in the - encoder and pooler. If string, "gelu", "relu" and "swish" are supported. - resid_pdrop: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - attn_pdrop: The dropout ratio for the attention - probabilities. - embd_pdrop: The dropout ratio for the embeddings. - layer_norm_epsilon: epsilon to use in the layer norm layers - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. - predict_special_tokens: should we predict special tokens (when the model has a LM head) + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. + + Args: + vocab_size (:obj:`int`, optional, defaults to 40478): + Vocabulary size of the GPT model. Defines the different tokens that + can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`. + n_positions (:obj:`int`, optional, defaults to 512): + The maximum sequence length that this model might ever be used with. + Typically set this to something large just in case (e.g., 512 or 1024 or 2048). + n_ctx (:obj:`int`, optional, defaults to 512): + Dimensionality of the causal mask (usually same as n_positions). + n_embd (:obj:`int`, optional, defaults to 768): + Dimensionality of the embeddings and hidden states. + n_layer (:obj:`int`, optional, defaults to 12): + Number of hidden layers in the Transformer encoder. + n_head (:obj:`int`, optional, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): + The non-linear activation function (function or string) in the encoder and pooler. + If string, "gelu", "relu", "swish" and "gelu_new" are supported. + resid_pdrop (:obj:`float`, optional, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + embd_pdrop (:obj:`int`, optional, defaults to 0.1): + The dropout ratio for the embeddings. + attn_pdrop (:obj:`float`, optional, defaults to 0.1): + The dropout ratio for the attention. + layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5): + The epsilon to use in the layer normalization layers + initializer_range (:obj:`float`, optional, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`): + Whether special tokens should be predicted when the model is has a language modeling head. + summary_type (:obj:`string`, optional, defaults to "cls_index"): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.OpenAIGPTDoubleHeadsModel`. + Is one of the following options: + - 'last' => take the last token hidden state (like XLNet) + - 'first' => take the first token hidden state (like Bert) + - 'mean' => take the mean of all tokens hidden states + - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) + - 'attn' => Not implemented now, use multi-head attention + summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.OpenAIGPTDoubleHeadsModel`. + Add a projection after the vector extraction + summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.OpenAIGPTDoubleHeadsModel`. + 'tanh' => add a tanh activation to the output, Other => no activation. + summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.OpenAIGPTDoubleHeadsModel`. + If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. + summary_first_dropout (:obj:`float`, optional, defaults to 0.1): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.OpenAIGPTDoubleHeadsModel`. + Add a dropout before the projection and activation + + Example:: + + from transformers import OpenAIGPTConfig, OpenAIGPTModel + + # Initializing a GPT configuration + configuration = OpenAIGPTConfig() + + # Initializing a model from the configuration + model = OpenAIGPTModel(configuration) + + # Accessing the model configuration + configuration = model.config + + Attributes: + pretrained_config_archive_map (Dict[str, str]): + A dictionary containing all the available pre-trained checkpoints. """ pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP @@ -78,9 +138,8 @@ class OpenAIGPTConfig(PretrainedConfig): summary_first_dropout=0.1, **kwargs ): - """Constructs OpenAIGPTConfig. - """ super(OpenAIGPTConfig, self).__init__(**kwargs) + self.vocab_size = vocab_size self.n_ctx = n_ctx self.n_positions = n_positions diff --git a/src/transformers/configuration_roberta.py b/src/transformers/configuration_roberta.py index 564b4d47b4..655fe03b71 100644 --- a/src/transformers/configuration_roberta.py +++ b/src/transformers/configuration_roberta.py @@ -34,5 +34,35 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { class RobertaConfig(BertConfig): + r""" + This is the configuration class to store the configuration of an :class:`~transformers.RobertaModel`. + It is used to instantiate an RoBERTa model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the BERT `bert-base-uncased `__ architecture. + + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. + + The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`. + It reuses the same defaults. Please check the parent class for more information. + + Example:: + + from transformers import RobertaConfig, RobertaModel + + # Initializing a RoBERTa configuration + configuration = RobertaConfig() + + # Initializing a model from the configuration + model = RobertaModel(configuration) + + # Accessing the model configuration + configuration = model.config + + Attributes: + pretrained_config_archive_map (Dict[str, str]): + A dictionary containing all the available pre-trained checkpoints. + """ pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP model_type = "roberta" diff --git a/src/transformers/configuration_transfo_xl.py b/src/transformers/configuration_transfo_xl.py index 9e332fa8c3..db210e5a10 100644 --- a/src/transformers/configuration_transfo_xl.py +++ b/src/transformers/configuration_transfo_xl.py @@ -29,39 +29,91 @@ TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = { class TransfoXLConfig(PretrainedConfig): - """Configuration class to store the configuration of a `TransfoXLModel`. + """ + This is the configuration class to store the configuration of an :class:`~transformers.TransfoXLModel`. + It is used to instantiate a Transformer XL model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the `Transformer XL `__ architecture. + + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. Args: - vocab_size: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file. - cutoffs: cutoffs for the adaptive softmax - d_model: Dimensionality of the model's hidden states. - d_embed: Dimensionality of the embeddings - d_head: Dimensionality of the model's heads. - div_val: divident value for adapative input and softmax - pre_lnorm: apply LayerNorm to the input instead of the output - d_inner: Inner dimension in FF - n_layer: Number of hidden layers in the Transformer encoder. - n_head: Number of attention heads for each attention layer in - the Transformer encoder. - tgt_len: number of tokens to predict - ext_len: length of the extended context - mem_len: length of the retained previous heads - same_length: use the same attn length for all tokens - proj_share_all_but_first: True to share all but first projs, False not to share. - attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al. - clamp_len: use the same pos embeddings after clamp_len - sample_softmax: number of samples in sampled softmax - adaptive: use adaptive softmax - tie_weight: tie the word embedding and softmax weights - dropout: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - dropatt: The dropout ratio for the attention probabilities. - untie_r: untie relative position biases - embd_pdrop: The dropout ratio for the embeddings. - init: parameter initializer to use - init_range: parameters initialized by U(-init_range, init_range). - proj_init_std: parameters initialized by N(0, init_std) - init_std: parameters initialized by N(0, init_std) + vocab_size (:obj:`int`, optional, defaults to 267735): + Vocabulary size of the Transformer XL model. Defines the different tokens that + can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.TransfoXLModel`. + cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`): + Cutoffs for the adaptive softmax + d_model (:obj:`int`, optional, defaults to 1024): + Dimensionality of the model's hidden states. + d_embed (:obj:`int`, optional, defaults to 1024): + Dimensionality of the embeddings + n_head (:obj:`int`, optional, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + d_head (:obj:`int`, optional, defaults to 64): + Dimensionality of the model's heads. + d_inner (:obj:`int`, optional, defaults to 4096): + Inner dimension in FF + div_val (:obj:`int`, optional, defaults to 4): + Divident value for adapative input and softmax + pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`): + Apply LayerNorm to the input instead of the output + n_layer (:obj:`int`, optional, defaults to 18): + Number of hidden layers in the Transformer encoder. + tgt_len (:obj:`int`, optional, defaults to 128): + Number of tokens to predict + ext_len (:obj:`int`, optional, defaults to 0): + Length of the extended context + mem_len (:obj:`int`, optional, defaults to 1600): + Length of the retained previous heads + clamp_len (:obj:`int`, optional, defaults to 1000): + use the same pos embeddings after clamp_len + same_length (:obj:`boolean`, optional, defaults to :obj:`True`): + Use the same attn length for all tokens + proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`): + True to share all but first projs, False not to share. + attn_type (:obj:`int`, optional, defaults to 0): + Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al. + sample_softmax (:obj:`int`, optional, defaults to -1): + number of samples in sampled softmax + adaptive (:obj:`boolean`, optional, defaults to :obj:`True`): + use adaptive softmax + tie_weight (:obj:`boolean`, optional, defaults to :obj:`True`): + tie the word embedding and softmax weights + dropout (:obj:`float`, optional, defaults to 0.1): + The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. + dropatt (:obj:`float`, optional, defaults to 0): + The dropout ratio for the attention probabilities. + untie_r (:obj:`boolean`, optional, defaults to :obj:`True`): + Untie relative position biases + init (:obj:`string`, optional, defaults to `normal`): + Parameter initializer to use + init_range (:obj:`float`, optional, defaults to 0.01): + Parameters initialized by U(-init_range, init_range). + proj_init_std (:obj:`float`, optional, defaults to 0.01): + Parameters initialized by N(0, init_std) + init_std (:obj:`float`, optional, defaults to 0.02): + Parameters initialized by N(0, init_std) + layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5): + The epsilon to use in the layer normalization layers + + Example:: + + from transformers import TransfoXLConfig, TransfoXLModel + + # Initializing a Transformer XL configuration + configuration = TransfoXLConfig() + + # Initializing a model from the configuration + model = TransfoXLModel(configuration) + + # Accessing the model configuration + configuration = model.config + + Attributes: + pretrained_config_archive_map (Dict[str, str]): + A dictionary containing all the available pre-trained checkpoints. """ pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP @@ -99,9 +151,8 @@ class TransfoXLConfig(PretrainedConfig): layer_norm_epsilon=1e-5, **kwargs ): - """Constructs TransfoXLConfig. - """ super(TransfoXLConfig, self).__init__(**kwargs) + self.vocab_size = vocab_size self.cutoffs = [] self.cutoffs.extend(cutoffs) diff --git a/src/transformers/configuration_xlm.py b/src/transformers/configuration_xlm.py index c3bf64c724..f1afc96489 100644 --- a/src/transformers/configuration_xlm.py +++ b/src/transformers/configuration_xlm.py @@ -37,44 +37,124 @@ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = { class XLMConfig(PretrainedConfig): - """Configuration class to store the configuration of a `XLMModel`. + """ + This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`. + It is used to instantiate an XLM model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the `xlm-mlm-en-2048 `__ architecture. - Args: - vocab_size: Vocabulary size of `inputs_ids` in `XLMModel`. - d_model: Size of the encoder layers and the pooler layer. - n_layer: Number of hidden layers in the Transformer encoder. - n_head: Number of attention heads for each attention layer in - the Transformer encoder. - d_inner: The size of the "intermediate" (i.e., feed-forward) - layer in the Transformer encoder. - ff_activation: The non-linear activation function (function or string) in the - encoder and pooler. If string, "gelu", "relu" and "swish" are supported. - untie_r: untie relative position biases - attn_type: 'bi' for XLM, 'uni' for Transformer-XL + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. - dropout: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - max_position_embeddings: The maximum sequence length that this model might - ever be used with. Typically set this to something large just in case - (e.g., 512 or 1024 or 2048). - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. - layer_norm_eps: The epsilon used by LayerNorm. + Args: + vocab_size (:obj:`int`, optional, defaults to 30145): + Vocabulary size of the XLM model. Defines the different tokens that + can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`. + emb_dim (:obj:`int`, optional, defaults to 2048): + Dimensionality of the encoder layers and the pooler layer. + n_layer (:obj:`int`, optional, defaults to 12): + Number of hidden layers in the Transformer encoder. + n_head (:obj:`int`, optional, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + dropout (:obj:`float`, optional, defaults to 0.1): + The dropout probability for all fully connected + layers in the embeddings, encoder, and pooler. + attention_dropout (:obj:`float`, optional, defaults to 0.1): + The dropout probability for the attention mechanism + gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`): + The non-linear activation function (function or string) in the + encoder and pooler. If set to `True`, "gelu" will be used instead of "relu". + sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`): + Whether to use sinusoidal positional embeddings instead of absolute positional embeddings. + causal (:obj:`boolean`, optional, defaults to :obj:`False`): + Set this to `True` for the model to behave in a causal manner. + Causal models use a triangular attention mask in order to only attend to the left-side context instead + if a bidirectional context. + asm (:obj:`boolean`, optional, defaults to :obj:`False`): + Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction + layer. + n_langs (:obj:`int`, optional, defaults to 1): + The number of languages the model handles. Set to 1 for monolingual models. + use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`) + Whether to use language embeddings. Some models use additional language embeddings, see + `the multilingual models page `__ + for information on how to use them. + max_position_embeddings (:obj:`int`, optional, defaults to 512): + The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5): + The standard deviation of the truncated_normal_initializer for + initializing the embedding matrices. + init_std (:obj:`int`, optional, defaults to 50257): + The standard deviation of the truncated_normal_initializer for + initializing all weight matrices except the embedding matrices. + layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): + The epsilon used by the layer normalization layers. + bos_index (:obj:`int`, optional, defaults to 0): + The index of the beginning of sentence token in the vocabulary. + eos_index (:obj:`int`, optional, defaults to 1): + The index of the end of sentence token in the vocabulary. + pad_index (:obj:`int`, optional, defaults to 2): + The index of the padding token in the vocabulary. + unk_index (:obj:`int`, optional, defaults to 3): + The index of the unknown token in the vocabulary. + mask_index (:obj:`int`, optional, defaults to 5): + The index of the masking token in the vocabulary. + is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`): + Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al. + summary_type (:obj:`string`, optional, defaults to "first"): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.XLMForSequenceClassification`. + Is one of the following options: + - 'last' => take the last token hidden state (like XLNet) + - 'first' => take the first token hidden state (like Bert) + - 'mean' => take the mean of all tokens hidden states + - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) + - 'attn' => Not implemented now, use multi-head attention + summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.XLMForSequenceClassification`. + Add a projection after the vector extraction + summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.XLMForSequenceClassification`. + 'tanh' => add a tanh activation to the output, Other => no activation. + summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.XLMForSequenceClassification`. + If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. + summary_first_dropout (:obj:`float`, optional, defaults to 0.1): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.XLMForSequenceClassification`. + Add a dropout before the projection and activation + start_n_top (:obj:`int`, optional, defaults to 5): + Used in the SQuAD evaluation script for XLM and XLNet. + end_n_top (:obj:`int`, optional, defaults to 5): + Used in the SQuAD evaluation script for XLM and XLNet. + mask_token_id (:obj:`int`, optional, defaults to 0): + Model agnostic parameter to identify masked tokens when generating text in an MLM context. + lang_id (:obj:`int`, optional, defaults to 1): + The ID of the language used by the model. This parameter is used when generating + text in a given language. - dropout: float, dropout rate. - init: str, the initialization scheme, either "normal" or "uniform". - init_range: float, initialize the parameters with a uniform distribution - in [-init_range, init_range]. Only effective when init="uniform". - init_std: float, initialize the parameters with a normal distribution - with mean 0 and stddev init_std. Only effective when init="normal". - mem_len: int, the number of tokens to cache. - reuse_len: int, the number of tokens in the currect batch to be cached - and reused in the future. - bi_data: bool, whether to use bidirectional input pipeline. - Usually set to True during pretraining and False during finetuning. - clamp_len: int, clamp all relative distances larger than clamp_len. - -1 means no clamping. - same_length: bool, whether to use the same attention length for each token. + Example:: + + from transformers import XLMConfig, XLMModel + + # Initializing a XLM configuration + configuration = XLMConfig() + + # Initializing a model from the configuration + model = XLMModel(configuration) + + # Accessing the model configuration + configuration = model.config + + Attributes: + pretrained_config_archive_map (Dict[str, str]): + A dictionary containing all the available pre-trained checkpoints. """ pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP diff --git a/src/transformers/configuration_xlnet.py b/src/transformers/configuration_xlnet.py index 2e4536fa0d..397db1bfbb 100644 --- a/src/transformers/configuration_xlnet.py +++ b/src/transformers/configuration_xlnet.py @@ -30,42 +30,102 @@ XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = { class XLNetConfig(PretrainedConfig): - """Configuration class to store the configuration of a ``XLNetModel``. + """ + This is the configuration class to store the configuration of a :class:`~transformers.XLNetModel`. + It is used to instantiate an XLNet model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the `xlnet-large-cased `__ architecture. - Args: - vocab_size: Vocabulary size of ``inputs_ids`` in ``XLNetModel``. - d_model: Size of the encoder layers and the pooler layer. - n_layer: Number of hidden layers in the Transformer encoder. - n_head: Number of attention heads for each attention layer in - the Transformer encoder. - d_inner: The size of the "intermediate" (i.e., feed-forward) - layer in the Transformer encoder. - ff_activation: The non-linear activation function (function or string) in the - encoder and pooler. If string, "gelu", "relu" and "swish" are supported. - untie_r: untie relative position biases - attn_type: 'bi' for XLNet, 'uni' for Transformer-XL + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. - dropout: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. - layer_norm_eps: The epsilon used by LayerNorm. + Args: + vocab_size (:obj:`int`, optional, defaults to 32000): + Vocabulary size of the XLNet model. Defines the different tokens that + can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLNetModel`. + d_model (:obj:`int`, optional, defaults to 1024): + Dimensionality of the encoder layers and the pooler layer. + n_layer (:obj:`int`, optional, defaults to 24): + Number of hidden layers in the Transformer encoder. + n_head (:obj:`int`, optional, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + d_inner (:obj:`int`, optional, defaults to 4096): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + ff_activation (:obj:`string`, optional, defaults to "gelu"): + The non-linear activation function (function or string) in the + encoder and pooler. If string, "gelu", "relu" and "swish" are supported. + untie_r (:obj:`boolean`, optional, defaults to :obj:`True`): + Untie relative position biases + attn_type (:obj:`string`, optional, defaults to "bi"): + The attention type used by the model. Set 'bi' for XLNet, 'uni' for Transformer-XL. + initializer_range (:obj:`float`, optional, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): + The epsilon used by the layer normalization layers. + dropout (:obj:`float`, optional, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + mem_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`): + The number of tokens to cache. The key/value pairs that have already been pre-computed + in a previous forward pass won't be re-computed. See the + `quickstart `__ + for more information. + reuse_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`): + The number of tokens in the current batch to be cached and reused in the future. + bi_data (:obj:`boolean`, optional, defaults to :obj:`False`): + Whether to use bidirectional input pipeline. Usually set to `True` during + pretraining and `False` during finetuning. + clamp_len (:obj:`int`, optional, defaults to -1): + Clamp all relative distances larger than clamp_len. + Setting this attribute to -1 means no clamping. + same_length (:obj:`boolean`, optional, defaults to :obj:`False`): + Whether to use the same attention length for each token. + summary_type (:obj:`string`, optional, defaults to "last"): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`. + Is one of the following options: + - 'last' => take the last token hidden state (like XLNet) + - 'first' => take the first token hidden state (like Bert) + - 'mean' => take the mean of all tokens hidden states + - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) + - 'attn' => Not implemented now, use multi-head attention + summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`. + Add a projection after the vector extraction + summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`. + 'tanh' => add a tanh activation to the output, Other => no activation. + summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`. + If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. + summary_last_dropout (:obj:`float`, optional, defaults to 0.1): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`. + Add a dropout after the projection and activation + start_n_top (:obj:`int`, optional, defaults to 5): + Used in the SQuAD evaluation script for XLM and XLNet. + end_n_top (:obj:`int`, optional, defaults to 5): + Used in the SQuAD evaluation script for XLM and XLNet. - dropout: float, dropout rate. - init: str, the initialization scheme, either "normal" or "uniform". - init_range: float, initialize the parameters with a uniform distribution - in [-init_range, init_range]. Only effective when init="uniform". - init_std: float, initialize the parameters with a normal distribution - with mean 0 and stddev init_std. Only effective when init="normal". - mem_len: int, the number of tokens to cache. - reuse_len: int, the number of tokens in the currect batch to be cached - and reused in the future. - bi_data: bool, whether to use bidirectional input pipeline. - Usually set to True during pretraining and False during finetuning. - clamp_len: int, clamp all relative distances larger than clamp_len. - -1 means no clamping. - same_length: bool, whether to use the same attention length for each token. - finetuning_task: name of the glue task on which the model was fine-tuned if any + Example:: + + from transformers import XLNetConfig, XLNetModel + + # Initializing a XLNet configuration + configuration = XLNetConfig() + + # Initializing a model from the configuration + model = XLNetModel(configuration) + + # Accessing the model configuration + configuration = model.config + + Attributes: + pretrained_config_archive_map (Dict[str, str]): + A dictionary containing all the available pre-trained checkpoints. """ pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index 866f9f238a..259c1c5643 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -384,16 +384,13 @@ def get_from_cache( else: http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent) - # we are copying the file before closing it, so flush to avoid truncation - temp_file.flush() + logger.info("storing %s in cache at %s", url, cache_path) + os.rename(temp_file.name, cache_path) - logger.info("storing %s in cache at %s", url, cache_path) - os.rename(temp_file.name, cache_path) - - logger.info("creating metadata file for %s", cache_path) - meta = {"url": url, "etag": etag} - meta_path = cache_path + ".json" - with open(meta_path, "w") as meta_file: - json.dump(meta, meta_file) + logger.info("creating metadata file for %s", cache_path) + meta = {"url": url, "etag": etag} + meta_path = cache_path + ".json" + with open(meta_path, "w") as meta_file: + json.dump(meta, meta_file) return cache_path diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py index c1540bda5f..4fae225212 100644 --- a/src/transformers/modeling_albert.py +++ b/src/transformers/modeling_albert.py @@ -579,6 +579,9 @@ class AlbertMLMHead(nn.Module): self.decoder = nn.Linear(config.embedding_size, config.vocab_size) self.activation = ACT2FN[config.hidden_act] + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + def forward(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.activation(hidden_states) diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index cdc46b9662..48ada95c75 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -481,6 +481,9 @@ class BertLMPredictionHead(nn.Module): self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) + self.bias diff --git a/src/transformers/modeling_roberta.py b/src/transformers/modeling_roberta.py index 56e983e01c..fc066cc7b8 100644 --- a/src/transformers/modeling_roberta.py +++ b/src/transformers/modeling_roberta.py @@ -306,6 +306,9 @@ class RobertaLMHead(nn.Module): self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + def forward(self, features, **kwargs): x = self.dense(features) x = gelu(x) diff --git a/src/transformers/modeling_t5.py b/src/transformers/modeling_t5.py index 81906e86ea..fb37f6fa4e 100644 --- a/src/transformers/modeling_t5.py +++ b/src/transformers/modeling_t5.py @@ -286,6 +286,7 @@ class T5Attention(nn.Module): bidirectional=not self.is_decoder, num_buckets=self.relative_attention_num_buckets, ) + rp_bucket = rp_bucket.to(self.relative_attention_bias.weight.device) values = self.relative_attention_bias(rp_bucket) # shape (qlen, klen, num_heads) values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, qlen, klen) return values diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 886d452375..71f851b2e1 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -705,55 +705,71 @@ class QuestionAnsweringPipeline(Pipeline): # Convert inputs to features examples = self._args_parser(*texts, **kwargs) - features = squad_convert_examples_to_features( - examples, self.tokenizer, kwargs["max_seq_len"], kwargs["doc_stride"], kwargs["max_question_len"], False - ) - fw_args = self.inputs_for_model([f.__dict__ for f in features]) + features_list = [ + squad_convert_examples_to_features( + [example], + self.tokenizer, + kwargs["max_seq_len"], + kwargs["doc_stride"], + kwargs["max_question_len"], + False, + ) + for example in examples + ] + all_answers = [] + for features, example in zip(features_list, examples): + fw_args = self.inputs_for_model([f.__dict__ for f in features]) - # Manage tensor allocation on correct device - with self.device_placement(): - if self.framework == "tf": - fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()} - start, end = self.model(fw_args) - start, end = start.numpy(), end.numpy() - else: - with torch.no_grad(): - # Retrieve the score for the context tokens only (removing question tokens) - fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()} - start, end = self.model(**fw_args) - start, end = start.cpu().numpy(), end.cpu().numpy() + # Manage tensor allocation on correct device + with self.device_placement(): + if self.framework == "tf": + fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()} + start, end = self.model(fw_args) + start, end = start.numpy(), end.numpy() + else: + with torch.no_grad(): + # Retrieve the score for the context tokens only (removing question tokens) + fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()} + start, end = self.model(**fw_args) + start, end = start.cpu().numpy(), end.cpu().numpy() - answers = [] - for (example, feature, start_, end_) in zip(examples, features, start, end): - # Normalize logits and spans to retrieve the answer - start_ = np.exp(start_) / np.sum(np.exp(start_)) - end_ = np.exp(end_) / np.sum(np.exp(end_)) + answers = [] + for (feature, start_, end_) in zip(features, start, end): + # Normalize logits and spans to retrieve the answer + start_ = np.exp(start_) / np.sum(np.exp(start_)) + end_ = np.exp(end_) / np.sum(np.exp(end_)) - # Mask padding and question - start_, end_ = start_ * np.abs(np.array(feature.p_mask) - 1), end_ * np.abs(np.array(feature.p_mask) - 1) + # Mask padding and question + start_, end_ = ( + start_ * np.abs(np.array(feature.p_mask) - 1), + end_ * np.abs(np.array(feature.p_mask) - 1), + ) - # TODO : What happens if not possible - # Mask CLS - start_[0] = end_[0] = 0 + # TODO : What happens if not possible + # Mask CLS + start_[0] = end_[0] = 0 - starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"]) - char_to_word = np.array(example.char_to_word_offset) + starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"]) + char_to_word = np.array(example.char_to_word_offset) - # Convert the answer (tokens) back to the original text - answers += [ - { - "score": score.item(), - "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(), - "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(), - "answer": " ".join( - example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1] - ), - } - for s, e, score in zip(starts, ends, scores) - ] - if len(answers) == 1: - return answers[0] - return answers + # Convert the answer (tokens) back to the original text + answers += [ + { + "score": score.item(), + "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(), + "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(), + "answer": " ".join( + example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1] + ), + } + for s, e, score in zip(starts, ends, scores) + ] + answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: kwargs["topk"]] + all_answers += answers + + if len(all_answers) == 1: + return all_answers[0] + return all_answers def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple: """ diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 3064087ef6..420ee6564e 100644 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -485,6 +485,8 @@ class ModelTesterMixin: self.assertEqual(model.config.vocab_size, model_vocab_size + 10) # Check that it actually resizes the embeddings matrix self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10) + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**inputs_dict) # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size model_embed = model.resize_token_embeddings(model_vocab_size - 15) @@ -492,6 +494,11 @@ class ModelTesterMixin: # Check that it actually resizes the embeddings matrix self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15) + # Check that the model can still do a forward pass successfully (every parameter should be resized) + # Input ids should be clamped to the maximum size of the vocabulary + inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1) + model(**inputs_dict) + # Check that adding and removing tokens has not modified the first part of the embedding matrix. models_equal = True for p1, p2 in zip(cloned_embeddings, model_embed.weight):