Update doc of the model page (#5985)

This commit is contained in:
Sylvain Gugger
2020-07-22 18:14:57 -04:00
committed by GitHub
parent c3206eef44
commit 33d7506ea1
3 changed files with 320 additions and 211 deletions

View File

@@ -1,9 +1,11 @@
Models Models
---------------------------------------------------- ----------------------------------------------------
The base class ``PreTrainedModel`` implements the common methods for loading/saving a model either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository). The base class :class:`~transformers.PreTrainedModel` implements the common methods for loading/saving a model either
from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from
HuggingFace's AWS S3 repository).
``PreTrainedModel`` also implements a few methods which are common among all the models to: :class:`~transformers.PreTrainedModel` also implements a few methods which are common among all the models to:
- resize the input token embeddings when new tokens are added to the vocabulary - resize the input token embeddings when new tokens are added to the vocabulary
- prune the attention heads of the model. - prune the attention heads of the model.
@@ -19,7 +21,6 @@ The base class ``PreTrainedModel`` implements the common methods for loading/sav
.. autofunction:: transformers.apply_chunking_to_forward .. autofunction:: transformers.apply_chunking_to_forward
``TFPreTrainedModel`` ``TFPreTrainedModel``
~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~

View File

@@ -18,6 +18,7 @@ import functools
import logging import logging
import os import os
import warnings import warnings
from typing import Dict
import h5py import h5py
import numpy as np import numpy as np
@@ -167,30 +168,31 @@ TFMaskedLanguageModelingLoss = TFCausalLanguageModelingLoss
class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin): class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
r""" Base class for all TF models. r"""
Base class for all TF models.
:class:`~transformers.TFPreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models :class:`~transformers.TFPreTrainedModel` takes care of storing the configuration of the models and handles methods
as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads. for loading, downloading and saving models as well as a few methods common to all models to:
Class attributes (overridden by derived classes): * resize the input embeddings,
- ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture. * prune heads in the self-attention heads.
- ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
- ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`, Class attributes (overridden by derived classes):
- ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`, - **config_class** (:class:`~transformers.PretrainedConfig`) -- A subclass of
- ``path``: a path (string) to the TensorFlow checkpoint. :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
- **base_model_prefix** (:obj:`str`) -- A string indicating the attribute associated to the base model in
- ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model. derived classes of the same architecture adding modules on top of the base model.
""" """
config_class = None config_class = None
base_model_prefix = "" base_model_prefix = ""
@property @property
def dummy_inputs(self): def dummy_inputs(self) -> Dict[str, tf.Tensor]:
""" Dummy inputs to build the network. """
Dummy inputs to build the network.
Returns: Returns:
tf.Tensor with dummy inputs :obj:`Dict[str, tf.Tensor]`: The dummy inputs.
""" """
return {"input_ids": tf.constant(DUMMY_INPUTS)} return {"input_ids": tf.constant(DUMMY_INPUTS)}
@@ -207,13 +209,12 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
# Save config in model # Save config in model
self.config = config self.config = config
def get_input_embeddings(self): def get_input_embeddings(self) -> tf.keras.layers.Layer:
""" """
Returns the model's input embeddings. Returns the model's input embeddings.
Returns: Returns:
:obj:`tf.keras.layers.Layer`: :obj:`tf.keras.layers.Layer`: A torch module mapping vocabulary to hidden states.
A torch module mapping vocabulary to hidden states.
""" """
base_model = getattr(self, self.base_model_prefix, self) base_model = getattr(self, self.base_model_prefix, self)
if base_model is not self: if base_model is not self:
@@ -223,7 +224,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
""" """
Set model's input embeddings Set model's input embeddings.
Args: Args:
value (:obj:`tf.keras.layers.Layer`): value (:obj:`tf.keras.layers.Layer`):
@@ -235,28 +236,30 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
else: else:
raise NotImplementedError raise NotImplementedError
def get_output_embeddings(self): def get_output_embeddings(self) -> tf.keras.layers.Layer:
""" """
Returns the model's output embeddings. Returns the model's output embeddings.
Returns: Returns:
:obj:`tf.keras.layers.Layer`: :obj:`tf.keras.layers.Layer`: A torch module mapping hidden states to vocabulary.
A torch module mapping hidden states to vocabulary.
""" """
return None # Overwrite for models with output embeddings return None # Overwrite for models with output embeddings
def resize_token_embeddings(self, new_num_tokens=None): def resize_token_embeddings(self, new_num_tokens=None) -> tf.Variable:
""" Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size. """
Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method. Resizes input token embeddings matrix of the model if :obj:`new_num_tokens != config.vocab_size`.
Takes care of tying weights embeddings afterwards if the model class has a :obj:`tie_weights()` method.
Arguments: Arguments:
new_num_tokens (:obj:`int`, `optional`):
The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
vectors at the end. Reducing the size will remove vectors from the end. If not provided or :obj:`None`,
just returns a pointer to the input tokens :obj:`tf.Variable` module of the model wihtout doing
anything.
new_num_tokens: (`optional`) int: Return:
New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end. :obj:`tf.Variable`: Pointer to the input tokens Embeddings Module of the model.
If not provided or None: does nothing and just returns a pointer to the input tokens ``tf.Variable`` Module of the model.
Return: ``tf.Variable``
Pointer to the input tokens Embeddings Module of the model
""" """
model_embeds = self._resize_token_embeddings(new_num_tokens) model_embeds = self._resize_token_embeddings(new_num_tokens)
if new_num_tokens is None: if new_num_tokens is None:
@@ -285,19 +288,24 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
else: else:
raise ValueError("word embedding is not defined.") raise ValueError("word embedding is not defined.")
def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None): def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None) -> tf.Variable:
""" Build a resized Embedding Variable from a provided token Embedding Module. """
Increasing the size will add newly initialized vectors at the end Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly
Reducing the size will remove vectors from the end. initialized vectors at the end. Reducing the size will remove vectors from the end
Args: Args:
new_num_tokens: (`optional`) int old_embeddings (:obj:`tf.Variable`):
Old embeddings to be resized.
new_num_tokens (:obj:`int`, `optional`):
New number of tokens in the embedding matrix. New number of tokens in the embedding matrix.
Increasing the size will add newly initialized vectors at the end
Reducing the size will remove vectors from the end Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
If not provided or None: return the provided token Embedding Module. vectors from the end. If not provided or :obj:`None`, just returns a pointer to the input tokens
Return: ``tf.Variable`` :obj:`tf.Variable`` module of the model wihtout doing anything.
Pointer to the resized word Embedding Module or the old Embedding Module if new_num_tokens is None
Return:
:obj:`tf.Variable`: Pointer to the resized Embedding Module or the old Embedding Module if
:obj:`new_num_tokens` is :obj:`None`
""" """
word_embeddings = self._get_word_embeddings(old_embeddings) word_embeddings = self._get_word_embeddings(old_embeddings)
if new_num_tokens is None: if new_num_tokens is None:
@@ -325,17 +333,25 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
return new_embeddings return new_embeddings
def prune_heads(self, heads_to_prune): def prune_heads(self, heads_to_prune):
""" Prunes heads of the base model. """
Prunes heads of the base model.
Arguments: Arguments:
heads_to_prune (:obj:`Dict[int, List[int]]`):
heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`). Dictionary with keys being selected layer indices (:obj:`int`) and associated values being the list
of heads to prune in said layer (list of :obj:`int`). For instance {1: [0, 2], 2: [2, 3]} will
prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
""" """
raise NotImplementedError raise NotImplementedError
def save_pretrained(self, save_directory): def save_pretrained(self, save_directory):
""" Save a model and its configuration file to a directory, so that it """
can be re-loaded using the :func:`~transformers.PreTrainedModel.from_pretrained` class method. Save a model and its configuration file to a directory, so that it can be re-loaded using the
`:func:`~transformers.TFPreTrainedModel.from_pretrained`` class method.
Arguments:
save_directory (:obj:`str`):
Directory to which to save. Will be created if it doesn't exist.
""" """
if os.path.isfile(save_directory): if os.path.isfile(save_directory):
logger.error("Provided path ({}) should be a directory, not a file".format(save_directory)) logger.error("Provided path ({}) should be a directory, not a file".format(save_directory))
@@ -352,68 +368,101 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r"""Instantiate a pretrained TF 2.0 model from a pre-trained model configuration. r"""
Instantiate a pretrained TF 2.0 model from a pre-trained model configuration.
The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model. The warning `Weights from XXX not initialized from pretrained model` means that the weights of XXX do not come
It is up to you to train those weights with a downstream fine-tuning task. pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
task.
The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded. The warning `Weights from XXX not used in YYY` means that the layer XXX is not used by YYY, therefore those
weights are discarded.
Parameters: Parameters:
pretrained_model_name_or_path: either: pretrained_model_name_or_path (:obj:`str`, `optional`):
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. Can be either:
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
- a path or url to a `PyTorch state_dict save file` (e.g. `./pt_model/pytorch_model.bin`). In this case, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the PyTorch checkpoint in a TensorFlow model using the provided conversion scripts and loading the TensorFlow model afterwards.
model_args: (`optional`) Sequence of positional arguments: - A string with the `shortcut name` of a pretrained model to load from cache or download, e.g.,
All remaning positional arguments will be passed to the underlying model's ``__init__`` method ``bert-base-uncased``.
- A string with the `identifier name` of a pretrained model that was user-uploaded to our S3, e.g.,
``dbmdz/bert-base-german-cased``.
- A path to a `directory` containing model weights saved using
:func:`~transformersTF.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
- A path or url to a `PyTorch state_dict save file` (e.g, `./pt_model/pytorch_model.bin`). In
this case, ``from_pt`` should be set to :obj:`True` and a configuration object should be provided
as ``config`` argument. This loading path is slower than converting the PyTorch model in a
TensorFlow model using the provided conversion scripts and loading the TensorFlow model
afterwards.
- :obj:`None` if you are both providing the configuration and state dictionary (resp. with keyword
arguments ``config`` and ``state_dict``).
model_args (sequence of positional arguments, `optional`):
All remaning positional arguments will be passed to the underlying model's ``__init__`` method.
config (:obj:`Union[PretrainedConfig, str]`, `optional`):
Can be either:
config: (`optional`) one of: - an instance of a class derived from :class:`~transformers.PretrainedConfig`,
- an instance of a class derived from :class:`~transformers.PretrainedConfig`, or - a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained`.
- a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained()`
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: Configuration for the model to use instead of an automatically loaded configuation. Configuration can
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or be automatically loaded when:
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
from_pt: (`optional`) boolean, default False: - The model is a model provided by the library (loaded with the `shortcut name` string of a
Load the model weights from a PyTorch state_dict save file (see docstring of pretrained_model_name_or_path argument). pretrained model).
- The model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded
by suppling the save directory.
- The model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a
configuration JSON file named `config.json` is found in the directory.
from_pt: (:obj:`bool`, `optional`, defaults to :obj:`False`):
Load the model weights from a PyTorch state_dict save file (see docstring of
``pretrained_model_name_or_path`` argument).
cache_dir (:obj:`str`, `optional`):
Path to a directory in which a downloaded pretrained model configuration should be cached if the
standard cache should not be used.
force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to delete incompletely received files. Will attempt to resume the download if such a
file exists.
proxies: (:obj:`Dict[str, str], `optional`):
A dictionary of proxy servers to use by protocol or endpoint, e.g.,
:obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each
request.
output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether ot not to also return a dictionnary containing missing keys, unexpected keys and error
messages.
local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to only look at local files (e.g., not try doanloading the model).
use_cdn(:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to use Cloudfront (a Content Delivery Network, or CDN) when searching for the model on
our S3 (faster).
kwargs (remaining dictionary of keyword arguments, `optional`):
Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
:obj:`output_attention=True`). Behaves differently depending on whether a ``config`` is provided or
automatically loaded:
cache_dir: (`optional`) string: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the
Path to a directory in which a downloaded pre-trained model underlying model's ``__init__`` method (we assume all relevant updates to the configuration have
configuration should be cached if the standard cache should not be used. already been done)
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class
force_download: (`optional`) boolean, default False: initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of
Force to (re-)download the model weights and configuration files and override the cached versions if they exists. ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute
with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration
resume_download: (`optional`) boolean, default False: attribute will be passed to the underlying model's ``__init__`` function.
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
output_loading_info: (`optional`) boolean:
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
kwargs: (`optional`) Remaining dictionary of keyword arguments:
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
Examples:: Examples::
# For example purposes. Not runnable. from transformers import BertConfig, TFBertModel
model = BertModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. # Download model and configuration from S3 and cache.
model = BertModel.from_pretrained('./test/saved_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = TFBertModel.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading # Model was saved using `save_pretrained('./test/saved_model/')` (for example purposes, not runnable).
model = TFBertModel.from_pretrained('./test/saved_model/')
# Update configuration during loading.
model = TFBertModel.from_pretrained('bert-base-uncased', output_attention=True)
assert model.config.output_attention == True assert model.config.output_attention == True
# Loading from a TF checkpoint file instead of a PyTorch model (slower) # Loading from a Pytorch model file instead of a TensorFlow checkpoint (slower, for example purposes, not runnable).
config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json') config = BertConfig.from_json_file('./pt_model/my_pt_model_config.json')
model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_pt=True, config=config) model = TFBertModel.from_pretrained('./pt_model/my_pytorch_model.bin', from_pt=True, config=config)
""" """
config = kwargs.pop("config", None) config = kwargs.pop("config", None)

View File

@@ -266,34 +266,43 @@ class ModuleUtilsMixin:
class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin): class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
r""" Base class for all models. r"""
Base class for all models.
:class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models :class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods
as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads. for loading, downloading and saving models as well as a few methods common to all models to:
Class attributes (overridden by derived classes): * resize the input embeddings,
- ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture. * prune heads in the self-attention heads.
- ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
- ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`, Class attributes (overridden by derived classes):
- ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`, - **config_class** (:class:`~transformers.PretrainedConfig`) -- A subclass of
- ``path``: a path (string) to the TensorFlow checkpoint. :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
- **load_tf_weights** (:obj:`Callable`) -- A python `method` for loading a TensorFlow checkpoint in a
PyTorch model, taking as arguments:
- ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model. - **model** (:class:`~transformers.PreTrainedModel`) -- An instance of the model on which to load the
TensorFlow checkpoint.
- **config** (:class:`~transformers.PreTrainedConfig`) -- An instance of the configuration associated
to the model.
- **path** (:obj:`str`) -- A path to the TensorFlow checkpoint.
- **base_model_prefix** (:obj:`str`) -- A string indicating the attribute associated to the base model in
derived classes of the same architecture adding modules on top of the base model.
""" """
config_class = None config_class = None
base_model_prefix = "" base_model_prefix = ""
@property @property
def dummy_inputs(self): def dummy_inputs(self) -> Dict[str, torch.Tensor]:
""" Dummy inputs to do a forward pass in the network. """ Dummy inputs to do a forward pass in the network.
Returns: Returns:
torch.Tensor with dummy inputs :obj:`Dict[str, torch.Tensor]`: The dummy inputs.
""" """
return {"input_ids": torch.tensor(DUMMY_INPUTS)} return {"input_ids": torch.tensor(DUMMY_INPUTS)}
def __init__(self, config, *inputs, **kwargs): def __init__(self, config: PretrainedConfig, *inputs, **kwargs):
super().__init__() super().__init__()
if not isinstance(config, PretrainedConfig): if not isinstance(config, PretrainedConfig):
raise ValueError( raise ValueError(
@@ -310,13 +319,12 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
def base_model(self): def base_model(self):
return getattr(self, self.base_model_prefix, self) return getattr(self, self.base_model_prefix, self)
def get_input_embeddings(self): def get_input_embeddings(self) -> nn.Module:
""" """
Returns the model's input embeddings. Returns the model's input embeddings.
Returns: Returns:
:obj:`nn.Module`: :obj:`nn.Module`: A torch module mapping vocabulary to hidden states.
A torch module mapping vocabulary to hidden states.
""" """
base_model = getattr(self, self.base_model_prefix, self) base_model = getattr(self, self.base_model_prefix, self)
if base_model is not self: if base_model is not self:
@@ -329,8 +337,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
Set model's input embeddings Set model's input embeddings
Args: Args:
value (:obj:`nn.Module`): value (:obj:`nn.Module`): A module mapping vocabulary to hidden states.
A module mapping vocabulary to hidden states.
""" """
base_model = getattr(self, self.base_model_prefix, self) base_model = getattr(self, self.base_model_prefix, self)
if base_model is not self: if base_model is not self:
@@ -338,20 +345,20 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
else: else:
raise NotImplementedError raise NotImplementedError
def get_output_embeddings(self): def get_output_embeddings(self) -> nn.Module:
""" """
Returns the model's output embeddings. Returns the model's output embeddings.
Returns: Returns:
:obj:`nn.Module`: :obj:`nn.Module`: A torch module mapping hidden states to vocabulary.
A torch module mapping hidden states to vocabulary.
""" """
return None # Overwrite for models with output embeddings return None # Overwrite for models with output embeddings
def tie_weights(self): def tie_weights(self):
""" """
Tie the weights between the input embeddings and the output embeddings. Tie the weights between the input embeddings and the output embeddings.
If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning
If the :obj:`torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning
the weights instead. the weights instead.
""" """
output_embeddings = self.get_output_embeddings() output_embeddings = self.get_output_embeddings()
@@ -376,18 +383,21 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"): if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
output_embeddings.out_features = input_embeddings.num_embeddings output_embeddings.out_features = input_embeddings.num_embeddings
def resize_token_embeddings(self, new_num_tokens: Optional[int] = None): def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> torch.nn.Embedding:
""" Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size. """
Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method. Resizes input token embeddings matrix of the model if :obj:`new_num_tokens != config.vocab_size`.
Takes care of tying weights embeddings afterwards if the model class has a :obj:`tie_weights()` method.
Arguments: Arguments:
new_num_tokens (:obj:`int`, `optional`):
The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
vectors at the end. Reducing the size will remove vectors from the end. If not provided or :obj:`None`,
just returns a pointer to the input tokens :obj:`torch.nn.Embedding` module of the model wihtout doing
anything.
new_num_tokens: (`optional`) int: Return:
New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end. :obj:`torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
If not provided or None: does nothing and just returns a pointer to the input tokens ``torch.nn.Embeddings`` Module of the model.
Return: ``torch.nn.Embeddings``
Pointer to the input tokens Embeddings Module of the model
""" """
base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed
model_embeds = base_model._resize_token_embeddings(new_num_tokens) model_embeds = base_model._resize_token_embeddings(new_num_tokens)
@@ -412,20 +422,23 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
def _get_resized_embeddings( def _get_resized_embeddings(
self, old_embeddings: torch.nn.Embedding, new_num_tokens: Optional[int] = None self, old_embeddings: torch.nn.Embedding, new_num_tokens: Optional[int] = None
) -> torch.nn.Embedding: ) -> torch.nn.Embedding:
""" Build a resized Embedding Module from a provided token Embedding Module. """
Increasing the size will add newly initialized vectors at the end Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly
Reducing the size will remove vectors from the end initialized vectors at the end. Reducing the size will remove vectors from the end
Args: Args:
old_embeddings: ``torch.nn.Embedding`` old_embeddings (:obj:`torch.nn.Embedding`):
Old embeddings to be resized. Old embeddings to be resized.
new_num_tokens: (`optional`) int new_num_tokens (:obj:`int`, `optional`):
New number of tokens in the embedding matrix. New number of tokens in the embedding matrix.
Increasing the size will add newly initialized vectors at the end
Reducing the size will remove vectors from the end Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
If not provided or None: return the provided token Embedding Module. vectors from the end. If not provided or :obj:`None`, just returns a pointer to the input tokens
Return: ``torch.nn.Embedding`` :obj:`torch.nn.Embedding`` module of the model wihtout doing anything.
Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
Return:
:obj:`torch.nn.Embedding`: Pointer to the resized Embedding Module or the old Embedding Module if
:obj:`new_num_tokens` is :obj:`None`
""" """
if new_num_tokens is None: if new_num_tokens is None:
return old_embeddings return old_embeddings
@@ -448,7 +461,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
return new_embeddings return new_embeddings
def init_weights(self): def init_weights(self):
""" Initialize and prunes weights if needed. """ """
Initializes and prunes weights if needed.
"""
# Initialize weights # Initialize weights
self.apply(self._init_weights) self.apply(self._init_weights)
@@ -459,13 +474,15 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
# Tie weights if needed # Tie weights if needed
self.tie_weights() self.tie_weights()
def prune_heads(self, heads_to_prune: Dict): def prune_heads(self, heads_to_prune: Dict[int, List[int]]):
""" Prunes heads of the base model. """
Prunes heads of the base model.
Arguments: Arguments:
heads_to_prune (:obj:`Dict[int, List[int]]`):
heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`). Dictionary with keys being selected layer indices (:obj:`int`) and associated values being the list
E.g. {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2. of heads to prune in said layer (list of :obj:`int`). For instance {1: [0, 2], 2: [2, 3]} will
prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
""" """
# save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads
for layer, heads in heads_to_prune.items(): for layer, heads in heads_to_prune.items():
@@ -475,11 +492,13 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
self.base_model._prune_heads(heads_to_prune) self.base_model._prune_heads(heads_to_prune)
def save_pretrained(self, save_directory): def save_pretrained(self, save_directory):
""" Save a model and its configuration file to a directory, so that it """
can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method. Save a model and its configuration file to a directory, so that it can be re-loaded using the
`:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
Arguments: Arguments:
save_directory: directory to which to save. save_directory (:obj:`str`):
Directory to which to save. Will be created if it doesn't exist.
""" """
if os.path.isfile(save_directory): if os.path.isfile(save_directory):
logger.error("Provided path ({}) should be a directory, not a file".format(save_directory)) logger.error("Provided path ({}) should be a directory, not a file".format(save_directory))
@@ -511,75 +530,110 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r"""Instantiate a pretrained pytorch model from a pre-trained model configuration. r"""
Instantiate a pretrained pytorch model from a pre-trained model configuration.
The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated) The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated).
To train the model, you should first set it back in training mode with ``model.train()`` To train the model, you should first set it back in training mode with ``model.train()``.
The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model. The warning `Weights from XXX not initialized from pretrained model` means that the weights of XXX do not come
It is up to you to train those weights with a downstream fine-tuning task. pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
task.
The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded. The warning `Weights from XXX not used in YYY` means that the layer XXX is not used by YYY, therefore those
weights are discarded.
Parameters: Parameters:
pretrained_model_name_or_path: either: pretrained_model_name_or_path (:obj:`str`, `optional`):
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. Can be either:
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
- None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)
model_args: (`optional`) Sequence of positional arguments: - A string with the `shortcut name` of a pretrained model to load from cache or download, e.g.,
All remaning positional arguments will be passed to the underlying model's ``__init__`` method ``bert-base-uncased``.
- A string with the `identifier name` of a pretrained model that was user-uploaded to our S3, e.g.,
``dbmdz/bert-base-german-cased``.
- A path to a `directory` containing model weights saved using
:func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
- A path or url to a `tensorflow index checkpoint file` (e.g, `./tf_model/model.ckpt.index`). In
this case, ``from_tf`` should be set to :obj:`True` and a configuration object should be provided
as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in
a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
- :obj:`None` if you are both providing the configuration and state dictionary (resp. with keyword
arguments ``config`` and ``state_dict``).
model_args (sequence of positional arguments, `optional`):
All remaning positional arguments will be passed to the underlying model's ``__init__`` method.
config (:obj:`Union[PretrainedConfig, str]`, `optional`):
Can be either:
config: (`optional`) one of: - an instance of a class derived from :class:`~transformers.PretrainedConfig`,
- an instance of a class derived from :class:`~transformers.PretrainedConfig`, or - a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained`.
- a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained()`
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: Configuration for the model to use instead of an automatically loaded configuation. Configuration can
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or be automatically loaded when:
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
state_dict: (`optional`) dict: - The model is a model provided by the library (loaded with the `shortcut name` string of a
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. pretrained model).
This option can be used if you want to create a model from a pretrained configuration but load your own weights. - The model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. by suppling the save directory.
- The model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a
configuration JSON file named `config.json` is found in the directory.
state_dict (:obj:`Dict[str, torch.Tensor]`, `optional`):
A state dictionary to use instead of a state dictionary loaded from saved weights file.
cache_dir: (`optional`) string: This option can be used if you want to create a model from a pretrained configuration but load your own
Path to a directory in which a downloaded pre-trained model weights. In this case though, you should check if using
configuration should be cached if the standard cache should not be used. :func:`~transformers.PreTrainedModel.save_pretrained` and
:func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
cache_dir (:obj:`str`, `optional`):
Path to a directory in which a downloaded pretrained model configuration should be cached if the
standard cache should not be used.
from_tf (:obj:`bool`, `optional`, defaults to :obj:`False`):
Load the model weights from a TensorFlow checkpoint save file (see docstring of
``pretrained_model_name_or_path`` argument).
force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to delete incompletely received files. Will attempt to resume the download if such a
file exists.
proxies: (:obj:`Dict[str, str], `optional`):
A dictionary of proxy servers to use by protocol or endpoint, e.g.,
:obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each
request.
output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether ot not to also return a dictionnary containing missing keys, unexpected keys and error
messages.
local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to only look at local files (e.g., not try doanloading the model).
use_cdn(:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to use Cloudfront (a Content Delivery Network, or CDN) when searching for the model on
our S3 (faster).
kwargs (remaining dictionary of keyword arguments, `optional`):
Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
:obj:`output_attention=True`). Behaves differently depending on whether a ``config`` is provided or
automatically loaded:
force_download: (`optional`) boolean, default False: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the
Force to (re-)download the model weights and configuration files and override the cached versions if they exists. underlying model's ``__init__`` method (we assume all relevant updates to the configuration have
already been done)
resume_download: (`optional`) boolean, default False: - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of
``kwargs`` that corresponds to a configuration attribute will be used to override said attribute
proxies: (`optional`) dict, default None: with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. attribute will be passed to the underlying model's ``__init__`` function.
The proxies are used on each request.
output_loading_info: (`optional`) boolean:
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
kwargs: (`optional`) Remaining dictionary of keyword arguments:
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
Examples:: Examples::
# For example purposes. Not runnable. from transformers import BertConfig, BertModel
model = BertModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. # Download model and configuration from S3 and cache.
model = BertModel.from_pretrained('./test/saved_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = BertModel.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading # Model was saved using `save_pretrained('./test/saved_model/')` (for example purposes, not runnable).
model = BertModel.from_pretrained('./test/saved_model/')
# Update configuration during loading.
model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)
assert model.config.output_attention == True assert model.config.output_attention == True
# Loading from a TF checkpoint file instead of a PyTorch model (slower) # Loading from a TF checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable).
config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json') config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config) model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
""" """
config = kwargs.pop("config", None) config = kwargs.pop("config", None)
state_dict = kwargs.pop("state_dict", None) state_dict = kwargs.pop("state_dict", None)
@@ -1242,18 +1296,23 @@ def apply_chunking_to_forward(
chunk_size: int, chunk_dim: int, forward_fn: Callable[..., torch.Tensor], *input_tensors chunk_size: int, chunk_dim: int, forward_fn: Callable[..., torch.Tensor], *input_tensors
) -> torch.Tensor: ) -> torch.Tensor:
""" """
This function chunks the `input_tensors` into smaller input tensor parts of size `chunk_size` over the dimension `chunk_dim`. This function chunks the :obj:`input_tensors` into smaller input tensor parts of size :obj:`chunk_size` over the
It then applies a layer `forward_fn` to each chunk independently to save memory. dimension :obj:`chunk_dim`. It then applies a layer :obj:`forward_fn` to each chunk independently to save memory.
If the `forward_fn` is independent across the `chunk_dim` this function will yield the
same result as not applying it. If the :obj:`forward_fn` is independent across the :obj:`chunk_dim` this function will yield the same result as
directly applying :obj:`forward_fn` to :obj:`input_tensors`.
Args: Args:
chunk_size: int - the chunk size of a chunked tensor. `num_chunks` = `len(input_tensors[0]) / chunk_size` chunk_size (:obj:`int`):
chunk_dim: int - the dimension over which the input_tensors should be chunked The chunk size of a chunked tensor: :obj:`num_chunks = len(input_tensors[0]) / chunk_size`.
forward_fn: fn - the forward fn of the model chunk_dim (:obj:`int`):
input_tensors: tuple(torch.Tensor) - the input tensors of `forward_fn` which are chunked The dimension over which the :obj:`input_tensors` should be chunked.
forward_fn (:obj:`Callable[..., torch.Tensor]`):
The forward function of the model.
input_tensors (:obj:`Tuple[torch.Tensor]`):
The input tensors of ``forward_fn`` which will be chunked.
Returns: Returns:
a Tensor with the same shape the foward_fn would have given if applied :obj:`torch.Tensor`: A tensor with the same shape as the :obj:`foward_fn` would have given if applied`.
Examples:: Examples::