updating docs - adding few tests to tokenizers

This commit is contained in:
thomwolf
2019-08-04 22:42:55 +02:00
parent 009273dbdd
commit 00132b7a7a
10 changed files with 390 additions and 521 deletions

View File

@@ -54,20 +54,22 @@ else:
class PretrainedConfig(object):
""" Base class for all configuration classes.
Handle a few common attributes and methods for loading/downloading/saving configurations.
r""" Base class for all configuration classes.
Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
Class attributes (overridden by derived classes):
- ``pretrained_config_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained model configurations as values.
Parameters:
``finetuning_task``: string, default `None`. Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint.
``num_labels``: integer, default `2`. Number of classes to use when the model is a classification model (sequences/tokens)
``output_attentions``: boolean, default `False`. Should the model returns attentions weights.
``output_hidden_states``: string, default `False`. Should the model returns all hidden-states.
``torchscript``: string, default `False`. Is the model used with Torchscript.
"""
pretrained_config_archive_map = {}
def __init__(self, **kwargs):
r""" The initialization of :class:`~pytorch_transformers.PretrainedConfig` extracts
a few configuration attributes from `**kwargs` which are common to all models:
- `finetuning_task`: string, default `None`. Name of the task used to fine-tune the model (used to convert from original checkpoint)
- `num_labels`: integer, default `2`. Number of classes to use when the model is a classification model (sequences/tokens)
- `output_attentions`: boolean, default `False`. Should the model returns attentions weights.
- `output_hidden_states`: string, default `False`. Should the model returns all hidden-states.
- `torchscript`: string, default `False`. Is the model used with Torchscript.
"""
self.finetuning_task = kwargs.pop('finetuning_task', None)
self.num_labels = kwargs.pop('num_labels', 2)
self.output_attentions = kwargs.pop('output_attentions', False)
@@ -76,7 +78,7 @@ class PretrainedConfig(object):
def save_pretrained(self, save_directory):
""" Save a configuration object to the directory `save_directory`, so that it
can be re-loaded using the `from_pretrained(save_directory)` class method.
can be re-loaded using the :func:`~pytorch_transformers.PretrainedConfig.from_pretrained` class method.
"""
assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
@@ -87,32 +89,29 @@ class PretrainedConfig(object):
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
r""" Instantiate a PretrainedConfig from a pre-trained model configuration.
r""" Instantiate a :class:`~pytorch_transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
Parameters:
**pretrained_model_name_or_path**: either:
pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
- a path to a `directory` containing a configuration file saved using the `save_pretrained(save_directory)` method, e.g.: ``./my_model_directory/``.
- a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
**cache_dir**: (`optional`) string:
cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model
configuration should be cached if the standard cache should not be used.
**return_unused_kwargs**: (`optional`) bool:
kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
- The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
return_unused_kwargs: (`optional`) bool:
- If False, then this function returns just the final configuration object.
- If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
**kwargs**: (`optional`) dict:
Dictionary of key/value pairs with which to update the configuration object after loading.
- The values in kwargs of any keys which are configuration attributes will be used
to override the loaded values.
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
by the `return_unused_kwargs` keyword parameter.
Examples::
# We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
@@ -215,14 +214,26 @@ class PretrainedConfig(object):
class PreTrainedModel(nn.Module):
""" Base class for all models. Handle loading/storing model config and
a simple interface for dowloading and loading pretrained models.
r""" Base class for all models.
:class:`~pytorch_transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
Class attributes (overridden by derived classes):
- ``config_class``: a class derived from :class:`~pytorch_transformers.PretrainedConfig` to use as configuration class for this model architecture.
- ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
- ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
- ``model``: an instance of the relevant subclass of :class:`~pytorch_transformers.PreTrainedModel`,
- ``config``: an instance of the relevant subclass of :class:`~pytorch_transformers.PretrainedConfig`,
- ``path``: a path (string) to the TensorFlow checkpoint.
- ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
"""
config_class = PretrainedConfig
config_class = None
pretrained_model_archive_map = {}
load_tf_weights = lambda model, config, path: None
base_model_prefix = ""
input_embeddings = None
def __init__(self, config, *inputs, **kwargs):
super(PreTrainedModel, self).__init__()
@@ -280,17 +291,16 @@ class PreTrainedModel(nn.Module):
def resize_token_embeddings(self, new_num_tokens=None):
""" Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
Args:
new_num_tokens: (`optional`) int
New number of tokens in the embedding matrix.
Increasing the size will add newly initialized vectors at the end
Reducing the size will remove vectors from the end
If not provided or None: does nothing and just returns a pointer to the input tokens Embedding Module of the model.
Arguments:
new_num_tokens: (`optional`) int:
New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end.
If not provided or None: does nothing and just returns a pointer to the input tokens ``torch.nn.Embeddings`` Module of the model.
Return: ``torch.nn.Embeddings``
Pointer to the input tokens Embedding Module of the model
Pointer to the input tokens Embeddings Module of the model
"""
base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed
model_embeds = base_model._resize_token_embeddings(new_num_tokens)
@@ -309,15 +319,17 @@ class PreTrainedModel(nn.Module):
def prune_heads(self, heads_to_prune):
""" Prunes heads of the base model.
Args:
heads_to_prune: dict of {layer_num (int): list of heads to prune in this layer (list of int)}
Arguments:
heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
"""
base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed
base_model._prune_heads(heads_to_prune)
def save_pretrained(self, save_directory):
""" Save a model with its configuration file to a directory, so that it
can be re-loaded using the `from_pretrained(save_directory)` class method.
""" Save a model and its configuration file to a directory, so that it
can be re-loaded using the `:func:`~pytorch_transformers.PreTrainedModel.from_pretrained`` class method.
"""
assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
@@ -336,50 +348,45 @@ class PreTrainedModel(nn.Module):
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r"""Instantiate a pretrained pytorch model from a pre-trained model configuration.
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
To train the model, you should first set it back in training mode with `model.train()`
The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated)
To train the model, you should first set it back in training mode with ``model.train()``
Params:
**pretrained_model_name_or_path**: either:
- a string with the `shortcut name` of a pre-trained model to load from cache
or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
- a path to a `directory` containing a configuration file saved
using the `save_pretrained(save_directory)` method.
- a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
In this case, ``from_tf`` should be set to True and a configuration object should be
provided as `config` argument. This loading option is slower than converting the TensorFlow
checkpoint in a PyTorch model using the provided conversion scripts and loading
the PyTorch model afterwards.
**model_args**: (`optional`) Sequence:
All remaning positional arguments will be passed to the underlying model's __init__ function
**config**: an optional configuration for the model to use instead of an automatically loaded configuation.
Configuration can be automatically loaded when:
- the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
- the model was saved using the `save_pretrained(save_directory)` (loaded by suppling the save directory).
**state_dict**: an optional state dictionnary for the model to use instead of a state dictionary loaded
from saved weights file.
Parameters:
pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
model_args: (`optional`) Sequence of positional arguments:
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
- the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
state_dict: (`optional`) dict:
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
a simpler option.
**cache_dir**: (`optional`) string:
In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model
configuration should be cached if the standard cache should not be used.
**output_loading_info**: (`optional`) boolean:
output_loading_info: (`optional`) boolean:
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
**kwargs**: (`optional`) dict:
Dictionary of key, values to update the configuration object after loading.
Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
- If a configuration is providedictionaryfig`, **kwargs will be directly passed
to the underlying model's __init__ method.
- If a configuration is not provided, **kwargs will be first passed to the pretrained
model configuration class loading function (`PretrainedConfig.from_pretrained`).
Each key of **kwargs that corresponds to a configuration attribute
will be used to override said attribute with the supplied **kwargs value.
Remaining keys that do not correspond to any configuration attribute will
be passed to the underlying model's __init__ function.
kwargs: (`optional`) Remaining dictionary of keyword arguments:
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
Examples::dictionary
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
Examples::
model = BertModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
model = BertModel.from_pretrained('./test/saved_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`