From 009273dbddd0964c378d7131445bfc0ae63bc29c Mon Sep 17 00:00:00 2001 From: thomwolf Date: Sun, 4 Aug 2019 12:14:57 +0200 Subject: [PATCH] big doc update [WIP] --- README.md | 1 + docs/source/converting_tensorflow_models.rst | 43 +++++++++++------ docs/source/index.rst | 9 ++++ docs/source/installation.rst | 16 +++---- docs/source/main_classes/configuration.rst | 10 ++++ docs/source/main_classes/model.rst | 8 ++++ .../main_classes/optimizer_schedules.rst | 26 ++++++++++ docs/source/main_classes/tokenizer.rst | 8 ++++ docs/source/migration.md | 9 ++-- docs/source/model_doc/bert.rst | 6 --- docs/source/quickstart.md | 47 +++++++++++++++++-- docs/source/serialization.rst | 3 ++ pytorch_transformers/__init__.py | 2 +- pytorch_transformers/modeling_utils.py | 38 ++++++++++----- pytorch_transformers/tokenization_bert.py | 2 +- pytorch_transformers/tokenization_gpt2.py | 2 +- .../tokenization_transfo_xl.py | 2 +- pytorch_transformers/tokenization_utils.py | 15 +++--- pytorch_transformers/tokenization_xlnet.py | 2 +- 19 files changed, 189 insertions(+), 60 deletions(-) create mode 100644 docs/source/main_classes/configuration.rst create mode 100644 docs/source/main_classes/model.rst create mode 100644 docs/source/main_classes/optimizer_schedules.rst create mode 100644 docs/source/main_classes/tokenizer.rst diff --git a/README.md b/README.md index 8e2074f727..c31bbd24b7 100644 --- a/README.md +++ b/README.md @@ -119,6 +119,7 @@ tokenizer = tokenizer_class.from_pretrained(pretrained_weights) ``` ## Quick tour of the fine-tuning/usage scripts + The library comprises several example scripts with SOTA performances for NLU and NLG tasks: - `run_glue.py`: an example fine-tuning Bert, XLNet and XLM on nine different GLUE tasks (*sequence-level classification*) diff --git a/docs/source/converting_tensorflow_models.rst b/docs/source/converting_tensorflow_models.rst index 36c1e4050f..8441c9b1f7 100644 --- a/docs/source/converting_tensorflow_models.rst +++ b/docs/source/converting_tensorflow_models.rst @@ -1,7 +1,7 @@ Converting Tensorflow Checkpoints ================================================ -A command-line interface is provided to convert a TensorFlow checkpoint in a PyTorch dump of the ``BertForPreTraining`` class (for BERT) or NumPy checkpoint in a PyTorch dump of the ``OpenAIGPTModel`` class (for OpenAI GPT). +A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints in models than be loaded using the ``from_pretrained`` methods of the library. BERT ^^^^ @@ -41,6 +41,20 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT model, $PYTORCH_DUMP_OUTPUT \ [OPENAI_GPT_CONFIG] +OpenAI GPT-2 +^^^^^^^^^^^^ + +Here is an example of the conversion process for a pre-trained OpenAI GPT-2 model (see `here `__\ ) + +.. code-block:: shell + + export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights + + pytorch_transformers gpt2 \ + $OPENAI_GPT2_CHECKPOINT_PATH \ + $PYTORCH_DUMP_OUTPUT \ + [OPENAI_GPT2_CONFIG] + Transformer-XL ^^^^^^^^^^^^^^ @@ -55,19 +69,6 @@ Here is an example of the conversion process for a pre-trained Transformer-XL mo $PYTORCH_DUMP_OUTPUT \ [TRANSFO_XL_CONFIG] -GPT-2 -^^^^^ - -Here is an example of the conversion process for a pre-trained OpenAI's GPT-2 model. - -.. code-block:: shell - - export GPT2_DIR=/path/to/gpt2/checkpoint - - pytorch_transformers gpt2 \ - $GPT2_DIR/model.ckpt \ - $PYTORCH_DUMP_OUTPUT \ - [GPT2_CONFIG] XLNet ^^^^^ @@ -84,3 +85,17 @@ Here is an example of the conversion process for a pre-trained XLNet model, fine $TRANSFO_XL_CONFIG_PATH \ $PYTORCH_DUMP_OUTPUT \ STS-B \ + + +XLM +^^^ + +Here is an example of the conversion process for a pre-trained XLM model: + +.. code-block:: shell + + export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint + + pytorch_transformers xlm \ + $XLM_CHECKPOINT_PATH \ + $PYTORCH_DUMP_OUTPUT \ diff --git a/docs/source/index.rst b/docs/source/index.rst index be8cfc2a39..c403a0ad4f 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -21,11 +21,20 @@ The library currently contains PyTorch implementations, pre-trained model weight pretrained_models examples notebooks + serialization converting_tensorflow_models migration bertology torchscript +.. toctree:: + :maxdepth: 2 + :caption: Main classes + + main_classes/configuration + main_classes/model + main_classes/tokenizer + main_classes/optimizer_schedules .. toctree:: :maxdepth: 2 diff --git a/docs/source/installation.rst b/docs/source/installation.rst index f8beb9f1c8..9e6269da94 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -1,12 +1,12 @@ Installation ================================================ -This repo was tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 0.4.1/1.0.0 +PyTorch-Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0 With pip ^^^^^^^^ -PyTorch pretrained bert can be installed with pip as follows: +PyTorch Transformers can be installed using pip as follows: .. code-block:: bash @@ -15,7 +15,7 @@ PyTorch pretrained bert can be installed with pip as follows: From source ^^^^^^^^^^^ -Clone the repository and instal locally: +To install from source, clone the repository and install with: .. code-block:: bash @@ -27,11 +27,11 @@ Clone the repository and instal locally: Tests ^^^^^ -An extensive test suite is included for the library and the example scripts. Library tests can be found in the `tests folder `_ and examples tests in the `examples folder `_. +An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the `tests folder `_ and examples tests in the `examples folder `_. -These tests can be run using `pytest` (install pytest if needed with `pip install pytest`). +Tests can be run using `pytest` (install pytest if needed with `pip install pytest`). -You can run the tests from the root of the cloned repository with the commands: +Run all the tests from the root of the cloned repository with the commands: .. code-block:: bash @@ -42,11 +42,11 @@ You can run the tests from the root of the cloned repository with the commands: OpenAI GPT original tokenization workflow ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -If you want to reproduce the original tokenization process of the ``OpenAI GPT`` paper, you will need to install ``ftfy`` (limit to version 4.4.3 if you are using Python 2) and ``SpaCy`` : +If you want to reproduce the original tokenization process of the ``OpenAI GPT`` paper, you will need to install ``ftfy`` (use version 4.4.3 if you are using Python 2) and ``SpaCy`` : .. code-block:: bash pip install spacy ftfy==4.4.3 python -m spacy download en -If you don't install ``ftfy`` and ``SpaCy``\ , the ``OpenAI GPT`` tokenizer will default to tokenize using BERT's ``BasicTokenizer`` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry). +If you don't install ``ftfy`` and ``SpaCy``\ , the ``OpenAI GPT`` tokenizer defaults to tokenize using BERT's ``BasicTokenizer`` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry). diff --git a/docs/source/main_classes/configuration.rst b/docs/source/main_classes/configuration.rst new file mode 100644 index 0000000000..5e069629b8 --- /dev/null +++ b/docs/source/main_classes/configuration.rst @@ -0,0 +1,10 @@ +Configuration +---------------------------------------------------- + +We provide a base class, ``PretrainedConfig``, which can load a pretrained instance either from a local file or directory or from a pretrained model configuration provided by the library (downloaded from HuggingFace AWS S3 repository). + +``PretrainedConfig`` +~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: pytorch_transformers.PretrainedConfig + :members: diff --git a/docs/source/main_classes/model.rst b/docs/source/main_classes/model.rst new file mode 100644 index 0000000000..dd4c9d87dd --- /dev/null +++ b/docs/source/main_classes/model.rst @@ -0,0 +1,8 @@ +Models +---------------------------------------------------- + +``PreTrainedModel`` +~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: pytorch_transformers.PreTrainedModel + :members: diff --git a/docs/source/main_classes/optimizer_schedules.rst b/docs/source/main_classes/optimizer_schedules.rst new file mode 100644 index 0000000000..2d91d495a4 --- /dev/null +++ b/docs/source/main_classes/optimizer_schedules.rst @@ -0,0 +1,26 @@ +Optimizer +---------------------------------------------------- + +``AdamW`` +~~~~~~~~~~~~~~~~ + +.. autoclass:: pytorch_transformers.AdamW + :members: + +Schedules +---------------------------------------------------- + +.. autoclass:: pytorch_transformers.ConstantLRSchedule + :members: + +.. autoclass:: pytorch_transformers.WarmupConstantSchedule + :members: + +.. autoclass:: pytorch_transformers.WarmupCosineSchedule + :members: + +.. autoclass:: pytorch_transformers.WarmupCosineWithHardRestartsSchedule + :members: + +.. autoclass:: pytorch_transformers.WarmupLinearSchedule + :members: diff --git a/docs/source/main_classes/tokenizer.rst b/docs/source/main_classes/tokenizer.rst new file mode 100644 index 0000000000..cd6b4786bb --- /dev/null +++ b/docs/source/main_classes/tokenizer.rst @@ -0,0 +1,8 @@ +Tokenizer +---------------------------------------------------- + +``PreTrainedTokenizer`` +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: pytorch_transformers.PreTrainedTokenizer + :members: diff --git a/docs/source/migration.md b/docs/source/migration.md index fff4807d5c..ba09253472 100644 --- a/docs/source/migration.md +++ b/docs/source/migration.md @@ -35,10 +35,13 @@ loss, logits, attentions = outputs ### Serialization -Breaking change: Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. -To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules. +Breaking change in the `from_pretrained()`method: -Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other seralization method before. +1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules. + +2. The additional `*inputs` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute first which can break derived model classes build based on the previous `BertForSequenceClassification` examples. More precisely, the positional arguments `*inputs` provided to `from_pretrained()` are directly forwarded the model `__init__()` method while the keyword arguments `**kwargs` (i) which match configuration class attributes are used to update said attributes (ii) which don't match any configuration class attributes are forwarded to the model `__init__()` method. + +Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before. Here is an example: diff --git a/docs/source/model_doc/bert.rst b/docs/source/model_doc/bert.rst index 8c786aa24f..cbce74e73b 100644 --- a/docs/source/model_doc/bert.rst +++ b/docs/source/model_doc/bert.rst @@ -15,12 +15,6 @@ BERT :members: -``AdamW`` -~~~~~~~~~~~~~~~~ - -.. autoclass:: pytorch_transformers.AdamW - :members: - ``BertModel`` ~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/quickstart.md b/docs/source/quickstart.md index 7414ef48c1..814021038a 100644 --- a/docs/source/quickstart.md +++ b/docs/source/quickstart.md @@ -1,17 +1,58 @@ # Quickstart +## Philosophy + +PyTorch-Transformers is an opinionated library built for NLP researchers seeking to use/study/extend large-scale transformers models. + +The library was designed with two strong goals in mind: + +- be as easy and fast to use as possible: + + - we strongly limited the number of abstractions to learn, in fact there are almost no abstractions, just three standard classes for each model: configuration, models and tokenizer, + - each pretrained model configuration, weights and vocabulary can be downloaded, cached and loaded in the related class in a simple way by using a common `from_pretrained()` instantiation method. + - this library is NOT a modular toolbox of building blocks for neural nets, to extend/build-upon the library, just use your regular Python/PyTorch modules and inherit from the base classes of the library to reuse functionalities like model loading/saving. + +- provide state-of-the-art models with performances as close as possible to the original models: + + - we provide at least one example for each model which reproduces a result provided by the official authors of said model, + - the code is usually as close to the original code base as possible which means some PyTorch code may be not as *pytorchic* as it could be as a result of being converted TensorFlow code. + +A few other goals: + +- expose the models internals as consistently as possible: + + - we give access, using a single API to the full hidden-states and attention weights, + - tokenizer and base model's API are standardized to easily switch between models. + +- incorporate a subjective selection of promising tools for fine-tuning/investiguating these models: + + - a simple/consistent way to add new tokens to the vocabulary and embeddings for fine-tuning, + - simple ways to mask and prune transformer heads. + ## Main concepts +The library is build around three type of classes for each models: + +- **model classes** which are PyTorch models (`torch.nn.Modules`) of the 6 models architectures currently provided in the library, e.g. `BertModel` +- **configuration classes** which store all the parameters required to build a model, e.g. `BertConfig` +- **tokenizer classes** which store the vocabulary for each model and provide methods for encoding strings in list of token embeddings indices to be fed to a model, e.g. `BertTokenizer` + +All these classes can be instantiated from pretrained instances and saved locally using two methods: + +- `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/pytorch-transformers/pretrained_models.html)) or stored locally (or on a server) by the user, +- `save_pretrained()` let you save a model/configuration/tokenizer locally so that it can be reloaded using `from_pretrained()`. + +Let's go through a few simple quick-start examples to see how we can instantiate and use these classes. ## Quick tour: Usage -Here are two quick-start examples showcasing a few `Bert` and `GPT2` classes and pre-trained models. +Here are two examples showcasing a few `Bert` and `GPT2` classes and pre-trained models. -See package reference for examples for each model classe. +See full API reference for examples for each model classe. ### BERT example -First let's prepare a tokenized input from a text string using `BertTokenizer` +Let's start by preparing a tokenized input (a list of token embeddings indices to be fed to Bert) from a text string using `BertTokenizer` ```python import torch diff --git a/docs/source/serialization.rst b/docs/source/serialization.rst index be5197135d..c0de1324cf 100644 --- a/docs/source/serialization.rst +++ b/docs/source/serialization.rst @@ -1,3 +1,6 @@ +Serialization +---------------------------------------------------- + ### Loading Google AI or OpenAI pre-trained weights or PyTorch dump ### `from_pretrained()` method diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py index f875e4ab18..c9b0aeebb7 100644 --- a/pytorch_transformers/__init__.py +++ b/pytorch_transformers/__init__.py @@ -5,7 +5,7 @@ from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus) from .tokenization_gpt2 import GPT2Tokenizer from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE from .tokenization_xlm import XLMTokenizer -from .tokenization_utils import (PreTrainedTokenizer, clean_up_tokenization) +from .tokenization_utils import (PreTrainedTokenizer) from .modeling_bert import (BertConfig, BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction, diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py index e458c5ef74..f21927e18c 100644 --- a/pytorch_transformers/modeling_utils.py +++ b/pytorch_transformers/modeling_utils.py @@ -55,11 +55,19 @@ else: class PretrainedConfig(object): """ Base class for all configuration classes. - Handle a few common parameters and methods for loading/downloading/saving configurations. + Handle a few common attributes and methods for loading/downloading/saving configurations. """ pretrained_config_archive_map = {} def __init__(self, **kwargs): + r""" The initialization of :class:`~pytorch_transformers.PretrainedConfig` extracts + a few configuration attributes from `**kwargs` which are common to all models: + - `finetuning_task`: string, default `None`. Name of the task used to fine-tune the model (used to convert from original checkpoint) + - `num_labels`: integer, default `2`. Number of classes to use when the model is a classification model (sequences/tokens) + - `output_attentions`: boolean, default `False`. Should the model returns attentions weights. + - `output_hidden_states`: string, default `False`. Should the model returns all hidden-states. + - `torchscript`: string, default `False`. Is the model used with Torchscript. + """ self.finetuning_task = kwargs.pop('finetuning_task', None) self.num_labels = kwargs.pop('num_labels', 2) self.output_attentions = kwargs.pop('output_attentions', False) @@ -67,7 +75,7 @@ class PretrainedConfig(object): self.torchscript = kwargs.pop('torchscript', False) def save_pretrained(self, save_directory): - """ Save a configuration object to a directory, so that it + """ Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the `from_pretrained(save_directory)` class method. """ assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved" @@ -81,30 +89,34 @@ class PretrainedConfig(object): def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): r""" Instantiate a PretrainedConfig from a pre-trained model configuration. - Params: + Parameters: **pretrained_model_name_or_path**: either: - - a string with the `shortcut name` of a pre-trained model configuration to load from cache - or download and cache if not already stored in cache (e.g. 'bert-base-uncased'). - - a path to a `directory` containing a configuration file saved - using the `save_pretrained(save_directory)` method. - - a path or url to a saved configuration `file`. + + - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. + - a path to a `directory` containing a configuration file saved using the `save_pretrained(save_directory)` method, e.g.: ``./my_model_directory/``. + - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. + **cache_dir**: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. + **return_unused_kwargs**: (`optional`) bool: + - If False, then this function returns just the final configuration object. - - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` - is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: - ie the part of kwargs which has not been used to update `config` and is otherwise ignored. + - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored. + **kwargs**: (`optional`) dict: Dictionary of key/value pairs with which to update the configuration object after loading. + - The values in kwargs of any keys which are configuration attributes will be used - to override the loaded values. + to override the loaded values. - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled - by the `return_unused_kwargs` keyword parameter. + by the `return_unused_kwargs` keyword parameter. Examples:: + # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a + # derived class: BertConfig config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json') diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py index d9cd881dfd..d7aeff7c39 100644 --- a/pytorch_transformers/tokenization_bert.py +++ b/pytorch_transformers/tokenization_bert.py @@ -22,7 +22,7 @@ import os import unicodedata from io import open -from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization +from .tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py index 29a9ae7660..0aee856180 100644 --- a/pytorch_transformers/tokenization_gpt2.py +++ b/pytorch_transformers/tokenization_gpt2.py @@ -31,7 +31,7 @@ except ImportError: def lru_cache(): return lambda func: func -from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization +from .tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) diff --git a/pytorch_transformers/tokenization_transfo_xl.py b/pytorch_transformers/tokenization_transfo_xl.py index 237f8ea387..992dff80d5 100644 --- a/pytorch_transformers/tokenization_transfo_xl.py +++ b/pytorch_transformers/tokenization_transfo_xl.py @@ -30,7 +30,7 @@ import torch import numpy as np from .file_utils import cached_path -from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization +from .tokenization_utils import PreTrainedTokenizer if sys.version_info[0] == 2: import cPickle as pickle diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py index eaef2fed1e..556f094f6d 100644 --- a/pytorch_transformers/tokenization_utils.py +++ b/pytorch_transformers/tokenization_utils.py @@ -444,7 +444,7 @@ class PreTrainedTokenizer(object): filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) text = self.convert_tokens_to_string(filtered_tokens) if clean_up_tokenization_spaces: - text = clean_up_tokenization(text) + text = self.clean_up_tokenization(text) return text @property @@ -480,10 +480,9 @@ class PreTrainedTokenizer(object): all_ids = list(self.convert_tokens_to_ids(t) for t in all_toks) return all_ids - - -def clean_up_tokenization(out_string): - out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',' - ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't" - ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re") - return out_string + @staticmethod + def clean_up_tokenization(out_string): + out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',' + ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't" + ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re") + return out_string diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py index a4f3fdfde2..919ac97bce 100644 --- a/pytorch_transformers/tokenization_xlnet.py +++ b/pytorch_transformers/tokenization_xlnet.py @@ -23,7 +23,7 @@ from shutil import copyfile import unicodedata import six -from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization +from .tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__)