From 009273dbddd0964c378d7131445bfc0ae63bc29c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sun, 4 Aug 2019 12:14:57 +0200
Subject: [PATCH] big doc update [WIP]

---
 README.md                                     |  1 +
 docs/source/converting_tensorflow_models.rst  | 43 +++++++++++------
 docs/source/index.rst                         |  9 ++++
 docs/source/installation.rst                  | 16 +++----
 docs/source/main_classes/configuration.rst    | 10 ++++
 docs/source/main_classes/model.rst            |  8 ++++
 .../main_classes/optimizer_schedules.rst      | 26 ++++++++++
 docs/source/main_classes/tokenizer.rst        |  8 ++++
 docs/source/migration.md                      |  9 ++--
 docs/source/model_doc/bert.rst                |  6 ---
 docs/source/quickstart.md                     | 47 +++++++++++++++++--
 docs/source/serialization.rst                 |  3 ++
 pytorch_transformers/__init__.py              |  2 +-
 pytorch_transformers/modeling_utils.py        | 38 ++++++++++-----
 pytorch_transformers/tokenization_bert.py     |  2 +-
 pytorch_transformers/tokenization_gpt2.py     |  2 +-
 .../tokenization_transfo_xl.py                |  2 +-
 pytorch_transformers/tokenization_utils.py    | 15 +++---
 pytorch_transformers/tokenization_xlnet.py    |  2 +-
 19 files changed, 189 insertions(+), 60 deletions(-)
 create mode 100644 docs/source/main_classes/configuration.rst
 create mode 100644 docs/source/main_classes/model.rst
 create mode 100644 docs/source/main_classes/optimizer_schedules.rst
 create mode 100644 docs/source/main_classes/tokenizer.rst

diff --git a/README.md b/README.md
index 8e2074f727..c31bbd24b7 100644
--- a/README.md
+++ b/README.md
@@ -119,6 +119,7 @@ tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
 ```
 
 ## Quick tour of the fine-tuning/usage scripts
+
 The library comprises several example scripts with SOTA performances for NLU and NLG tasks:
 
 - `run_glue.py`: an example fine-tuning Bert, XLNet and XLM on nine different GLUE tasks (*sequence-level classification*)
diff --git a/docs/source/converting_tensorflow_models.rst b/docs/source/converting_tensorflow_models.rst
index 36c1e4050f..8441c9b1f7 100644
--- a/docs/source/converting_tensorflow_models.rst
+++ b/docs/source/converting_tensorflow_models.rst
@@ -1,7 +1,7 @@
 Converting Tensorflow Checkpoints
 ================================================
 
-A command-line interface is provided to convert a TensorFlow checkpoint in a PyTorch dump of the ``BertForPreTraining`` class  (for BERT) or NumPy checkpoint in a PyTorch dump of the ``OpenAIGPTModel`` class  (for OpenAI GPT).
+A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints in models than be loaded using the ``from_pretrained`` methods of the library.
 
 BERT
 ^^^^
@@ -41,6 +41,20 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT model,
      $PYTORCH_DUMP_OUTPUT \
      [OPENAI_GPT_CONFIG]
 
+OpenAI GPT-2
+^^^^^^^^^^^^
+
+Here is an example of the conversion process for a pre-trained OpenAI GPT-2 model (see `here <https://github.com/openai/gpt-2>`__\ )
+
+.. code-block:: shell
+
+   export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
+
+   pytorch_transformers gpt2 \
+     $OPENAI_GPT2_CHECKPOINT_PATH \
+     $PYTORCH_DUMP_OUTPUT \
+     [OPENAI_GPT2_CONFIG]
+
 Transformer-XL
 ^^^^^^^^^^^^^^
 
@@ -55,19 +69,6 @@ Here is an example of the conversion process for a pre-trained Transformer-XL mo
      $PYTORCH_DUMP_OUTPUT \
      [TRANSFO_XL_CONFIG]
 
-GPT-2
-^^^^^
-
-Here is an example of the conversion process for a pre-trained OpenAI's GPT-2 model.
-
-.. code-block:: shell
-
-   export GPT2_DIR=/path/to/gpt2/checkpoint
-
-   pytorch_transformers gpt2 \
-     $GPT2_DIR/model.ckpt \
-     $PYTORCH_DUMP_OUTPUT \
-     [GPT2_CONFIG]
 
 XLNet
 ^^^^^
@@ -84,3 +85,17 @@ Here is an example of the conversion process for a pre-trained XLNet model, fine
      $TRANSFO_XL_CONFIG_PATH \
      $PYTORCH_DUMP_OUTPUT \
      STS-B \
+
+
+XLM
+^^^
+
+Here is an example of the conversion process for a pre-trained XLM model:
+
+.. code-block:: shell
+
+   export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
+
+   pytorch_transformers xlm \
+     $XLM_CHECKPOINT_PATH \
+     $PYTORCH_DUMP_OUTPUT \
diff --git a/docs/source/index.rst b/docs/source/index.rst
index be8cfc2a39..c403a0ad4f 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -21,11 +21,20 @@ The library currently contains PyTorch implementations, pre-trained model weight
     pretrained_models
     examples
     notebooks
+    serialization
     converting_tensorflow_models
     migration
     bertology
     torchscript
 
+.. toctree::
+    :maxdepth: 2
+    :caption: Main classes
+
+    main_classes/configuration
+    main_classes/model
+    main_classes/tokenizer
+    main_classes/optimizer_schedules
 
 .. toctree::
     :maxdepth: 2
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index f8beb9f1c8..9e6269da94 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -1,12 +1,12 @@
 Installation
 ================================================
 
-This repo was tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 0.4.1/1.0.0
+PyTorch-Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0
 
 With pip
 ^^^^^^^^
 
-PyTorch pretrained bert can be installed with pip as follows:
+PyTorch Transformers can be installed using pip as follows:
 
 .. code-block:: bash
 
@@ -15,7 +15,7 @@ PyTorch pretrained bert can be installed with pip as follows:
 From source
 ^^^^^^^^^^^
 
-Clone the repository and instal locally:
+To install from source, clone the repository and install with:
 
 .. code-block:: bash
 
@@ -27,11 +27,11 @@ Clone the repository and instal locally:
 Tests
 ^^^^^
 
-An extensive test suite is included for the library and the example scripts. Library tests can be found in the `tests folder <https://github.com/huggingface/pytorch-transformers/tree/master/pytorch_transformers/tests>`_ and examples tests in the `examples folder <https://github.com/huggingface/pytorch-transformers/tree/master/examples>`_.
+An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the `tests folder <https://github.com/huggingface/pytorch-transformers/tree/master/pytorch_transformers/tests>`_ and examples tests in the `examples folder <https://github.com/huggingface/pytorch-transformers/tree/master/examples>`_.
 
-These tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
+Tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
 
-You can run the tests from the root of the cloned repository with the commands:
+Run all the tests from the root of the cloned repository with the commands:
 
 .. code-block:: bash
 
@@ -42,11 +42,11 @@ You can run the tests from the root of the cloned repository with the commands:
 OpenAI GPT original tokenization workflow
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-If you want to reproduce the original tokenization process of the ``OpenAI GPT`` paper, you will need to install ``ftfy`` (limit to version 4.4.3 if you are using Python 2) and ``SpaCy`` :
+If you want to reproduce the original tokenization process of the ``OpenAI GPT`` paper, you will need to install ``ftfy`` (use version 4.4.3 if you are using Python 2) and ``SpaCy`` :
 
 .. code-block:: bash
 
    pip install spacy ftfy==4.4.3
    python -m spacy download en
 
-If you don't install ``ftfy`` and ``SpaCy``\ , the ``OpenAI GPT`` tokenizer will default to tokenize using BERT's ``BasicTokenizer`` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
+If you don't install ``ftfy`` and ``SpaCy``\ , the ``OpenAI GPT`` tokenizer defaults to tokenize using BERT's ``BasicTokenizer`` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
diff --git a/docs/source/main_classes/configuration.rst b/docs/source/main_classes/configuration.rst
new file mode 100644
index 0000000000..5e069629b8
--- /dev/null
+++ b/docs/source/main_classes/configuration.rst
@@ -0,0 +1,10 @@
+Configuration
+----------------------------------------------------
+
+We provide a base class, ``PretrainedConfig``, which can load a pretrained instance either from a local file or directory or from a pretrained model configuration provided by the library (downloaded from HuggingFace AWS S3 repository).
+
+``PretrainedConfig``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.PretrainedConfig
+    :members:
diff --git a/docs/source/main_classes/model.rst b/docs/source/main_classes/model.rst
new file mode 100644
index 0000000000..dd4c9d87dd
--- /dev/null
+++ b/docs/source/main_classes/model.rst
@@ -0,0 +1,8 @@
+Models
+----------------------------------------------------
+
+``PreTrainedModel``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.PreTrainedModel
+    :members:
diff --git a/docs/source/main_classes/optimizer_schedules.rst b/docs/source/main_classes/optimizer_schedules.rst
new file mode 100644
index 0000000000..2d91d495a4
--- /dev/null
+++ b/docs/source/main_classes/optimizer_schedules.rst
@@ -0,0 +1,26 @@
+Optimizer
+----------------------------------------------------
+
+``AdamW``
+~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.AdamW
+    :members:
+
+Schedules
+----------------------------------------------------
+
+.. autoclass:: pytorch_transformers.ConstantLRSchedule
+    :members:
+
+.. autoclass:: pytorch_transformers.WarmupConstantSchedule
+    :members:
+
+.. autoclass:: pytorch_transformers.WarmupCosineSchedule
+    :members:
+
+.. autoclass:: pytorch_transformers.WarmupCosineWithHardRestartsSchedule
+    :members:
+
+.. autoclass:: pytorch_transformers.WarmupLinearSchedule
+    :members:
diff --git a/docs/source/main_classes/tokenizer.rst b/docs/source/main_classes/tokenizer.rst
new file mode 100644
index 0000000000..cd6b4786bb
--- /dev/null
+++ b/docs/source/main_classes/tokenizer.rst
@@ -0,0 +1,8 @@
+Tokenizer
+----------------------------------------------------
+
+``PreTrainedTokenizer``
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.PreTrainedTokenizer
+    :members:
diff --git a/docs/source/migration.md b/docs/source/migration.md
index fff4807d5c..ba09253472 100644
--- a/docs/source/migration.md
+++ b/docs/source/migration.md
@@ -35,10 +35,13 @@ loss, logits, attentions = outputs
 
 ### Serialization
 
-Breaking change: Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method.
-To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
+Breaking change in the `from_pretrained()`method:
 
-Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other seralization method before.
+1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
+
+2. The additional `*inputs` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute first which can break derived model classes build based on the previous `BertForSequenceClassification` examples. More precisely, the positional arguments `*inputs` provided to `from_pretrained()` are directly forwarded the model `__init__()` method while the keyword arguments `**kwargs` (i) which match configuration class attributes are used to update said attributes (ii) which don't match any configuration class attributes are forwarded to the model `__init__()` method.
+
+Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before.
 
 Here is an example:
 
diff --git a/docs/source/model_doc/bert.rst b/docs/source/model_doc/bert.rst
index 8c786aa24f..cbce74e73b 100644
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -15,12 +15,6 @@ BERT
     :members:
 
 
-``AdamW``
-~~~~~~~~~~~~~~~~
-
-.. autoclass:: pytorch_transformers.AdamW
-    :members:
-
 ``BertModel``
 ~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/quickstart.md b/docs/source/quickstart.md
index 7414ef48c1..814021038a 100644
--- a/docs/source/quickstart.md
+++ b/docs/source/quickstart.md
@@ -1,17 +1,58 @@
 # Quickstart
 
+## Philosophy
+
+PyTorch-Transformers is an opinionated library built for NLP researchers seeking to use/study/extend large-scale transformers models.
+
+The library was designed with two strong goals in mind:
+
+- be as easy and fast to use as possible:
+
+  - we strongly limited the number of abstractions to learn, in fact there are almost no abstractions, just three standard classes for each model: configuration, models and tokenizer,
+  - each pretrained model configuration, weights and vocabulary can be downloaded, cached and loaded in the related class in a simple way by using a common `from_pretrained()` instantiation method.
+  - this library is NOT a modular toolbox of building blocks for neural nets, to extend/build-upon the library, just use your regular Python/PyTorch modules and inherit from the base classes of the library to reuse functionalities like model loading/saving.
+
+- provide state-of-the-art models with performances as close as possible to the original models:
+
+  - we provide at least one example for each model which reproduces a result provided by the official authors of said model,
+  - the code is usually as close to the original code base as possible which means some PyTorch code may be not as *pytorchic* as it could be as a result of being converted TensorFlow code.
+
+A few other goals:
+
+- expose the models internals as consistently as possible:
+
+  - we give access, using a single API to the full hidden-states and attention weights,
+  - tokenizer and base model's API are standardized to easily switch between models.
+
+- incorporate a subjective selection of promising tools for fine-tuning/investiguating these models:
+
+  - a simple/consistent way to add new tokens to the vocabulary and embeddings for fine-tuning,
+  - simple ways to mask and prune transformer heads.
+
 ## Main concepts
 
+The library is build around three type of classes for each models:
+
+- **model classes** which are PyTorch models (`torch.nn.Modules`) of the 6 models architectures currently provided in the library, e.g. `BertModel`
+- **configuration classes** which store all the parameters required to build a model, e.g. `BertConfig`
+- **tokenizer classes** which store the vocabulary for each model and provide methods for encoding strings in list of token embeddings indices to be fed to a model, e.g. `BertTokenizer`
+
+All these classes can be instantiated from pretrained instances and saved locally using two methods:
+
+- `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/pytorch-transformers/pretrained_models.html)) or stored locally (or on a server) by the user,
+- `save_pretrained()` let you save a model/configuration/tokenizer locally so that it can be reloaded using `from_pretrained()`.
+
+Let's go through a few simple quick-start examples to see how we can instantiate and use these classes.
 
 ## Quick tour: Usage
 
-Here are two quick-start examples showcasing a few `Bert` and `GPT2` classes and pre-trained models.
+Here are two examples showcasing a few `Bert` and `GPT2` classes and pre-trained models.
 
-See package reference for examples for each model classe.
+See full API reference for examples for each model classe.
 
 ### BERT example
 
-First let's prepare a tokenized input from a text string using `BertTokenizer`
+Let's start by preparing a tokenized input (a list of token embeddings indices to be fed to Bert) from a text string using `BertTokenizer`
 
 ```python
 import torch
diff --git a/docs/source/serialization.rst b/docs/source/serialization.rst
index be5197135d..c0de1324cf 100644
--- a/docs/source/serialization.rst
+++ b/docs/source/serialization.rst
@@ -1,3 +1,6 @@
+Serialization
+----------------------------------------------------
+
 ### Loading Google AI or OpenAI pre-trained weights or PyTorch dump
 
 ### `from_pretrained()` method
diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py
index f875e4ab18..c9b0aeebb7 100644
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -5,7 +5,7 @@ from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
 from .tokenization_gpt2 import GPT2Tokenizer
 from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
 from .tokenization_xlm import XLMTokenizer
-from .tokenization_utils import (PreTrainedTokenizer, clean_up_tokenization)
+from .tokenization_utils import (PreTrainedTokenizer)
 
 from .modeling_bert import (BertConfig, BertModel, BertForPreTraining,
                        BertForMaskedLM, BertForNextSentencePrediction,
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index e458c5ef74..f21927e18c 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -55,11 +55,19 @@ else:
 
 class PretrainedConfig(object):
     """ Base class for all configuration classes.
-        Handle a few common parameters and methods for loading/downloading/saving configurations.
+        Handle a few common attributes and methods for loading/downloading/saving configurations.
     """
     pretrained_config_archive_map = {}
 
     def __init__(self, **kwargs):
+        r""" The initialization of :class:`~pytorch_transformers.PretrainedConfig` extracts
+            a few configuration attributes from `**kwargs` which are common to all models:
+                - `finetuning_task`: string, default `None`. Name of the task used to fine-tune the model (used to convert from original checkpoint)
+                - `num_labels`: integer, default `2`. Number of classes to use when the model is a classification model (sequences/tokens)
+                - `output_attentions`: boolean, default `False`. Should the model returns attentions weights.
+                - `output_hidden_states`: string, default `False`. Should the model returns all hidden-states.
+                - `torchscript`: string, default `False`. Is the model used with Torchscript.
+        """
         self.finetuning_task = kwargs.pop('finetuning_task', None)
         self.num_labels = kwargs.pop('num_labels', 2)
         self.output_attentions = kwargs.pop('output_attentions', False)
@@ -67,7 +75,7 @@ class PretrainedConfig(object):
         self.torchscript = kwargs.pop('torchscript', False)
 
     def save_pretrained(self, save_directory):
-        """ Save a configuration object to a directory, so that it
+        """ Save a configuration object to the directory `save_directory`, so that it
             can be re-loaded using the `from_pretrained(save_directory)` class method.
         """
         assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
@@ -81,30 +89,34 @@ class PretrainedConfig(object):
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         r""" Instantiate a PretrainedConfig from a pre-trained model configuration.
 
-        Params:
+        Parameters:
             **pretrained_model_name_or_path**: either:
-                - a string with the `shortcut name` of a pre-trained model configuration to load from cache
-                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
-                - a path to a `directory` containing a configuration file saved
-                    using the `save_pretrained(save_directory)` method.
-                - a path or url to a saved configuration `file`.
+
+                - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing a configuration file saved using the `save_pretrained(save_directory)` method, e.g.: ``./my_model_directory/``.
+                - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
+
             **cache_dir**: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
                 configuration should be cached if the standard cache should not be used.
+
             **return_unused_kwargs**: (`optional`) bool:
+
                 - If False, then this function returns just the final configuration object.
-                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs`
-                is a dictionary consisting of the key/value pairs whose keys are not configuration attributes:
-                ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
+                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
+
             **kwargs**: (`optional`) dict:
                 Dictionary of key/value pairs with which to update the configuration object after loading.
+
                 - The values in kwargs of any keys which are configuration attributes will be used
-                to override the loaded values.
+                    to override the loaded values.
                 - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
-                by the `return_unused_kwargs` keyword parameter.
+                    by the `return_unused_kwargs` keyword parameter.
 
         Examples::
 
+            # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
+            # derived class: BertConfig
             config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
             config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
             config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index d9cd881dfd..d7aeff7c39 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -22,7 +22,7 @@ import os
 import unicodedata
 from io import open
 
-from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer
 
 logger = logging.getLogger(__name__)
 
diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py
index 29a9ae7660..0aee856180 100644
--- a/pytorch_transformers/tokenization_gpt2.py
+++ b/pytorch_transformers/tokenization_gpt2.py
@@ -31,7 +31,7 @@ except ImportError:
     def lru_cache():
         return lambda func: func
 
-from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer
 
 logger = logging.getLogger(__name__)
 
diff --git a/pytorch_transformers/tokenization_transfo_xl.py b/pytorch_transformers/tokenization_transfo_xl.py
index 237f8ea387..992dff80d5 100644
--- a/pytorch_transformers/tokenization_transfo_xl.py
+++ b/pytorch_transformers/tokenization_transfo_xl.py
@@ -30,7 +30,7 @@ import torch
 import numpy as np
 
 from .file_utils import cached_path
-from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index eaef2fed1e..556f094f6d 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -444,7 +444,7 @@ class PreTrainedTokenizer(object):
         filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
         text = self.convert_tokens_to_string(filtered_tokens)
         if clean_up_tokenization_spaces:
-            text = clean_up_tokenization(text)
+            text = self.clean_up_tokenization(text)
         return text
 
     @property
@@ -480,10 +480,9 @@ class PreTrainedTokenizer(object):
         all_ids = list(self.convert_tokens_to_ids(t) for t in all_toks)
         return all_ids
 
-
-
-def clean_up_tokenization(out_string):
-    out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
-                    ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
-                    ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
-    return out_string
+    @staticmethod
+    def clean_up_tokenization(out_string):
+        out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
+                        ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
+                        ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
+        return out_string
diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py
index a4f3fdfde2..919ac97bce 100644
--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py
@@ -23,7 +23,7 @@ from shutil import copyfile
 import unicodedata
 import six
 
-from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer
 
 logger = logging.getLogger(__name__)