Reformer (#3351)

* first copy & past commit from Bert and morgans LSH code * add easy way to compare to trax original code * translate most of function * make trax lsh self attention deterministic with numpy seed + copy paste code * add same config * add same config * make layer init work * implemented hash_vectors function for lsh attention * continue reformer translation * hf LSHSelfAttentionLayer gives same output as trax layer * refactor code * refactor code * refactor code * refactor * refactor + add reformer config * delete bogus file * split reformer attention layer into two layers * save intermediate step * save intermediate step * make test work * add complete reformer block layer * finish reformer layer * implement causal and self mask * clean reformer test and refactor code * fix merge conflicts * fix merge conflicts * update init * fix device for GPU * fix chunk length init for tests * include morgans optimization * improve memory a bit * improve comment * factorize num_buckets * better testing parameters * make whole model work * make lm model work * add t5 copy paste tokenizer * add chunking feed forward * clean config * add improved assert statements * make tokenizer work * improve test * correct typo * extend config * add complexer test * add new axial position embeddings * add local block attention layer * clean tests * refactor * better testing * save intermediate progress * clean test file * make shorter input length work for model * allow variable input length * refactor * make forward pass for pretrained model work * add generation possibility * finish dropout and init * make style * refactor * add first version of RevNet Layers * make forward pass work and add convert file * make uploaded model forward pass work * make uploaded model forward pass work * refactor code * add namedtuples and cache buckets * correct head masks * refactor * made reformer more flexible * make style * remove set max length * add attention masks * fix up tests * fix lsh attention mask * make random seed optional for the moment * improve memory in reformer * add tests * make style * make sure masks work correctly * detach gradients * save intermediate * correct backprob through gather * make style * change back num hashes * rename to labels * fix rotation shape * fix detach * update * fix trainer * fix backward dropout * make reformer more flexible * fix conflict * fix * fix * add tests for fixed seed in reformer layer * fix trainer typo * fix typo in activations * add fp16 tests * add fp16 training * support fp16 * correct gradient bug in reformer * add fast gelu * re-add dropout for embedding dropout * better naming * better naming * renaming * finalize test branch * finalize tests * add more tests * finish tests * fix * fix type trainer * fix fp16 tests * fix tests * fix tests * fix tests * fix issue with dropout * fix dropout seeds * correct random seed on gpu * finalize random seed for dropout * finalize random seed for dropout * remove duplicate line * correct half precision bug * make style * refactor * refactor * docstring * remove sinusoidal position encodings for reformer * move chunking to modeling_utils * make style * clean config * make style * fix tests * fix auto tests * pretrained models * fix docstring * update conversion file * Update pretrained_models.rst * fix rst * fix rst * update copyright * fix test path * fix test path * fix small issue in test * include reformer in generation tests * add docs for axial position encoding * finish docs * Update convert_reformer_trax_checkpoint_to_pytorch.py * remove isort * include sams comments * remove wrong comment in utils * correct typos * fix typo * Update reformer.rst * applied morgans optimization * make style * make gpu compatible * remove bogus file * big test refactor * add example for chunking * fix typo * add to README
2020-05-07 10:17:01 +02:00
parent 877fc56410
commit dca34695d0
19 changed files with 3608 additions and 23 deletions
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -47,6 +47,7 @@ from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
 from .configuration_marian import MarianConfig
 from .configuration_mmbt import MMBTConfig
 from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig
+from .configuration_reformer import REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, ReformerConfig
 from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig
 from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
 from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig
@@ -138,6 +139,7 @@ from .tokenization_electra import ElectraTokenizer, ElectraTokenizerFast
 from .tokenization_flaubert import FlaubertTokenizer
 from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
 from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
+from .tokenization_reformer import ReformerTokenizer
 from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
 from .tokenization_t5 import T5Tokenizer
 from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer, TransfoXLTokenizerFast
@@ -159,7 +161,7 @@ if is_sklearn_available():

 # Modeling
 if is_torch_available():
-    from .modeling_utils import PreTrainedModel, prune_layer, Conv1D, top_k_top_p_filtering
+    from .modeling_utils import PreTrainedModel, prune_layer, Conv1D, top_k_top_p_filtering, apply_chunking_to_forward
    from .modeling_auto import (
        AutoModel,
        AutoModelForPreTraining,
@@ -190,6 +192,7 @@ if is_torch_available():
        BertForQuestionAnswering,
        load_tf_weights_in_bert,
        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        BertLayer,
    )
    from .modeling_openai import (
        OpenAIGPTPreTrainedModel,
@@ -320,6 +323,14 @@ if is_torch_available():
        ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP,
    )

+    from .modeling_reformer import (
+        ReformerAttention,
+        ReformerLayer,
+        ReformerModel,
+        ReformerModelWithLMHead,
+        REFORMER_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+
    # Optimization
    from .optimization import (
        AdamW,
--- a/src/transformers/activations.py
+++ b/src/transformers/activations.py
@@ -34,12 +34,18 @@ if torch.__version__ < "1.4.0":
 else:
    gelu = F.gelu

+
+def gelu_fast(x):
+    return 0.5 * x * (1 + torch.tanh(x * 0.7978845608 * (1 + 0.044715 * x * x)))
+
+
 ACT2FN = {
    "relu": F.relu,
    "swish": swish,
    "gelu": gelu,
    "tanh": torch.tanh,
    "gelu_new": gelu_new,
+    "gelu_fast": gelu_fast,
 }


--- a/src/transformers/configuration_auto.py
+++ b/src/transformers/configuration_auto.py
@@ -29,6 +29,7 @@ from .configuration_encoder_decoder import EncoderDecoderConfig
 from .configuration_flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig
 from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
 from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig
+from .configuration_reformer import ReformerConfig
 from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig
 from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
 from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig
@@ -73,6 +74,7 @@ CONFIG_MAPPING = OrderedDict(
        ("camembert", CamembertConfig,),
        ("xlm-roberta", XLMRobertaConfig,),
        ("bart", BartConfig,),
+        ("reformer", ReformerConfig,),
        ("roberta", RobertaConfig,),
        ("flaubert", FlaubertConfig,),
        ("bert", BertConfig,),
@@ -130,6 +132,7 @@ class AutoConfig:
            - contains `camembert`: :class:`~transformers.CamembertConfig` (CamemBERT model)
            - contains `xlm-roberta`: :class:`~transformers.XLMRobertaConfig` (XLM-RoBERTa model)
            - contains `roberta`: :class:`~transformers.RobertaConfig` (RoBERTa model)
+            - contains `reformer`: :class:`~transformers.ReformerConfig` (Reformer model)
            - contains `bert`: :class:`~transformers.BertConfig` (Bert model)
            - contains `openai-gpt`: :class:`~transformers.OpenAIGPTConfig` (OpenAI GPT model)
            - contains `gpt2`: :class:`~transformers.GPT2Config` (OpenAI GPT-2 model)
--- a/src/transformers/configuration_reformer.py
+++ b/src/transformers/configuration_reformer.py
@@ -0,0 +1,210 @@
+# coding=utf-8
+# Copyright 2020 The Trax Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Reformer model configuration """
+
+
+import logging
+
+from .configuration_utils import PretrainedConfig
+
+
+logger = logging.getLogger(__name__)
+
+REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "google/reformer-crime-and-punishment": "https://cdn.huggingface.co/google/reformer-crime-and-punishment/config.json"
+}
+
+
+class ReformerConfig(PretrainedConfig):
+    r"""
+        This is the configuration class to store the configuration of a :class:`~transformers.ReformerModel`.
+        It is used to instantiate an Reformer model according to the specified arguments, defining the model
+        architecture.
+
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+        for more information.
+
+        Args:
+            attention_head_size (:obj:`int`, optional, defaults to 64):
+                Dimensionality of the projected key, query and value vectors
+            attn_layers (:obj:`list(str)`, optional, defaults to ["local", "lsh", "local", "lsh", "local", "lsh"]):
+                List of attention layer types in ascending order. It can be chosen between a
+                LSHSelfAttention layer ("lsh") and a LocalSelfAttention layer ("local").
+                For more information on LSHSelfAttention layer, see `LSH Self Attention <reformer.html#lsh-self-attention>`__ .
+                For more information on LocalSelfAttention layer, see `Local Self Attention <reformer.html#local-sensitive-hashing-self-attention>`__ .
+            axial_pos_embds (:obj:`bool`, optional, defaults to True):
+                If `True` use axial position embeddings. For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__
+            axial_norm_std (:obj:`float`, optional, defaluts to 1.0):
+                The standard deviation of the normal_initializer for initializing the weight matrices of the axial positional encodings.
+            axial_pos_shape (:obj:`list(int)`, optional, defaults to `[64, 64]`):
+                The position dims of the axial position encodings.
+                During training the product of the position dims has to equal the sequence length.
+                For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__ncodings.
+            axial_pos_embds_dim (:obj:`list(int)`, optional, defaults to `[64, 192]`):
+                The embedding dims of the axial position encodings.
+                The sum of the embedding dims has to equal the hidden size.
+                For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__ncodings.
+            chunk_size_lm_head (:obj:`int`, optional, defaults to 0):
+                The chunk size of the final language model feed forward head layer.
+                A chunk size of 0 means that the feed forward layer is not chunked.
+                A chunk size of n means that the feed forward layer processes n < sequence_length embeddings at a time.
+                For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
+            chunk_size_feed_forward (:obj:`int`, optional, defaults to 0):
+                The chunk size of all feed forward layers in the residual attention blocks.
+                A chunk size of 0 means that the feed forward layer is not chunked.
+                A chunk size of n means that the feed forward layer processes n < sequence_length embeddings at a time.
+                For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
+            eos_token_id (:obj:`int`, optional, defaults to 2):
+                The token id for the <EOS> token.
+            feed_forward_size (:obj:`int`, optional, defaults to 512):
+                Dimensionality of the "feed_forward" (i.e., feed-forward) layer in the residual attention block.
+            hash_seed (:obj:`int`, optional, defaults to `None`):
+                Seed that can be used to make local sensitive hashing in LSHSelfAttention deterministic. This should only be set for testing purposed. For evaluation and training purposes `hash_seed` should be set to `None` to ensure fully random rotations in local sensitive hashing scheme.
+            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
+                The non-linear activation function (function or string) in the feed forward layer in the residual attention block.
+                If string, "gelu", "relu", "swish", "gelu_new" and "gelu_fast" are supported.
+            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.05):
+                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            hidden_size (:obj:`int`, optional, defaults to 256):
+                Dimensionality of the output hidden states of the residual attention blocks.
+            initializer_range (:obj:`float`, optional, defaults to 0.02):
+                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            is_decoder (:obj:`bool`, optional, defaults to False):
+                If `is_decoder` is True, a causal mask is used in addition to `attention_mask`.
+                When using the Reformer for causal language modeling, `is_decoder` is set to `True`.
+            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+                The epsilon used by the layer normalization layers.
+            local_chunk_length (:obj:`int`, optional, defaults to 64):
+                Length of chunk which attends to itself in LocalSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
+            local_num_chunks_before (:obj:`int`, optional, defaults to 1):
+                Number of previous neighbouring chunks to attend to in LocalSelfAttention layer to itself.
+            local_num_chunks_after (:obj:`int`, optional, defaults to 0):
+                Number of following neighbouring chunks to attend to in LocalSelfAttention layer in addition to itself.
+            local_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+                The dropout ratio for the attention probabilities in LocalSelfAttention.
+            lsh_chunk_length (:obj:`int`, optional, defaults to 64):
+                Length of chunk which attends to itself in LSHSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
+            lsh_num_chunks_before (:obj:`int`, optional, defaults to 1):
+                Number of previous neighbouring chunks to attend to in LSHSelfAttention layer to itself.
+            lsh_num_chunks_after (:obj:`int`, optional, defaults to 0):
+                Number of following neighbouring chunks to attend to in LSHSelfAttention layer to itself.
+            lsh_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+                The dropout ratio for the attention probabilities in LSHSelfAttention.
+            max_position_embeddings (:obj:`int`, optional, defaults to 4096):
+                The maximum sequence length that this model might ever be used with.
+                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            num_attention_heads (:obj:`int`, optional, defaults to 12):
+                Number of attention heads for each attention layer in the Transformer encoder.
+            num_buckets (:obj:`int` or :obj:`list(int)`, optional, defaults to `64`):
+                Number of buckets, the key query vectors can be "hashed into" using the locality sensitive hashing scheme. Each query key vector is hashed into a hash in `1, ..., num_buckets`.
+                The number of buckets can also be factorized into a list for improved memory complexity. In this case, each query key vector is hashed into a hash in `1-1, 1-2, ..., num_buckets[0]-1, ..., num_buckets[0]-num_buckets[1]` if `num_buckets` is factorized into two factors.
+                The number of buckets (or the product the factors) should approximately equal sequence length / lsh_chunk_length.
+            num_hashes (:obj:`int`, optional, defaults to 1):
+                Number of hashing rounds (e.g. number of random rotations) in Local Sensitive Hashing scheme.
+                The higher `num_hashes`, the more accurate the `LSHSelfAttention` becomes, but also the more memory and time intensive the hashing becomes.
+            pad_token_id (:obj:`int`, optional, defaults to 0):
+                The token id for the <PAD> token.
+            vocab_size (:obj:`int`, optional, defaults to 320):
+                Vocabulary size of the Reformer model. Defines the different tokens that
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ReformerModel`.
+
+        Example::
+
+            from transformers import ReformerModel, ReformerConfig
+
+            # Initializing a Reformer configuration
+            configuration = ReformerConfig()
+
+            # Initializing a Reformer model
+            model = ReformerModel(configuration)
+
+            # Accessing the model configuration
+            configuration = model.config
+
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
+    """
+    pretrained_config_archive_map = REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP
+    model_type = "reformer"
+
+    def __init__(
+        self,
+        attention_head_size=64,
+        attn_layers=["local", "lsh", "local", "lsh", "local", "lsh"],
+        axial_norm_std=1.0,
+        axial_pos_embds=True,
+        axial_pos_shape=[64, 64],
+        axial_pos_embds_dim=[64, 192],
+        chunk_size_lm_head=0,
+        chunk_size_feed_forward=0,
+        eos_token_id=2,
+        feed_forward_size=512,
+        hash_seed=None,
+        hidden_act="relu",
+        hidden_dropout_prob=0.05,
+        hidden_size=256,
+        initializer_range=0.02,
+        is_decoder=False,
+        layer_norm_eps=1e-12,
+        local_num_chunks_before=1,
+        local_num_chunks_after=0,
+        local_attention_probs_dropout_prob=0.05,
+        local_attn_chunk_length=64,
+        lsh_attn_chunk_length=64,
+        lsh_attention_probs_dropout_prob=0.0,
+        lsh_num_chunks_before=1,
+        lsh_num_chunks_after=0,
+        max_position_embeddings=4096,
+        num_attention_heads=2,
+        num_buckets=32,
+        num_hashes=1,
+        pad_token_id=0,
+        vocab_size=320,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, eos_token_id=eos_token_id, is_decoder=is_decoder, **kwargs)
+
+        self.hash_seed = hash_seed
+        self.vocab_size = vocab_size
+        self.attention_head_size = attention_head_size
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.num_hashes = num_hashes
+        self.num_hidden_layers = len(attn_layers)
+        self.num_buckets = tuple(num_buckets) if isinstance(num_buckets, list) else num_buckets
+        self.lsh_attn_chunk_length = lsh_attn_chunk_length
+        self.local_attn_chunk_length = local_attn_chunk_length
+        self.lsh_num_chunks_after = lsh_num_chunks_after
+        self.lsh_num_chunks_before = lsh_num_chunks_before
+        self.local_num_chunks_after = local_num_chunks_after
+        self.local_num_chunks_before = local_num_chunks_before
+        self.hidden_act = hidden_act
+        self.feed_forward_size = feed_forward_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.lsh_attention_probs_dropout_prob = lsh_attention_probs_dropout_prob
+        self.local_attention_probs_dropout_prob = local_attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.axial_pos_embds = axial_pos_embds
+        self.axial_pos_shape = tuple(axial_pos_shape)
+        self.axial_pos_embds_dim = tuple(axial_pos_embds_dim)
+        self.axial_norm_std = axial_norm_std
+        self.chunk_size_lm_head = chunk_size_lm_head
+        self.chunk_size_feed_forward = chunk_size_feed_forward
+        self.attn_layers = attn_layers
--- a/src/transformers/convert_reformer_trax_checkpoint_to_pytorch.py
+++ b/src/transformers/convert_reformer_trax_checkpoint_to_pytorch.py
@@ -0,0 +1,211 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Reformer checkpoint."""
+
+
+import argparse
+import logging
+import pickle
+
+import numpy as np
+import torch
+
+from transformers import ReformerConfig, ReformerModelWithLMHead
+
+
+logging.basicConfig(level=logging.INFO)
+
+
+def set_param(torch_layer, weight, bias=None):
+    # set parameter of one layer
+    assert torch_layer.weight.shape == weight.shape, "{} layer.weight does not match".format(torch_layer)
+    torch_layer.weight = torch.nn.Parameter(weight)
+    if bias is not None:
+        assert torch_layer.bias.shape == bias.shape, "{} layer.bias does not match".format(torch_layer)
+        torch_layer.bias = torch.nn.Parameter(bias)
+
+
+def set_layer_weights_in_torch_lsh(weights, torch_layer, hidden_size):
+    # set torch weights for 1-to-1 comparison
+    np_query_key = np.asarray(weights[0])
+    np_value = np.asarray(weights[1])
+    np_dense = np.asarray(weights[2])
+
+    set_param(
+        torch_layer.self_attention.query_key,
+        torch.tensor(np_query_key).transpose(1, 2).contiguous().view(-1, hidden_size),
+    )
+    set_param(
+        torch_layer.self_attention.value, torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
+    )
+    set_param(
+        torch_layer.output.dense, torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
+    )
+
+
+def set_layer_weights_in_torch_local(weights, torch_layer, hidden_size):
+    # set torch weights for 1-to-1 comparison
+    np_query = np.asarray(weights[0])
+    np_key = np.asarray(weights[1])
+    np_value = np.asarray(weights[2])
+    np_dense = np.asarray(weights[3])
+
+    set_param(
+        torch_layer.self_attention.query, torch.tensor(np_query).transpose(1, 2).contiguous().view(-1, hidden_size),
+    )
+    set_param(
+        torch_layer.self_attention.key, torch.tensor(np_key).transpose(1, 2).contiguous().view(-1, hidden_size),
+    )
+    set_param(
+        torch_layer.self_attention.value, torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
+    )
+    set_param(
+        torch_layer.output.dense, torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
+    )
+
+
+def set_block_weights_in_torch(weights, torch_block, hidden_size):
+    # layernorm 1
+    layer_norm_1 = weights[0][0][0]
+    layer_norm_1_weight = np.asarray(layer_norm_1[0])
+    layer_norm_1_bias = np.asarray(layer_norm_1[1])
+    set_param(
+        torch_block.attention.layer_norm, torch.tensor(layer_norm_1_weight), torch.tensor(layer_norm_1_bias),
+    )
+
+    # lsh weights + output
+    attn_weights = weights[0][1]
+    if len(attn_weights) < 4:
+        set_layer_weights_in_torch_lsh(attn_weights, torch_block.attention, hidden_size)
+    else:
+        set_layer_weights_in_torch_local(attn_weights, torch_block.attention, hidden_size)
+
+    # intermediate weighs
+    intermediate_weights = weights[2][0][2][2]
+
+    # Chunked Feed Forward
+    if len(intermediate_weights) == 4:
+        intermediate_weights = intermediate_weights[2]
+
+    # layernorm 2
+    layer_norm_2_weight = np.asarray(intermediate_weights[0][0])
+    layer_norm_2_bias = np.asarray(intermediate_weights[0][1])
+    set_param(
+        torch_block.feed_forward.layer_norm, torch.tensor(layer_norm_2_weight), torch.tensor(layer_norm_2_bias),
+    )
+
+    # intermediate dense
+    inter_dense_weight = np.asarray(intermediate_weights[1][0])
+    inter_dense_bias = np.asarray(intermediate_weights[1][1])
+    set_param(
+        torch_block.feed_forward.dense.dense,
+        torch.tensor(inter_dense_weight).transpose(0, 1).contiguous(),
+        torch.tensor(inter_dense_bias),
+    )
+
+    # intermediate out
+    out_dense_weight = np.asarray(intermediate_weights[4][0])
+    out_dense_bias = np.asarray(intermediate_weights[4][1])
+    set_param(
+        torch_block.feed_forward.output.dense,
+        torch.tensor(out_dense_weight).transpose(0, 1).contiguous(),
+        torch.tensor(out_dense_bias),
+    )
+
+
+def set_model_weights_in_torch(weights, torch_model, hidden_size):
+    # reformer model
+    torch_model_reformer = torch_model.reformer
+
+    # word embeds
+    word_embeddings = np.asarray(weights[1])
+    set_param(
+        torch_model_reformer.embeddings.word_embeddings, torch.tensor(word_embeddings),
+    )
+
+    if isinstance(weights[3], tuple):
+        position_embeddings = torch_model_reformer.embeddings.position_embeddings
+        for emb_idx in range(len(position_embeddings.weights)):
+            emb_weights = np.asarray(weights[3][emb_idx][0])
+            assert position_embeddings.weights[emb_idx].shape == emb_weights.shape, "{} emb does not match".format(
+                position_embeddings[emb_idx]
+            )
+            position_embeddings.weights[emb_idx] = torch.nn.Parameter(torch.tensor(emb_weights))
+
+    trax_layer_weights = weights[5]
+    assert len(torch_model_reformer.encoder.layers) * 4 + 1 == len(
+        trax_layer_weights
+    ), "HF and trax model do not have the same number of layers"
+    for layer_idx, layer in enumerate(torch_model_reformer.encoder.layers):
+        block_weights = trax_layer_weights[4 * layer_idx : 4 * (layer_idx + 1)]
+        set_block_weights_in_torch(block_weights, layer, hidden_size)
+
+    # output weights
+    out_weights = weights[6]
+
+    # output layer norm
+    layer_norm_out_weight = np.asarray(out_weights[0][0])
+    layer_norm_out_bias = np.asarray(out_weights[0][1])
+    set_param(
+        torch_model_reformer.encoder.layer_norm,
+        torch.tensor(layer_norm_out_weight),
+        torch.tensor(layer_norm_out_bias),
+    )
+
+    # output embeddings
+    output_embed_weights = np.asarray(out_weights[2][0])
+    output_embed_bias = np.asarray(out_weights[2][1])
+    set_param(
+        torch_model.lm_head.decoder,
+        torch.tensor(output_embed_weights).transpose(0, 1).contiguous(),
+        torch.tensor(output_embed_bias),
+    )
+
+
+def convert_trax_checkpoint_to_pytorch(trax_model_pkl_path, config_file, pytorch_dump_path):
+    # Initialise PyTorch model
+    config = ReformerConfig.from_json_file(config_file)
+    print("Building PyTorch model from configuration: {}".format(str(config)))
+    model = ReformerModelWithLMHead(config)
+
+    with open(trax_model_pkl_path, "rb") as f:
+        model_weights = pickle.load(f)["weights"]
+
+    set_model_weights_in_torch(model_weights, model, config.hidden_size)
+
+    # Save pytorch-model
+    print("Save PyTorch model to {}".format(pytorch_dump_path))
+    torch.save(model.state_dict(), pytorch_dump_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--trax_model_pkl_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
+    )
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The config json file corresponding to the pre-trained Reformer model. \n"
+        "This specifies the model architecture.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    args = parser.parse_args()
+    convert_trax_checkpoint_to_pytorch(args.trax_model_pkl_path, args.config_file, args.pytorch_dump_path)
--- a/src/transformers/modeling_auto.py
+++ b/src/transformers/modeling_auto.py
@@ -31,6 +31,7 @@ from .configuration_auto import (
    FlaubertConfig,
    GPT2Config,
    OpenAIGPTConfig,
+    ReformerConfig,
    RobertaConfig,
    T5Config,
    TransfoXLConfig,
@@ -97,6 +98,7 @@ from .modeling_flaubert import (
 )
 from .modeling_gpt2 import GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2LMHeadModel, GPT2Model
 from .modeling_openai import OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, OpenAIGPTLMHeadModel, OpenAIGPTModel
+from .modeling_reformer import ReformerModel, ReformerModelWithLMHead
 from .modeling_roberta import (
    ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
    RobertaForMaskedLM,
@@ -179,6 +181,7 @@ MODEL_MAPPING = OrderedDict(
        (XLMConfig, XLMModel),
        (CTRLConfig, CTRLModel),
        (ElectraConfig, ElectraModel),
+        (ReformerConfig, ReformerModel),
    ]
 )

@@ -222,6 +225,7 @@ MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
        (CTRLConfig, CTRLLMHeadModel),
        (ElectraConfig, ElectraForMaskedLM),
        (EncoderDecoderConfig, EncoderDecoderModel),
+        (ReformerConfig, ReformerModelWithLMHead),
    ]
 )

--- a/src/transformers/modeling_reformer.py
+++ b/src/transformers/modeling_reformer.py
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -13,8 +13,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch BERT model."""

+import inspect
 import logging
 import os
 from typing import Callable, Tuple
@@ -175,7 +175,7 @@ class ModuleUtilsMixin:
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
        return extended_attention_mask

-    def get_head_mask(self, head_mask, num_hidden_layers):
+    def get_head_mask(self, head_mask, num_hidden_layers, is_attention_chunked=False):
        """
        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
@@ -189,6 +189,8 @@ class ModuleUtilsMixin:
        """
        if head_mask is not None:
            head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
+            if is_attention_chunked is True:
+                head_mask = head_mask.unsqueeze(-1)
        else:
            head_mask = [None] * num_hidden_layers

@@ -786,6 +788,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
        attention_mask=None,
        decoder_start_token_id=None,
        use_cache=None,
+        **model_specific_kwargs
    ):
        r""" Generates sequences for models with a LM head. The method currently supports greedy decoding, beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling.

@@ -863,6 +866,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
            use_cache: (`optional`) bool
                If `use_cache` is True, past key values are used to speed up decoding if applicable to model. Defaults to `True`.

+            model_specific_kwargs: (`optional`) dict
+                Additional model specific kwargs will be forwarded to the `forward` function of the model.
+
        Return:

            output: `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`
@@ -1116,6 +1122,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
                encoder_outputs=encoder_outputs,
                attention_mask=attention_mask,
                use_cache=use_cache,
+                model_specific_kwargs=model_specific_kwargs,
            )
        else:
            output = self._generate_no_beam_search(
@@ -1138,6 +1145,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
                encoder_outputs=encoder_outputs,
                attention_mask=attention_mask,
                use_cache=use_cache,
+                model_specific_kwargs=model_specific_kwargs,
            )

        return output
@@ -1163,6 +1171,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
        encoder_outputs,
        attention_mask,
        use_cache,
+        model_specific_kwargs,
    ):
        """ Generate sequences for each example without beam search (num_beams == 1).
            All returned sequence are generated independantly.
@@ -1175,7 +1184,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):

        while cur_len < max_length:
            model_inputs = self.prepare_inputs_for_generation(
-                input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache
+                input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **model_specific_kwargs
            )

            outputs = self(**model_inputs)
@@ -1288,6 +1297,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
        encoder_outputs,
        attention_mask,
        use_cache,
+        model_specific_kwargs,
    ):
        """ Generate sequences for each example with beam search.
        """
@@ -1314,7 +1324,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):

        while cur_len < max_length:
            model_inputs = self.prepare_inputs_for_generation(
-                input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache
+                input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **model_specific_kwargs
            )
            outputs = self(**model_inputs)  # (batch_size * num_beams, cur_len, vocab_size)
            next_token_logits = outputs[0][:, -1, :]  # (batch_size * num_beams, vocab_size)
@@ -2087,3 +2097,66 @@ def prune_layer(layer, index, dim=None):
        return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim)
    else:
        raise ValueError("Can't prune layer of class {}".format(layer.__class__))
+
+
+def apply_chunking_to_forward(
+    chunk_size: int, chunk_dim: int, forward_fn: Callable[..., torch.Tensor], *input_tensors
+) -> torch.Tensor:
+    """
+    This function chunks the `input_tensors` into smaller input tensor parts of size `chunk_size` over the dimension `chunk_dim`.
+    It then applies a layer `forward_fn` to each chunk independently to save memory.
+    If the `forward_fn` is independent across the `chunk_dim` this function will yield the
+    same result as not applying it.
+
+    Args:
+        chunk_size: int - the chunk size of a chunked tensor. `num_chunks` = `len(input_tensors[0]) / chunk_size`
+        chunk_dim: int - the dimension over which the input_tensors should be chunked
+        forward_fn: fn - the forward fn of the model
+        input_tensors: tuple(torch.Tensor) - the input tensors of `forward_fn` which are chunked
+    Returns:
+        a Tensor with the same shape the foward_fn would have given if applied
+
+
+    Examples::
+
+        # rename the usual forward() fn to forward_chunk()
+        def forward_chunk(self, hidden_states):
+            hidden_states = self.decoder(hidden_states)
+            return hidden_states
+
+        # implement a chunked forward function
+        def forward(self, hidden_states):
+            return apply_chunking_to_forward(self.chunk_size_lm_head, self.seq_len_dim, self.forward_chunk, hidden_states)
+    """
+
+    assert len(input_tensors) > 0, "{} has to be a tuple/list of tensors".format(input_tensors)
+    tensor_shape = input_tensors[0].shape
+    assert all(
+        input_tensor.shape == tensor_shape for input_tensor in input_tensors
+    ), "All input tenors have to be of the same shape"
+
+    # inspect.signature exist since python 3.5 and is a python method -> no problem with backward compability
+    num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters)
+    assert num_args_in_forward_chunk_fn == len(
+        input_tensors
+    ), "forward_chunk_fn expects {} arguments, but only {} input tensors are given".format(
+        num_args_in_forward_chunk_fn, len(input_tensors)
+    )
+
+    if chunk_size > 0:
+        assert (
+            input_tensors[0].shape[chunk_dim] % chunk_size == 0
+        ), "The dimension to be chunked {} has to be a multiple of the chunk size {}".format(
+            input_tensors[0][chunk_dim], chunk_size
+        )
+
+        num_chunks = input_tensors[0].shape[chunk_dim] // chunk_size
+
+        # chunk input tensor into tuples
+        input_tensors_chunks = tuple(input_tensor.chunk(num_chunks, dim=chunk_dim) for input_tensor in input_tensors)
+        # apply forward fn to every tuple
+        output_chunks = tuple(forward_fn(*input_tensors_chunk) for input_tensors_chunk in zip(*input_tensors_chunks))
+        # concatenate output at same dimension
+        return torch.cat(output_chunks, dim=chunk_dim)
+
+    return forward_fn(*input_tensors)
--- a/src/transformers/tokenization_auto.py
+++ b/src/transformers/tokenization_auto.py
@@ -30,6 +30,7 @@ from .configuration_auto import (
    FlaubertConfig,
    GPT2Config,
    OpenAIGPTConfig,
+    ReformerConfig,
    RobertaConfig,
    T5Config,
    TransfoXLConfig,
@@ -49,6 +50,7 @@ from .tokenization_electra import ElectraTokenizer, ElectraTokenizerFast
 from .tokenization_flaubert import FlaubertTokenizer
 from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
 from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
+from .tokenization_reformer import ReformerTokenizer
 from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
 from .tokenization_t5 import T5Tokenizer
 from .tokenization_transfo_xl import TransfoXLTokenizer, TransfoXLTokenizerFast
@@ -69,6 +71,7 @@ TOKENIZER_MAPPING = OrderedDict(
        (XLMRobertaConfig, (XLMRobertaTokenizer, None)),
        (BartConfig, (BartTokenizer, None)),
        (RobertaConfig, (RobertaTokenizer, RobertaTokenizerFast)),
+        (ReformerConfig, (ReformerTokenizer, None)),
        (ElectraConfig, (ElectraTokenizer, ElectraTokenizerFast)),
        (BertConfig, (BertTokenizer, BertTokenizerFast)),
        (OpenAIGPTConfig, (OpenAIGPTTokenizer, OpenAIGPTTokenizerFast)),
--- a/src/transformers/tokenization_reformer.py
+++ b/src/transformers/tokenization_reformer.py
@@ -0,0 +1,179 @@
+# coding=utf-8
+# Copyright 2020 The Trax Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tokenization class for model Reformer."""
+
+
+import logging
+import os
+from shutil import copyfile
+
+from .tokenization_utils import PreTrainedTokenizer
+
+
+logger = logging.getLogger(__name__)
+
+SPIECE_UNDERLINE = "▁"
+
+
+####################################################
+# Mapping from the keyword arguments names of Tokenizer `__init__`
+# to file names for serializing Tokenizer instances
+####################################################
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
+
+####################################################
+# Mapping from the keyword arguments names of Tokenizer `__init__`
+# to pretrained vocabulary URL for all the model shortcut names.
+####################################################
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "google/reformer-crime-and-punishment": "https://cdn.huggingface.co/google/reformer-crime-and-punishment/spiece.model"
+    }
+}
+
+####################################################
+# Mapping from model shortcut names to max length of inputs
+####################################################
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "google/reformer-crime-and-punishment": 524288,
+}
+
+
+class ReformerTokenizer(PreTrainedTokenizer):
+    """
+        Constructs an Reformer tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__ .
+
+        This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+        should refer to the superclass for more information regarding methods.
+
+        Args:
+            vocab_file (:obj:`string`):
+                `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+                contains the vocabulary necessary to instantiate a tokenizer.
+            eos_token (:obj:`string`, `optional`, defaults to "</s>"):
+                The end of sequence token.
+
+                .. note::
+
+                    When building a sequence using special tokens, this is not the token that is used for the end
+                    of sequence. The token used is the :obj:`sep_token`.
+            unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
+                The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+                token instead.
+            pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
+                The token used for padding, for example when batching sequences of different lengths.
+            additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`None`):
+                Additional special tokens used by the tokenizer.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        eos_token="</s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        additional_special_tokens=[],
+        **kwargs
+    ):
+        super().__init__(
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+
+        try:
+            import sentencepiece as spm
+        except ImportError:
+            logger.warning(
+                "You need to install SentencePiece to use ReformerTokenizer:"
+                "https://github.com/google/sentencepiece"
+                "pip install sentencepiece"
+            )
+            raise
+
+        self.vocab_file = vocab_file
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(vocab_file)
+
+    @property
+    def vocab_size(self):
+        return self.sp_model.get_piece_size()
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+        try:
+            import sentencepiece as spm
+        except ImportError:
+            logger.warning(
+                "You need to install SentencePiece to use ReformerTokenizer: https://github.com/google/sentencepiece"
+                "pip install sentencepiece"
+            )
+            raise
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(self.vocab_file)
+
+    def _tokenize(self, text, sample=False):
+        """ Take as input a string and return a list of strings (tokens) for words/sub-words
+        """
+        if not sample:
+            pieces = self.sp_model.EncodeAsPieces(text)
+        else:
+            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
+        return pieces
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.sp_model.piece_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index < self.sp_model.get_piece_size():
+            token = self.sp_model.IdToPiece(index)
+        return token
+
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = self.sp_model.decode_pieces(tokens)
+        return out_string
+
+    def save_vocabulary(self, save_directory):
+        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
+            to a directory.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)