Funnel transformer (#6908)

* Initial model * Fix upsampling * Add special cls token id and test * Formatting * Test and fist FunnelTokenizerFast * Common tests * Fix the check_repo script and document Funnel * Doc fixes * Add all models * Write doc * Fix test * Initial model * Fix upsampling * Add special cls token id and test * Formatting * Test and fist FunnelTokenizerFast * Common tests * Fix the check_repo script and document Funnel * Doc fixes * Add all models * Write doc * Fix test * Fix copyright * Forgot some layers can be repeated * Apply suggestions from code review Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> * Update src/transformers/modeling_funnel.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Address review comments * Update src/transformers/modeling_funnel.py Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> * Address review comments * Update src/transformers/modeling_funnel.py Co-authored-by: Sam Shleifer <sshleifer@gmail.com> * Slow integration test * Make small integration test * Formatting * Add checkpoint and separate classification head * Formatting * Expand list, fix link and add in pretrained models * Styling * Add the model in all summaries * Typo fixes Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: Sam Shleifer <sshleifer@gmail.com>
2020-09-08 08:08:08 -04:00
parent 25afb4ea50
commit d155b38d6e
18 changed files with 3208 additions and 405 deletions
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -29,6 +29,7 @@ from .configuration_dpr import DPR_PRETRAINED_CONFIG_ARCHIVE_MAP, DPRConfig
 from .configuration_electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig
 from .configuration_encoder_decoder import EncoderDecoderConfig
 from .configuration_flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig
+from .configuration_funnel import FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP, FunnelConfig
 from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
 from .configuration_longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig
 from .configuration_lxmert import LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP, LxmertConfig
@@ -155,6 +156,7 @@ from .tokenization_dpr import (
 )
 from .tokenization_electra import ElectraTokenizer, ElectraTokenizerFast
 from .tokenization_flaubert import FlaubertTokenizer
+from .tokenization_funnel import FunnelTokenizer, FunnelTokenizerFast
 from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
 from .tokenization_longformer import LongformerTokenizer, LongformerTokenizerFast
 from .tokenization_lxmert import LxmertTokenizer, LxmertTokenizerFast
@@ -327,6 +329,18 @@ if is_torch_available():
        FlaubertModel,
        FlaubertWithLMHeadModel,
    )
+    from .modeling_funnel import (
+        FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST,
+        FunnelBaseModel,
+        FunnelForMaskedLM,
+        FunnelForMultipleChoice,
+        FunnelForPreTraining,
+        FunnelForQuestionAnswering,
+        FunnelForSequenceClassification,
+        FunnelForTokenClassification,
+        FunnelModel,
+        load_tf_weights_in_funnel,
+    )
    from .modeling_gpt2 import (
        GPT2_PRETRAINED_MODEL_ARCHIVE_LIST,
        GPT2DoubleHeadsModel,
--- a/src/transformers/commands/convert.py
+++ b/src/transformers/commands/convert.py
@@ -15,6 +15,12 @@ def convert_command_factory(args: Namespace):
    )


+IMPORT_ERROR_MESSAGE = """transformers can only be used from the commandline to convert TensorFlow models in PyTorch,
+In that case, it requires TensorFlow to be installed. Please see
+https://www.tensorflow.org/install/ for installation instructions.
+"""
+
+
 class ConvertCommand(BaseTransformersCLICommand):
    @staticmethod
    def register_subcommand(parser: ArgumentParser):
@@ -69,12 +75,7 @@ class ConvertCommand(BaseTransformersCLICommand):
                    convert_tf_checkpoint_to_pytorch,
                )
            except ImportError:
-                msg = (
-                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
-                    "In that case, it requires TensorFlow to be installed. Please see "
-                    "https://www.tensorflow.org/install/ for installation instructions."
-                )
-                raise ImportError(msg)
+                raise ImportError(IMPORT_ERROR_MESSAGE)

            convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
        elif self._model_type == "bert":
@@ -83,12 +84,16 @@ class ConvertCommand(BaseTransformersCLICommand):
                    convert_tf_checkpoint_to_pytorch,
                )
            except ImportError:
-                msg = (
-                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
-                    "In that case, it requires TensorFlow to be installed. Please see "
-                    "https://www.tensorflow.org/install/ for installation instructions."
+                raise ImportError(IMPORT_ERROR_MESSAGE)
+
+            convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
+        elif self._model_type == "funnel":
+            try:
+                from transformers.convert_funnel_original_tf_checkpoint_to_pytorch import (
+                    convert_tf_checkpoint_to_pytorch,
                )
-                raise ImportError(msg)
+            except ImportError:
+                raise ImportError(IMPORT_ERROR_MESSAGE)

            convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
        elif self._model_type == "gpt":
@@ -103,12 +108,7 @@ class ConvertCommand(BaseTransformersCLICommand):
                    convert_transfo_xl_checkpoint_to_pytorch,
                )
            except ImportError:
-                msg = (
-                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
-                    "In that case, it requires TensorFlow to be installed. Please see "
-                    "https://www.tensorflow.org/install/ for installation instructions."
-                )
-                raise ImportError(msg)
+                raise ImportError(IMPORT_ERROR_MESSAGE)

            if "ckpt" in self._tf_checkpoint.lower():
                TF_CHECKPOINT = self._tf_checkpoint
@@ -125,12 +125,7 @@ class ConvertCommand(BaseTransformersCLICommand):
                    convert_gpt2_checkpoint_to_pytorch,
                )
            except ImportError:
-                msg = (
-                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
-                    "In that case, it requires TensorFlow to be installed. Please see "
-                    "https://www.tensorflow.org/install/ for installation instructions."
-                )
-                raise ImportError(msg)
+                raise ImportError(IMPORT_ERROR_MESSAGE)

            convert_gpt2_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
        elif self._model_type == "xlnet":
@@ -139,12 +134,7 @@ class ConvertCommand(BaseTransformersCLICommand):
                    convert_xlnet_checkpoint_to_pytorch,
                )
            except ImportError:
-                msg = (
-                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
-                    "In that case, it requires TensorFlow to be installed. Please see "
-                    "https://www.tensorflow.org/install/ for installation instructions."
-                )
-                raise ImportError(msg)
+                raise ImportError(IMPORT_ERROR_MESSAGE)

            convert_xlnet_checkpoint_to_pytorch(
                self._tf_checkpoint, self._config, self._pytorch_dump_output, self._finetuning_task_name
--- a/src/transformers/configuration_auto.py
+++ b/src/transformers/configuration_auto.py
@@ -26,6 +26,7 @@ from .configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
 from .configuration_electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig
 from .configuration_encoder_decoder import EncoderDecoderConfig
 from .configuration_flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig
+from .configuration_funnel import FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP, FunnelConfig
 from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
 from .configuration_longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig
 from .configuration_lxmert import LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP, LxmertConfig
@@ -67,6 +68,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
        ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP,
        LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
        RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP,
        LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
    ]
    for key, value, in pretrained_map.items()
@@ -168,6 +170,10 @@ CONFIG_MAPPING = OrderedDict(
            "encoder-decoder",
            EncoderDecoderConfig,
        ),
+        (
+            "funnel",
+            FunnelConfig,
+        ),
        (
            "lxmert",
            LxmertConfig,
@@ -230,6 +236,7 @@ class AutoConfig:
            - `ctrl` : :class:`~transformers.CTRLConfig` (CTRL model)
            - `flaubert` : :class:`~transformers.FlaubertConfig` (Flaubert model)
            - `electra` : :class:`~transformers.ElectraConfig` (ELECTRA model)
+            - `funnel`: :class:`~transformers.FunnelConfig` (Funnel Transformer model)

        Args:
            pretrained_model_name_or_path (:obj:`string`):
--- a/src/transformers/configuration_funnel.py
+++ b/src/transformers/configuration_funnel.py
@@ -0,0 +1,183 @@
+# coding=utf-8
+# Copyright 2020, Hugging Face
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Funnel Transformer model configuration """
+
+from .configuration_utils import PretrainedConfig
+from .utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "funnel-transformer/small": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/small/config.json",
+    "funnel-transformer/small-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/small-base/config.json",
+    "funnel-transformer/medium": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/medium/config.json",
+    "funnel-transformer/medium-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/medium-base/config.json",
+    "funnel-transformer/intermediate": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/intermediate/config.json",
+    "funnel-transformer/intermediate-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/intermediate-base/config.json",
+    "funnel-transformer/large": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/large/config.json",
+    "funnel-transformer/large-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/large-base/config.json",
+    "funnel-transformer/xlarge": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/xlarge/config.json",
+    "funnel-transformer/xlarge-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/xlarge-base/config.json",
+}
+
+
+class FunnelConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.FunnelModel`.
+    It is used to instantiate an Funnel Transformer model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the Funnel Transformer `funnel-transformer/small <https://huggingface.co/funnel-transformer/small>`__ architecture.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+            Vocabulary size of the Funnel transformer. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.FunnelModel`.
+        block_sizes (:obj:`List[int]`, `optional`, defaults to :obj:`[4, 4, 4]`):
+            The sizes of the blocks used in the model.
+        block_repeats (:obj:`List[int]`, `optional`):
+            If passed along, each layer of each block is repeated the number of times indicated.
+        num_decoder_layers (:obj:`int`, `optional`, defaults to 2):
+            The number of layers in the decoder (when not using the base model).
+        d_model (:obj:`int`, `optional`, defaults to 768):
+            Dimensionality of the model's hidden states.
+        n_head (:obj:`int`, `optional`, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        d_head (:obj:`int`, `optional`, defaults to 64):
+            Dimensionality of the model's heads.
+        d_inner (:obj:`int`, `optional`, defaults to 3072):
+            Inner dimension in the feed-forward blocks.
+        hidden_act (:obj:`str` or :obj:`callable`, `optional`, defaults to :obj:`"gelu_new"`):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for the attention probabilities.
+        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout probability used between the two layers of the feed-forward blocks.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 3):
+            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.FunnelModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.1):
+            The standard deviation of the `uniform initializer` for initializing all weight matrices in attention
+            layers.
+        initializer_std (:obj:`float`, `optional`):
+            The standard deviation of the `normal initializer` for initializing the embedding matrix and the weight of
+            linear layers. Will default to 1 for the embedding matrix and the value given by Xavier initialization for
+            linear layers.
+        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-9):
+            The epsilon used by the layer normalization layers.
+        pooling_type (:obj:`str`, `optional`, defaults to :obj:`"mean"`):
+            Possible values are ``"mean"`` or ``"max"``. The way pooling is performed at the beginning of each
+            block.
+        attention_type (:obj:`str`, `optional`, defaults to :obj:`"relative_shift"`):
+            Possible values are ``"relative_shift"`` or ``"factorized"``. The former is faster on CPU/GPU while
+            the latter is faster on TPU.
+        separate_cls (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to separate the cls token when applying pooling.
+        truncate_seq (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            When using ``separate_cls``, whether or not to truncate the last token when pooling, to avoid getting
+            a sequence length that is not a multiple of 2.
+        pool_q_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to apply the pooling only to the query or to query, key and values for the attention
+            layers.
+    """
+    model_type = "funnel"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        block_sizes=[4, 4, 4],
+        block_repeats=None,
+        num_decoder_layers=2,
+        d_model=768,
+        n_head=12,
+        d_head=64,
+        d_inner=3072,
+        hidden_act="gelu_new",
+        hidden_dropout=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.0,
+        max_position_embeddings=512,
+        type_vocab_size=3,
+        initializer_range=0.1,
+        initializer_std=None,
+        layer_norm_eps=1e-9,
+        pooling_type="mean",
+        attention_type="relative_shift",
+        separate_cls=True,
+        truncate_seq=True,
+        pool_q_only=True,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.vocab_size = vocab_size
+        self.block_sizes = block_sizes
+        self.block_repeats = [1] * len(block_sizes) if block_repeats is None else block_repeats
+        assert len(block_sizes) == len(
+            self.block_repeats
+        ), "`block_sizes` and `block_repeats` should have the same length."
+        self.num_decoder_layers = num_decoder_layers
+        self.d_model = d_model
+        self.n_head = n_head
+        self.d_head = d_head
+        self.d_inner = d_inner
+        self.hidden_act = hidden_act
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.initializer_std = initializer_std
+        self.layer_norm_eps = layer_norm_eps
+        assert pooling_type in [
+            "mean",
+            "max",
+        ], f"Got {pooling_type} for `pooling_type` but only 'mean' and 'max' are supported."
+        self.pooling_type = pooling_type
+        assert attention_type in [
+            "relative_shift",
+            "factorized",
+        ], f"Got {attention_type} for `attention_type` but only 'relative_shift' and 'factorized' are supported."
+        self.attention_type = attention_type
+        self.separate_cls = separate_cls
+        self.truncate_seq = truncate_seq
+        self.pool_q_only = pool_q_only
+
+    @property
+    def hidden_size(self):
+        return self.d_model
+
+    @property
+    def num_attention_heads(self):
+        return self.n_head
+
+    @property
+    def num_hidden_layers(self):
+        return sum(self.block_sizes)
+
+    @property
+    def num_blocks(self):
+        return len(self.block_sizes)
--- a/src/transformers/convert_funnel_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/convert_funnel_original_tf_checkpoint_to_pytorch.py
@@ -0,0 +1,61 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Funnel checkpoint."""
+
+
+import argparse
+import logging
+
+import torch
+
+from transformers import FunnelConfig, FunnelForPreTraining, load_tf_weights_in_funnel
+
+
+logging.basicConfig(level=logging.INFO)
+
+
+def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
+    # Initialise PyTorch model
+    config = FunnelConfig.from_json_file(config_file)
+    print("Building PyTorch model from configuration: {}".format(str(config)))
+    model = FunnelForPreTraining(config)
+
+    # Load weights from tf checkpoint
+    load_tf_weights_in_funnel(model, config, tf_checkpoint_path)
+
+    # Save pytorch-model
+    print("Save PyTorch model to {}".format(pytorch_dump_path))
+    torch.save(model.state_dict(), pytorch_dump_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
+    )
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The config json file corresponding to the pre-trained model. \n"
+        "This specifies the model architecture.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    args = parser.parse_args()
+    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
--- a/src/transformers/modeling_auto.py
+++ b/src/transformers/modeling_auto.py
@@ -29,6 +29,7 @@ from .configuration_auto import (
    ElectraConfig,
    EncoderDecoderConfig,
    FlaubertConfig,
+    FunnelConfig,
    GPT2Config,
    LongformerConfig,
    LxmertConfig,
@@ -108,6 +109,14 @@ from .modeling_flaubert import (
    FlaubertModel,
    FlaubertWithLMHeadModel,
 )
+from .modeling_funnel import (
+    FunnelForMaskedLM,
+    FunnelForMultipleChoice,
+    FunnelForQuestionAnswering,
+    FunnelForSequenceClassification,
+    FunnelForTokenClassification,
+    FunnelModel,
+)
 from .modeling_gpt2 import GPT2LMHeadModel, GPT2Model
 from .modeling_longformer import (
    LongformerForMaskedLM,
@@ -202,6 +211,7 @@ MODEL_MAPPING = OrderedDict(
        (CTRLConfig, CTRLModel),
        (ElectraConfig, ElectraModel),
        (ReformerConfig, ReformerModel),
+        (FunnelConfig, FunnelModel),
        (LxmertConfig, LxmertModel),
    ]
 )
@@ -254,6 +264,7 @@ MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
        (ElectraConfig, ElectraForMaskedLM),
        (EncoderDecoderConfig, EncoderDecoderModel),
        (ReformerConfig, ReformerModelWithLMHead),
+        (FunnelConfig, FunnelForMaskedLM),
    ]
 )

@@ -291,6 +302,7 @@ MODEL_FOR_MASKED_LM_MAPPING = OrderedDict(
        (XLMConfig, XLMWithLMHeadModel),
        (ElectraConfig, ElectraForMaskedLM),
        (ReformerConfig, ReformerForMaskedLM),
+        (FunnelConfig, FunnelForMaskedLM),
    ]
 )

@@ -320,6 +332,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict(
        (FlaubertConfig, FlaubertForSequenceClassification),
        (XLMConfig, XLMForSequenceClassification),
        (ElectraConfig, ElectraForSequenceClassification),
+        (FunnelConfig, FunnelForSequenceClassification),
    ]
 )

@@ -339,6 +352,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict(
        (XLMConfig, XLMForQuestionAnsweringSimple),
        (ElectraConfig, ElectraForQuestionAnswering),
        (ReformerConfig, ReformerForQuestionAnswering),
+        (FunnelConfig, FunnelForQuestionAnswering),
    ]
 )

@@ -357,6 +371,7 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict(
        (AlbertConfig, AlbertForTokenClassification),
        (ElectraConfig, ElectraForTokenClassification),
        (FlaubertConfig, FlaubertForTokenClassification),
+        (FunnelConfig, FunnelForTokenClassification),
    ]
 )

@@ -374,6 +389,7 @@ MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
        (AlbertConfig, AlbertForMultipleChoice),
        (XLMConfig, XLMForMultipleChoice),
        (FlaubertConfig, FlaubertForMultipleChoice),
+        (FunnelConfig, FunnelForMultipleChoice),
    ]
 )

@@ -421,6 +437,7 @@ class AutoModel:
                - isInstance of `xlm` configuration class: :class:`~transformers.XLMModel` (XLM model)
                - isInstance of `flaubert` configuration class: :class:`~transformers.FlaubertModel` (Flaubert model)
                - isInstance of `electra` configuration class: :class:`~transformers.ElectraModel` (Electra model)
+                - isInstance of `funnel` configuration class: :class:`~transformers.FunnelModel` (Funnel Transformer model)

        Examples::

@@ -462,6 +479,7 @@ class AutoModel:
            - `ctrl`: :class:`~transformers.CTRLModel` (Salesforce CTRL  model)
            - `flaubert`: :class:`~transformers.FlaubertModel` (Flaubert  model)
            - `electra`: :class:`~transformers.ElectraModel` (Electra  model)
+            - `funnel`: :class:`~transformers.FunnelModel` (Funnel Transformer model)

        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
        To train the model, you should first set it back in training mode with `model.train()`
@@ -729,6 +747,7 @@ class AutoModelWithLMHead:
                - isInstance of `xlm` configuration class: :class:`~transformers.XLMWithLMHeadModel` (XLM model)
                - isInstance of `flaubert` configuration class: :class:`~transformers.FlaubertWithLMHeadModel` (Flaubert model)
                - isInstance of `electra` configuration class: :class:`~transformers.ElectraForMaskedLM` (Electra model)
+                - isInstance of `funnel` configuration class: :class:`~transformers.FunnelForMaskedLM` (Funnel Transformer model)

        Examples::

@@ -774,6 +793,7 @@ class AutoModelWithLMHead:
            - `ctrl`: :class:`~transformers.CTRLLMHeadModel` (Salesforce CTRL model)
            - `flaubert`: :class:`~transformers.FlaubertWithLMHeadModel` (Flaubert model)
            - `electra`: :class:`~transformers.ElectraForMaskedLM` (Electra model)
+            - `funnel`: :class:`~transformers.FunnelForMaskedLM` (Funnel Transformer model)

        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
        To train the model, you should first set it back in training mode with `model.train()`
@@ -1024,6 +1044,7 @@ class AutoModelForMaskedLM:
                - isInstance of `electra` configuration class: :class:`~transformers.ElectraForMaskedLM` (Electra model)
                - isInstance of `camembert` configuration class: :class:`~transformers.CamembertForMaskedLM` (Camembert model)
                - isInstance of `albert` configuration class: :class:`~transformers.AlbertForMaskedLM` (Albert model)
+                - isInstance of `funnel` configuration class: :class:`~transformers.FunnelForMaskedLM` (Funnel Transformer model)


        Examples::
@@ -1060,6 +1081,7 @@ class AutoModelForMaskedLM:
            - `flaubert`: :class:`~transformers.FlaubertWithLMHeadModel` (Flaubert model)
            - `electra`: :class:`~transformers.ElectraForMaskedLM` (Electra model)
            - `bert`: :class:`~transformers.BertLMHeadModel` (Bert model)
+            - `funnel`: :class:`~transformers.FunnelForMaskedLM` (Funnel Transformer model)

        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
        To train the model, you should first set it back in training mode with `model.train()`
@@ -1304,7 +1326,7 @@ class AutoModelForSequenceClassification:
                - isInstance of `xlnet` configuration class: :class:`~transformers.XLNetForSequenceClassification` (XLNet model)
                - isInstance of `xlm` configuration class: :class:`~transformers.XLMForSequenceClassification` (XLM model)
                - isInstance of `flaubert` configuration class: :class:`~transformers.FlaubertForSequenceClassification` (Flaubert model)
-
+                - isInstance of `funnel` configuration class: :class:`~transformers.FunnelModelForSequenceClassification` (Funnel Transformer model)

        Examples::

@@ -1340,6 +1362,7 @@ class AutoModelForSequenceClassification:
            - `bert`: :class:`~transformers.BertForSequenceClassification` (Bert model)
            - `xlnet`: :class:`~transformers.XLNetForSequenceClassification` (XLNet model)
            - `flaubert`: :class:`~transformers.FlaubertForSequenceClassification` (Flaubert model)
+            - `funnel`: :class:`~transformers.FunnelForSequenceClassification` (Funnel Transformer model)

        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
        To train the model, you should first set it back in training mode with `model.train()`
@@ -1454,6 +1477,7 @@ class AutoModelForQuestionAnswering:
                - isInstance of `xlnet` configuration class: :class:`~transformers.XLNetForQuestionAnswering` (XLNet model)
                - isInstance of `xlm` configuration class: :class:`~transformers.XLMForQuestionAnswering` (XLM model)
                - isInstance of `flaubert` configuration class: :class:`~transformers.FlaubertForQuestionAnswering` (XLM model)
+                - isInstance of `funnel` configuration class: :class:`~transformers.FunnelForQuestionAnswering` (Funnel Transformer model)

        Examples::

@@ -1488,6 +1512,7 @@ class AutoModelForQuestionAnswering:
            - `xlnet`: :class:`~transformers.XLNetForQuestionAnswering` (XLNet model)
            - `xlm`: :class:`~transformers.XLMForQuestionAnswering` (XLM model)
            - `flaubert`: :class:`~transformers.FlaubertForQuestionAnswering` (XLM model)
+            - `funnel`: :class:`~transformers.FunnelForQuestionAnswering` (Funnel Transformer model)

        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
        To train the model, you should first set it back in training mode with `model.train()`
@@ -1604,6 +1629,7 @@ class AutoModelForTokenClassification:
                - isInstance of `camembert` configuration class: :class:`~transformers.CamembertModelForTokenClassification` (Camembert model)
                - isInstance of `roberta` configuration class: :class:`~transformers.RobertaModelForTokenClassification` (Roberta model)
                - isInstance of `electra` configuration class: :class:`~transformers.ElectraForTokenClassification` (Electra model)
+                - isInstance of `funnel` configuration class: :class:`~transformers.FunnelForTokenClassification` (Funnel Transformer model)

        Examples::

@@ -1641,6 +1667,7 @@ class AutoModelForTokenClassification:
            - `flaubert`: :class:`~transformers.FlaubertForTokenClassification` (Flaubert model)
            - `roberta`: :class:`~transformers.RobertaForTokenClassification` (Roberta model)
            - `electra`: :class:`~transformers.ElectraForTokenClassification` (Electra model)
+            - `funnel`: :class:`~transformers.FunnelForTokenClassification` (Funnel Transformer model)

        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
        To train the model, you should first set it back in training mode with `model.train()`
--- a/src/transformers/modeling_funnel.py
+++ b/src/transformers/modeling_funnel.py
--- a/src/transformers/tokenization_auto.py
+++ b/src/transformers/tokenization_auto.py
@@ -27,6 +27,7 @@ from .configuration_auto import (
    DistilBertConfig,
    ElectraConfig,
    FlaubertConfig,
+    FunnelConfig,
    GPT2Config,
    LongformerConfig,
    LxmertConfig,
@@ -54,6 +55,7 @@ from .tokenization_ctrl import CTRLTokenizer
 from .tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast
 from .tokenization_electra import ElectraTokenizer, ElectraTokenizerFast
 from .tokenization_flaubert import FlaubertTokenizer
+from .tokenization_funnel import FunnelTokenizer, FunnelTokenizerFast
 from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
 from .tokenization_longformer import LongformerTokenizer, LongformerTokenizerFast
 from .tokenization_lxmert import LxmertTokenizer, LxmertTokenizerFast
@@ -93,6 +95,7 @@ TOKENIZER_MAPPING = OrderedDict(
        (RobertaConfig, (RobertaTokenizer, RobertaTokenizerFast)),
        (ReformerConfig, (ReformerTokenizer, None)),
        (ElectraConfig, (ElectraTokenizer, ElectraTokenizerFast)),
+        (FunnelConfig, (FunnelTokenizer, FunnelTokenizerFast)),
        (LxmertConfig, (LxmertTokenizer, LxmertTokenizerFast)),
        (BertConfig, (BertTokenizer, BertTokenizerFast)),
        (OpenAIGPTConfig, (OpenAIGPTTokenizer, OpenAIGPTTokenizerFast)),
@@ -131,6 +134,7 @@ class AutoTokenizer:
        - `xlm`: XLMTokenizer (XLM model)
        - `ctrl`: CTRLTokenizer (Salesforce CTRL model)
        - `electra`: ElectraTokenizer (Google ELECTRA model)
+        - `funnel`: FunnelTokenizer (Funnel Transformer model)
        - `lxmert`: LxmertTokenizer (Lxmert model)

    This class cannot be instantiated using `__init__()` (throw an error).
@@ -167,6 +171,7 @@ class AutoTokenizer:
            - `xlm`: XLMTokenizer (XLM model)
            - `ctrl`: CTRLTokenizer (Salesforce CTRL model)
            - `electra`: ElectraTokenizer (Google ELECTRA model)
+            - `funnel`: FunnelTokenizer (Funnel Transformer model)
            - `lxmert`: LxmertTokenizer (Lxmert model)

        Params:
--- a/src/transformers/tokenization_funnel.py
+++ b/src/transformers/tokenization_funnel.py
@@ -0,0 +1,232 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tokenization class for Funnel Transformer."""
+
+from typing import List, Optional
+
+from .tokenization_bert import BertTokenizer, BertTokenizerFast
+from .utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+_model_names = [
+    "small",
+    "small-base",
+    "medium",
+    "medium-base",
+    "intermediate",
+    "intermediate-base",
+    "large",
+    "large-base",
+    "xlarge",
+    "xlarge-base",
+]
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "funnel-transformer/small": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/small/vocab.txt",
+        "funnel-transformer/small-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/small-base/vocab.txt",
+        "funnel-transformer/medium": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/medium/vocab.txt",
+        "funnel-transformer/medium-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/medium-base/vocab.txt",
+        "funnel-transformer/intermediate": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/intermediate/vocab.txt",
+        "funnel-transformer/intermediate-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/intermediate-base/vocab.txt",
+        "funnel-transformer/large": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/large/vocab.txt",
+        "funnel-transformer/large-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/large-base/vocab.txt",
+        "funnel-transformer/xlarge": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/xlarge/vocab.txt",
+        "funnel-transformer/xlarge-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/xlarge-base/vocab.txt",
+    }
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {f"funnel-transformer/{name}": 512 for name in _model_names}
+PRETRAINED_INIT_CONFIGURATION = {f"funnel-transformer/{name}": {"do_lower_case": True} for name in _model_names}
+
+
+class FunnelTokenizer(BertTokenizer):
+    r"""
+    Tokenizer for the Funnel Transformer models.
+
+    :class:`~transformers.FunnelTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+    tokenization: punctuation splitting + wordpiece.
+
+    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    parameters.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    cls_token_type_id: int = 2
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="<unk>",
+        sep_token="<sep>",
+        pad_token="<pad>",
+        cls_token="<cls>",
+        mask_token="<mask>",
+        bos_token="<s>",
+        eos_token="</s>",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs
+    ):
+        super().__init__(
+            vocab_file,
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+        Funnel Transformer expects a sequence pair mask that has the following format:
+
+        ::
+
+            2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        if token_ids_1 is None, only returns the first portion of the mask (0's).
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0]
+        return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+
+class FunnelTokenizerFast(BertTokenizerFast):
+    r"""
+    "Fast" tokenizer for the Funnel Transformer models (backed by HuggingFace's :obj:`tokenizers` library).
+
+    :class:`~transformers.FunnelTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+    end-to-end tokenization: punctuation splitting + wordpiece.
+
+    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    parameters.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    cls_token_type_id: int = 2
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        unk_token="<unk>",
+        sep_token="<sep>",
+        pad_token="<pad>",
+        cls_token="<cls>",
+        mask_token="<mask>",
+        bos_token="<s>",
+        eos_token="</s>",
+        clean_text=True,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        wordpieces_prefix="##",
+        **kwargs
+    ):
+        super().__init__(
+            vocab_file,
+            do_lower_case=do_lower_case,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            clean_text=clean_text,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            wordpieces_prefix=wordpieces_prefix,
+            **kwargs,
+        )
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+        Funnel Transformer expects a sequence pair mask that has the following format:
+
+        ::
+
+            2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        if token_ids_1 is None, only returns the first portion of the mask (0's).
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0]
+        return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+    def _convert_encoding(self, encoding, **kwargs):
+        # The fast tokenizer doesn't use the function above so we fix the cls token type id when decoding the fast
+        # tokenzier output.
+        encoding_dict = super()._convert_encoding(encoding, **kwargs)
+        if "token_type_ids" in encoding_dict:
+            # Note: we can't assume the <cls> token is in first position because left padding is a thing, hence the
+            # double list comprehension.
+            encoding_dict["token_type_ids"] = [
+                [self.cls_token_type_id if i == self.cls_token_id else t for i, t in zip(input_ids, type_ids)]
+                for input_ids, type_ids in zip(encoding_dict["input_ids"], encoding_dict["token_type_ids"])
+            ]
+        return encoding_dict