Integrate DeBERTa v2(the 1.5B model surpassed human performance on Su… (#10018)

* Integrate DeBERTa v2(the 1.5B model surpassed human performance on SuperGLUE); Add DeBERTa v2 900M,1.5B models; * DeBERTa-v2 * Fix v2 model loading issue (#10129) * Doc members * Update src/transformers/models/deberta/modeling_deberta.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Address Sylvain's comments * Address Patrick's comments Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> * Style Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr> Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
2021-02-19 15:34:44 -08:00
parent f6e53e3c2b
commit 9a7e63729f
19 changed files with 3012 additions and 73 deletions
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -157,6 +157,7 @@ _import_structure = {
    "models.camembert": ["CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "CamembertConfig"],
    "models.ctrl": ["CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CTRLConfig", "CTRLTokenizer"],
    "models.deberta": ["DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaConfig", "DebertaTokenizer"],
+    "models.deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config", "DebertaV2Tokenizer"],
    "models.distilbert": ["DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DistilBertConfig", "DistilBertTokenizer"],
    "models.dpr": [
        "DPR_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -515,6 +516,17 @@ if is_torch_available():
            "DebertaForQuestionAnswering",
        ]
    )
+    _import_structure["models.deberta_v2"].extend(
+        [
+            "DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "DebertaV2ForSequenceClassification",
+            "DebertaV2Model",
+            "DebertaV2ForMaskedLM",
+            "DebertaV2PreTrainedModel",
+            "DebertaV2ForTokenClassification",
+            "DebertaV2ForQuestionAnswering",
+        ]
+    )
    _import_structure["models.distilbert"].extend(
        [
            "DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1287,6 +1299,7 @@ if TYPE_CHECKING:
    from .models.convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig, ConvBertTokenizer
    from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
    from .models.deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig, DebertaTokenizer
+    from .models.deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config, DebertaV2Tokenizer
    from .models.distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig, DistilBertTokenizer
    from .models.dpr import (
        DPR_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -1604,6 +1617,15 @@ if TYPE_CHECKING:
            DebertaModel,
            DebertaPreTrainedModel,
        )
+        from .models.deberta_v2 import (
+            DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST,
+            DebertaV2ForMaskedLM,
+            DebertaV2ForQuestionAnswering,
+            DebertaV2ForSequenceClassification,
+            DebertaV2ForTokenClassification,
+            DebertaV2Model,
+            DebertaV2PreTrainedModel,
+        )
        from .models.distilbert import (
            DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
            DistilBertForMaskedLM,
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -31,6 +31,7 @@ from ..camembert.configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCH
 from ..convbert.configuration_convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig
 from ..ctrl.configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig
 from ..deberta.configuration_deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig
+from ..deberta_v2.configuration_deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config
 from ..distilbert.configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig
 from ..dpr.configuration_dpr import DPR_PRETRAINED_CONFIG_ARCHIVE_MAP, DPRConfig
 from ..electra.configuration_electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig
@@ -103,6 +104,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
        LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
        DPR_PRETRAINED_CONFIG_ARCHIVE_MAP,
        DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP,
        SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
        XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
        PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -138,6 +140,7 @@ CONFIG_MAPPING = OrderedDict(
        ("reformer", ReformerConfig),
        ("longformer", LongformerConfig),
        ("roberta", RobertaConfig),
+        ("deberta-v2", DebertaV2Config),
        ("deberta", DebertaConfig),
        ("flaubert", FlaubertConfig),
        ("fsmt", FSMTConfig),
@@ -199,6 +202,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
        ("encoder-decoder", "Encoder decoder"),
        ("funnel", "Funnel Transformer"),
        ("lxmert", "LXMERT"),
+        ("deberta-v2", "DeBERTa-v2"),
        ("deberta", "DeBERTa"),
        ("layoutlm", "LayoutLM"),
        ("dpr", "DPR"),
@@ -366,7 +370,6 @@ class AutoConfig:
            {'foo': False}
        """
        config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
        if "model_type" in config_dict:
            config_class = CONFIG_MAPPING[config_dict["model_type"]]
            return config_class.from_dict(config_dict, **kwargs)
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -84,6 +84,13 @@ from ..deberta.modeling_deberta import (
    DebertaForTokenClassification,
    DebertaModel,
 )
+from ..deberta_v2.modeling_deberta_v2 import (
+    DebertaV2ForMaskedLM,
+    DebertaV2ForQuestionAnswering,
+    DebertaV2ForSequenceClassification,
+    DebertaV2ForTokenClassification,
+    DebertaV2Model,
+)
 from ..distilbert.modeling_distilbert import (
    DistilBertForMaskedLM,
    DistilBertForMultipleChoice,
@@ -254,6 +261,7 @@ from .configuration_auto import (
    ConvBertConfig,
    CTRLConfig,
    DebertaConfig,
+    DebertaV2Config,
    DistilBertConfig,
    DPRConfig,
    ElectraConfig,
@@ -332,6 +340,7 @@ MODEL_MAPPING = OrderedDict(
        (LxmertConfig, LxmertModel),
        (BertGenerationConfig, BertGenerationEncoder),
        (DebertaConfig, DebertaModel),
+        (DebertaV2Config, DebertaV2Model),
        (DPRConfig, DPRQuestionEncoder),
        (XLMProphetNetConfig, XLMProphetNetModel),
        (ProphetNetConfig, ProphetNetModel),
@@ -408,6 +417,7 @@ MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
        (MPNetConfig, MPNetForMaskedLM),
        (TapasConfig, TapasForMaskedLM),
        (DebertaConfig, DebertaForMaskedLM),
+        (DebertaV2Config, DebertaV2ForMaskedLM),
    ]
 )

@@ -465,6 +475,7 @@ MODEL_FOR_MASKED_LM_MAPPING = OrderedDict(
        (MPNetConfig, MPNetForMaskedLM),
        (TapasConfig, TapasForMaskedLM),
        (DebertaConfig, DebertaForMaskedLM),
+        (DebertaV2Config, DebertaV2ForMaskedLM),
    ]
 )

@@ -510,6 +521,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict(
        (ElectraConfig, ElectraForSequenceClassification),
        (FunnelConfig, FunnelForSequenceClassification),
        (DebertaConfig, DebertaForSequenceClassification),
+        (DebertaV2Config, DebertaV2ForSequenceClassification),
        (GPT2Config, GPT2ForSequenceClassification),
        (OpenAIGPTConfig, OpenAIGPTForSequenceClassification),
        (ReformerConfig, ReformerForSequenceClassification),
@@ -545,6 +557,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict(
        (LxmertConfig, LxmertForQuestionAnswering),
        (MPNetConfig, MPNetForQuestionAnswering),
        (DebertaConfig, DebertaForQuestionAnswering),
+        (DebertaV2Config, DebertaV2ForQuestionAnswering),
    ]
 )

@@ -577,6 +590,7 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict(
        (FunnelConfig, FunnelForTokenClassification),
        (MPNetConfig, MPNetForTokenClassification),
        (DebertaConfig, DebertaForTokenClassification),
+        (DebertaV2Config, DebertaV2ForTokenClassification),
    ]
 )

--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -66,6 +66,7 @@ from .configuration_auto import (
    ConvBertConfig,
    CTRLConfig,
    DebertaConfig,
+    DebertaV2Config,
    DistilBertConfig,
    DPRConfig,
    ElectraConfig,
@@ -108,6 +109,7 @@ if is_sentencepiece_available():
    from ..barthez.tokenization_barthez import BarthezTokenizer
    from ..bert_generation.tokenization_bert_generation import BertGenerationTokenizer
    from ..camembert.tokenization_camembert import CamembertTokenizer
+    from ..deberta_v2.tokenization_deberta_v2 import DebertaV2Tokenizer
    from ..marian.tokenization_marian import MarianTokenizer
    from ..mbart.tokenization_mbart import MBartTokenizer
    from ..mt5 import MT5Tokenizer
@@ -122,6 +124,7 @@ else:
    BarthezTokenizer = None
    BertGenerationTokenizer = None
    CamembertTokenizer = None
+    DebertaV2Tokenizer = None
    MarianTokenizer = None
    MBartTokenizer = None
    MT5Tokenizer = None
@@ -233,6 +236,7 @@ TOKENIZER_MAPPING = OrderedDict(
        (FSMTConfig, (FSMTTokenizer, None)),
        (BertGenerationConfig, (BertGenerationTokenizer, None)),
        (DebertaConfig, (DebertaTokenizer, None)),
+        (DebertaV2Config, (DebertaV2Tokenizer, None)),
        (RagConfig, (RagTokenizer, None)),
        (XLMProphetNetConfig, (XLMProphetNetTokenizer, None)),
        (ProphetNetConfig, (ProphetNetTokenizer, None)),
--- a/src/transformers/models/deberta/configuration_deberta.py
+++ b/src/transformers/models/deberta/configuration_deberta.py
@@ -23,6 +23,10 @@ logger = logging.get_logger(__name__)
 DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/deberta-base": "https://huggingface.co/microsoft/deberta-base/resolve/main/config.json",
    "microsoft/deberta-large": "https://huggingface.co/microsoft/deberta-large/resolve/main/config.json",
+    "microsoft/deberta-xlarge": "https://huggingface.co/microsoft/deberta-xlarge/resolve/main/config.json",
+    "microsoft/deberta-base-mnli": "https://huggingface.co/microsoft/deberta-base-mnli/resolve/main/config.json",
+    "microsoft/deberta-large-mnli": "https://huggingface.co/microsoft/deberta-large-mnli/resolve/main/config.json",
+    "microsoft/deberta-xlarge-mnli": "https://huggingface.co/microsoft/deberta-xlarge-mnli/resolve/main/config.json",
 }


--- a/src/transformers/models/deberta/modeling_deberta.py
+++ b/src/transformers/models/deberta/modeling_deberta.py
@@ -18,7 +18,6 @@ import math
 from collections.abc import Sequence

 import torch
-from packaging import version
 from torch import _softmax_backward_data, nn
 from torch.nn import CrossEntropyLoss

@@ -40,10 +39,15 @@ logger = logging.get_logger(__name__)

 _CONFIG_FOR_DOC = "DebertaConfig"
 _TOKENIZER_FOR_DOC = "DebertaTokenizer"
+_CHECKPOINT_FOR_DOC = "microsoft/deberta-base"

 DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/deberta-base",
    "microsoft/deberta-large",
+    "microsoft/deberta-xlarge",
+    "microsoft/deberta-base-mnli",
+    "microsoft/deberta-large-mnli",
+    "microsoft/deberta-xlarge-mnli",
 ]


@@ -54,7 +58,7 @@ class ContextPooler(nn.Module):
        self.dropout = StableDropout(config.pooler_dropout)
        self.config = config

-    def forward(self, hidden_states, mask=None):
+    def forward(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.

@@ -74,27 +78,28 @@ class XSoftmax(torch.autograd.Function):
    Masked Softmax which is optimized for saving memory

    Args:
-      input (:obj:`torch.tensor`): The input tensor that will apply softmax.
-      mask (:obj:`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
-      dim (int): The dimension that will apply softmax
+        input (:obj:`torch.tensor`): The input tensor that will apply softmax.
+        mask (:obj:`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
+        dim (int): The dimension that will apply softmax

    Example::
-      import torch
-      from transformers.models.deberta import XSoftmax
-      # Make a tensor
-      x = torch.randn([4,20,100])
-      # Create a mask
-      mask = (x>0).int()
-      y = XSoftmax.apply(x, mask, dim=-1)
+
+          >>> import torch
+          >>> from transformers.models.deberta.modeling_deberta import XSoftmax
+
+          >>> # Make a tensor
+          >>> x = torch.randn([4,20,100])
+
+          >>> # Create a mask
+          >>> mask = (x>0).int()
+
+          >>> y = XSoftmax.apply(x, mask, dim=-1)
    """

    @staticmethod
    def forward(self, input, mask, dim):
        self.dim = dim
-        if version.Version(torch.__version__) >= version.Version("1.2.0a"):
-            rmask = ~(mask.bool())
-        else:
-            rmask = (1 - mask).byte()  # This line is not supported by Onnx tracing.
+        rmask = ~(mask.bool())

        output = input.masked_fill(rmask, float("-inf"))
        output = torch.softmax(output, self.dim)
@@ -127,10 +132,7 @@ def get_mask(input, local_context):
        mask = local_context.mask if local_context.reuse_mask else None

    if dropout > 0 and mask is None:
-        if version.Version(torch.__version__) >= version.Version("1.2.0a"):
-            mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).bool()
-        else:
-            mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).byte()
+        mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).bool()

    if isinstance(local_context, DropoutContext):
        if local_context.mask is None:
@@ -166,9 +168,7 @@ class StableDropout(torch.nn.Module):
    Optimized dropout module for stabilizing the training

    Args:
-
        drop_prob (float): the dropout probabilities
-
    """

    def __init__(self, drop_prob):
@@ -183,8 +183,6 @@ class StableDropout(torch.nn.Module):

        Args:
            x (:obj:`torch.tensor`): The input tensor to apply dropout
-
-
        """
        if self.training and self.drop_prob > 0:
            return XDropout.apply(x, self.get_context())
@@ -302,7 +300,7 @@ class DebertaIntermediate(nn.Module):

 class DebertaOutput(nn.Module):
    def __init__(self, config):
-        super(DebertaOutput, self).__init__()
+        super().__init__()
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        self.LayerNorm = DebertaLayerNorm(config.hidden_size, config.layer_norm_eps)
        self.dropout = StableDropout(config.hidden_dropout_prob)
@@ -317,7 +315,7 @@ class DebertaOutput(nn.Module):

 class DebertaLayer(nn.Module):
    def __init__(self, config):
-        super(DebertaLayer, self).__init__()
+        super().__init__()
        self.attention = DebertaAttention(config)
        self.intermediate = DebertaIntermediate(config)
        self.output = DebertaOutput(config)
@@ -701,7 +699,6 @@ class DebertaEmbeddings(nn.Module):
            self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False)
        self.LayerNorm = DebertaLayerNorm(config.hidden_size, config.layer_norm_eps)
        self.dropout = StableDropout(config.hidden_dropout_prob)
-        self.output_to_half = False
        self.config = config

        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
@@ -763,6 +760,11 @@ class DebertaPreTrainedModel(PreTrainedModel):
    config_class = DebertaConfig
    base_model_prefix = "deberta"
    _keys_to_ignore_on_load_missing = ["position_ids"]
+    _keys_to_ignore_on_load_unexpected = ["position_embeddings"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self._register_load_state_dict_pre_hook(self._pre_load_hook)

    def _init_weights(self, module):
        """ Initialize the weights """
@@ -773,6 +775,25 @@ class DebertaPreTrainedModel(PreTrainedModel):
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

+    def _pre_load_hook(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+        """
+        Removes the classifier if it doesn't have the correct number of labels.
+        """
+        self_state = self.state_dict()
+        if (
+            ("classifier.weight" in self_state)
+            and ("classifier.weight" in state_dict)
+            and self_state["classifier.weight"].size() != state_dict["classifier.weight"].size()
+        ):
+            logger.warning(
+                f"The checkpoint classifier head has a shape {state_dict['classifier.weight'].size()} and this model "
+                f"classifier head has a shape {self_state['classifier.weight'].size()}. Ignoring the checkpoint "
+                f"weights. You should train your model on new data."
+            )
+            del state_dict["classifier.weight"]
+            if "classifier.bias" in state_dict:
+                del state_dict["classifier.bias"]
+

 DEBERTA_START_DOCSTRING = r"""
    The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention
@@ -867,7 +888,7 @@ class DebertaModel(DebertaPreTrainedModel):
    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="microsoft/deberta-base",
+        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
@@ -953,7 +974,6 @@ class DebertaModel(DebertaPreTrainedModel):

@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top. """, DEBERTA_START_DOCSTRING)
 class DebertaForMaskedLM(DebertaPreTrainedModel):
-
    _keys_to_ignore_on_load_unexpected = [r"pooler"]
    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]

@@ -974,7 +994,7 @@ class DebertaForMaskedLM(DebertaPreTrainedModel):
    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="microsoft/deberta-base",
+        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
@@ -1114,7 +1134,7 @@ class DebertaForSequenceClassification(DebertaPreTrainedModel):
    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="microsoft/deberta-base",
+        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
@@ -1194,7 +1214,6 @@ class DebertaForSequenceClassification(DebertaPreTrainedModel):
    DEBERTA_START_DOCSTRING,
 )
 class DebertaForTokenClassification(DebertaPreTrainedModel):
-
    _keys_to_ignore_on_load_unexpected = [r"pooler"]

    def __init__(self, config):
@@ -1210,7 +1229,7 @@ class DebertaForTokenClassification(DebertaPreTrainedModel):
    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="microsoft/deberta-base",
+        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
@@ -1283,7 +1302,6 @@ class DebertaForTokenClassification(DebertaPreTrainedModel):
    DEBERTA_START_DOCSTRING,
 )
 class DebertaForQuestionAnswering(DebertaPreTrainedModel):
-
    _keys_to_ignore_on_load_unexpected = [r"pooler"]

    def __init__(self, config):
@@ -1298,7 +1316,7 @@ class DebertaForQuestionAnswering(DebertaPreTrainedModel):
    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="microsoft/deberta-base",
+        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
--- a/src/transformers/models/deberta/tokenization_deberta.py
+++ b/src/transformers/models/deberta/tokenization_deberta.py
@@ -44,12 +44,20 @@ PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "microsoft/deberta-base": "https://huggingface.co/microsoft/deberta-base/resolve/main/bpe_encoder.bin",
        "microsoft/deberta-large": "https://huggingface.co/microsoft/deberta-large/resolve/main/bpe_encoder.bin",
+        "microsoft/deberta-xlarge": "https://huggingface.co/microsoft/deberta-xlarge/resolve/main/bpe_encoder.bin",
+        "microsoft/deberta-base-mnli": "https://huggingface.co/microsoft/deberta-base-mnli/resolve/main/bpe_encoder.bin",
+        "microsoft/deberta-large-mnli": "https://huggingface.co/microsoft/deberta-large-mnli/resolve/main/bpe_encoder.bin",
+        "microsoft/deberta-xlarge-mnli": "https://huggingface.co/microsoft/deberta-xlarge-mnli/resolve/main/bpe_encoder.bin",
    }
 }

 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "microsoft/deberta-base": 512,
    "microsoft/deberta-large": 512,
+    "microsoft/deberta-xlarge": 512,
+    "microsoft/deberta-base-mnli": 512,
+    "microsoft/deberta-large-mnli": 512,
+    "microsoft/deberta-xlarge-mnli": 512,
 }

 PRETRAINED_INIT_CONFIGURATION = {
--- a/src/transformers/models/deberta_v2/init.py
+++ b/src/transformers/models/deberta_v2/init.py
@@ -0,0 +1,72 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...file_utils import _BaseLazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config"],
+    "tokenization_deberta_v2": ["DebertaV2Tokenizer"],
+}
+
+if is_torch_available():
+    _import_structure["modeling_deberta_v2"] = [
+        "DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "DebertaV2ForSequenceClassification",
+        "DebertaV2Model",
+        "DebertaV2ForMaskedLM",
+        "DebertaV2PreTrainedModel",
+        "DebertaV2ForTokenClassification",
+        "DebertaV2ForQuestionAnswering",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config
+    from .tokenization_deberta_v2 import DebertaV2Tokenizer
+
+    if is_torch_available():
+        from .modeling_deberta_v2 import (
+            DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST,
+            DebertaV2ForMaskedLM,
+            DebertaV2ForQuestionAnswering,
+            DebertaV2ForSequenceClassification,
+            DebertaV2ForTokenClassification,
+            DebertaV2Model,
+            DebertaV2PreTrainedModel,
+        )
+
+else:
+    import importlib
+    import os
+    import sys
+
+    class _LazyModule(_BaseLazyModule):
+        """
+        Module class that surfaces all objects but only performs associated imports when the objects are requested.
+        """
+
+        __file__ = globals()["__file__"]
+        __path__ = [os.path.dirname(__file__)]
+
+        def _get_module(self, module_name: str):
+            return importlib.import_module("." + module_name, self.__name__)
+
+    sys.modules[__name__] = _LazyModule(__name__, _import_structure)
--- a/src/transformers/models/deberta_v2/configuration_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/configuration_deberta_v2.py
@@ -0,0 +1,138 @@
+# coding=utf-8
+# Copyright 2020, Microsoft and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" DeBERTa-v2 model configuration """
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "microsoft/deberta-v2-xlarge": "https://huggingface.co/microsoft/deberta-v2-xlarge/resolve/main/config.json",
+    "microsoft/deberta-v2-xxlarge": "https://huggingface.co/microsoft/deberta-v2-xxlarge/resolve/main/config.json",
+    "microsoft/deberta-v2-xlarge-mnli": "https://huggingface.co/microsoft/deberta-v2-xlarge-mnli/resolve/main/config.json",
+    "microsoft/deberta-v2-xxlarge-mnli": "https://huggingface.co/microsoft/deberta-v2-xxlarge-mnli/resolve/main/config.json",
+}
+
+
+class DebertaV2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.DebertaV2Model`. It is used
+    to instantiate a DeBERTa-v2 model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the DeBERTa
+    `microsoft/deberta-v2-xlarge <https://huggingface.co/microsoft/deberta-base>`__ architecture.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+    Arguments:
+        vocab_size (:obj:`int`, `optional`, defaults to 128100):
+            Vocabulary size of the DeBERTa-v2 model. Defines the number of different tokens that can be represented by
+            the :obj:`inputs_ids` passed when calling :class:`~transformers.DebertaV2Model`.
+        hidden_size (:obj:`int`, `optional`, defaults to 1536):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, `optional`, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, `optional`, defaults to 24):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, `optional`, defaults to 6144):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"`, :obj:`"gelu"`, :obj:`"tanh"`, :obj:`"gelu_fast"`,
+            :obj:`"mish"`, :obj:`"linear"`, :obj:`"sigmoid"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 0):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.DebertaModel` or
+            :class:`~transformers.TFDebertaModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-7):
+            The epsilon used by the layer normalization layers.
+        relative_attention (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether use relative position encoding.
+        max_relative_positions (:obj:`int`, `optional`, defaults to -1):
+            The range of relative positions :obj:`[-max_position_embeddings, max_position_embeddings]`. Use the same
+            value as :obj:`max_position_embeddings`.
+        pad_token_id (:obj:`int`, `optional`, defaults to 0):
+            The value used to pad input_ids.
+        position_biased_input (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether add absolute position embedding to content embedding.
+        pos_att_type (:obj:`List[str]`, `optional`):
+            The type of relative position attention, it can be a combination of :obj:`["p2c", "c2p", "p2p"]`, e.g.
+            :obj:`["p2c"]`, :obj:`["p2c", "c2p"]`, :obj:`["p2c", "c2p", 'p2p"]`.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+    """
+    model_type = "deberta-v2"
+
+    def __init__(
+        self,
+        vocab_size=128100,
+        hidden_size=1536,
+        num_hidden_layers=24,
+        num_attention_heads=24,
+        intermediate_size=6144,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-7,
+        relative_attention=False,
+        max_relative_positions=-1,
+        pad_token_id=0,
+        position_biased_input=True,
+        pos_att_type=None,
+        pooler_dropout=0,
+        pooler_hidden_act="gelu",
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.relative_attention = relative_attention
+        self.max_relative_positions = max_relative_positions
+        self.pad_token_id = pad_token_id
+        self.position_biased_input = position_biased_input
+
+        # Backwards compatibility
+        if type(pos_att_type) == str:
+            pos_att_type = [x.strip() for x in pos_att_type.lower().split("|")]
+
+        self.pos_att_type = pos_att_type
+        self.vocab_size = vocab_size
+        self.layer_norm_eps = layer_norm_eps
+
+        self.pooler_hidden_size = kwargs.get("pooler_hidden_size", hidden_size)
+        self.pooler_dropout = pooler_dropout
+        self.pooler_hidden_act = pooler_hidden_act
--- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
--- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
@@ -0,0 +1,491 @@
+# coding=utf-8
+# Copyright 2020 Microsoft and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tokenization class for model DeBERTa."""
+
+import os
+import unicodedata
+from typing import Optional, Tuple
+
+import sentencepiece as sp
+import six
+
+from ...tokenization_utils import PreTrainedTokenizer
+
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "microsoft/deberta-v2-xlarge": "https://huggingface.co/microsoft/deberta-v2-xlarge/resolve/main/spm.model",
+        "microsoft/deberta-v2-xxlarge": "https://huggingface.co/microsoft/deberta-v2-xxlarge/resolve/main/spm.model",
+        "microsoft/deberta-v2-xlarge-mnli": "https://huggingface.co/microsoft/deberta-v2-xlarge-mnli/resolve/main/spm.model",
+        "microsoft/deberta-v2-xxlarge-mnli": "https://huggingface.co/microsoft/deberta-v2-xxlarge-mnli/resolve/main/spm.model",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "microsoft/deberta-v2-xlarge": 512,
+    "microsoft/deberta-v2-xxlarge": 512,
+    "microsoft/deberta-v2-xlarge-mnli": 512,
+    "microsoft/deberta-v2-xxlarge-mnli": 512,
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "microsoft/deberta-v2-xlarge": {"do_lower_case": False},
+    "microsoft/deberta-v2-xxlarge": {"do_lower_case": False},
+    "microsoft/deberta-v2-xlarge-mnli": {"do_lower_case": False},
+    "microsoft/deberta-v2-xxlarge-mnli": {"do_lower_case": False},
+}
+
+VOCAB_FILES_NAMES = {"vocab_file": "spm.model"}
+
+
+class DebertaV2Tokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a DeBERTa-v2 tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+
+    Args:
+        vocab_file (:obj:`str`):
+            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to lowercase the input when tokenizing.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=False,
+        split_by_punct=False,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+        super().__init__(
+            do_lower_case=do_lower_case,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            split_by_punct=split_by_punct,
+            **kwargs,
+        )
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = DebertaV2Tokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+        self.do_lower_case = do_lower_case
+        self.split_by_punct = split_by_punct
+        self._tokenizer = SPMTokenizer(vocab_file, split_by_punct=split_by_punct)
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    @property
+    def vocab(self):
+        return self._tokenizer.vocab
+
+    def get_vocab(self):
+        vocab = self.vocab.copy()
+        vocab.update(self.get_added_vocab())
+        return vocab
+
+    def _tokenize(self, text):
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+        if self.do_lower_case:
+            text = text.lower()
+        return self._tokenizer.tokenize(text)
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self._tokenizer.spm.PieceToId(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self._tokenizer.spm.IdToPiece(index) if index < self.vocab_size else self.unk_token
+
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        return self._tokenizer.decode(tokens)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A DeBERTa sequence has the following format:
+
+        - single sequence: [CLS] X [SEP]
+        - pair of sequences: [CLS] A [SEP] B [SEP]
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(
+                map(
+                    lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
+                    token_ids_0,
+                )
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
+        sequence pair mask has the following format:
+
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        add_prefix_space = kwargs.pop("add_prefix_space", False)
+        if is_split_into_words or add_prefix_space:
+            text = " " + text
+        return (text, kwargs)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        return self._tokenizer.save_pretrained(save_directory, filename_prefix=filename_prefix)
+
+
+class SPMTokenizer:
+    def __init__(self, vocab_file, split_by_punct=False):
+        self.split_by_punct = split_by_punct
+        self.vocab_file = vocab_file
+        spm = sp.SentencePieceProcessor()
+        assert os.path.exists(vocab_file)
+        spm.load(vocab_file)
+        bpe_vocab_size = spm.GetPieceSize()
+        # Token map
+        # <unk> 0+1
+        # <s> 1+1
+        # </s> 2+1
+        self.vocab = {spm.IdToPiece(i): i for i in range(bpe_vocab_size)}
+        self.id_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)]
+        # self.vocab['[PAD]'] = 0
+        # self.vocab['[CLS]'] = 1
+        # self.vocab['[SEP]'] = 2
+        # self.vocab['[UNK]'] = 3
+
+        self.spm = spm
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["spm"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+        self.spm = sp.SentencePieceProcessor()
+        self.spm.Load(self.vocab_file)
+
+    def tokenize(self, text):
+        pieces = self._encode_as_pieces(text)
+
+        def _norm(x):
+            if x not in self.vocab or x == "<unk>":
+                return "[UNK]"
+            else:
+                return x
+
+        pieces = [_norm(p) for p in pieces]
+        return pieces
+
+    def convert_ids_to_tokens(self, ids):
+        tokens = []
+        for i in ids:
+            tokens.append(self.ids_to_tokens[i])
+        return tokens
+
+    def decode(self, tokens, start=-1, end=-1, raw_text=None):
+        if raw_text is None:
+            return self.spm.decode_pieces([t for t in tokens])
+        else:
+            words = self.split_to_words(raw_text)
+            word_tokens = [self.tokenize(w) for w in words]
+            token2words = [0] * len(tokens)
+            tid = 0
+            for i, w in enumerate(word_tokens):
+                for k, t in enumerate(w):
+                    token2words[tid] = i
+                    tid += 1
+            word_start = token2words[start]
+            word_end = token2words[end] if end < len(tokens) else len(words)
+            text = "".join(words[word_start:word_end])
+            return text
+
+    def add_special_token(self, token):
+        if token not in self.special_tokens:
+            self.special_tokens.append(token)
+            if token not in self.vocab:
+                self.vocab[token] = len(self.vocab) - 1
+                self.id_to_tokens.append(token)
+        return self.id(token)
+
+    def part_of_whole_word(self, token, is_bos=False):
+        if is_bos:
+            return True
+        if (
+            len(token) == 1
+            and (_is_whitespace(list(token)[0]) or _is_control(list(token)[0]) or _is_punctuation(list(token)[0]))
+        ) or token in self.special_tokens:
+            return False
+
+        word_start = b"\xe2\x96\x81".decode("utf-8")
+        return not token.startswith(word_start)
+
+    def pad(self):
+        return "[PAD]"
+
+    def bos(self):
+        return "[CLS]"
+
+    def eos(self):
+        return "[SEP]"
+
+    def unk(self):
+        return "[UNK]"
+
+    def mask(self):
+        return "[MASK]"
+
+    def sym(self, id):
+        return self.ids_to_tokens[id]
+
+    def id(self, sym):
+        return self.vocab[sym] if sym in self.vocab else 1
+
+    def _encode_as_pieces(self, text):
+        text = convert_to_unicode(text)
+        if self.split_by_punct:
+            words = self._run_split_on_punc(text)
+            pieces = [self.spm.encode_as_pieces(w) for w in words]
+            return [p for w in pieces for p in w]
+        else:
+            return self.spm.encode_as_pieces(text)
+
+    def split_to_words(self, text):
+        pieces = self._encode_as_pieces(text)
+        word_start = b"\xe2\x96\x81".decode("utf-8")
+        words = []
+        offset = 0
+        prev_end = 0
+        for i, p in enumerate(pieces):
+            if p.startswith(word_start):
+                if offset > prev_end:
+                    words.append(text[prev_end:offset])
+                prev_end = offset
+                w = p.replace(word_start, "")
+            else:
+                w = p
+            try:
+                s = text.index(w, offset)
+                pn = ""
+                k = i + 1
+                while k < len(pieces):
+                    pn = pieces[k].replace(word_start, "")
+                    if len(pn) > 0:
+                        break
+                    k += 1
+
+                if len(pn) > 0 and pn in text[offset:s]:
+                    offset = offset + 1
+                else:
+                    offset = s + len(w)
+            except Exception:
+                offset = offset + 1
+
+        if prev_end < offset:
+            words.append(text[prev_end:offset])
+
+        return words
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def save_pretrained(self, path: str, filename_prefix: str = None):
+        filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]]
+        if filename_prefix is not None:
+            filename = filename_prefix + "-" + filename
+        full_path = os.path.join(path, filename)
+        with open(full_path, "wb") as fs:
+            fs.write(self.spm.serialized_model_proto())
+        return (full_path,)
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
+
+
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -883,6 +883,63 @@ class DebertaPreTrainedModel:
        requires_pytorch(self)


+DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class DebertaV2ForMaskedLM:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class DebertaV2ForQuestionAnswering:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class DebertaV2ForSequenceClassification:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class DebertaV2ForTokenClassification:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class DebertaV2Model:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class DebertaV2PreTrainedModel:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
 DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None