Integrate DeBERTa v2(the 1.5B model surpassed human performance on Su… (#10018)
* Integrate DeBERTa v2(the 1.5B model surpassed human performance on SuperGLUE); Add DeBERTa v2 900M,1.5B models; * DeBERTa-v2 * Fix v2 model loading issue (#10129) * Doc members * Update src/transformers/models/deberta/modeling_deberta.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Address Sylvain's comments * Address Patrick's comments Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> * Style Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr> Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
This commit is contained in:
@@ -157,6 +157,7 @@ _import_structure = {
|
||||
"models.camembert": ["CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "CamembertConfig"],
|
||||
"models.ctrl": ["CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CTRLConfig", "CTRLTokenizer"],
|
||||
"models.deberta": ["DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaConfig", "DebertaTokenizer"],
|
||||
"models.deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config", "DebertaV2Tokenizer"],
|
||||
"models.distilbert": ["DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DistilBertConfig", "DistilBertTokenizer"],
|
||||
"models.dpr": [
|
||||
"DPR_PRETRAINED_CONFIG_ARCHIVE_MAP",
|
||||
@@ -515,6 +516,17 @@ if is_torch_available():
|
||||
"DebertaForQuestionAnswering",
|
||||
]
|
||||
)
|
||||
_import_structure["models.deberta_v2"].extend(
|
||||
[
|
||||
"DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
"DebertaV2ForSequenceClassification",
|
||||
"DebertaV2Model",
|
||||
"DebertaV2ForMaskedLM",
|
||||
"DebertaV2PreTrainedModel",
|
||||
"DebertaV2ForTokenClassification",
|
||||
"DebertaV2ForQuestionAnswering",
|
||||
]
|
||||
)
|
||||
_import_structure["models.distilbert"].extend(
|
||||
[
|
||||
"DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
@@ -1287,6 +1299,7 @@ if TYPE_CHECKING:
|
||||
from .models.convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig, ConvBertTokenizer
|
||||
from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
|
||||
from .models.deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig, DebertaTokenizer
|
||||
from .models.deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config, DebertaV2Tokenizer
|
||||
from .models.distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig, DistilBertTokenizer
|
||||
from .models.dpr import (
|
||||
DPR_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
@@ -1604,6 +1617,15 @@ if TYPE_CHECKING:
|
||||
DebertaModel,
|
||||
DebertaPreTrainedModel,
|
||||
)
|
||||
from .models.deberta_v2 import (
|
||||
DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
DebertaV2ForMaskedLM,
|
||||
DebertaV2ForQuestionAnswering,
|
||||
DebertaV2ForSequenceClassification,
|
||||
DebertaV2ForTokenClassification,
|
||||
DebertaV2Model,
|
||||
DebertaV2PreTrainedModel,
|
||||
)
|
||||
from .models.distilbert import (
|
||||
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
DistilBertForMaskedLM,
|
||||
|
||||
@@ -31,6 +31,7 @@ from ..camembert.configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCH
|
||||
from ..convbert.configuration_convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig
|
||||
from ..ctrl.configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig
|
||||
from ..deberta.configuration_deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig
|
||||
from ..deberta_v2.configuration_deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config
|
||||
from ..distilbert.configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig
|
||||
from ..dpr.configuration_dpr import DPR_PRETRAINED_CONFIG_ARCHIVE_MAP, DPRConfig
|
||||
from ..electra.configuration_electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig
|
||||
@@ -103,6 +104,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
|
||||
LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
DPR_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
@@ -138,6 +140,7 @@ CONFIG_MAPPING = OrderedDict(
|
||||
("reformer", ReformerConfig),
|
||||
("longformer", LongformerConfig),
|
||||
("roberta", RobertaConfig),
|
||||
("deberta-v2", DebertaV2Config),
|
||||
("deberta", DebertaConfig),
|
||||
("flaubert", FlaubertConfig),
|
||||
("fsmt", FSMTConfig),
|
||||
@@ -199,6 +202,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
|
||||
("encoder-decoder", "Encoder decoder"),
|
||||
("funnel", "Funnel Transformer"),
|
||||
("lxmert", "LXMERT"),
|
||||
("deberta-v2", "DeBERTa-v2"),
|
||||
("deberta", "DeBERTa"),
|
||||
("layoutlm", "LayoutLM"),
|
||||
("dpr", "DPR"),
|
||||
@@ -366,7 +370,6 @@ class AutoConfig:
|
||||
{'foo': False}
|
||||
"""
|
||||
config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
if "model_type" in config_dict:
|
||||
config_class = CONFIG_MAPPING[config_dict["model_type"]]
|
||||
return config_class.from_dict(config_dict, **kwargs)
|
||||
|
||||
@@ -84,6 +84,13 @@ from ..deberta.modeling_deberta import (
|
||||
DebertaForTokenClassification,
|
||||
DebertaModel,
|
||||
)
|
||||
from ..deberta_v2.modeling_deberta_v2 import (
|
||||
DebertaV2ForMaskedLM,
|
||||
DebertaV2ForQuestionAnswering,
|
||||
DebertaV2ForSequenceClassification,
|
||||
DebertaV2ForTokenClassification,
|
||||
DebertaV2Model,
|
||||
)
|
||||
from ..distilbert.modeling_distilbert import (
|
||||
DistilBertForMaskedLM,
|
||||
DistilBertForMultipleChoice,
|
||||
@@ -254,6 +261,7 @@ from .configuration_auto import (
|
||||
ConvBertConfig,
|
||||
CTRLConfig,
|
||||
DebertaConfig,
|
||||
DebertaV2Config,
|
||||
DistilBertConfig,
|
||||
DPRConfig,
|
||||
ElectraConfig,
|
||||
@@ -332,6 +340,7 @@ MODEL_MAPPING = OrderedDict(
|
||||
(LxmertConfig, LxmertModel),
|
||||
(BertGenerationConfig, BertGenerationEncoder),
|
||||
(DebertaConfig, DebertaModel),
|
||||
(DebertaV2Config, DebertaV2Model),
|
||||
(DPRConfig, DPRQuestionEncoder),
|
||||
(XLMProphetNetConfig, XLMProphetNetModel),
|
||||
(ProphetNetConfig, ProphetNetModel),
|
||||
@@ -408,6 +417,7 @@ MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
|
||||
(MPNetConfig, MPNetForMaskedLM),
|
||||
(TapasConfig, TapasForMaskedLM),
|
||||
(DebertaConfig, DebertaForMaskedLM),
|
||||
(DebertaV2Config, DebertaV2ForMaskedLM),
|
||||
]
|
||||
)
|
||||
|
||||
@@ -465,6 +475,7 @@ MODEL_FOR_MASKED_LM_MAPPING = OrderedDict(
|
||||
(MPNetConfig, MPNetForMaskedLM),
|
||||
(TapasConfig, TapasForMaskedLM),
|
||||
(DebertaConfig, DebertaForMaskedLM),
|
||||
(DebertaV2Config, DebertaV2ForMaskedLM),
|
||||
]
|
||||
)
|
||||
|
||||
@@ -510,6 +521,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict(
|
||||
(ElectraConfig, ElectraForSequenceClassification),
|
||||
(FunnelConfig, FunnelForSequenceClassification),
|
||||
(DebertaConfig, DebertaForSequenceClassification),
|
||||
(DebertaV2Config, DebertaV2ForSequenceClassification),
|
||||
(GPT2Config, GPT2ForSequenceClassification),
|
||||
(OpenAIGPTConfig, OpenAIGPTForSequenceClassification),
|
||||
(ReformerConfig, ReformerForSequenceClassification),
|
||||
@@ -545,6 +557,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict(
|
||||
(LxmertConfig, LxmertForQuestionAnswering),
|
||||
(MPNetConfig, MPNetForQuestionAnswering),
|
||||
(DebertaConfig, DebertaForQuestionAnswering),
|
||||
(DebertaV2Config, DebertaV2ForQuestionAnswering),
|
||||
]
|
||||
)
|
||||
|
||||
@@ -577,6 +590,7 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict(
|
||||
(FunnelConfig, FunnelForTokenClassification),
|
||||
(MPNetConfig, MPNetForTokenClassification),
|
||||
(DebertaConfig, DebertaForTokenClassification),
|
||||
(DebertaV2Config, DebertaV2ForTokenClassification),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@@ -66,6 +66,7 @@ from .configuration_auto import (
|
||||
ConvBertConfig,
|
||||
CTRLConfig,
|
||||
DebertaConfig,
|
||||
DebertaV2Config,
|
||||
DistilBertConfig,
|
||||
DPRConfig,
|
||||
ElectraConfig,
|
||||
@@ -108,6 +109,7 @@ if is_sentencepiece_available():
|
||||
from ..barthez.tokenization_barthez import BarthezTokenizer
|
||||
from ..bert_generation.tokenization_bert_generation import BertGenerationTokenizer
|
||||
from ..camembert.tokenization_camembert import CamembertTokenizer
|
||||
from ..deberta_v2.tokenization_deberta_v2 import DebertaV2Tokenizer
|
||||
from ..marian.tokenization_marian import MarianTokenizer
|
||||
from ..mbart.tokenization_mbart import MBartTokenizer
|
||||
from ..mt5 import MT5Tokenizer
|
||||
@@ -122,6 +124,7 @@ else:
|
||||
BarthezTokenizer = None
|
||||
BertGenerationTokenizer = None
|
||||
CamembertTokenizer = None
|
||||
DebertaV2Tokenizer = None
|
||||
MarianTokenizer = None
|
||||
MBartTokenizer = None
|
||||
MT5Tokenizer = None
|
||||
@@ -233,6 +236,7 @@ TOKENIZER_MAPPING = OrderedDict(
|
||||
(FSMTConfig, (FSMTTokenizer, None)),
|
||||
(BertGenerationConfig, (BertGenerationTokenizer, None)),
|
||||
(DebertaConfig, (DebertaTokenizer, None)),
|
||||
(DebertaV2Config, (DebertaV2Tokenizer, None)),
|
||||
(RagConfig, (RagTokenizer, None)),
|
||||
(XLMProphetNetConfig, (XLMProphetNetTokenizer, None)),
|
||||
(ProphetNetConfig, (ProphetNetTokenizer, None)),
|
||||
|
||||
@@ -23,6 +23,10 @@ logger = logging.get_logger(__name__)
|
||||
DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
"microsoft/deberta-base": "https://huggingface.co/microsoft/deberta-base/resolve/main/config.json",
|
||||
"microsoft/deberta-large": "https://huggingface.co/microsoft/deberta-large/resolve/main/config.json",
|
||||
"microsoft/deberta-xlarge": "https://huggingface.co/microsoft/deberta-xlarge/resolve/main/config.json",
|
||||
"microsoft/deberta-base-mnli": "https://huggingface.co/microsoft/deberta-base-mnli/resolve/main/config.json",
|
||||
"microsoft/deberta-large-mnli": "https://huggingface.co/microsoft/deberta-large-mnli/resolve/main/config.json",
|
||||
"microsoft/deberta-xlarge-mnli": "https://huggingface.co/microsoft/deberta-xlarge-mnli/resolve/main/config.json",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -18,7 +18,6 @@ import math
|
||||
from collections.abc import Sequence
|
||||
|
||||
import torch
|
||||
from packaging import version
|
||||
from torch import _softmax_backward_data, nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
@@ -40,10 +39,15 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
_CONFIG_FOR_DOC = "DebertaConfig"
|
||||
_TOKENIZER_FOR_DOC = "DebertaTokenizer"
|
||||
_CHECKPOINT_FOR_DOC = "microsoft/deberta-base"
|
||||
|
||||
DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
||||
"microsoft/deberta-base",
|
||||
"microsoft/deberta-large",
|
||||
"microsoft/deberta-xlarge",
|
||||
"microsoft/deberta-base-mnli",
|
||||
"microsoft/deberta-large-mnli",
|
||||
"microsoft/deberta-xlarge-mnli",
|
||||
]
|
||||
|
||||
|
||||
@@ -54,7 +58,7 @@ class ContextPooler(nn.Module):
|
||||
self.dropout = StableDropout(config.pooler_dropout)
|
||||
self.config = config
|
||||
|
||||
def forward(self, hidden_states, mask=None):
|
||||
def forward(self, hidden_states):
|
||||
# We "pool" the model by simply taking the hidden state corresponding
|
||||
# to the first token.
|
||||
|
||||
@@ -74,27 +78,28 @@ class XSoftmax(torch.autograd.Function):
|
||||
Masked Softmax which is optimized for saving memory
|
||||
|
||||
Args:
|
||||
input (:obj:`torch.tensor`): The input tensor that will apply softmax.
|
||||
mask (:obj:`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
|
||||
dim (int): The dimension that will apply softmax
|
||||
input (:obj:`torch.tensor`): The input tensor that will apply softmax.
|
||||
mask (:obj:`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
|
||||
dim (int): The dimension that will apply softmax
|
||||
|
||||
Example::
|
||||
import torch
|
||||
from transformers.models.deberta import XSoftmax
|
||||
# Make a tensor
|
||||
x = torch.randn([4,20,100])
|
||||
# Create a mask
|
||||
mask = (x>0).int()
|
||||
y = XSoftmax.apply(x, mask, dim=-1)
|
||||
|
||||
>>> import torch
|
||||
>>> from transformers.models.deberta.modeling_deberta import XSoftmax
|
||||
|
||||
>>> # Make a tensor
|
||||
>>> x = torch.randn([4,20,100])
|
||||
|
||||
>>> # Create a mask
|
||||
>>> mask = (x>0).int()
|
||||
|
||||
>>> y = XSoftmax.apply(x, mask, dim=-1)
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def forward(self, input, mask, dim):
|
||||
self.dim = dim
|
||||
if version.Version(torch.__version__) >= version.Version("1.2.0a"):
|
||||
rmask = ~(mask.bool())
|
||||
else:
|
||||
rmask = (1 - mask).byte() # This line is not supported by Onnx tracing.
|
||||
rmask = ~(mask.bool())
|
||||
|
||||
output = input.masked_fill(rmask, float("-inf"))
|
||||
output = torch.softmax(output, self.dim)
|
||||
@@ -127,10 +132,7 @@ def get_mask(input, local_context):
|
||||
mask = local_context.mask if local_context.reuse_mask else None
|
||||
|
||||
if dropout > 0 and mask is None:
|
||||
if version.Version(torch.__version__) >= version.Version("1.2.0a"):
|
||||
mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).bool()
|
||||
else:
|
||||
mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).byte()
|
||||
mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).bool()
|
||||
|
||||
if isinstance(local_context, DropoutContext):
|
||||
if local_context.mask is None:
|
||||
@@ -166,9 +168,7 @@ class StableDropout(torch.nn.Module):
|
||||
Optimized dropout module for stabilizing the training
|
||||
|
||||
Args:
|
||||
|
||||
drop_prob (float): the dropout probabilities
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, drop_prob):
|
||||
@@ -183,8 +183,6 @@ class StableDropout(torch.nn.Module):
|
||||
|
||||
Args:
|
||||
x (:obj:`torch.tensor`): The input tensor to apply dropout
|
||||
|
||||
|
||||
"""
|
||||
if self.training and self.drop_prob > 0:
|
||||
return XDropout.apply(x, self.get_context())
|
||||
@@ -302,7 +300,7 @@ class DebertaIntermediate(nn.Module):
|
||||
|
||||
class DebertaOutput(nn.Module):
|
||||
def __init__(self, config):
|
||||
super(DebertaOutput, self).__init__()
|
||||
super().__init__()
|
||||
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
|
||||
self.LayerNorm = DebertaLayerNorm(config.hidden_size, config.layer_norm_eps)
|
||||
self.dropout = StableDropout(config.hidden_dropout_prob)
|
||||
@@ -317,7 +315,7 @@ class DebertaOutput(nn.Module):
|
||||
|
||||
class DebertaLayer(nn.Module):
|
||||
def __init__(self, config):
|
||||
super(DebertaLayer, self).__init__()
|
||||
super().__init__()
|
||||
self.attention = DebertaAttention(config)
|
||||
self.intermediate = DebertaIntermediate(config)
|
||||
self.output = DebertaOutput(config)
|
||||
@@ -701,7 +699,6 @@ class DebertaEmbeddings(nn.Module):
|
||||
self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False)
|
||||
self.LayerNorm = DebertaLayerNorm(config.hidden_size, config.layer_norm_eps)
|
||||
self.dropout = StableDropout(config.hidden_dropout_prob)
|
||||
self.output_to_half = False
|
||||
self.config = config
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
@@ -763,6 +760,11 @@ class DebertaPreTrainedModel(PreTrainedModel):
|
||||
config_class = DebertaConfig
|
||||
base_model_prefix = "deberta"
|
||||
_keys_to_ignore_on_load_missing = ["position_ids"]
|
||||
_keys_to_ignore_on_load_unexpected = ["position_embeddings"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self._register_load_state_dict_pre_hook(self._pre_load_hook)
|
||||
|
||||
def _init_weights(self, module):
|
||||
""" Initialize the weights """
|
||||
@@ -773,6 +775,25 @@ class DebertaPreTrainedModel(PreTrainedModel):
|
||||
if isinstance(module, nn.Linear) and module.bias is not None:
|
||||
module.bias.data.zero_()
|
||||
|
||||
def _pre_load_hook(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
|
||||
"""
|
||||
Removes the classifier if it doesn't have the correct number of labels.
|
||||
"""
|
||||
self_state = self.state_dict()
|
||||
if (
|
||||
("classifier.weight" in self_state)
|
||||
and ("classifier.weight" in state_dict)
|
||||
and self_state["classifier.weight"].size() != state_dict["classifier.weight"].size()
|
||||
):
|
||||
logger.warning(
|
||||
f"The checkpoint classifier head has a shape {state_dict['classifier.weight'].size()} and this model "
|
||||
f"classifier head has a shape {self_state['classifier.weight'].size()}. Ignoring the checkpoint "
|
||||
f"weights. You should train your model on new data."
|
||||
)
|
||||
del state_dict["classifier.weight"]
|
||||
if "classifier.bias" in state_dict:
|
||||
del state_dict["classifier.bias"]
|
||||
|
||||
|
||||
DEBERTA_START_DOCSTRING = r"""
|
||||
The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention
|
||||
@@ -867,7 +888,7 @@ class DebertaModel(DebertaPreTrainedModel):
|
||||
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
tokenizer_class=_TOKENIZER_FOR_DOC,
|
||||
checkpoint="microsoft/deberta-base",
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=SequenceClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
@@ -953,7 +974,6 @@ class DebertaModel(DebertaPreTrainedModel):
|
||||
|
||||
@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top. """, DEBERTA_START_DOCSTRING)
|
||||
class DebertaForMaskedLM(DebertaPreTrainedModel):
|
||||
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
|
||||
|
||||
@@ -974,7 +994,7 @@ class DebertaForMaskedLM(DebertaPreTrainedModel):
|
||||
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
tokenizer_class=_TOKENIZER_FOR_DOC,
|
||||
checkpoint="microsoft/deberta-base",
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=MaskedLMOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
@@ -1114,7 +1134,7 @@ class DebertaForSequenceClassification(DebertaPreTrainedModel):
|
||||
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
tokenizer_class=_TOKENIZER_FOR_DOC,
|
||||
checkpoint="microsoft/deberta-base",
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=SequenceClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
@@ -1194,7 +1214,6 @@ class DebertaForSequenceClassification(DebertaPreTrainedModel):
|
||||
DEBERTA_START_DOCSTRING,
|
||||
)
|
||||
class DebertaForTokenClassification(DebertaPreTrainedModel):
|
||||
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
@@ -1210,7 +1229,7 @@ class DebertaForTokenClassification(DebertaPreTrainedModel):
|
||||
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
tokenizer_class=_TOKENIZER_FOR_DOC,
|
||||
checkpoint="microsoft/deberta-base",
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=TokenClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
@@ -1283,7 +1302,6 @@ class DebertaForTokenClassification(DebertaPreTrainedModel):
|
||||
DEBERTA_START_DOCSTRING,
|
||||
)
|
||||
class DebertaForQuestionAnswering(DebertaPreTrainedModel):
|
||||
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
@@ -1298,7 +1316,7 @@ class DebertaForQuestionAnswering(DebertaPreTrainedModel):
|
||||
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
tokenizer_class=_TOKENIZER_FOR_DOC,
|
||||
checkpoint="microsoft/deberta-base",
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=QuestionAnsweringModelOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
|
||||
@@ -44,12 +44,20 @@ PRETRAINED_VOCAB_FILES_MAP = {
|
||||
"vocab_file": {
|
||||
"microsoft/deberta-base": "https://huggingface.co/microsoft/deberta-base/resolve/main/bpe_encoder.bin",
|
||||
"microsoft/deberta-large": "https://huggingface.co/microsoft/deberta-large/resolve/main/bpe_encoder.bin",
|
||||
"microsoft/deberta-xlarge": "https://huggingface.co/microsoft/deberta-xlarge/resolve/main/bpe_encoder.bin",
|
||||
"microsoft/deberta-base-mnli": "https://huggingface.co/microsoft/deberta-base-mnli/resolve/main/bpe_encoder.bin",
|
||||
"microsoft/deberta-large-mnli": "https://huggingface.co/microsoft/deberta-large-mnli/resolve/main/bpe_encoder.bin",
|
||||
"microsoft/deberta-xlarge-mnli": "https://huggingface.co/microsoft/deberta-xlarge-mnli/resolve/main/bpe_encoder.bin",
|
||||
}
|
||||
}
|
||||
|
||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
"microsoft/deberta-base": 512,
|
||||
"microsoft/deberta-large": 512,
|
||||
"microsoft/deberta-xlarge": 512,
|
||||
"microsoft/deberta-base-mnli": 512,
|
||||
"microsoft/deberta-large-mnli": 512,
|
||||
"microsoft/deberta-xlarge-mnli": 512,
|
||||
}
|
||||
|
||||
PRETRAINED_INIT_CONFIGURATION = {
|
||||
|
||||
72
src/transformers/models/deberta_v2/__init__.py
Normal file
72
src/transformers/models/deberta_v2/__init__.py
Normal file
@@ -0,0 +1,72 @@
|
||||
# flake8: noqa
|
||||
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
||||
# module, but to preserve other warnings. So, don't check this module at all.
|
||||
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ...file_utils import _BaseLazyModule, is_torch_available
|
||||
|
||||
|
||||
_import_structure = {
|
||||
"configuration_deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config"],
|
||||
"tokenization_deberta_v2": ["DebertaV2Tokenizer"],
|
||||
}
|
||||
|
||||
if is_torch_available():
|
||||
_import_structure["modeling_deberta_v2"] = [
|
||||
"DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
"DebertaV2ForSequenceClassification",
|
||||
"DebertaV2Model",
|
||||
"DebertaV2ForMaskedLM",
|
||||
"DebertaV2PreTrainedModel",
|
||||
"DebertaV2ForTokenClassification",
|
||||
"DebertaV2ForQuestionAnswering",
|
||||
]
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config
|
||||
from .tokenization_deberta_v2 import DebertaV2Tokenizer
|
||||
|
||||
if is_torch_available():
|
||||
from .modeling_deberta_v2 import (
|
||||
DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
DebertaV2ForMaskedLM,
|
||||
DebertaV2ForQuestionAnswering,
|
||||
DebertaV2ForSequenceClassification,
|
||||
DebertaV2ForTokenClassification,
|
||||
DebertaV2Model,
|
||||
DebertaV2PreTrainedModel,
|
||||
)
|
||||
|
||||
else:
|
||||
import importlib
|
||||
import os
|
||||
import sys
|
||||
|
||||
class _LazyModule(_BaseLazyModule):
|
||||
"""
|
||||
Module class that surfaces all objects but only performs associated imports when the objects are requested.
|
||||
"""
|
||||
|
||||
__file__ = globals()["__file__"]
|
||||
__path__ = [os.path.dirname(__file__)]
|
||||
|
||||
def _get_module(self, module_name: str):
|
||||
return importlib.import_module("." + module_name, self.__name__)
|
||||
|
||||
sys.modules[__name__] = _LazyModule(__name__, _import_structure)
|
||||
138
src/transformers/models/deberta_v2/configuration_deberta_v2.py
Normal file
138
src/transformers/models/deberta_v2/configuration_deberta_v2.py
Normal file
@@ -0,0 +1,138 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020, Microsoft and the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" DeBERTa-v2 model configuration """
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
"microsoft/deberta-v2-xlarge": "https://huggingface.co/microsoft/deberta-v2-xlarge/resolve/main/config.json",
|
||||
"microsoft/deberta-v2-xxlarge": "https://huggingface.co/microsoft/deberta-v2-xxlarge/resolve/main/config.json",
|
||||
"microsoft/deberta-v2-xlarge-mnli": "https://huggingface.co/microsoft/deberta-v2-xlarge-mnli/resolve/main/config.json",
|
||||
"microsoft/deberta-v2-xxlarge-mnli": "https://huggingface.co/microsoft/deberta-v2-xxlarge-mnli/resolve/main/config.json",
|
||||
}
|
||||
|
||||
|
||||
class DebertaV2Config(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.DebertaV2Model`. It is used
|
||||
to instantiate a DeBERTa-v2 model according to the specified arguments, defining the model architecture.
|
||||
Instantiating a configuration with the defaults will yield a similar configuration to that of the DeBERTa
|
||||
`microsoft/deberta-v2-xlarge <https://huggingface.co/microsoft/deberta-base>`__ architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
|
||||
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
|
||||
|
||||
Arguments:
|
||||
vocab_size (:obj:`int`, `optional`, defaults to 128100):
|
||||
Vocabulary size of the DeBERTa-v2 model. Defines the number of different tokens that can be represented by
|
||||
the :obj:`inputs_ids` passed when calling :class:`~transformers.DebertaV2Model`.
|
||||
hidden_size (:obj:`int`, `optional`, defaults to 1536):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
num_hidden_layers (:obj:`int`, `optional`, defaults to 24):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (:obj:`int`, `optional`, defaults to 24):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
intermediate_size (:obj:`int`, `optional`, defaults to 6144):
|
||||
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
|
||||
hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"`, :obj:`"gelu"`, :obj:`"tanh"`, :obj:`"gelu_fast"`,
|
||||
:obj:`"mish"`, :obj:`"linear"`, :obj:`"sigmoid"` and :obj:`"gelu_new"` are supported.
|
||||
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size (:obj:`int`, `optional`, defaults to 0):
|
||||
The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.DebertaModel` or
|
||||
:class:`~transformers.TFDebertaModel`.
|
||||
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-7):
|
||||
The epsilon used by the layer normalization layers.
|
||||
relative_attention (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether use relative position encoding.
|
||||
max_relative_positions (:obj:`int`, `optional`, defaults to -1):
|
||||
The range of relative positions :obj:`[-max_position_embeddings, max_position_embeddings]`. Use the same
|
||||
value as :obj:`max_position_embeddings`.
|
||||
pad_token_id (:obj:`int`, `optional`, defaults to 0):
|
||||
The value used to pad input_ids.
|
||||
position_biased_input (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether add absolute position embedding to content embedding.
|
||||
pos_att_type (:obj:`List[str]`, `optional`):
|
||||
The type of relative position attention, it can be a combination of :obj:`["p2c", "c2p", "p2p"]`, e.g.
|
||||
:obj:`["p2c"]`, :obj:`["p2c", "c2p"]`, :obj:`["p2c", "c2p", 'p2p"]`.
|
||||
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
"""
|
||||
model_type = "deberta-v2"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=128100,
|
||||
hidden_size=1536,
|
||||
num_hidden_layers=24,
|
||||
num_attention_heads=24,
|
||||
intermediate_size=6144,
|
||||
hidden_act="gelu",
|
||||
hidden_dropout_prob=0.1,
|
||||
attention_probs_dropout_prob=0.1,
|
||||
max_position_embeddings=512,
|
||||
type_vocab_size=0,
|
||||
initializer_range=0.02,
|
||||
layer_norm_eps=1e-7,
|
||||
relative_attention=False,
|
||||
max_relative_positions=-1,
|
||||
pad_token_id=0,
|
||||
position_biased_input=True,
|
||||
pos_att_type=None,
|
||||
pooler_dropout=0,
|
||||
pooler_hidden_act="gelu",
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.intermediate_size = intermediate_size
|
||||
self.hidden_act = hidden_act
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.type_vocab_size = type_vocab_size
|
||||
self.initializer_range = initializer_range
|
||||
self.relative_attention = relative_attention
|
||||
self.max_relative_positions = max_relative_positions
|
||||
self.pad_token_id = pad_token_id
|
||||
self.position_biased_input = position_biased_input
|
||||
|
||||
# Backwards compatibility
|
||||
if type(pos_att_type) == str:
|
||||
pos_att_type = [x.strip() for x in pos_att_type.lower().split("|")]
|
||||
|
||||
self.pos_att_type = pos_att_type
|
||||
self.vocab_size = vocab_size
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
|
||||
self.pooler_hidden_size = kwargs.get("pooler_hidden_size", hidden_size)
|
||||
self.pooler_dropout = pooler_dropout
|
||||
self.pooler_hidden_act = pooler_hidden_act
|
||||
1516
src/transformers/models/deberta_v2/modeling_deberta_v2.py
Normal file
1516
src/transformers/models/deberta_v2/modeling_deberta_v2.py
Normal file
File diff suppressed because it is too large
Load Diff
491
src/transformers/models/deberta_v2/tokenization_deberta_v2.py
Normal file
491
src/transformers/models/deberta_v2/tokenization_deberta_v2.py
Normal file
@@ -0,0 +1,491 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 Microsoft and the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Tokenization class for model DeBERTa."""
|
||||
|
||||
import os
|
||||
import unicodedata
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import sentencepiece as sp
|
||||
import six
|
||||
|
||||
from ...tokenization_utils import PreTrainedTokenizer
|
||||
|
||||
|
||||
PRETRAINED_VOCAB_FILES_MAP = {
|
||||
"vocab_file": {
|
||||
"microsoft/deberta-v2-xlarge": "https://huggingface.co/microsoft/deberta-v2-xlarge/resolve/main/spm.model",
|
||||
"microsoft/deberta-v2-xxlarge": "https://huggingface.co/microsoft/deberta-v2-xxlarge/resolve/main/spm.model",
|
||||
"microsoft/deberta-v2-xlarge-mnli": "https://huggingface.co/microsoft/deberta-v2-xlarge-mnli/resolve/main/spm.model",
|
||||
"microsoft/deberta-v2-xxlarge-mnli": "https://huggingface.co/microsoft/deberta-v2-xxlarge-mnli/resolve/main/spm.model",
|
||||
}
|
||||
}
|
||||
|
||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
"microsoft/deberta-v2-xlarge": 512,
|
||||
"microsoft/deberta-v2-xxlarge": 512,
|
||||
"microsoft/deberta-v2-xlarge-mnli": 512,
|
||||
"microsoft/deberta-v2-xxlarge-mnli": 512,
|
||||
}
|
||||
|
||||
PRETRAINED_INIT_CONFIGURATION = {
|
||||
"microsoft/deberta-v2-xlarge": {"do_lower_case": False},
|
||||
"microsoft/deberta-v2-xxlarge": {"do_lower_case": False},
|
||||
"microsoft/deberta-v2-xlarge-mnli": {"do_lower_case": False},
|
||||
"microsoft/deberta-v2-xxlarge-mnli": {"do_lower_case": False},
|
||||
}
|
||||
|
||||
VOCAB_FILES_NAMES = {"vocab_file": "spm.model"}
|
||||
|
||||
|
||||
class DebertaV2Tokenizer(PreTrainedTokenizer):
|
||||
r"""
|
||||
Constructs a DeBERTa-v2 tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
|
||||
|
||||
Args:
|
||||
vocab_file (:obj:`str`):
|
||||
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
|
||||
contains the vocabulary necessary to instantiate a tokenizer.
|
||||
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to lowercase the input when tokenizing.
|
||||
unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
|
||||
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||
token instead.
|
||||
sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
|
||||
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
|
||||
sequence classification or for a text and a question for question answering. It is also used as the last
|
||||
token of a sequence built with special tokens.
|
||||
pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
|
||||
The token used for padding, for example when batching sequences of different lengths.
|
||||
cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
|
||||
The classifier token which is used when doing sequence classification (classification of the whole sequence
|
||||
instead of per-token classification). It is the first token of the sequence when built with special tokens.
|
||||
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
|
||||
The token used for masking values. This is the token used when training this model with masked language
|
||||
modeling. This is the token which the model will try to predict.
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file,
|
||||
do_lower_case=False,
|
||||
split_by_punct=False,
|
||||
unk_token="[UNK]",
|
||||
sep_token="[SEP]",
|
||||
pad_token="[PAD]",
|
||||
cls_token="[CLS]",
|
||||
mask_token="[MASK]",
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
split_by_punct=split_by_punct,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
|
||||
"model use `tokenizer = DebertaV2Tokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
|
||||
)
|
||||
self.do_lower_case = do_lower_case
|
||||
self.split_by_punct = split_by_punct
|
||||
self._tokenizer = SPMTokenizer(vocab_file, split_by_punct=split_by_punct)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return len(self.vocab)
|
||||
|
||||
@property
|
||||
def vocab(self):
|
||||
return self._tokenizer.vocab
|
||||
|
||||
def get_vocab(self):
|
||||
vocab = self.vocab.copy()
|
||||
vocab.update(self.get_added_vocab())
|
||||
return vocab
|
||||
|
||||
def _tokenize(self, text):
|
||||
"""Take as input a string and return a list of strings (tokens) for words/sub-words"""
|
||||
if self.do_lower_case:
|
||||
text = text.lower()
|
||||
return self._tokenizer.tokenize(text)
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
""" Converts a token (str) in an id using the vocab. """
|
||||
return self._tokenizer.spm.PieceToId(token)
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
return self._tokenizer.spm.IdToPiece(index) if index < self.vocab_size else self.unk_token
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
""" Converts a sequence of tokens (string) in a single string. """
|
||||
return self._tokenizer.decode(tokens)
|
||||
|
||||
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
||||
"""
|
||||
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
|
||||
adding special tokens. A DeBERTa sequence has the following format:
|
||||
|
||||
- single sequence: [CLS] X [SEP]
|
||||
- pair of sequences: [CLS] A [SEP] B [SEP]
|
||||
|
||||
Args:
|
||||
token_ids_0 (:obj:`List[int]`):
|
||||
List of IDs to which the special tokens will be added.
|
||||
token_ids_1 (:obj:`List[int]`, `optional`):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
|
||||
Returns:
|
||||
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
|
||||
"""
|
||||
|
||||
if token_ids_1 is None:
|
||||
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
||||
cls = [self.cls_token_id]
|
||||
sep = [self.sep_token_id]
|
||||
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||
|
||||
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
|
||||
"""
|
||||
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||
|
||||
Args:
|
||||
token_ids_0 (:obj:`List[int]`):
|
||||
List of IDs.
|
||||
token_ids_1 (:obj:`List[int]`, `optional`):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not the token list is already formatted with special tokens for the model.
|
||||
|
||||
Returns:
|
||||
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
||||
"""
|
||||
|
||||
if already_has_special_tokens:
|
||||
if token_ids_1 is not None:
|
||||
raise ValueError(
|
||||
"You should not supply a second sequence if the provided sequence of "
|
||||
"ids is already formatted with special tokens for the model."
|
||||
)
|
||||
return list(
|
||||
map(
|
||||
lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
|
||||
token_ids_0,
|
||||
)
|
||||
)
|
||||
|
||||
if token_ids_1 is not None:
|
||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||
|
||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
||||
"""
|
||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
|
||||
sequence pair mask has the following format:
|
||||
|
||||
::
|
||||
|
||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
||||
| first sequence | second sequence |
|
||||
|
||||
If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
|
||||
|
||||
Args:
|
||||
token_ids_0 (:obj:`List[int]`):
|
||||
List of IDs.
|
||||
token_ids_1 (:obj:`List[int]`, `optional`):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
|
||||
Returns:
|
||||
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
|
||||
sequence(s).
|
||||
"""
|
||||
sep = [self.sep_token_id]
|
||||
cls = [self.cls_token_id]
|
||||
if token_ids_1 is None:
|
||||
return len(cls + token_ids_0 + sep) * [0]
|
||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
||||
|
||||
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
|
||||
add_prefix_space = kwargs.pop("add_prefix_space", False)
|
||||
if is_split_into_words or add_prefix_space:
|
||||
text = " " + text
|
||||
return (text, kwargs)
|
||||
|
||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||
return self._tokenizer.save_pretrained(save_directory, filename_prefix=filename_prefix)
|
||||
|
||||
|
||||
class SPMTokenizer:
|
||||
def __init__(self, vocab_file, split_by_punct=False):
|
||||
self.split_by_punct = split_by_punct
|
||||
self.vocab_file = vocab_file
|
||||
spm = sp.SentencePieceProcessor()
|
||||
assert os.path.exists(vocab_file)
|
||||
spm.load(vocab_file)
|
||||
bpe_vocab_size = spm.GetPieceSize()
|
||||
# Token map
|
||||
# <unk> 0+1
|
||||
# <s> 1+1
|
||||
# </s> 2+1
|
||||
self.vocab = {spm.IdToPiece(i): i for i in range(bpe_vocab_size)}
|
||||
self.id_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)]
|
||||
# self.vocab['[PAD]'] = 0
|
||||
# self.vocab['[CLS]'] = 1
|
||||
# self.vocab['[SEP]'] = 2
|
||||
# self.vocab['[UNK]'] = 3
|
||||
|
||||
self.spm = spm
|
||||
|
||||
def __getstate__(self):
|
||||
state = self.__dict__.copy()
|
||||
state["spm"] = None
|
||||
return state
|
||||
|
||||
def __setstate__(self, d):
|
||||
self.__dict__ = d
|
||||
self.spm = sp.SentencePieceProcessor()
|
||||
self.spm.Load(self.vocab_file)
|
||||
|
||||
def tokenize(self, text):
|
||||
pieces = self._encode_as_pieces(text)
|
||||
|
||||
def _norm(x):
|
||||
if x not in self.vocab or x == "<unk>":
|
||||
return "[UNK]"
|
||||
else:
|
||||
return x
|
||||
|
||||
pieces = [_norm(p) for p in pieces]
|
||||
return pieces
|
||||
|
||||
def convert_ids_to_tokens(self, ids):
|
||||
tokens = []
|
||||
for i in ids:
|
||||
tokens.append(self.ids_to_tokens[i])
|
||||
return tokens
|
||||
|
||||
def decode(self, tokens, start=-1, end=-1, raw_text=None):
|
||||
if raw_text is None:
|
||||
return self.spm.decode_pieces([t for t in tokens])
|
||||
else:
|
||||
words = self.split_to_words(raw_text)
|
||||
word_tokens = [self.tokenize(w) for w in words]
|
||||
token2words = [0] * len(tokens)
|
||||
tid = 0
|
||||
for i, w in enumerate(word_tokens):
|
||||
for k, t in enumerate(w):
|
||||
token2words[tid] = i
|
||||
tid += 1
|
||||
word_start = token2words[start]
|
||||
word_end = token2words[end] if end < len(tokens) else len(words)
|
||||
text = "".join(words[word_start:word_end])
|
||||
return text
|
||||
|
||||
def add_special_token(self, token):
|
||||
if token not in self.special_tokens:
|
||||
self.special_tokens.append(token)
|
||||
if token not in self.vocab:
|
||||
self.vocab[token] = len(self.vocab) - 1
|
||||
self.id_to_tokens.append(token)
|
||||
return self.id(token)
|
||||
|
||||
def part_of_whole_word(self, token, is_bos=False):
|
||||
if is_bos:
|
||||
return True
|
||||
if (
|
||||
len(token) == 1
|
||||
and (_is_whitespace(list(token)[0]) or _is_control(list(token)[0]) or _is_punctuation(list(token)[0]))
|
||||
) or token in self.special_tokens:
|
||||
return False
|
||||
|
||||
word_start = b"\xe2\x96\x81".decode("utf-8")
|
||||
return not token.startswith(word_start)
|
||||
|
||||
def pad(self):
|
||||
return "[PAD]"
|
||||
|
||||
def bos(self):
|
||||
return "[CLS]"
|
||||
|
||||
def eos(self):
|
||||
return "[SEP]"
|
||||
|
||||
def unk(self):
|
||||
return "[UNK]"
|
||||
|
||||
def mask(self):
|
||||
return "[MASK]"
|
||||
|
||||
def sym(self, id):
|
||||
return self.ids_to_tokens[id]
|
||||
|
||||
def id(self, sym):
|
||||
return self.vocab[sym] if sym in self.vocab else 1
|
||||
|
||||
def _encode_as_pieces(self, text):
|
||||
text = convert_to_unicode(text)
|
||||
if self.split_by_punct:
|
||||
words = self._run_split_on_punc(text)
|
||||
pieces = [self.spm.encode_as_pieces(w) for w in words]
|
||||
return [p for w in pieces for p in w]
|
||||
else:
|
||||
return self.spm.encode_as_pieces(text)
|
||||
|
||||
def split_to_words(self, text):
|
||||
pieces = self._encode_as_pieces(text)
|
||||
word_start = b"\xe2\x96\x81".decode("utf-8")
|
||||
words = []
|
||||
offset = 0
|
||||
prev_end = 0
|
||||
for i, p in enumerate(pieces):
|
||||
if p.startswith(word_start):
|
||||
if offset > prev_end:
|
||||
words.append(text[prev_end:offset])
|
||||
prev_end = offset
|
||||
w = p.replace(word_start, "")
|
||||
else:
|
||||
w = p
|
||||
try:
|
||||
s = text.index(w, offset)
|
||||
pn = ""
|
||||
k = i + 1
|
||||
while k < len(pieces):
|
||||
pn = pieces[k].replace(word_start, "")
|
||||
if len(pn) > 0:
|
||||
break
|
||||
k += 1
|
||||
|
||||
if len(pn) > 0 and pn in text[offset:s]:
|
||||
offset = offset + 1
|
||||
else:
|
||||
offset = s + len(w)
|
||||
except Exception:
|
||||
offset = offset + 1
|
||||
|
||||
if prev_end < offset:
|
||||
words.append(text[prev_end:offset])
|
||||
|
||||
return words
|
||||
|
||||
def _run_strip_accents(self, text):
|
||||
"""Strips accents from a piece of text."""
|
||||
text = unicodedata.normalize("NFD", text)
|
||||
output = []
|
||||
for char in text:
|
||||
cat = unicodedata.category(char)
|
||||
if cat == "Mn":
|
||||
continue
|
||||
output.append(char)
|
||||
return "".join(output)
|
||||
|
||||
def _run_split_on_punc(self, text):
|
||||
"""Splits punctuation on a piece of text."""
|
||||
chars = list(text)
|
||||
i = 0
|
||||
start_new_word = True
|
||||
output = []
|
||||
while i < len(chars):
|
||||
char = chars[i]
|
||||
if _is_punctuation(char):
|
||||
output.append([char])
|
||||
start_new_word = True
|
||||
else:
|
||||
if start_new_word:
|
||||
output.append([])
|
||||
start_new_word = False
|
||||
output[-1].append(char)
|
||||
i += 1
|
||||
|
||||
return ["".join(x) for x in output]
|
||||
|
||||
def save_pretrained(self, path: str, filename_prefix: str = None):
|
||||
filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]]
|
||||
if filename_prefix is not None:
|
||||
filename = filename_prefix + "-" + filename
|
||||
full_path = os.path.join(path, filename)
|
||||
with open(full_path, "wb") as fs:
|
||||
fs.write(self.spm.serialized_model_proto())
|
||||
return (full_path,)
|
||||
|
||||
|
||||
def _is_whitespace(char):
|
||||
"""Checks whether `chars` is a whitespace character."""
|
||||
# \t, \n, and \r are technically contorl characters but we treat them
|
||||
# as whitespace since they are generally considered as such.
|
||||
if char == " " or char == "\t" or char == "\n" or char == "\r":
|
||||
return True
|
||||
cat = unicodedata.category(char)
|
||||
if cat == "Zs":
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _is_control(char):
|
||||
"""Checks whether `chars` is a control character."""
|
||||
# These are technically control characters but we count them as whitespace
|
||||
# characters.
|
||||
if char == "\t" or char == "\n" or char == "\r":
|
||||
return False
|
||||
cat = unicodedata.category(char)
|
||||
if cat.startswith("C"):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _is_punctuation(char):
|
||||
"""Checks whether `chars` is a punctuation character."""
|
||||
cp = ord(char)
|
||||
# We treat all non-letter/number ASCII as punctuation.
|
||||
# Characters such as "^", "$", and "`" are not in the Unicode
|
||||
# Punctuation class but we treat them as punctuation anyways, for
|
||||
# consistency.
|
||||
if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
|
||||
return True
|
||||
cat = unicodedata.category(char)
|
||||
if cat.startswith("P"):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def convert_to_unicode(text):
|
||||
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
|
||||
if six.PY3:
|
||||
if isinstance(text, str):
|
||||
return text
|
||||
elif isinstance(text, bytes):
|
||||
return text.decode("utf-8", "ignore")
|
||||
else:
|
||||
raise ValueError("Unsupported string type: %s" % (type(text)))
|
||||
elif six.PY2:
|
||||
if isinstance(text, str):
|
||||
return text.decode("utf-8", "ignore")
|
||||
else:
|
||||
raise ValueError("Unsupported string type: %s" % (type(text)))
|
||||
else:
|
||||
raise ValueError("Not running on Python2 or Python 3?")
|
||||
@@ -883,6 +883,63 @@ class DebertaPreTrainedModel:
|
||||
requires_pytorch(self)
|
||||
|
||||
|
||||
DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST = None
|
||||
|
||||
|
||||
class DebertaV2ForMaskedLM:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
|
||||
class DebertaV2ForQuestionAnswering:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
|
||||
class DebertaV2ForSequenceClassification:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
|
||||
class DebertaV2ForTokenClassification:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
|
||||
class DebertaV2Model:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
|
||||
class DebertaV2PreTrainedModel:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
|
||||
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user