add DebertaV2 fast tokenizer (#15529)
Co-authored-by: alcinos <carion.nicolas@gmail.com> Co-authored-by: SaulLu <55560583+SaulLu@users.noreply.github.com> Co-authored-by: Nicolas Carion <carion.nicolas@gmail.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
@@ -192,7 +192,7 @@ Flax), PyTorch, and/or TensorFlow.
|
|||||||
| Data2VecText | ❌ | ❌ | ✅ | ❌ | ❌ |
|
| Data2VecText | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| Data2VecVision | ❌ | ❌ | ✅ | ❌ | ❌ |
|
| Data2VecVision | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| DeBERTa | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| DeBERTa | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
| DeBERTa-v2 | ✅ | ❌ | ✅ | ✅ | ❌ |
|
| DeBERTa-v2 | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
| Decision Transformer | ❌ | ❌ | ✅ | ❌ | ❌ |
|
| Decision Transformer | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| DeiT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
| DeiT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| DETR | ❌ | ❌ | ✅ | ❌ | ❌ |
|
| DETR | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
|
|||||||
@@ -71,6 +71,12 @@ contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code
|
|||||||
- create_token_type_ids_from_sequences
|
- create_token_type_ids_from_sequences
|
||||||
- save_vocabulary
|
- save_vocabulary
|
||||||
|
|
||||||
|
## DebertaV2TokenizerFast
|
||||||
|
|
||||||
|
[[autodoc]] DebertaV2TokenizerFast
|
||||||
|
- build_inputs_with_special_tokens
|
||||||
|
- create_token_type_ids_from_sequences
|
||||||
|
|
||||||
## DebertaV2Model
|
## DebertaV2Model
|
||||||
|
|
||||||
[[autodoc]] DebertaV2Model
|
[[autodoc]] DebertaV2Model
|
||||||
|
|||||||
@@ -461,6 +461,7 @@ if is_tokenizers_available():
|
|||||||
_import_structure["models.blenderbot"].append("BlenderbotTokenizerFast")
|
_import_structure["models.blenderbot"].append("BlenderbotTokenizerFast")
|
||||||
_import_structure["models.camembert"].append("CamembertTokenizerFast")
|
_import_structure["models.camembert"].append("CamembertTokenizerFast")
|
||||||
_import_structure["models.deberta"].append("DebertaTokenizerFast")
|
_import_structure["models.deberta"].append("DebertaTokenizerFast")
|
||||||
|
_import_structure["models.deberta_v2"].append("DebertaV2TokenizerFast")
|
||||||
_import_structure["models.distilbert"].append("DistilBertTokenizerFast")
|
_import_structure["models.distilbert"].append("DistilBertTokenizerFast")
|
||||||
_import_structure["models.dpr"].extend(
|
_import_structure["models.dpr"].extend(
|
||||||
["DPRContextEncoderTokenizerFast", "DPRQuestionEncoderTokenizerFast", "DPRReaderTokenizerFast"]
|
["DPRContextEncoderTokenizerFast", "DPRQuestionEncoderTokenizerFast", "DPRReaderTokenizerFast"]
|
||||||
@@ -2829,6 +2830,7 @@ if TYPE_CHECKING:
|
|||||||
from .models.clip import CLIPTokenizerFast
|
from .models.clip import CLIPTokenizerFast
|
||||||
from .models.convbert import ConvBertTokenizerFast
|
from .models.convbert import ConvBertTokenizerFast
|
||||||
from .models.deberta import DebertaTokenizerFast
|
from .models.deberta import DebertaTokenizerFast
|
||||||
|
from .models.deberta_v2 import DebertaV2TokenizerFast
|
||||||
from .models.distilbert import DistilBertTokenizerFast
|
from .models.distilbert import DistilBertTokenizerFast
|
||||||
from .models.dpr import DPRContextEncoderTokenizerFast, DPRQuestionEncoderTokenizerFast, DPRReaderTokenizerFast
|
from .models.dpr import DPRContextEncoderTokenizerFast, DPRQuestionEncoderTokenizerFast, DPRReaderTokenizerFast
|
||||||
from .models.electra import ElectraTokenizerFast
|
from .models.electra import ElectraTokenizerFast
|
||||||
|
|||||||
@@ -580,6 +580,38 @@ class CamembertConverter(SpmConverter):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class DebertaV2Converter(SpmConverter):
|
||||||
|
def pre_tokenizer(self, replacement, add_prefix_space):
|
||||||
|
list_pretokenizers = []
|
||||||
|
if self.original_tokenizer.split_by_punct:
|
||||||
|
list_pretokenizers.append(pre_tokenizers.Punctuation(behavior="isolated"))
|
||||||
|
list_pretokenizers.append(pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space))
|
||||||
|
return pre_tokenizers.Sequence(list_pretokenizers)
|
||||||
|
|
||||||
|
def normalizer(self, proto):
|
||||||
|
list_normalizers = []
|
||||||
|
if self.original_tokenizer.do_lower_case:
|
||||||
|
list_normalizers.append(normalizers.Lowercase())
|
||||||
|
list_normalizers.append(normalizers.Strip())
|
||||||
|
|
||||||
|
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
|
||||||
|
if precompiled_charsmap:
|
||||||
|
list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
|
||||||
|
list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " "))
|
||||||
|
|
||||||
|
return normalizers.Sequence(list_normalizers)
|
||||||
|
|
||||||
|
def post_processor(self):
|
||||||
|
return processors.TemplateProcessing(
|
||||||
|
single="[CLS]:0 $A:0 [SEP]:0",
|
||||||
|
pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
|
||||||
|
special_tokens=[
|
||||||
|
("[CLS]", self.original_tokenizer.convert_tokens_to_ids("[CLS]")),
|
||||||
|
("[SEP]", self.original_tokenizer.convert_tokens_to_ids("[SEP]")),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class MBartConverter(SpmConverter):
|
class MBartConverter(SpmConverter):
|
||||||
def vocab(self, proto):
|
def vocab(self, proto):
|
||||||
vocab = [
|
vocab = [
|
||||||
@@ -979,6 +1011,7 @@ SLOW_TO_FAST_CONVERTERS = {
|
|||||||
"CLIPTokenizer": CLIPConverter,
|
"CLIPTokenizer": CLIPConverter,
|
||||||
"ConvBertTokenizer": BertConverter,
|
"ConvBertTokenizer": BertConverter,
|
||||||
"DebertaTokenizer": DebertaConverter,
|
"DebertaTokenizer": DebertaConverter,
|
||||||
|
"DebertaV2Tokenizer": DebertaV2Converter,
|
||||||
"DistilBertTokenizer": BertConverter,
|
"DistilBertTokenizer": BertConverter,
|
||||||
"DPRReaderTokenizer": BertConverter,
|
"DPRReaderTokenizer": BertConverter,
|
||||||
"DPRQuestionEncoderTokenizer": BertConverter,
|
"DPRQuestionEncoderTokenizer": BertConverter,
|
||||||
|
|||||||
@@ -150,7 +150,13 @@ else:
|
|||||||
("fsmt", ("FSMTTokenizer", None)),
|
("fsmt", ("FSMTTokenizer", None)),
|
||||||
("bert-generation", ("BertGenerationTokenizer" if is_sentencepiece_available() else None, None)),
|
("bert-generation", ("BertGenerationTokenizer" if is_sentencepiece_available() else None, None)),
|
||||||
("deberta", ("DebertaTokenizer", "DebertaTokenizerFast" if is_tokenizers_available() else None)),
|
("deberta", ("DebertaTokenizer", "DebertaTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
("deberta-v2", ("DebertaV2Tokenizer" if is_sentencepiece_available() else None, None)),
|
(
|
||||||
|
"deberta-v2",
|
||||||
|
(
|
||||||
|
"DebertaV2Tokenizer" if is_sentencepiece_available() else None,
|
||||||
|
"DebertaV2TokenizerFast" if is_tokenizers_available() else None,
|
||||||
|
),
|
||||||
|
),
|
||||||
("rag", ("RagTokenizer", None)),
|
("rag", ("RagTokenizer", None)),
|
||||||
("xlm-prophetnet", ("XLMProphetNetTokenizer" if is_sentencepiece_available() else None, None)),
|
("xlm-prophetnet", ("XLMProphetNetTokenizer" if is_sentencepiece_available() else None, None)),
|
||||||
("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
|
("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
|
||||||
|
|||||||
@@ -18,7 +18,7 @@
|
|||||||
|
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
from ...utils import _LazyModule, is_tf_available, is_torch_available
|
from ...utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available
|
||||||
|
|
||||||
|
|
||||||
_import_structure = {
|
_import_structure = {
|
||||||
@@ -26,6 +26,9 @@ _import_structure = {
|
|||||||
"tokenization_deberta_v2": ["DebertaV2Tokenizer"],
|
"tokenization_deberta_v2": ["DebertaV2Tokenizer"],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if is_tokenizers_available():
|
||||||
|
_import_structure["tokenization_deberta_v2_fast"] = ["DebertaV2TokenizerFast"]
|
||||||
|
|
||||||
if is_tf_available():
|
if is_tf_available():
|
||||||
_import_structure["modeling_tf_deberta_v2"] = [
|
_import_structure["modeling_tf_deberta_v2"] = [
|
||||||
"TF_DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST",
|
"TF_DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||||
@@ -53,6 +56,9 @@ if TYPE_CHECKING:
|
|||||||
from .configuration_deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config
|
from .configuration_deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config
|
||||||
from .tokenization_deberta_v2 import DebertaV2Tokenizer
|
from .tokenization_deberta_v2 import DebertaV2Tokenizer
|
||||||
|
|
||||||
|
if is_tokenizers_available():
|
||||||
|
from .tokenization_deberta_v2_fast import DebertaV2TokenizerFast
|
||||||
|
|
||||||
if is_tf_available():
|
if is_tf_available():
|
||||||
from .modeling_tf_deberta_v2 import (
|
from .modeling_tf_deberta_v2 import (
|
||||||
TF_DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST,
|
TF_DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||||
|
|||||||
@@ -142,6 +142,7 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
|
|||||||
)
|
)
|
||||||
self.do_lower_case = do_lower_case
|
self.do_lower_case = do_lower_case
|
||||||
self.split_by_punct = split_by_punct
|
self.split_by_punct = split_by_punct
|
||||||
|
self.vocab_file = vocab_file
|
||||||
self._tokenizer = SPMTokenizer(vocab_file, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs)
|
self._tokenizer = SPMTokenizer(vocab_file, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -325,16 +326,7 @@ class SPMTokenizer:
|
|||||||
self.spm.Load(self.vocab_file)
|
self.spm.Load(self.vocab_file)
|
||||||
|
|
||||||
def tokenize(self, text):
|
def tokenize(self, text):
|
||||||
pieces = self._encode_as_pieces(text)
|
return self._encode_as_pieces(text)
|
||||||
|
|
||||||
def _norm(x):
|
|
||||||
if x not in self.vocab or x == "<unk>":
|
|
||||||
return "[UNK]"
|
|
||||||
else:
|
|
||||||
return x
|
|
||||||
|
|
||||||
pieces = [_norm(p) for p in pieces]
|
|
||||||
return pieces
|
|
||||||
|
|
||||||
def convert_ids_to_tokens(self, ids):
|
def convert_ids_to_tokens(self, ids):
|
||||||
tokens = []
|
tokens = []
|
||||||
|
|||||||
@@ -0,0 +1,243 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2020 Microsoft and the HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Fast Tokenization class for model DeBERTa."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from shutil import copyfile
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
from ...file_utils import is_sentencepiece_available
|
||||||
|
from ...tokenization_utils_fast import PreTrainedTokenizerFast
|
||||||
|
from ...utils import logging
|
||||||
|
|
||||||
|
|
||||||
|
if is_sentencepiece_available():
|
||||||
|
from .tokenization_deberta_v2 import DebertaV2Tokenizer
|
||||||
|
else:
|
||||||
|
DebertaV2Tokenizer = None
|
||||||
|
|
||||||
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
VOCAB_FILES_NAMES = {"vocab_file": "spm.model", "tokenizer_file": "tokenizer.json"}
|
||||||
|
|
||||||
|
PRETRAINED_VOCAB_FILES_MAP = {
|
||||||
|
"vocab_file": {
|
||||||
|
"microsoft/deberta-v2-xlarge": "https://huggingface.co/microsoft/deberta-v2-xlarge/resolve/main/spm.model",
|
||||||
|
"microsoft/deberta-v2-xxlarge": "https://huggingface.co/microsoft/deberta-v2-xxlarge/resolve/main/spm.model",
|
||||||
|
"microsoft/deberta-v2-xlarge-mnli": "https://huggingface.co/microsoft/deberta-v2-xlarge-mnli/resolve/main/spm.model",
|
||||||
|
"microsoft/deberta-v2-xxlarge-mnli": "https://huggingface.co/microsoft/deberta-v2-xxlarge-mnli/resolve/main/spm.model",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||||
|
"microsoft/deberta-v2-xlarge": 512,
|
||||||
|
"microsoft/deberta-v2-xxlarge": 512,
|
||||||
|
"microsoft/deberta-v2-xlarge-mnli": 512,
|
||||||
|
"microsoft/deberta-v2-xxlarge-mnli": 512,
|
||||||
|
}
|
||||||
|
|
||||||
|
PRETRAINED_INIT_CONFIGURATION = {
|
||||||
|
"microsoft/deberta-v2-xlarge": {"do_lower_case": False},
|
||||||
|
"microsoft/deberta-v2-xxlarge": {"do_lower_case": False},
|
||||||
|
"microsoft/deberta-v2-xlarge-mnli": {"do_lower_case": False},
|
||||||
|
"microsoft/deberta-v2-xxlarge-mnli": {"do_lower_case": False},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class DebertaV2TokenizerFast(PreTrainedTokenizerFast):
|
||||||
|
r"""
|
||||||
|
Constructs a DeBERTa-v2 fast tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_file (`str`):
|
||||||
|
[SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
|
||||||
|
contains the vocabulary necessary to instantiate a tokenizer.
|
||||||
|
do_lower_case (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether or not to lowercase the input when tokenizing.
|
||||||
|
bos_token (`string`, *optional*, defaults to `"[CLS]"`):
|
||||||
|
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
|
||||||
|
When building a sequence using special tokens, this is not the token that is used for the beginning of
|
||||||
|
sequence. The token used is the `cls_token`.
|
||||||
|
eos_token (`string`, *optional*, defaults to `"[SEP]"`):
|
||||||
|
The end of sequence token. When building a sequence using special tokens, this is not the token that is
|
||||||
|
used for the end of sequence. The token used is the `sep_token`.
|
||||||
|
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
|
||||||
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||||
|
token instead.
|
||||||
|
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
|
||||||
|
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
|
||||||
|
sequence classification or for a text and a question for question answering. It is also used as the last
|
||||||
|
token of a sequence built with special tokens.
|
||||||
|
pad_token (`str`, *optional*, defaults to `"[PAD]"`):
|
||||||
|
The token used for padding, for example when batching sequences of different lengths.
|
||||||
|
cls_token (`str`, *optional*, defaults to `"[CLS]"`):
|
||||||
|
The classifier token which is used when doing sequence classification (classification of the whole sequence
|
||||||
|
instead of per-token classification). It is the first token of the sequence when built with special tokens.
|
||||||
|
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
|
||||||
|
The token used for masking values. This is the token used when training this model with masked language
|
||||||
|
modeling. This is the token which the model will try to predict.
|
||||||
|
sp_model_kwargs (`dict`, *optional*):
|
||||||
|
Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
|
||||||
|
SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
|
||||||
|
to set:
|
||||||
|
|
||||||
|
- `enable_sampling`: Enable subword regularization.
|
||||||
|
- `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
|
||||||
|
|
||||||
|
- `nbest_size = {0,1}`: No sampling is performed.
|
||||||
|
- `nbest_size > 1`: samples from the nbest_size results.
|
||||||
|
- `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
|
||||||
|
using forward-filtering-and-backward-sampling algorithm.
|
||||||
|
|
||||||
|
- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
|
||||||
|
BPE-dropout.
|
||||||
|
"""
|
||||||
|
|
||||||
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
|
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||||
|
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
|
||||||
|
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||||
|
slow_tokenizer_class = DebertaV2Tokenizer
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab_file=None,
|
||||||
|
tokenizer_file=None,
|
||||||
|
do_lower_case=False,
|
||||||
|
split_by_punct=False,
|
||||||
|
bos_token="[CLS]",
|
||||||
|
eos_token="[SEP]",
|
||||||
|
unk_token="[UNK]",
|
||||||
|
sep_token="[SEP]",
|
||||||
|
pad_token="[PAD]",
|
||||||
|
cls_token="[CLS]",
|
||||||
|
mask_token="[MASK]",
|
||||||
|
**kwargs
|
||||||
|
) -> None:
|
||||||
|
super().__init__(
|
||||||
|
vocab_file,
|
||||||
|
tokenizer_file=tokenizer_file,
|
||||||
|
do_lower_case=do_lower_case,
|
||||||
|
bos_token=bos_token,
|
||||||
|
eos_token=eos_token,
|
||||||
|
unk_token=unk_token,
|
||||||
|
sep_token=sep_token,
|
||||||
|
pad_token=pad_token,
|
||||||
|
cls_token=cls_token,
|
||||||
|
mask_token=mask_token,
|
||||||
|
split_by_punct=split_by_punct,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.do_lower_case = do_lower_case
|
||||||
|
self.split_by_punct = split_by_punct
|
||||||
|
self.vocab_file = vocab_file
|
||||||
|
self.can_save_slow_tokenizer = False if not self.vocab_file else True
|
||||||
|
|
||||||
|
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
||||||
|
"""
|
||||||
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
|
||||||
|
adding special tokens. A DeBERTa sequence has the following format:
|
||||||
|
|
||||||
|
- single sequence: [CLS] X [SEP]
|
||||||
|
- pair of sequences: [CLS] A [SEP] B [SEP]
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0 (`List[int]`):
|
||||||
|
List of IDs to which the special tokens will be added.
|
||||||
|
token_ids_1 (`List[int]`, *optional*):
|
||||||
|
Optional second list of IDs for sequence pairs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if token_ids_1 is None:
|
||||||
|
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
||||||
|
cls = [self.cls_token_id]
|
||||||
|
sep = [self.sep_token_id]
|
||||||
|
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||||
|
|
||||||
|
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
|
||||||
|
"""
|
||||||
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||||
|
special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0 (`List[int]`):
|
||||||
|
List of IDs.
|
||||||
|
token_ids_1 (`List[int]`, *optional*):
|
||||||
|
Optional second list of IDs for sequence pairs.
|
||||||
|
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether or not the token list is already formatted with special tokens for the model.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if already_has_special_tokens:
|
||||||
|
return super().get_special_tokens_mask(
|
||||||
|
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
|
||||||
|
)
|
||||||
|
|
||||||
|
if token_ids_1 is not None:
|
||||||
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
|
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
||||||
|
"""
|
||||||
|
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
|
||||||
|
sequence pair mask has the following format:
|
||||||
|
|
||||||
|
```
|
||||||
|
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
||||||
|
| first sequence | second sequence |
|
||||||
|
```
|
||||||
|
|
||||||
|
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0 (`List[int]`):
|
||||||
|
List of IDs.
|
||||||
|
token_ids_1 (`List[int]`, *optional*):
|
||||||
|
Optional second list of IDs for sequence pairs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
||||||
|
"""
|
||||||
|
sep = [self.sep_token_id]
|
||||||
|
cls = [self.cls_token_id]
|
||||||
|
if token_ids_1 is None:
|
||||||
|
return len(cls + token_ids_0 + sep) * [0]
|
||||||
|
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
||||||
|
|
||||||
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
|
if not self.can_save_slow_tokenizer:
|
||||||
|
raise ValueError(
|
||||||
|
"Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
|
||||||
|
"tokenizer."
|
||||||
|
)
|
||||||
|
|
||||||
|
if not os.path.isdir(save_directory):
|
||||||
|
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
||||||
|
return
|
||||||
|
out_vocab_file = os.path.join(
|
||||||
|
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
|
||||||
|
)
|
||||||
|
|
||||||
|
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
|
||||||
|
copyfile(self.vocab_file, out_vocab_file)
|
||||||
|
|
||||||
|
return (out_vocab_file,)
|
||||||
@@ -80,6 +80,13 @@ class DebertaTokenizerFast(metaclass=DummyObject):
|
|||||||
requires_backends(self, ["tokenizers"])
|
requires_backends(self, ["tokenizers"])
|
||||||
|
|
||||||
|
|
||||||
|
class DebertaV2TokenizerFast(metaclass=DummyObject):
|
||||||
|
_backends = ["tokenizers"]
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
requires_backends(self, ["tokenizers"])
|
||||||
|
|
||||||
|
|
||||||
class DistilBertTokenizerFast(metaclass=DummyObject):
|
class DistilBertTokenizerFast(metaclass=DummyObject):
|
||||||
_backends = ["tokenizers"]
|
_backends = ["tokenizers"]
|
||||||
|
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ import os
|
|||||||
import unittest
|
import unittest
|
||||||
from os.path import dirname
|
from os.path import dirname
|
||||||
|
|
||||||
from transformers import DebertaV2Tokenizer
|
from transformers import DebertaV2Tokenizer, DebertaV2TokenizerFast
|
||||||
from transformers.testing_utils import require_sentencepiece, require_tokenizers, slow
|
from transformers.testing_utils import require_sentencepiece, require_tokenizers, slow
|
||||||
|
|
||||||
from ..test_tokenization_common import TokenizerTesterMixin
|
from ..test_tokenization_common import TokenizerTesterMixin
|
||||||
@@ -31,8 +31,7 @@ SAMPLE_VOCAB = os.path.join(dirname(dirname(os.path.abspath(__file__))), "fixtur
|
|||||||
class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
tokenizer_class = DebertaV2Tokenizer
|
tokenizer_class = DebertaV2Tokenizer
|
||||||
rust_tokenizer_class = None
|
rust_tokenizer_class = DebertaV2TokenizerFast
|
||||||
test_rust_tokenizer = False
|
|
||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
test_sentencepiece_ignore_case = True
|
test_sentencepiece_ignore_case = True
|
||||||
|
|
||||||
@@ -67,17 +66,109 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_vocab_size(self):
|
def test_vocab_size(self):
|
||||||
self.assertEqual(self.get_tokenizer().vocab_size, 30_000)
|
self.assertEqual(self.get_tokenizer().vocab_size, 30_000)
|
||||||
|
|
||||||
|
def test_do_lower_case(self):
|
||||||
|
# fmt: off
|
||||||
|
sequence = " \tHeLLo!how \n Are yoU? "
|
||||||
|
tokens_target = ["▁hello", "!", "how", "▁are", "▁you", "?"]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=True)
|
||||||
|
tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
|
||||||
|
|
||||||
|
self.assertListEqual(tokens, tokens_target)
|
||||||
|
|
||||||
|
rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=True)
|
||||||
|
rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
|
||||||
|
|
||||||
|
self.assertListEqual(rust_tokens, tokens_target)
|
||||||
|
|
||||||
|
def test_split_by_punct(self):
|
||||||
|
# fmt: off
|
||||||
|
sequence = "I was born in 92000, and this is falsé."
|
||||||
|
tokens_target = ["▁", "<unk>", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "▁", ".", ]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, split_by_punct=True)
|
||||||
|
tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
|
||||||
|
|
||||||
|
self.assertListEqual(tokens, tokens_target)
|
||||||
|
|
||||||
|
rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, split_by_punct=True)
|
||||||
|
rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
|
||||||
|
|
||||||
|
self.assertListEqual(rust_tokens, tokens_target)
|
||||||
|
|
||||||
|
def test_do_lower_case_split_by_punct(self):
|
||||||
|
# fmt: off
|
||||||
|
sequence = "I was born in 92000, and this is falsé."
|
||||||
|
tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "▁", ".", ]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=True, split_by_punct=True)
|
||||||
|
tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
|
||||||
|
self.assertListEqual(tokens, tokens_target)
|
||||||
|
|
||||||
|
rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=True, split_by_punct=True)
|
||||||
|
rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
|
||||||
|
self.assertListEqual(rust_tokens, tokens_target)
|
||||||
|
|
||||||
|
def test_do_lower_case_split_by_punct_false(self):
|
||||||
|
# fmt: off
|
||||||
|
sequence = "I was born in 92000, and this is falsé."
|
||||||
|
tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", ".", ]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=True, split_by_punct=False)
|
||||||
|
tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
|
||||||
|
|
||||||
|
self.assertListEqual(tokens, tokens_target)
|
||||||
|
|
||||||
|
rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=True, split_by_punct=False)
|
||||||
|
rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
|
||||||
|
|
||||||
|
self.assertListEqual(rust_tokens, tokens_target)
|
||||||
|
|
||||||
|
def test_do_lower_case_false_split_by_punct(self):
|
||||||
|
# fmt: off
|
||||||
|
sequence = "I was born in 92000, and this is falsé."
|
||||||
|
tokens_target = ["▁", "<unk>", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "▁", ".", ]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=False, split_by_punct=True)
|
||||||
|
tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
|
||||||
|
|
||||||
|
self.assertListEqual(tokens, tokens_target)
|
||||||
|
|
||||||
|
rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=False, split_by_punct=True)
|
||||||
|
rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
|
||||||
|
|
||||||
|
self.assertListEqual(rust_tokens, tokens_target)
|
||||||
|
|
||||||
|
def test_do_lower_case_false_split_by_punct_false(self):
|
||||||
|
# fmt: off
|
||||||
|
sequence = " \tHeLLo!how \n Are yoU? "
|
||||||
|
tokens_target = ["▁", "<unk>", "e", "<unk>", "o", "!", "how", "▁", "<unk>", "re", "▁yo", "<unk>", "?"]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=False, split_by_punct=False)
|
||||||
|
tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
|
||||||
|
|
||||||
|
self.assertListEqual(tokens, tokens_target)
|
||||||
|
|
||||||
|
rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=False, split_by_punct=False)
|
||||||
|
rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
|
||||||
|
|
||||||
|
self.assertListEqual(rust_tokens, tokens_target)
|
||||||
|
|
||||||
def test_rust_and_python_full_tokenizers(self):
|
def test_rust_and_python_full_tokenizers(self):
|
||||||
if not self.test_rust_tokenizer:
|
|
||||||
return
|
|
||||||
|
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
rust_tokenizer = self.get_rust_tokenizer()
|
rust_tokenizer = self.get_rust_tokenizer()
|
||||||
|
|
||||||
sequence = "I was born in 92000, and this is falsé."
|
sequence = "I was born in 92000, and this is falsé."
|
||||||
|
|
||||||
tokens = tokenizer.tokenize(sequence)
|
tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
|
||||||
rust_tokens = rust_tokenizer.tokenize(sequence)
|
rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
|
||||||
self.assertListEqual(tokens, rust_tokens)
|
self.assertListEqual(tokens, rust_tokens)
|
||||||
|
|
||||||
ids = tokenizer.encode(sequence, add_special_tokens=False)
|
ids = tokenizer.encode(sequence, add_special_tokens=False)
|
||||||
@@ -90,29 +181,49 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
self.assertListEqual(ids, rust_ids)
|
self.assertListEqual(ids, rust_ids)
|
||||||
|
|
||||||
def test_full_tokenizer(self):
|
def test_full_tokenizer(self):
|
||||||
|
sequence = "This is a test"
|
||||||
|
ids_target = [13, 1, 4398, 25, 21, 1289]
|
||||||
|
tokens_target = ["▁", "T", "his", "▁is", "▁a", "▁test"]
|
||||||
|
back_tokens_target = ["▁", "<unk>", "his", "▁is", "▁a", "▁test"]
|
||||||
|
|
||||||
tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, keep_accents=True)
|
tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||||
|
rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, keep_accents=True)
|
||||||
|
|
||||||
tokens = tokenizer.tokenize("This is a test")
|
ids = tokenizer.encode(sequence, add_special_tokens=False)
|
||||||
self.assertListEqual(tokens, ["▁", "[UNK]", "his", "▁is", "▁a", "▁test"])
|
self.assertListEqual(ids, ids_target)
|
||||||
|
tokens = tokenizer.tokenize(sequence)
|
||||||
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [13, 1, 4398, 25, 21, 1289])
|
self.assertListEqual(tokens, tokens_target)
|
||||||
|
|
||||||
tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
|
|
||||||
# fmt: off
|
|
||||||
self.assertListEqual(
|
|
||||||
tokens,
|
|
||||||
["▁", "[UNK]", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "[UNK]", "."],
|
|
||||||
)
|
|
||||||
ids = tokenizer.convert_tokens_to_ids(tokens)
|
|
||||||
self.assertListEqual(ids, [13, 1, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])
|
|
||||||
|
|
||||||
back_tokens = tokenizer.convert_ids_to_tokens(ids)
|
back_tokens = tokenizer.convert_ids_to_tokens(ids)
|
||||||
self.assertListEqual(
|
self.assertListEqual(back_tokens, back_tokens_target)
|
||||||
back_tokens,
|
|
||||||
["▁", "<unk>", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "."],
|
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
|
||||||
)
|
self.assertListEqual(rust_ids, ids_target)
|
||||||
|
rust_tokens = rust_tokenizer.tokenize(sequence)
|
||||||
|
self.assertListEqual(rust_tokens, tokens_target)
|
||||||
|
rust_back_tokens = rust_tokenizer.convert_ids_to_tokens(rust_ids)
|
||||||
|
self.assertListEqual(rust_back_tokens, back_tokens_target)
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
sequence = "I was born in 92000, and this is falsé."
|
||||||
|
ids_target = [13, 1, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9]
|
||||||
|
tokens_target = ["▁", "I", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", ".", ]
|
||||||
|
back_tokens_target = ["▁", "<unk>", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", ".", ]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
ids = tokenizer.encode(sequence, add_special_tokens=False)
|
||||||
|
self.assertListEqual(ids, ids_target)
|
||||||
|
tokens = tokenizer.tokenize(sequence)
|
||||||
|
self.assertListEqual(tokens, tokens_target)
|
||||||
|
back_tokens = tokenizer.convert_ids_to_tokens(ids)
|
||||||
|
self.assertListEqual(back_tokens, back_tokens_target)
|
||||||
|
|
||||||
|
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
|
||||||
|
self.assertListEqual(rust_ids, ids_target)
|
||||||
|
rust_tokens = rust_tokenizer.tokenize(sequence)
|
||||||
|
self.assertListEqual(rust_tokens, tokens_target)
|
||||||
|
rust_back_tokens = rust_tokenizer.convert_ids_to_tokens(rust_ids)
|
||||||
|
self.assertListEqual(rust_back_tokens, back_tokens_target)
|
||||||
|
|
||||||
def test_sequence_builders(self):
|
def test_sequence_builders(self):
|
||||||
tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB)
|
tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB)
|
||||||
|
|
||||||
|
|||||||
@@ -55,4 +55,3 @@ src/transformers/models/wav2vec2/modeling_wav2vec2.py
|
|||||||
src/transformers/models/wav2vec2/tokenization_wav2vec2.py
|
src/transformers/models/wav2vec2/tokenization_wav2vec2.py
|
||||||
src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
|
src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
|
||||||
src/transformers/models/wavlm/modeling_wavlm.py
|
src/transformers/models/wavlm/modeling_wavlm.py
|
||||||
src/transformers/models/ctrl/modeling_ctrl.py
|
|
||||||
|
|||||||
Reference in New Issue
Block a user