From ff06b177917384137af2d9585697d2d76c40cdfc Mon Sep 17 00:00:00 2001 From: Yang Ming <54308224+mingboiz@users.noreply.github.com> Date: Wed, 20 Apr 2022 16:26:51 +0800 Subject: [PATCH] add DebertaV2 fast tokenizer (#15529) Co-authored-by: alcinos Co-authored-by: SaulLu <55560583+SaulLu@users.noreply.github.com> Co-authored-by: Nicolas Carion Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- docs/source/en/index.mdx | 2 +- docs/source/en/model_doc/deberta-v2.mdx | 6 + src/transformers/__init__.py | 2 + src/transformers/convert_slow_tokenizer.py | 33 +++ .../models/auto/tokenization_auto.py | 8 +- .../models/deberta_v2/__init__.py | 8 +- .../deberta_v2/tokenization_deberta_v2.py | 12 +- .../tokenization_deberta_v2_fast.py | 243 ++++++++++++++++++ .../utils/dummy_tokenizers_objects.py | 7 + .../test_tokenization_deberta_v2.py | 161 ++++++++++-- utils/documentation_tests.txt | 1 - 11 files changed, 444 insertions(+), 39 deletions(-) create mode 100644 src/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index e1acfcefa0..08370fae6e 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -192,7 +192,7 @@ Flax), PyTorch, and/or TensorFlow. | Data2VecText | ❌ | ❌ | ✅ | ❌ | ❌ | | Data2VecVision | ❌ | ❌ | ✅ | ❌ | ❌ | | DeBERTa | ✅ | ✅ | ✅ | ✅ | ❌ | -| DeBERTa-v2 | ✅ | ❌ | ✅ | ✅ | ❌ | +| DeBERTa-v2 | ✅ | ✅ | ✅ | ✅ | ❌ | | Decision Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | | DeiT | ❌ | ❌ | ✅ | ❌ | ❌ | | DETR | ❌ | ❌ | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/deberta-v2.mdx b/docs/source/en/model_doc/deberta-v2.mdx index 1243139ff7..7dd34790de 100644 --- a/docs/source/en/model_doc/deberta-v2.mdx +++ b/docs/source/en/model_doc/deberta-v2.mdx @@ -71,6 +71,12 @@ contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code - create_token_type_ids_from_sequences - save_vocabulary +## DebertaV2TokenizerFast + +[[autodoc]] DebertaV2TokenizerFast + - build_inputs_with_special_tokens + - create_token_type_ids_from_sequences + ## DebertaV2Model [[autodoc]] DebertaV2Model diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 779025cd56..88d6747787 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -461,6 +461,7 @@ if is_tokenizers_available(): _import_structure["models.blenderbot"].append("BlenderbotTokenizerFast") _import_structure["models.camembert"].append("CamembertTokenizerFast") _import_structure["models.deberta"].append("DebertaTokenizerFast") + _import_structure["models.deberta_v2"].append("DebertaV2TokenizerFast") _import_structure["models.distilbert"].append("DistilBertTokenizerFast") _import_structure["models.dpr"].extend( ["DPRContextEncoderTokenizerFast", "DPRQuestionEncoderTokenizerFast", "DPRReaderTokenizerFast"] @@ -2829,6 +2830,7 @@ if TYPE_CHECKING: from .models.clip import CLIPTokenizerFast from .models.convbert import ConvBertTokenizerFast from .models.deberta import DebertaTokenizerFast + from .models.deberta_v2 import DebertaV2TokenizerFast from .models.distilbert import DistilBertTokenizerFast from .models.dpr import DPRContextEncoderTokenizerFast, DPRQuestionEncoderTokenizerFast, DPRReaderTokenizerFast from .models.electra import ElectraTokenizerFast diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index 188c1705c9..4f41f6b61f 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -580,6 +580,38 @@ class CamembertConverter(SpmConverter): ) +class DebertaV2Converter(SpmConverter): + def pre_tokenizer(self, replacement, add_prefix_space): + list_pretokenizers = [] + if self.original_tokenizer.split_by_punct: + list_pretokenizers.append(pre_tokenizers.Punctuation(behavior="isolated")) + list_pretokenizers.append(pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)) + return pre_tokenizers.Sequence(list_pretokenizers) + + def normalizer(self, proto): + list_normalizers = [] + if self.original_tokenizer.do_lower_case: + list_normalizers.append(normalizers.Lowercase()) + list_normalizers.append(normalizers.Strip()) + + precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap + if precompiled_charsmap: + list_normalizers.append(normalizers.Precompiled(precompiled_charsmap)) + list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " ")) + + return normalizers.Sequence(list_normalizers) + + def post_processor(self): + return processors.TemplateProcessing( + single="[CLS]:0 $A:0 [SEP]:0", + pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1", + special_tokens=[ + ("[CLS]", self.original_tokenizer.convert_tokens_to_ids("[CLS]")), + ("[SEP]", self.original_tokenizer.convert_tokens_to_ids("[SEP]")), + ], + ) + + class MBartConverter(SpmConverter): def vocab(self, proto): vocab = [ @@ -979,6 +1011,7 @@ SLOW_TO_FAST_CONVERTERS = { "CLIPTokenizer": CLIPConverter, "ConvBertTokenizer": BertConverter, "DebertaTokenizer": DebertaConverter, + "DebertaV2Tokenizer": DebertaV2Converter, "DistilBertTokenizer": BertConverter, "DPRReaderTokenizer": BertConverter, "DPRQuestionEncoderTokenizer": BertConverter, diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index b3b0960b23..4a3ab5523b 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -150,7 +150,13 @@ else: ("fsmt", ("FSMTTokenizer", None)), ("bert-generation", ("BertGenerationTokenizer" if is_sentencepiece_available() else None, None)), ("deberta", ("DebertaTokenizer", "DebertaTokenizerFast" if is_tokenizers_available() else None)), - ("deberta-v2", ("DebertaV2Tokenizer" if is_sentencepiece_available() else None, None)), + ( + "deberta-v2", + ( + "DebertaV2Tokenizer" if is_sentencepiece_available() else None, + "DebertaV2TokenizerFast" if is_tokenizers_available() else None, + ), + ), ("rag", ("RagTokenizer", None)), ("xlm-prophetnet", ("XLMProphetNetTokenizer" if is_sentencepiece_available() else None, None)), ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)), diff --git a/src/transformers/models/deberta_v2/__init__.py b/src/transformers/models/deberta_v2/__init__.py index 1178e41301..a7f3cada93 100644 --- a/src/transformers/models/deberta_v2/__init__.py +++ b/src/transformers/models/deberta_v2/__init__.py @@ -18,7 +18,7 @@ from typing import TYPE_CHECKING -from ...utils import _LazyModule, is_tf_available, is_torch_available +from ...utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available _import_structure = { @@ -26,6 +26,9 @@ _import_structure = { "tokenization_deberta_v2": ["DebertaV2Tokenizer"], } +if is_tokenizers_available(): + _import_structure["tokenization_deberta_v2_fast"] = ["DebertaV2TokenizerFast"] + if is_tf_available(): _import_structure["modeling_tf_deberta_v2"] = [ "TF_DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -53,6 +56,9 @@ if TYPE_CHECKING: from .configuration_deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config from .tokenization_deberta_v2 import DebertaV2Tokenizer + if is_tokenizers_available(): + from .tokenization_deberta_v2_fast import DebertaV2TokenizerFast + if is_tf_available(): from .modeling_tf_deberta_v2 import ( TF_DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST, diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py index 5404b70135..0e67f85896 100644 --- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py +++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py @@ -142,6 +142,7 @@ class DebertaV2Tokenizer(PreTrainedTokenizer): ) self.do_lower_case = do_lower_case self.split_by_punct = split_by_punct + self.vocab_file = vocab_file self._tokenizer = SPMTokenizer(vocab_file, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs) @property @@ -325,16 +326,7 @@ class SPMTokenizer: self.spm.Load(self.vocab_file) def tokenize(self, text): - pieces = self._encode_as_pieces(text) - - def _norm(x): - if x not in self.vocab or x == "": - return "[UNK]" - else: - return x - - pieces = [_norm(p) for p in pieces] - return pieces + return self._encode_as_pieces(text) def convert_ids_to_tokens(self, ids): tokens = [] diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py new file mode 100644 index 0000000000..8aa92180d6 --- /dev/null +++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py @@ -0,0 +1,243 @@ +# coding=utf-8 +# Copyright 2020 Microsoft and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Tokenization class for model DeBERTa.""" + +import os +from shutil import copyfile +from typing import Optional, Tuple + +from ...file_utils import is_sentencepiece_available +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import logging + + +if is_sentencepiece_available(): + from .tokenization_deberta_v2 import DebertaV2Tokenizer +else: + DebertaV2Tokenizer = None + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "spm.model", "tokenizer_file": "tokenizer.json"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "microsoft/deberta-v2-xlarge": "https://huggingface.co/microsoft/deberta-v2-xlarge/resolve/main/spm.model", + "microsoft/deberta-v2-xxlarge": "https://huggingface.co/microsoft/deberta-v2-xxlarge/resolve/main/spm.model", + "microsoft/deberta-v2-xlarge-mnli": "https://huggingface.co/microsoft/deberta-v2-xlarge-mnli/resolve/main/spm.model", + "microsoft/deberta-v2-xxlarge-mnli": "https://huggingface.co/microsoft/deberta-v2-xxlarge-mnli/resolve/main/spm.model", + } +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "microsoft/deberta-v2-xlarge": 512, + "microsoft/deberta-v2-xxlarge": 512, + "microsoft/deberta-v2-xlarge-mnli": 512, + "microsoft/deberta-v2-xxlarge-mnli": 512, +} + +PRETRAINED_INIT_CONFIGURATION = { + "microsoft/deberta-v2-xlarge": {"do_lower_case": False}, + "microsoft/deberta-v2-xxlarge": {"do_lower_case": False}, + "microsoft/deberta-v2-xlarge-mnli": {"do_lower_case": False}, + "microsoft/deberta-v2-xxlarge-mnli": {"do_lower_case": False}, +} + + +class DebertaV2TokenizerFast(PreTrainedTokenizerFast): + r""" + Constructs a DeBERTa-v2 fast tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece). + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + do_lower_case (`bool`, *optional*, defaults to `False`): + Whether or not to lowercase the input when tokenizing. + bos_token (`string`, *optional*, defaults to `"[CLS]"`): + The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + eos_token (`string`, *optional*, defaults to `"[SEP]"`): + The end of sequence token. When building a sequence using special tokens, this is not the token that is + used for the end of sequence. The token used is the `sep_token`. + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + sp_model_kwargs (`dict`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + slow_tokenizer_class = DebertaV2Tokenizer + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + do_lower_case=False, + split_by_punct=False, + bos_token="[CLS]", + eos_token="[SEP]", + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + **kwargs + ) -> None: + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + do_lower_case=do_lower_case, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + split_by_punct=split_by_punct, + **kwargs, + ) + + self.do_lower_case = do_lower_case + self.split_by_punct = split_by_punct + self.vocab_file = vocab_file + self.can_save_slow_tokenizer = False if not self.vocab_file else True + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A DeBERTa sequence has the following format: + + - single sequence: [CLS] X [SEP] + - pair of sequences: [CLS] A [SEP] B [SEP] + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa + sequence pair mask has the following format: + + ``` + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + ``` + + If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not self.can_save_slow_tokenizer: + raise ValueError( + "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow " + "tokenizer." + ) + + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) diff --git a/src/transformers/utils/dummy_tokenizers_objects.py b/src/transformers/utils/dummy_tokenizers_objects.py index 5ff4d3a4a4..64c7541649 100644 --- a/src/transformers/utils/dummy_tokenizers_objects.py +++ b/src/transformers/utils/dummy_tokenizers_objects.py @@ -80,6 +80,13 @@ class DebertaTokenizerFast(metaclass=DummyObject): requires_backends(self, ["tokenizers"]) +class DebertaV2TokenizerFast(metaclass=DummyObject): + _backends = ["tokenizers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tokenizers"]) + + class DistilBertTokenizerFast(metaclass=DummyObject): _backends = ["tokenizers"] diff --git a/tests/deberta_v2/test_tokenization_deberta_v2.py b/tests/deberta_v2/test_tokenization_deberta_v2.py index be414551f6..ee52c8706a 100644 --- a/tests/deberta_v2/test_tokenization_deberta_v2.py +++ b/tests/deberta_v2/test_tokenization_deberta_v2.py @@ -17,7 +17,7 @@ import os import unittest from os.path import dirname -from transformers import DebertaV2Tokenizer +from transformers import DebertaV2Tokenizer, DebertaV2TokenizerFast from transformers.testing_utils import require_sentencepiece, require_tokenizers, slow from ..test_tokenization_common import TokenizerTesterMixin @@ -31,8 +31,7 @@ SAMPLE_VOCAB = os.path.join(dirname(dirname(os.path.abspath(__file__))), "fixtur class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = DebertaV2Tokenizer - rust_tokenizer_class = None - test_rust_tokenizer = False + rust_tokenizer_class = DebertaV2TokenizerFast test_sentencepiece = True test_sentencepiece_ignore_case = True @@ -67,17 +66,109 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_vocab_size(self): self.assertEqual(self.get_tokenizer().vocab_size, 30_000) + def test_do_lower_case(self): + # fmt: off + sequence = " \tHeLLo!how \n Are yoU? " + tokens_target = ["▁hello", "!", "how", "▁are", "▁you", "?"] + # fmt: on + + tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=True) + tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False)) + + self.assertListEqual(tokens, tokens_target) + + rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=True) + rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False)) + + self.assertListEqual(rust_tokens, tokens_target) + + def test_split_by_punct(self): + # fmt: off + sequence = "I was born in 92000, and this is falsé." + tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", ".", ] + # fmt: on + + tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, split_by_punct=True) + tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False)) + + self.assertListEqual(tokens, tokens_target) + + rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, split_by_punct=True) + rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False)) + + self.assertListEqual(rust_tokens, tokens_target) + + def test_do_lower_case_split_by_punct(self): + # fmt: off + sequence = "I was born in 92000, and this is falsé." + tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", ".", ] + # fmt: on + + tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=True, split_by_punct=True) + tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False)) + self.assertListEqual(tokens, tokens_target) + + rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=True, split_by_punct=True) + rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False)) + self.assertListEqual(rust_tokens, tokens_target) + + def test_do_lower_case_split_by_punct_false(self): + # fmt: off + sequence = "I was born in 92000, and this is falsé." + tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", ".", ] + # fmt: on + + tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=True, split_by_punct=False) + tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False)) + + self.assertListEqual(tokens, tokens_target) + + rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=True, split_by_punct=False) + rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False)) + + self.assertListEqual(rust_tokens, tokens_target) + + def test_do_lower_case_false_split_by_punct(self): + # fmt: off + sequence = "I was born in 92000, and this is falsé." + tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", ".", ] + # fmt: on + + tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=False, split_by_punct=True) + tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False)) + + self.assertListEqual(tokens, tokens_target) + + rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=False, split_by_punct=True) + rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False)) + + self.assertListEqual(rust_tokens, tokens_target) + + def test_do_lower_case_false_split_by_punct_false(self): + # fmt: off + sequence = " \tHeLLo!how \n Are yoU? " + tokens_target = ["▁", "", "e", "", "o", "!", "how", "▁", "", "re", "▁yo", "", "?"] + # fmt: on + + tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=False, split_by_punct=False) + tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False)) + + self.assertListEqual(tokens, tokens_target) + + rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=False, split_by_punct=False) + rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False)) + + self.assertListEqual(rust_tokens, tokens_target) + def test_rust_and_python_full_tokenizers(self): - if not self.test_rust_tokenizer: - return tokenizer = self.get_tokenizer() rust_tokenizer = self.get_rust_tokenizer() sequence = "I was born in 92000, and this is falsé." - tokens = tokenizer.tokenize(sequence) - rust_tokens = rust_tokenizer.tokenize(sequence) + tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False)) + rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False)) self.assertListEqual(tokens, rust_tokens) ids = tokenizer.encode(sequence, add_special_tokens=False) @@ -90,29 +181,49 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertListEqual(ids, rust_ids) def test_full_tokenizer(self): + sequence = "This is a test" + ids_target = [13, 1, 4398, 25, 21, 1289] + tokens_target = ["▁", "T", "his", "▁is", "▁a", "▁test"] + back_tokens_target = ["▁", "", "his", "▁is", "▁a", "▁test"] + tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, keep_accents=True) + rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, keep_accents=True) - tokens = tokenizer.tokenize("This is a test") - self.assertListEqual(tokens, ["▁", "[UNK]", "his", "▁is", "▁a", "▁test"]) - - self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [13, 1, 4398, 25, 21, 1289]) - - tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") - # fmt: off - self.assertListEqual( - tokens, - ["▁", "[UNK]", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "[UNK]", "."], - ) - ids = tokenizer.convert_tokens_to_ids(tokens) - self.assertListEqual(ids, [13, 1, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9]) - + ids = tokenizer.encode(sequence, add_special_tokens=False) + self.assertListEqual(ids, ids_target) + tokens = tokenizer.tokenize(sequence) + self.assertListEqual(tokens, tokens_target) back_tokens = tokenizer.convert_ids_to_tokens(ids) - self.assertListEqual( - back_tokens, - ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "."], - ) + self.assertListEqual(back_tokens, back_tokens_target) + + rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False) + self.assertListEqual(rust_ids, ids_target) + rust_tokens = rust_tokenizer.tokenize(sequence) + self.assertListEqual(rust_tokens, tokens_target) + rust_back_tokens = rust_tokenizer.convert_ids_to_tokens(rust_ids) + self.assertListEqual(rust_back_tokens, back_tokens_target) + + # fmt: off + sequence = "I was born in 92000, and this is falsé." + ids_target = [13, 1, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9] + tokens_target = ["▁", "I", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", ".", ] + back_tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", ".", ] # fmt: on + ids = tokenizer.encode(sequence, add_special_tokens=False) + self.assertListEqual(ids, ids_target) + tokens = tokenizer.tokenize(sequence) + self.assertListEqual(tokens, tokens_target) + back_tokens = tokenizer.convert_ids_to_tokens(ids) + self.assertListEqual(back_tokens, back_tokens_target) + + rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False) + self.assertListEqual(rust_ids, ids_target) + rust_tokens = rust_tokenizer.tokenize(sequence) + self.assertListEqual(rust_tokens, tokens_target) + rust_back_tokens = rust_tokenizer.convert_ids_to_tokens(rust_ids) + self.assertListEqual(rust_back_tokens, back_tokens_target) + def test_sequence_builders(self): tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB) diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt index 077dd5f13a..5b46c02cf5 100644 --- a/utils/documentation_tests.txt +++ b/utils/documentation_tests.txt @@ -55,4 +55,3 @@ src/transformers/models/wav2vec2/modeling_wav2vec2.py src/transformers/models/wav2vec2/tokenization_wav2vec2.py src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py src/transformers/models/wavlm/modeling_wavlm.py -src/transformers/models/ctrl/modeling_ctrl.py