From 45fc8c7951f978c0f8f13c8bab52c744cd5c4784 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Fri, 9 Apr 2021 11:57:44 -0400 Subject: [PATCH] Make `get_special_tokens_mask` consider all tokens (#11163) --- docs/source/model_doc/convbert.rst | 3 +- docs/source/model_doc/led.rst | 3 +- .../models/albert/tokenization_albert.py | 9 ++---- .../models/albert/tokenization_albert_fast.py | 31 ------------------ .../models/barthez/tokenization_barthez.py | 9 ++---- .../barthez/tokenization_barthez_fast.py | 30 ----------------- .../models/bert/tokenization_bert.py | 9 ++---- .../models/bertweet/tokenization_bertweet.py | 9 ++---- .../models/big_bird/tokenization_big_bird.py | 9 ++---- .../camembert/tokenization_camembert.py | 9 ++---- .../camembert/tokenization_camembert_fast.py | 30 ----------------- .../models/deberta/tokenization_deberta.py | 9 ++---- .../deberta_v2/tokenization_deberta_v2.py | 12 ++----- .../models/fsmt/tokenization_fsmt.py | 12 ++----- .../herbert/tokenization_herbert_fast.py | 9 ++---- .../models/m2m_100/tokenization_m2m_100.py | 10 +++--- .../models/mbart/tokenization_mbart.py | 10 +++--- .../models/mbart/tokenization_mbart50.py | 10 +++--- .../models/mbart/tokenization_mbart50_fast.py | 32 ------------------- .../models/mbart/tokenization_mbart_fast.py | 32 ------------------- .../models/mpnet/tokenization_mpnet.py | 9 ++---- .../models/phobert/tokenization_phobert.py | 9 ++---- .../prophetnet/tokenization_prophetnet.py | 9 ++---- .../models/roberta/tokenization_roberta.py | 9 ++---- .../tokenization_speech_to_text.py | 10 +++--- src/transformers/models/t5/tokenization_t5.py | 10 +++--- .../models/tapas/tokenization_tapas.py | 9 ++---- .../models/xlm/tokenization_xlm.py | 12 ++----- .../tokenization_xlm_prophetnet.py | 9 ++---- .../xlm_roberta/tokenization_xlm_roberta.py | 9 ++---- .../tokenization_xlm_roberta_fast.py | 31 ------------------ .../models/xlnet/tokenization_xlnet.py | 9 ++---- .../models/xlnet/tokenization_xlnet_fast.py | 31 ------------------ src/transformers/tokenization_utils.py | 10 ++++++ ...on_{{cookiecutter.lowercase_modelname}}.py | 9 ++---- .../{{cookiecutter.lowercase_modelname}}.rst | 3 +- 36 files changed, 90 insertions(+), 385 deletions(-) diff --git a/docs/source/model_doc/convbert.rst b/docs/source/model_doc/convbert.rst index 80ed9ebc37..69f7473354 100644 --- a/docs/source/model_doc/convbert.rst +++ b/docs/source/model_doc/convbert.rst @@ -56,8 +56,7 @@ ConvBertTokenizerFast ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.ConvBertTokenizerFast - :members: build_inputs_with_special_tokens, get_special_tokens_mask, - create_token_type_ids_from_sequences, save_vocabulary + :members: ConvBertModel diff --git a/docs/source/model_doc/led.rst b/docs/source/model_doc/led.rst index 4dbdbbaeb3..83a9386165 100644 --- a/docs/source/model_doc/led.rst +++ b/docs/source/model_doc/led.rst @@ -73,8 +73,7 @@ LEDTokenizerFast ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.LEDTokenizerFast - :members: build_inputs_with_special_tokens, get_special_tokens_mask, - create_token_type_ids_from_sequences, save_vocabulary + :members: LED specific outputs diff --git a/src/transformers/models/albert/tokenization_albert.py b/src/transformers/models/albert/tokenization_albert.py index a271f86064..92c06bbcde 100644 --- a/src/transformers/models/albert/tokenization_albert.py +++ b/src/transformers/models/albert/tokenization_albert.py @@ -267,12 +267,9 @@ class AlbertTokenizer(PreTrainedTokenizer): """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) if token_ids_1 is not None: return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] diff --git a/src/transformers/models/albert/tokenization_albert_fast.py b/src/transformers/models/albert/tokenization_albert_fast.py index 1d6e82b12d..cb817ddcc0 100644 --- a/src/transformers/models/albert/tokenization_albert_fast.py +++ b/src/transformers/models/albert/tokenization_albert_fast.py @@ -184,37 +184,6 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast): return cls + token_ids_0 + sep return cls + token_ids_0 + sep + token_ids_1 + sep - def get_special_tokens_mask( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False - ) -> List[int]: - """ - Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` method. - - Args: - token_ids_0 (:obj:`List[int]`): - List of ids. - token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): - Set to True if the token list is already formatted with special tokens for the model - - Returns: - :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - - if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) - - if token_ids_1 is not None: - return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1] - def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: diff --git a/src/transformers/models/barthez/tokenization_barthez.py b/src/transformers/models/barthez/tokenization_barthez.py index 428f6fec65..641cc80c1d 100644 --- a/src/transformers/models/barthez/tokenization_barthez.py +++ b/src/transformers/models/barthez/tokenization_barthez.py @@ -180,12 +180,9 @@ class BarthezTokenizer(PreTrainedTokenizer): :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formated with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) if token_ids_1 is None: return [1] + ([0] * len(token_ids_0)) + [1] diff --git a/src/transformers/models/barthez/tokenization_barthez_fast.py b/src/transformers/models/barthez/tokenization_barthez_fast.py index 1a9610c556..224bfb6453 100644 --- a/src/transformers/models/barthez/tokenization_barthez_fast.py +++ b/src/transformers/models/barthez/tokenization_barthez_fast.py @@ -164,36 +164,6 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast): sep = [self.sep_token_id] return cls + token_ids_0 + sep + sep + token_ids_1 + sep - def get_special_tokens_mask( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False - ) -> List[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` method. - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formated with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) - - if token_ids_1 is None: - return [1] + ([0] * len(token_ids_0)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] - def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: diff --git a/src/transformers/models/bert/tokenization_bert.py b/src/transformers/models/bert/tokenization_bert.py index 8f3ecfabf6..fbb2cfc029 100644 --- a/src/transformers/models/bert/tokenization_bert.py +++ b/src/transformers/models/bert/tokenization_bert.py @@ -290,12 +290,9 @@ class BertTokenizer(PreTrainedTokenizer): """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) if token_ids_1 is not None: return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] diff --git a/src/transformers/models/bertweet/tokenization_bertweet.py b/src/transformers/models/bertweet/tokenization_bertweet.py index aaeffd7380..bf110274da 100644 --- a/src/transformers/models/bertweet/tokenization_bertweet.py +++ b/src/transformers/models/bertweet/tokenization_bertweet.py @@ -220,12 +220,9 @@ class BertweetTokenizer(PreTrainedTokenizer): """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) if token_ids_1 is None: return [1] + ([0] * len(token_ids_0)) + [1] diff --git a/src/transformers/models/big_bird/tokenization_big_bird.py b/src/transformers/models/big_bird/tokenization_big_bird.py index 3cafcda189..4d03a0a3ac 100644 --- a/src/transformers/models/big_bird/tokenization_big_bird.py +++ b/src/transformers/models/big_bird/tokenization_big_bird.py @@ -219,12 +219,9 @@ class BigBirdTokenizer(PreTrainedTokenizer): :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) if token_ids_1 is None: return [1] + ([0] * len(token_ids_0)) + [1] diff --git a/src/transformers/models/camembert/tokenization_camembert.py b/src/transformers/models/camembert/tokenization_camembert.py index eb57acec89..8337d6826c 100644 --- a/src/transformers/models/camembert/tokenization_camembert.py +++ b/src/transformers/models/camembert/tokenization_camembert.py @@ -178,12 +178,9 @@ class CamembertTokenizer(PreTrainedTokenizer): :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) if token_ids_1 is None: return [1] + ([0] * len(token_ids_0)) + [1] diff --git a/src/transformers/models/camembert/tokenization_camembert_fast.py b/src/transformers/models/camembert/tokenization_camembert_fast.py index 648da8be70..a6333b98d0 100644 --- a/src/transformers/models/camembert/tokenization_camembert_fast.py +++ b/src/transformers/models/camembert/tokenization_camembert_fast.py @@ -162,36 +162,6 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast): sep = [self.sep_token_id] return cls + token_ids_0 + sep + sep + token_ids_1 + sep - def get_special_tokens_mask( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False - ) -> List[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` method. - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) - - if token_ids_1 is None: - return [1] + ([0] * len(token_ids_0)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] - def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: diff --git a/src/transformers/models/deberta/tokenization_deberta.py b/src/transformers/models/deberta/tokenization_deberta.py index 089c6dc509..ddd08e5286 100644 --- a/src/transformers/models/deberta/tokenization_deberta.py +++ b/src/transformers/models/deberta/tokenization_deberta.py @@ -174,12 +174,9 @@ class DebertaTokenizer(GPT2Tokenizer): :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) if token_ids_1 is None: return [1] + ([0] * len(token_ids_0)) + [1] diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py index a0e80f6b00..78509f88d7 100644 --- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py +++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py @@ -187,16 +187,8 @@ class DebertaV2Tokenizer(PreTrainedTokenizer): """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list( - map( - lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, - token_ids_0, - ) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True ) if token_ids_1 is not None: diff --git a/src/transformers/models/fsmt/tokenization_fsmt.py b/src/transformers/models/fsmt/tokenization_fsmt.py index 124a9541d7..226d18cc3e 100644 --- a/src/transformers/models/fsmt/tokenization_fsmt.py +++ b/src/transformers/models/fsmt/tokenization_fsmt.py @@ -437,16 +437,8 @@ class FSMTTokenizer(PreTrainedTokenizer): """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list( - map( - lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, - token_ids_0, - ) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True ) # no bos used in fairseq if token_ids_1 is not None: diff --git a/src/transformers/models/herbert/tokenization_herbert_fast.py b/src/transformers/models/herbert/tokenization_herbert_fast.py index 2e5ba1d17a..7a67d5e737 100644 --- a/src/transformers/models/herbert/tokenization_herbert_fast.py +++ b/src/transformers/models/herbert/tokenization_herbert_fast.py @@ -126,12 +126,9 @@ class HerbertTokenizerFast(PreTrainedTokenizerFast): :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) if token_ids_1 is None: return [1] + ([0] * len(token_ids_0)) + [1] diff --git a/src/transformers/models/m2m_100/tokenization_m2m_100.py b/src/transformers/models/m2m_100/tokenization_m2m_100.py index cbd8a0aa0d..3d2f273d72 100644 --- a/src/transformers/models/m2m_100/tokenization_m2m_100.py +++ b/src/transformers/models/m2m_100/tokenization_m2m_100.py @@ -207,12 +207,10 @@ class M2M100Tokenizer(PreTrainedTokenizer): """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + prefix_ones = [1] * len(self.prefix_tokens) suffix_ones = [1] * len(self.suffix_tokens) if token_ids_1 is None: diff --git a/src/transformers/models/mbart/tokenization_mbart.py b/src/transformers/models/mbart/tokenization_mbart.py index c256132d7e..a38aaf7ef3 100644 --- a/src/transformers/models/mbart/tokenization_mbart.py +++ b/src/transformers/models/mbart/tokenization_mbart.py @@ -149,12 +149,10 @@ class MBartTokenizer(XLMRobertaTokenizer): """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + prefix_ones = [1] * len(self.prefix_tokens) suffix_ones = [1] * len(self.suffix_tokens) if token_ids_1 is None: diff --git a/src/transformers/models/mbart/tokenization_mbart50.py b/src/transformers/models/mbart/tokenization_mbart50.py index f5f1a2f60f..5afd9b215f 100644 --- a/src/transformers/models/mbart/tokenization_mbart50.py +++ b/src/transformers/models/mbart/tokenization_mbart50.py @@ -241,12 +241,10 @@ class MBart50Tokenizer(PreTrainedTokenizer): """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + prefix_ones = [1] * len(self.prefix_tokens) suffix_ones = [1] * len(self.suffix_tokens) if token_ids_1 is None: diff --git a/src/transformers/models/mbart/tokenization_mbart50_fast.py b/src/transformers/models/mbart/tokenization_mbart50_fast.py index bda4b7cf36..f22d02e59b 100644 --- a/src/transformers/models/mbart/tokenization_mbart50_fast.py +++ b/src/transformers/models/mbart/tokenization_mbart50_fast.py @@ -160,38 +160,6 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast): self._src_lang = new_src_lang self.set_src_lang_special_tokens(self._src_lang) - def get_special_tokens_mask( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False - ) -> List[int]: - """ - Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` method. - - Args: - token_ids_0 (:obj:`List[int]`): - List of ids. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - - if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) - prefix_ones = [1] * len(self.prefix_tokens) - suffix_ones = [1] * len(self.suffix_tokens) - if token_ids_1 is None: - return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones - return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones - def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: diff --git a/src/transformers/models/mbart/tokenization_mbart_fast.py b/src/transformers/models/mbart/tokenization_mbart_fast.py index e690218315..bbe9ed7d5d 100644 --- a/src/transformers/models/mbart/tokenization_mbart_fast.py +++ b/src/transformers/models/mbart/tokenization_mbart_fast.py @@ -131,38 +131,6 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast): self._src_lang = new_src_lang self.set_src_lang_special_tokens(self._src_lang) - def get_special_tokens_mask( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False - ) -> List[int]: - """ - Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` method. - - Args: - token_ids_0 (:obj:`List[int]`): - List of ids. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - - if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) - prefix_ones = [1] * len(self.prefix_tokens) - suffix_ones = [1] * len(self.suffix_tokens) - if token_ids_1 is None: - return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones - return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones - def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: diff --git a/src/transformers/models/mpnet/tokenization_mpnet.py b/src/transformers/models/mpnet/tokenization_mpnet.py index 125fde68a5..8041ec4ec5 100644 --- a/src/transformers/models/mpnet/tokenization_mpnet.py +++ b/src/transformers/models/mpnet/tokenization_mpnet.py @@ -266,12 +266,9 @@ class MPNetTokenizer(PreTrainedTokenizer): :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formated with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) if token_ids_1 is None: return [1] + ([0] * len(token_ids_0)) + [1] diff --git a/src/transformers/models/phobert/tokenization_phobert.py b/src/transformers/models/phobert/tokenization_phobert.py index e99e58002e..3caca9012d 100644 --- a/src/transformers/models/phobert/tokenization_phobert.py +++ b/src/transformers/models/phobert/tokenization_phobert.py @@ -201,12 +201,9 @@ class PhobertTokenizer(PreTrainedTokenizer): """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) if token_ids_1 is None: return [1] + ([0] * len(token_ids_0)) + [1] diff --git a/src/transformers/models/prophetnet/tokenization_prophetnet.py b/src/transformers/models/prophetnet/tokenization_prophetnet.py index cd51662b55..25df78162e 100644 --- a/src/transformers/models/prophetnet/tokenization_prophetnet.py +++ b/src/transformers/models/prophetnet/tokenization_prophetnet.py @@ -203,12 +203,9 @@ class ProphetNetTokenizer(PreTrainedTokenizer): :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) if token_ids_1 is None: return ([0] * len(token_ids_0)) + [1] diff --git a/src/transformers/models/roberta/tokenization_roberta.py b/src/transformers/models/roberta/tokenization_roberta.py index 9a037d1d15..696868fdfc 100644 --- a/src/transformers/models/roberta/tokenization_roberta.py +++ b/src/transformers/models/roberta/tokenization_roberta.py @@ -215,12 +215,9 @@ class RobertaTokenizer(GPT2Tokenizer): :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) if token_ids_1 is None: return [1] + ([0] * len(token_ids_0)) + [1] diff --git a/src/transformers/models/speech_to_text/tokenization_speech_to_text.py b/src/transformers/models/speech_to_text/tokenization_speech_to_text.py index bf3402295a..502021d535 100644 --- a/src/transformers/models/speech_to_text/tokenization_speech_to_text.py +++ b/src/transformers/models/speech_to_text/tokenization_speech_to_text.py @@ -199,12 +199,10 @@ class Speech2TextTokenizer(PreTrainedTokenizer): """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.bos_token_id, self.eos_token_id] else 0, token_ids_0)) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + prefix_ones = [1] * len(self.prefix_tokens) suffix_ones = [1] if token_ids_1 is None: diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py index 74dc811c6e..90a0159aef 100644 --- a/src/transformers/models/t5/tokenization_t5.py +++ b/src/transformers/models/t5/tokenization_t5.py @@ -157,12 +157,10 @@ class T5Tokenizer(PreTrainedTokenizer): :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + # normal case: some special tokens if token_ids_1 is None: return ([0] * len(token_ids_0)) + [1] diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py index 9716193951..3d1e82ac51 100644 --- a/src/transformers/models/tapas/tokenization_tapas.py +++ b/src/transformers/models/tapas/tokenization_tapas.py @@ -510,12 +510,9 @@ class TapasTokenizer(PreTrainedTokenizer): """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) if token_ids_1 is not None: return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) diff --git a/src/transformers/models/xlm/tokenization_xlm.py b/src/transformers/models/xlm/tokenization_xlm.py index d861ccc0ed..95730451fd 100644 --- a/src/transformers/models/xlm/tokenization_xlm.py +++ b/src/transformers/models/xlm/tokenization_xlm.py @@ -906,16 +906,8 @@ class XLMTokenizer(PreTrainedTokenizer): """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list( - map( - lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, - token_ids_0, - ) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True ) if token_ids_1 is not None: diff --git a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py index ba1d160ee2..188292ed17 100644 --- a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py +++ b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py @@ -200,12 +200,9 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer): """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) if token_ids_1 is None: return ([0] * len(token_ids_0)) + [1] diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py index 4549d212ec..8ecec6dffe 100644 --- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py @@ -206,12 +206,9 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) if token_ids_1 is None: return [1] + ([0] * len(token_ids_0)) + [1] diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py index 9426d6c4aa..fbdeca2e1a 100644 --- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py +++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py @@ -172,37 +172,6 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast): sep = [self.sep_token_id] return cls + token_ids_0 + sep + sep + token_ids_1 + sep - def get_special_tokens_mask( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False - ) -> List[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` method. - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - - if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) - - if token_ids_1 is None: - return [1] + ([0] * len(token_ids_0)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] - def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: diff --git a/src/transformers/models/xlnet/tokenization_xlnet.py b/src/transformers/models/xlnet/tokenization_xlnet.py index 4980f450cb..97fd542c28 100644 --- a/src/transformers/models/xlnet/tokenization_xlnet.py +++ b/src/transformers/models/xlnet/tokenization_xlnet.py @@ -270,12 +270,9 @@ class XLNetTokenizer(PreTrainedTokenizer): """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) if token_ids_1 is not None: return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1] diff --git a/src/transformers/models/xlnet/tokenization_xlnet_fast.py b/src/transformers/models/xlnet/tokenization_xlnet_fast.py index f3a46c2d78..364dccf3d6 100644 --- a/src/transformers/models/xlnet/tokenization_xlnet_fast.py +++ b/src/transformers/models/xlnet/tokenization_xlnet_fast.py @@ -190,37 +190,6 @@ class XLNetTokenizerFast(PreTrainedTokenizerFast): return token_ids_0 + sep + cls return token_ids_0 + sep + token_ids_1 + sep + cls - def get_special_tokens_mask( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False - ) -> List[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` method. - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - - if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) - - if token_ids_1 is not None: - return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1] - return ([0] * len(token_ids_0)) + [1, 1] - def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index 0606c3f9c1..fafe8a5597 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -670,6 +670,16 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): Returns: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formatted with special tokens for the model." + ) + + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0)) @overload diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py index 7973c1e1dd..ec154a9b1c 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py @@ -225,12 +225,9 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(PreTrainedTokenizer): :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) if token_ids_1 is None: return [1] + ([0] * len(token_ids_0)) + [1] diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.rst b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.rst index 67384736e7..7510fe44e9 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.rst +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.rst @@ -46,8 +46,7 @@ Tips: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}TokenizerFast - :members: build_inputs_with_special_tokens, get_special_tokens_mask, - create_token_type_ids_from_sequences, save_vocabulary + :members: {% if "PyTorch" in cookiecutter.generate_tensorflow_and_pytorch -%}