From 5164ea91a7b4d35cb03867233527fa383a651775 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Mon, 9 Mar 2020 13:48:58 -0400 Subject: [PATCH] Skipping outputs (#3116) * Minimal example * Proposal 2 * Proposal 2 for fast tokenizers * Typings * Docs * Revert "Docs" for easier review This reverts commit eaf0f97062e809887704a542144c537f769d5223. * Remove unnecessary assignments * Tests * Fix faulty type * Remove prints * return_outputs -> model_input_names * Revert "Revert "Docs" for easier review" This reverts commit 6fdc69408102bf695797f2dfddbb6350c6b9e722. * code quality --- src/transformers/tokenization_distilbert.py | 2 + src/transformers/tokenization_roberta.py | 2 + src/transformers/tokenization_utils.py | 348 ++++++++++++-------- tests/test_tokenization_common.py | 62 ++-- 4 files changed, 251 insertions(+), 163 deletions(-) diff --git a/src/transformers/tokenization_distilbert.py b/src/transformers/tokenization_distilbert.py index 7a5bf34367..626e65486b 100644 --- a/src/transformers/tokenization_distilbert.py +++ b/src/transformers/tokenization_distilbert.py @@ -69,6 +69,7 @@ class DistilBertTokenizer(BertTokenizer): pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + model_input_names = ["attention_mask"] class DistilBertTokenizerFast(BertTokenizerFast): @@ -76,3 +77,4 @@ class DistilBertTokenizerFast(BertTokenizerFast): pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + model_input_names = ["attention_mask"] diff --git a/src/transformers/tokenization_roberta.py b/src/transformers/tokenization_roberta.py index 7275ceb4ca..5076f8764c 100644 --- a/src/transformers/tokenization_roberta.py +++ b/src/transformers/tokenization_roberta.py @@ -119,6 +119,7 @@ class RobertaTokenizer(GPT2Tokenizer): vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["attention_mask"] def __init__( self, @@ -244,6 +245,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast): vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["attention_mask"] def __init__( self, diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index 75119e9285..cdb5e2839a 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -22,6 +22,7 @@ import os import re from collections import defaultdict from contextlib import contextmanager +from typing import List, Optional, Tuple, Union from tokenizers.implementations import BaseTokenizer @@ -138,6 +139,7 @@ class PreTrainedTokenizer(object): pretrained_vocab_files_map = {} pretrained_init_configuration = {} max_model_input_sizes = {} + model_input_names = ["token_type_ids", "attention_mask"] SPECIAL_TOKENS_ATTRIBUTES = [ "bos_token", @@ -316,6 +318,7 @@ class PreTrainedTokenizer(object): # Padding side is right by default and over-riden in subclasses. If specified in the kwargs, it is changed. self.padding_side = kwargs.pop("padding_side", self.padding_side) + self.model_input_names = kwargs.pop("model_input_names", self.model_input_names) # Added tokens self.added_tokens_encoder = {} @@ -849,14 +852,14 @@ class PreTrainedTokenizer(object): def encode( self, - text, - text_pair=None, - add_special_tokens=True, - max_length=None, - stride=0, - truncation_strategy="longest_first", - pad_to_max_length=False, - return_tensors=None, + text: str, + text_pair: Optional[str] = None, + add_special_tokens: bool = True, + max_length: Optional[int] = None, + stride: int = 0, + truncation_strategy: str = "longest_first", + pad_to_max_length: bool = False, + return_tensors: Optional[str] = None, **kwargs ): """ @@ -865,34 +868,43 @@ class PreTrainedTokenizer(object): Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``. Args: - text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using + text (:obj:`str` or :obj:`List[str]`): + The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` method) - text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized + text_pair (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`None`): + Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` method) - add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative + add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): + If set to ``True``, the sequences will be encoded with the special tokens relative to their model. - max_length: if set to a number, will limit the total sequence returned so that it has a maximum length. + max_length (:obj:`int`, `optional`, defaults to :obj:`None`): + If set to a number, will limit the total sequence returned so that it has a maximum length. If there are overflowing tokens, those will be added to the returned dictionary - stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens + stride (:obj:`int`, `optional`, defaults to ``0``): + If set to a number along with max_length, the overflowing tokens returned will contain some tokens from the main sequence returned. The value of this argument defines the number of additional tokens. - truncation_strategy: string selected in the following options: + truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`): + String selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length - starting from the longest one at each token (when there is a pair of input sequences) + starting from the longest one at each token (when there is a pair of input sequences) - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) - pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and - padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length. - The tokenizer padding sides are handled by the class attribute `padding_side` which can be set to the following strings: + pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`): + If set to True, the returned sequences will be padded according to the model's padding side and + padding index, up to their max length. If no max length is specified, the padding is done up to the + model's max length. The tokenizer padding sides are handled by the class attribute `padding_side` + which can be set to the following strings: + - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences Defaults to False: no padding. - return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant - or PyTorch torch.Tensor instead of a list of python integers. - add_prefix_space: Only applies to GPT-2 and RoBERTa tokenizers. When `True`, this ensures that the sequence - begins with an empty space. False by default except for when using RoBERTa with `add_special_tokens=True`. + return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): + Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant` + or PyTorch :obj:`torch.Tensor` instead of a list of python integers. **kwargs: passed to the `self.tokenize()` method """ encoded_inputs = self.encode_plus( @@ -911,59 +923,79 @@ class PreTrainedTokenizer(object): def encode_plus( self, - text, - text_pair=None, - add_special_tokens=True, - max_length=None, - stride=0, - truncation_strategy="longest_first", - pad_to_max_length=False, - return_tensors=None, - return_token_type_ids=True, - return_attention_mask=True, - return_overflowing_tokens=False, - return_special_tokens_mask=False, - return_offsets_mapping=False, + text: str, + text_pair: Optional[str] = None, + add_special_tokens: bool = True, + max_length: Optional[int] = None, + stride: int = 0, + truncation_strategy: str = "longest_first", + pad_to_max_length: bool = False, + return_tensors: Optional[str] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, **kwargs ): """ - Returns a dictionary containing the encoded sequence or sequence pair and additional informations: + Returns a dictionary containing the encoded sequence or sequence pair and additional information: the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. Args: - text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using + text (:obj:`str` or :obj:`List[str]`): + The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` method) - text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized + text_pair (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`None`): + Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` method) - add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative + add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): + If set to ``True``, the sequences will be encoded with the special tokens relative to their model. - max_length: if set to a number, will limit the total sequence returned so that it has a maximum length. + max_length (:obj:`int`, `optional`, defaults to :obj:`None`): + If set to a number, will limit the total sequence returned so that it has a maximum length. If there are overflowing tokens, those will be added to the returned dictionary - stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens + stride (:obj:`int`, `optional`, defaults to ``0``): + If set to a number along with max_length, the overflowing tokens returned will contain some tokens from the main sequence returned. The value of this argument defines the number of additional tokens. - truncation_strategy: string selected in the following options: + truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`): + String selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length - starting from the longest one at each token (when there is a pair of input sequences) + starting from the longest one at each token (when there is a pair of input sequences) - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) - pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and - padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length. - The tokenizer padding sides are handled by the class attribute `padding_side` which can be set to the following strings: + pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`): + If set to True, the returned sequences will be padded according to the model's padding side and + padding index, up to their max length. If no max length is specified, the padding is done up to the + model's max length. The tokenizer padding sides are handled by the class attribute `padding_side` + which can be set to the following strings: + - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences Defaults to False: no padding. - return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant - or PyTorch torch.Tensor instead of a list of python integers. - add_prefix_space: Only applies to GPT-2 and RoBERTa tokenizers. When `True`, this ensures that the sequence - begins with an empty space. False by default except for when using RoBERTa with `add_special_tokens=True`. - return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True). - return_attention_mask: (optional) Set to False to avoid returning attention mask (default True) - return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False). - return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False). - return_offsets_mapping: (optional) Set to True to return (char_start, char_end) for each token (default False). + return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): + Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant` + or PyTorch :obj:`torch.Tensor` instead of a list of python integers. + return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`): + Whether to return token type IDs. If left to the default, will return the token type IDs according + to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. + + `What are token type IDs? <../glossary.html#token-type-ids>`_ + return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`none`): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. + + `What are attention masks? <../glossary.html#attention-mask>`__ + return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True to return overflowing token information (default False). + return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True to return special tokens mask information (default False). + return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True to return (char_start, char_end) for each token (default False). If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on Rust-based tokenizers inheriting from PreTrainedTokenizerFast. **kwargs: passed to the `self.tokenize()` method @@ -981,13 +1013,14 @@ class PreTrainedTokenizer(object): } With the fields: - ``input_ids``: list of token ids to be fed to a model - ``token_type_ids``: list of token type ids to be fed to a model - ``attention_mask``: list of indices specifying which tokens should be attended to by the model - ``overflowing_tokens``: list of overflowing tokens if a max length is specified. - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added - tokens and 1 specifying sequence tokens. + + - ``input_ids``: list of token ids to be fed to a model + - ``token_type_ids``: list of token type ids to be fed to a model + - ``attention_mask``: list of indices specifying which tokens should be attended to by the model + - ``overflowing_tokens``: list of overflowing tokens if a max length is specified. + - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified + - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added + tokens and 1 specifying sequence tokens. """ def get_input_ids(text): @@ -1038,19 +1071,19 @@ class PreTrainedTokenizer(object): def batch_encode_plus( self, - batch_text_or_text_pairs=None, - add_special_tokens=True, - max_length=None, - stride=0, - truncation_strategy="longest_first", - pad_to_max_length=False, - return_tensors=None, - return_token_type_ids=True, - return_attention_masks=True, - return_overflowing_tokens=False, - return_special_tokens_masks=False, - return_offsets_mapping=False, - return_input_lengths=False, + batch_text_or_text_pairs: Union[str, List[str]], + add_special_tokens: bool = True, + max_length: Optional[int] = None, + stride: int = 0, + truncation_strategy: str = "longest_first", + pad_to_max_length: bool = False, + return_tensors: Optional[str] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_masks: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_masks: bool = False, + return_offsets_mapping: bool = False, + return_input_lengths: bool = False, **kwargs ): """ @@ -1058,32 +1091,59 @@ class PreTrainedTokenizer(object): the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. Args: - batch_text_or_text_pairs: Batch of sequences or pair of sequences to be encoded. + batch_text_or_text_pairs (:obj:`List[str]` or :obj:`List[List[str]]`): + Batch of sequences or pair of sequences to be encoded. This can be a list of string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see details in encode_plus) - add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative + add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): + If set to ``True``, the sequences will be encoded with the special tokens relative to their model. - max_length: if set to a number, will limit the total sequence returned so that it has a maximum length. - If there are overflowing tokens, those will be added to the returned dictionary` - stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens + max_length (:obj:`int`, `optional`, defaults to :obj:`None`): + If set to a number, will limit the total sequence returned so that it has a maximum length. + If there are overflowing tokens, those will be added to the returned dictionary + stride (:obj:`int`, `optional`, defaults to ``0``): + If set to a number along with max_length, the overflowing tokens returned will contain some tokens from the main sequence returned. The value of this argument defines the number of additional tokens. - truncation_strategy: string selected in the following options: + truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`): + String selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length - starting from the longest one at each token (when there is a pair of input sequences) + starting from the longest one at each token (when there is a pair of input sequences) - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) - pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and - padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length. - The tokenizer padding sides are handled by the class attribute `padding_side` which can be set to the following strings: + pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`): + If set to True, the returned sequences will be padded according to the model's padding side and + padding index, up to their max length. If no max length is specified, the padding is done up to the + model's max length. The tokenizer padding sides are handled by the class attribute `padding_side` + which can be set to the following strings: + - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences Defaults to False: no padding. - return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant - or PyTorch torch.Tensor instead of a list of python integers. - return_input_lengths: (optional) If set the resulting dictionary will include the length of each sample - return_attention_masks: (optional) Set to True to return the attention mask (default False) - return_offsets_mapping: (optional) Not available, should be set to False or it will throw NotImplementError + return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): + Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant` + or PyTorch :obj:`torch.Tensor` instead of a list of python integers. + return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`): + Whether to return token type IDs. If left to the default, will return the token type IDs according + to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. + + `What are token type IDs? <../glossary.html#token-type-ids>`_ + return_attention_masks (:obj:`bool`, `optional`, defaults to :obj:`none`): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. + + `What are attention masks? <../glossary.html#attention-mask>`__ + return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True to return overflowing token information (default False). + return_special_tokens_masks (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True to return special tokens mask information (default False). + return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True to return (char_start, char_end) for each token (default False). + If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on + Rust-based tokenizers inheriting from PreTrainedTokenizerFast. + return_input_lengths (:obj:`bool`, `optional`, defaults to :obj:`False`): + If set the resulting dictionary will include the length of each sample **kwargs: passed to the `self.tokenize()` method Return: @@ -1099,13 +1159,14 @@ class PreTrainedTokenizer(object): } With the fields: - ``input_ids``: list of token ids to be fed to a model - ``token_type_ids``: list of token type ids to be fed to a model - ``attention_mask``: list of indices specifying which tokens should be attended to by the model - ``overflowing_tokens``: list of overflowing tokens if a max length is specified. - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added - tokens and 1 specifying sequence tokens. + + - ``input_ids``: list of token ids to be fed to a model + - ``token_type_ids``: list of token type ids to be fed to a model + - ``attention_mask``: list of indices specifying which tokens should be attended to by the model + - ``overflowing_tokens``: list of overflowing tokens if a max length is specified. + - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified + - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added + tokens and 1 specifying sequence tokens. """ def get_input_ids(text): @@ -1220,18 +1281,18 @@ class PreTrainedTokenizer(object): def prepare_for_model( self, - ids, - pair_ids=None, - max_length=None, - add_special_tokens=True, - stride=0, - truncation_strategy="longest_first", - pad_to_max_length=False, - return_tensors=None, - return_token_type_ids=True, - return_attention_mask=True, - return_overflowing_tokens=False, - return_special_tokens_mask=False, + ids: List[int], + pair_ids: Optional[List[int]] = None, + max_length: Optional[int] = None, + add_special_tokens: bool = True, + stride: int = 0, + truncation_strategy: str = "longest_first", + pad_to_max_length: bool = False, + return_tensors: Optional[str] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, ): """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. @@ -1292,6 +1353,11 @@ class PreTrainedTokenizer(object): len_ids = len(ids) len_pair_ids = len(pair_ids) if pair else 0 + if return_token_type_ids is None: + return_token_type_ids = "token_type_ids" in self.model_input_names + if return_attention_mask is None: + return_attention_mask = "attention_mask" in self.model_input_names + encoded_inputs = {} # Handle max sequence length @@ -1617,6 +1683,9 @@ class PreTrainedTokenizer(object): class PreTrainedTokenizerFast(PreTrainedTokenizer): + + model_input_names = ["token_type_ids", "attention_mask"] + def __init__(self, tokenizer: BaseTokenizer, **kwargs): if tokenizer is None: raise ValueError("Provided tokenizer cannot be None") @@ -1685,16 +1754,21 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer): if self._tokenizer is not None: self._tokenizer.add_special_tokens(self.all_special_tokens) - @staticmethod def _convert_encoding( + self, encoding, return_tensors=None, - return_token_type_ids=True, - return_attention_mask=True, + return_token_type_ids=None, + return_attention_mask=None, return_overflowing_tokens=False, return_special_tokens_mask=False, return_offsets_mapping=False, ): + if return_token_type_ids is None: + return_token_type_ids = "token_type_ids" in self.model_input_names + if return_attention_mask is None: + return_attention_mask = "attention_mask" in self.model_input_names + if return_overflowing_tokens and encoding.overflowing is not None: encodings = [encoding] + encoding.overflowing else: @@ -1774,18 +1848,18 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer): def batch_encode_plus( self, - batch_text_or_text_pairs=None, - add_special_tokens=True, - max_length=None, - stride=0, - truncation_strategy="longest_first", - pad_to_max_length=False, - return_tensors=None, - return_token_type_ids=True, - return_attention_mask=True, - return_overflowing_tokens=False, - return_special_tokens_mask=False, - return_offsets_mapping=False, + batch_text_or_text_pairs: Optional[Union[List[str], List[Tuple[str]]]] = None, + add_special_tokens: bool = True, + max_length: Optional[int] = None, + stride: int = 0, + truncation_strategy: str = "longest_first", + pad_to_max_length: bool = False, + return_tensors: Optional[str] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, **kwargs ): if not add_special_tokens: @@ -1868,19 +1942,19 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer): def encode_plus( self, - text, - text_pair=None, - add_special_tokens=False, - max_length=None, - pad_to_max_length=False, - stride=0, - truncation_strategy="longest_first", - return_tensors=None, - return_token_type_ids=True, - return_attention_mask=True, - return_overflowing_tokens=False, - return_special_tokens_mask=False, - return_offsets_mapping=False, + text: str, + text_pair: Optional[str] = None, + add_special_tokens: bool = False, + max_length: Optional[int] = None, + pad_to_max_length: bool = False, + stride: int = 0, + truncation_strategy: str = "longest_first", + return_tensors: Optional[bool] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, **kwargs ): batched_input = [(text, text_pair)] if text_pair else [text] diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index b1f69fbfc1..3534eeef06 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -48,7 +48,7 @@ class TokenizerTesterMixin: # to the concatenated encode_plus format: [{'input_ids': [...], ...}, {'input_ids': [...], ...}] return [ {value: batch_encode_plus_sequences[value][i] for value in batch_encode_plus_sequences.keys()} - for i in range(len(batch_encode_plus_sequences)) + for i in range(len(batch_encode_plus_sequences["input_ids"])) ] def test_tokenizers_common_properties(self): @@ -261,7 +261,10 @@ class TokenizerTesterMixin: def test_mask_output(self): tokenizer = self.get_tokenizer() - if tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer": + if ( + tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer" + and "token_type_ids" in tokenizer.model_input_names + ): seq_0 = "Test this method." seq_1 = "With these inputs." information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True) @@ -504,51 +507,58 @@ class TokenizerTesterMixin: encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True) input_ids = encoded_sequence["input_ids"] - token_type_ids = encoded_sequence["token_type_ids"] - attention_mask = encoded_sequence["attention_mask"] special_tokens_mask = encoded_sequence["special_tokens_mask"] sequence_length = len(input_ids) # Test right padding tokenizer.padding_side = "right" - padded_sequence = tokenizer.encode_plus( + right_padded_sequence = tokenizer.encode_plus( sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True, ) - padded_input_ids = padded_sequence["input_ids"] - padded_token_type_ids = padded_sequence["token_type_ids"] - padded_attention_mask = padded_sequence["attention_mask"] - padded_special_tokens_mask = padded_sequence["special_tokens_mask"] - padded_sequence_length = len(padded_input_ids) + right_padded_input_ids = right_padded_sequence["input_ids"] - assert sequence_length + padding_size == padded_sequence_length - assert input_ids + [padding_idx] * padding_size == padded_input_ids - assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids - assert attention_mask + [0] * padding_size == padded_attention_mask - assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask + right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"] + right_padded_sequence_length = len(right_padded_input_ids) + + assert sequence_length + padding_size == right_padded_sequence_length + assert input_ids + [padding_idx] * padding_size == right_padded_input_ids + assert special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask # Test left padding tokenizer.padding_side = "left" - padded_sequence = tokenizer.encode_plus( + left_padded_sequence = tokenizer.encode_plus( sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True, ) - padded_input_ids = padded_sequence["input_ids"] - padded_token_type_ids = padded_sequence["token_type_ids"] - padded_attention_mask = padded_sequence["attention_mask"] - padded_special_tokens_mask = padded_sequence["special_tokens_mask"] - padded_sequence_length = len(padded_input_ids) + left_padded_input_ids = left_padded_sequence["input_ids"] + left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"] + left_padded_sequence_length = len(left_padded_input_ids) - assert sequence_length + padding_size == padded_sequence_length - assert [padding_idx] * padding_size + input_ids == padded_input_ids - assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids - assert [0] * padding_size + attention_mask == padded_attention_mask - assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask + assert sequence_length + padding_size == left_padded_sequence_length + assert [padding_idx] * padding_size + input_ids == left_padded_input_ids + assert [1] * padding_size + special_tokens_mask == left_padded_special_tokens_mask + + if "token_type_ids" in tokenizer.model_input_names: + token_type_ids = encoded_sequence["token_type_ids"] + left_padded_token_type_ids = left_padded_sequence["token_type_ids"] + right_padded_token_type_ids = right_padded_sequence["token_type_ids"] + + assert token_type_ids + [token_type_padding_idx] * padding_size == right_padded_token_type_ids + assert [token_type_padding_idx] * padding_size + token_type_ids == left_padded_token_type_ids + + if "attention_mask" in tokenizer.model_input_names: + attention_mask = encoded_sequence["attention_mask"] + right_padded_attention_mask = right_padded_sequence["attention_mask"] + left_padded_attention_mask = left_padded_sequence["attention_mask"] + + assert attention_mask + [0] * padding_size == right_padded_attention_mask + assert [0] * padding_size + attention_mask == left_padded_attention_mask def test_separate_tokenizers(self): # This tests that tokenizers don't impact others. Unfortunately the case where it fails is when