From 4b0418df11886547e2c701cc4504627881397a0b Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Fri, 13 Sep 2024 12:58:38 +0200 Subject: [PATCH] Enable `padding_side` as call time kwargs (#33385) * fix * add padding-side kwarg * add padding side in all models & fix tests * fix copies * fix tests --- .../layoutlmv2/tokenization_layoutlmv2.py | 29 +++++++++-- .../tokenization_layoutlmv2_fast.py | 22 +++++++-- .../layoutlmv3/tokenization_layoutlmv3.py | 29 +++++++++-- .../tokenization_layoutlmv3_fast.py | 22 +++++++-- .../layoutxlm/tokenization_layoutxlm.py | 23 +++++++-- .../layoutxlm/tokenization_layoutxlm_fast.py | 18 +++++-- .../models/led/tokenization_led.py | 2 + .../models/led/tokenization_led_fast.py | 2 + .../models/luke/tokenization_luke.py | 29 +++++++++-- .../models/markuplm/tokenization_markuplm.py | 29 +++++++++-- .../markuplm/tokenization_markuplm_fast.py | 22 +++++++-- .../models/mluke/tokenization_mluke.py | 29 +++++++++-- .../models/roc_bert/tokenization_roc_bert.py | 17 +++++-- .../models/tapas/tokenization_tapas.py | 27 ++++++++-- .../models/udop/tokenization_udop.py | 27 ++++++++-- .../models/udop/tokenization_udop_fast.py | 22 +++++++-- .../models/wav2vec2/tokenization_wav2vec2.py | 6 +++ src/transformers/tokenization_utils.py | 7 +++ src/transformers/tokenization_utils_base.py | 37 ++++++++++++-- src/transformers/tokenization_utils_fast.py | 10 +++- .../test_tokenization_layoutlmv2.py | 44 ++++++++++------- .../test_tokenization_layoutlmv3.py | 44 ++++++++++------- .../layoutxlm/test_tokenization_layoutxlm.py | 44 ++++++++++------- .../markuplm/test_tokenization_markuplm.py | 44 ++++++++++------- tests/models/tapas/test_tokenization_tapas.py | 43 +++++++++------- tests/test_tokenization_common.py | 49 ++++++++++++------- 26 files changed, 528 insertions(+), 149 deletions(-) diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py index fe03055623..c5ec79666d 100644 --- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py @@ -414,6 +414,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -517,6 +518,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -539,6 +541,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -567,6 +570,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -598,6 +602,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -625,6 +630,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -653,6 +659,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -677,6 +684,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -708,6 +716,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=None, # we pad in batch afterward + padding_side=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -728,6 +737,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer): padding=padding_strategy.value, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -748,6 +758,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -769,6 +780,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -795,6 +807,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -838,6 +851,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -861,6 +875,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -891,6 +906,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, prepend_batch_axis=True, return_attention_mask=return_attention_mask, @@ -914,6 +930,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -1100,6 +1117,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer): max_length=max_length, padding=padding_strategy.value, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -1243,6 +1261,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -1265,6 +1284,9 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer): pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -1288,7 +1310,8 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer): if needs_to_be_padded: difference = max_length - len(required_input) - if self.padding_side == "right": + padding_side = padding_side if padding_side is not None else self.padding_side + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -1302,7 +1325,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer): if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -1317,7 +1340,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer): encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py index aa2bf6b322..a666e3d4ea 100644 --- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py +++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py @@ -165,6 +165,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -268,6 +269,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -290,6 +292,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -318,6 +321,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -349,6 +353,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -381,6 +386,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -424,6 +430,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -451,6 +458,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -470,6 +478,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, ) if is_pair: @@ -603,6 +612,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[bool] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -631,6 +641,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -663,6 +674,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -685,6 +697,9 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast): pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -708,7 +723,8 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast): if needs_to_be_padded: difference = max_length - len(required_input) - if self.padding_side == "right": + padding_side = padding_side if padding_side is not None else self.padding_side + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -722,7 +738,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast): if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -737,7 +753,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast): encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py index 89f899f22f..248a299c14 100644 --- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py @@ -543,6 +543,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -646,6 +647,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -668,6 +670,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -697,6 +700,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -728,6 +732,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -756,6 +761,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -784,6 +790,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -809,6 +816,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -840,6 +848,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=None, # we pad in batch afterward + padding_side=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -860,6 +869,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer): padding=padding_strategy.value, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -881,6 +891,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -902,6 +913,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -929,6 +941,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -972,6 +985,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -996,6 +1010,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -1026,6 +1041,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, prepend_batch_axis=True, return_attention_mask=return_attention_mask, @@ -1049,6 +1065,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -1237,6 +1254,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer): max_length=max_length, padding=padding_strategy.value, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -1382,6 +1400,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -1404,6 +1423,9 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer): pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -1427,7 +1449,8 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer): if needs_to_be_padded: difference = max_length - len(required_input) - if self.padding_side == "right": + padding_side = padding_side if padding_side is not None else self.padding_side + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -1441,7 +1464,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer): if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -1456,6 +1479,6 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer): encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py index 07bedf3613..63cd1022e5 100644 --- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py +++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py @@ -217,6 +217,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -320,6 +321,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -342,6 +344,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -371,6 +374,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -402,6 +406,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -436,6 +441,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -479,6 +485,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -506,6 +513,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -525,6 +533,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, ) if is_pair: @@ -664,6 +673,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[bool] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -692,6 +702,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -725,6 +736,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -747,6 +759,9 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast): pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -770,7 +785,8 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast): if needs_to_be_padded: difference = max_length - len(required_input) - if self.padding_side == "right": + padding_side = padding_side if padding_side is not None else self.padding_side + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -784,7 +800,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast): if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -799,7 +815,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast): encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py index 3ab57ac892..248f16af84 100644 --- a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py +++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py @@ -447,6 +447,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -550,6 +551,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -572,6 +574,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -599,6 +602,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -627,6 +631,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -651,6 +656,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -682,6 +688,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=None, # we pad in batch afterward + padding_side=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -702,6 +709,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer): padding=padding_strategy.value, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -721,6 +729,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -751,6 +760,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, prepend_batch_axis=True, return_attention_mask=return_attention_mask, @@ -774,6 +784,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -947,6 +958,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer): max_length=max_length, padding=padding_strategy.value, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -1090,6 +1102,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -1112,6 +1125,9 @@ class LayoutXLMTokenizer(PreTrainedTokenizer): pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -1135,7 +1151,8 @@ class LayoutXLMTokenizer(PreTrainedTokenizer): if needs_to_be_padded: difference = max_length - len(required_input) - if self.padding_side == "right": + padding_side = padding_side if padding_side is not None else self.padding_side + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -1149,7 +1166,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer): if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -1164,6 +1181,6 @@ class LayoutXLMTokenizer(PreTrainedTokenizer): encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py index 6d68cb9f18..7d12cec496 100644 --- a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py +++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py @@ -277,6 +277,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -380,6 +381,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -402,6 +404,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -442,6 +445,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -462,6 +466,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, ) if is_pair: @@ -595,6 +600,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[bool] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -623,6 +629,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -655,6 +662,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -677,6 +685,9 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast): pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -700,7 +711,8 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast): if needs_to_be_padded: difference = max_length - len(required_input) - if self.padding_side == "right": + padding_side = padding_side if padding_side is not None else self.padding_side + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -714,7 +726,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast): if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -729,7 +741,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast): encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/led/tokenization_led.py b/src/transformers/models/led/tokenization_led.py index aaf09e6d14..6c1ec9526a 100644 --- a/src/transformers/models/led/tokenization_led.py +++ b/src/transformers/models/led/tokenization_led.py @@ -412,6 +412,7 @@ class LEDTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: encoded_inputs = super()._pad( @@ -419,6 +420,7 @@ class LEDTokenizer(PreTrainedTokenizer): max_length=max_length, padding_strategy=padding_strategy, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) diff --git a/src/transformers/models/led/tokenization_led_fast.py b/src/transformers/models/led/tokenization_led_fast.py index ca15eb997b..6ee69fbe79 100644 --- a/src/transformers/models/led/tokenization_led_fast.py +++ b/src/transformers/models/led/tokenization_led_fast.py @@ -288,6 +288,7 @@ class LEDTokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: encoded_inputs = super()._pad( @@ -295,6 +296,7 @@ class LEDTokenizerFast(PreTrainedTokenizerFast): max_length=max_length, padding_strategy=padding_strategy, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) diff --git a/src/transformers/models/luke/tokenization_luke.py b/src/transformers/models/luke/tokenization_luke.py index 1a570992ff..e06b9c753f 100644 --- a/src/transformers/models/luke/tokenization_luke.py +++ b/src/transformers/models/luke/tokenization_luke.py @@ -570,6 +570,7 @@ class LukeTokenizer(PreTrainedTokenizer): stride: int = 0, is_split_into_words: Optional[bool] = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -662,6 +663,7 @@ class LukeTokenizer(PreTrainedTokenizer): stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -688,6 +690,7 @@ class LukeTokenizer(PreTrainedTokenizer): stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -715,6 +718,7 @@ class LukeTokenizer(PreTrainedTokenizer): stride: int = 0, is_split_into_words: Optional[bool] = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -769,6 +773,7 @@ class LukeTokenizer(PreTrainedTokenizer): max_entity_length=max_entity_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, prepend_batch_axis=True, return_attention_mask=return_attention_mask, @@ -796,6 +801,7 @@ class LukeTokenizer(PreTrainedTokenizer): stride: int = 0, is_split_into_words: Optional[bool] = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -876,6 +882,7 @@ class LukeTokenizer(PreTrainedTokenizer): max_entity_length=max_entity_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -1070,6 +1077,7 @@ class LukeTokenizer(PreTrainedTokenizer): max_entity_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -1112,6 +1120,7 @@ class LukeTokenizer(PreTrainedTokenizer): max_entity_length=max_entity_length, stride=stride, pad_to_multiple_of=None, # we pad in batch afterward + padding_side=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -1132,6 +1141,7 @@ class LukeTokenizer(PreTrainedTokenizer): padding=padding_strategy.value, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -1155,6 +1165,7 @@ class LukeTokenizer(PreTrainedTokenizer): max_entity_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -1357,6 +1368,7 @@ class LukeTokenizer(PreTrainedTokenizer): max_entity_length=max_entity_length, padding=padding_strategy.value, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -1382,6 +1394,7 @@ class LukeTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, max_entity_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, verbose: bool = True, @@ -1418,6 +1431,9 @@ class LukeTokenizer(PreTrainedTokenizer): pad_to_multiple_of (`int`, *optional*): If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask (`bool`, *optional*): Whether to return the attention mask. If left to the default, will return the attention mask according to the specific tokenizer's default, defined by the `return_outputs` attribute. [What are attention @@ -1495,6 +1511,7 @@ class LukeTokenizer(PreTrainedTokenizer): max_entity_length=max_entity_length, padding_strategy=padding_strategy, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) return BatchEncoding(encoded_inputs, tensor_type=return_tensors) @@ -1519,6 +1536,7 @@ class LukeTokenizer(PreTrainedTokenizer): max_entity_length=max_entity_length, padding_strategy=padding_strategy, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -1536,6 +1554,7 @@ class LukeTokenizer(PreTrainedTokenizer): max_entity_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -1562,6 +1581,9 @@ class LukeTokenizer(PreTrainedTokenizer): pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -1600,9 +1622,10 @@ class LukeTokenizer(PreTrainedTokenizer): if needs_to_be_padded: difference = max_length - len(encoded_inputs["input_ids"]) + padding_side = padding_side if padding_side is not None else self.padding_side if entities_provided: entity_difference = max_entity_length - len(encoded_inputs["entity_ids"]) - if self.padding_side == "right": + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if entities_provided: @@ -1633,7 +1656,7 @@ class LukeTokenizer(PreTrainedTokenizer): encoded_inputs["entity_end_positions"] + [0] * entity_difference ) - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if entities_provided: @@ -1664,7 +1687,7 @@ class LukeTokenizer(PreTrainedTokenizer): "entity_end_positions" ] else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/markuplm/tokenization_markuplm.py b/src/transformers/models/markuplm/tokenization_markuplm.py index c77865abc9..e5de1e4e76 100644 --- a/src/transformers/models/markuplm/tokenization_markuplm.py +++ b/src/transformers/models/markuplm/tokenization_markuplm.py @@ -503,6 +503,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -602,6 +603,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -624,6 +626,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -652,6 +655,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -683,6 +687,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -710,6 +715,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -738,6 +744,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -762,6 +769,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -793,6 +801,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=None, # we pad in batch afterward + padding_side=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -813,6 +822,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer): padding=padding_strategy.value, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -833,6 +843,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -854,6 +865,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -880,6 +892,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -923,6 +936,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -946,6 +960,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -976,6 +991,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, prepend_batch_axis=True, return_attention_mask=return_attention_mask, @@ -999,6 +1015,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -1203,6 +1220,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer): max_length=max_length, padding=padding_strategy.value, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -1357,6 +1375,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -1376,6 +1395,9 @@ class MarkupLMTokenizer(PreTrainedTokenizer): pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -1399,7 +1421,8 @@ class MarkupLMTokenizer(PreTrainedTokenizer): if needs_to_be_padded: difference = max_length - len(required_input) - if self.padding_side == "right": + padding_side = padding_side if padding_side is not None else self.padding_side + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -1419,7 +1442,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer): if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -1440,6 +1463,6 @@ class MarkupLMTokenizer(PreTrainedTokenizer): encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/markuplm/tokenization_markuplm_fast.py b/src/transformers/models/markuplm/tokenization_markuplm_fast.py index ff0e4ffeb5..7964598764 100644 --- a/src/transformers/models/markuplm/tokenization_markuplm_fast.py +++ b/src/transformers/models/markuplm/tokenization_markuplm_fast.py @@ -286,6 +286,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -385,6 +386,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -407,6 +409,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -435,6 +438,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -466,6 +470,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -498,6 +503,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -541,6 +547,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -568,6 +575,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -587,6 +595,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, ) if is_pair: @@ -721,6 +730,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[bool] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -749,6 +759,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -781,6 +792,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -800,6 +812,9 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast): pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -823,7 +838,8 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast): if needs_to_be_padded: difference = max_length - len(required_input) - if self.padding_side == "right": + padding_side = padding_side if padding_side is not None else self.padding_side + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -843,7 +859,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast): if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -864,7 +880,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast): encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/mluke/tokenization_mluke.py b/src/transformers/models/mluke/tokenization_mluke.py index 3ac8191402..f087c0d92f 100644 --- a/src/transformers/models/mluke/tokenization_mluke.py +++ b/src/transformers/models/mluke/tokenization_mluke.py @@ -399,6 +399,7 @@ class MLukeTokenizer(PreTrainedTokenizer): stride: int = 0, is_split_into_words: Optional[bool] = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -491,6 +492,7 @@ class MLukeTokenizer(PreTrainedTokenizer): stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -517,6 +519,7 @@ class MLukeTokenizer(PreTrainedTokenizer): stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -545,6 +548,7 @@ class MLukeTokenizer(PreTrainedTokenizer): stride: int = 0, is_split_into_words: Optional[bool] = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -599,6 +603,7 @@ class MLukeTokenizer(PreTrainedTokenizer): max_entity_length=max_entity_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, prepend_batch_axis=True, return_attention_mask=return_attention_mask, @@ -627,6 +632,7 @@ class MLukeTokenizer(PreTrainedTokenizer): stride: int = 0, is_split_into_words: Optional[bool] = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -707,6 +713,7 @@ class MLukeTokenizer(PreTrainedTokenizer): max_entity_length=max_entity_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -904,6 +911,7 @@ class MLukeTokenizer(PreTrainedTokenizer): max_entity_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -946,6 +954,7 @@ class MLukeTokenizer(PreTrainedTokenizer): max_entity_length=max_entity_length, stride=stride, pad_to_multiple_of=None, # we pad in batch afterward + padding_side=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -966,6 +975,7 @@ class MLukeTokenizer(PreTrainedTokenizer): padding=padding_strategy.value, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -990,6 +1000,7 @@ class MLukeTokenizer(PreTrainedTokenizer): max_entity_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -1192,6 +1203,7 @@ class MLukeTokenizer(PreTrainedTokenizer): max_entity_length=max_entity_length, padding=padding_strategy.value, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -1218,6 +1230,7 @@ class MLukeTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, max_entity_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, verbose: bool = True, @@ -1254,6 +1267,9 @@ class MLukeTokenizer(PreTrainedTokenizer): pad_to_multiple_of (`int`, *optional*): If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask (`bool`, *optional*): Whether to return the attention mask. If left to the default, will return the attention mask according to the specific tokenizer's default, defined by the `return_outputs` attribute. [What are attention @@ -1331,6 +1347,7 @@ class MLukeTokenizer(PreTrainedTokenizer): max_entity_length=max_entity_length, padding_strategy=padding_strategy, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) return BatchEncoding(encoded_inputs, tensor_type=return_tensors) @@ -1355,6 +1372,7 @@ class MLukeTokenizer(PreTrainedTokenizer): max_entity_length=max_entity_length, padding_strategy=padding_strategy, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -1373,6 +1391,7 @@ class MLukeTokenizer(PreTrainedTokenizer): max_entity_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -1399,6 +1418,9 @@ class MLukeTokenizer(PreTrainedTokenizer): pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -1437,9 +1459,10 @@ class MLukeTokenizer(PreTrainedTokenizer): if needs_to_be_padded: difference = max_length - len(encoded_inputs["input_ids"]) + padding_side = padding_side if padding_side is not None else self.padding_side if entities_provided: entity_difference = max_entity_length - len(encoded_inputs["entity_ids"]) - if self.padding_side == "right": + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if entities_provided: @@ -1470,7 +1493,7 @@ class MLukeTokenizer(PreTrainedTokenizer): encoded_inputs["entity_end_positions"] + [0] * entity_difference ) - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if entities_provided: @@ -1501,7 +1524,7 @@ class MLukeTokenizer(PreTrainedTokenizer): "entity_end_positions" ] else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/roc_bert/tokenization_roc_bert.py b/src/transformers/models/roc_bert/tokenization_roc_bert.py index eaf2a1a491..3a980c0ae6 100644 --- a/src/transformers/models/roc_bert/tokenization_roc_bert.py +++ b/src/transformers/models/roc_bert/tokenization_roc_bert.py @@ -210,6 +210,7 @@ class RoCBertTokenizer(PreTrainedTokenizer): stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -283,6 +284,7 @@ class RoCBertTokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, prepend_batch_axis=True, return_attention_mask=return_attention_mask, @@ -308,6 +310,7 @@ class RoCBertTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -462,6 +465,7 @@ class RoCBertTokenizer(PreTrainedTokenizer): max_length=max_length, padding=padding_strategy.value, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -480,6 +484,7 @@ class RoCBertTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: # Load from model defaults @@ -502,8 +507,9 @@ class RoCBertTokenizer(PreTrainedTokenizer): if needs_to_be_padded: difference = max_length - len(required_input) + padding_side = padding_side if padding_side is not None else self.padding_side - if self.padding_side == "right": + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -516,7 +522,7 @@ class RoCBertTokenizer(PreTrainedTokenizer): if key in encoded_inputs: encoded_inputs[key] = encoded_inputs[key] + [self.pad_token_id] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -530,7 +536,7 @@ class RoCBertTokenizer(PreTrainedTokenizer): encoded_inputs[key] = [self.pad_token_id] * difference + encoded_inputs[key] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs @@ -551,6 +557,7 @@ class RoCBertTokenizer(PreTrainedTokenizer): stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -627,6 +634,7 @@ class RoCBertTokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -650,6 +658,7 @@ class RoCBertTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -686,6 +695,7 @@ class RoCBertTokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=None, # we pad in batch afterward + padding_side=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -706,6 +716,7 @@ class RoCBertTokenizer(PreTrainedTokenizer): padding=padding_strategy.value, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py index 2da9fe40c1..867e53ff89 100644 --- a/src/transformers/models/tapas/tokenization_tapas.py +++ b/src/transformers/models/tapas/tokenization_tapas.py @@ -517,6 +517,7 @@ class TapasTokenizer(PreTrainedTokenizer): truncation: Union[bool, str, TapasTruncationStrategy] = False, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -581,6 +582,7 @@ class TapasTokenizer(PreTrainedTokenizer): truncation=truncation, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -602,6 +604,7 @@ class TapasTokenizer(PreTrainedTokenizer): truncation=truncation, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -631,6 +634,7 @@ class TapasTokenizer(PreTrainedTokenizer): truncation: Union[bool, str, TapasTruncationStrategy] = False, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -699,6 +703,7 @@ class TapasTokenizer(PreTrainedTokenizer): truncation=truncation, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -738,6 +743,7 @@ class TapasTokenizer(PreTrainedTokenizer): truncation: Union[bool, str, TapasTruncationStrategy] = False, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = True, return_attention_mask: Optional[bool] = None, @@ -768,6 +774,7 @@ class TapasTokenizer(PreTrainedTokenizer): add_special_tokens=add_special_tokens, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, prepend_batch_axis=True, return_attention_mask=return_attention_mask, @@ -797,6 +804,7 @@ class TapasTokenizer(PreTrainedTokenizer): truncation: Union[bool, str, TapasTruncationStrategy] = False, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = True, return_attention_mask: Optional[bool] = True, @@ -823,6 +831,7 @@ class TapasTokenizer(PreTrainedTokenizer): truncation=truncation, max_length=max_length, pad_to_multiple_of=None, # we pad in batch afterwards + padding_side=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterwards return_token_type_ids=return_token_type_ids, return_special_tokens_mask=return_special_tokens_mask, @@ -844,6 +853,7 @@ class TapasTokenizer(PreTrainedTokenizer): padding=padding, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -912,6 +922,7 @@ class TapasTokenizer(PreTrainedTokenizer): truncation: Union[bool, str, TapasTruncationStrategy] = False, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -968,6 +979,7 @@ class TapasTokenizer(PreTrainedTokenizer): padding=padding, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -993,6 +1005,7 @@ class TapasTokenizer(PreTrainedTokenizer): truncation: Union[bool, str, TapasTruncationStrategy] = False, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = True, return_attention_mask: Optional[bool] = True, @@ -1024,6 +1037,7 @@ class TapasTokenizer(PreTrainedTokenizer): padding=padding, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, prepend_batch_axis=True, return_attention_mask=return_attention_mask, @@ -1051,6 +1065,7 @@ class TapasTokenizer(PreTrainedTokenizer): truncation: Union[bool, str, TapasTruncationStrategy] = False, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = True, return_attention_mask: Optional[bool] = True, @@ -1214,6 +1229,7 @@ class TapasTokenizer(PreTrainedTokenizer): max_length=max_length, padding=padding.value, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -1754,6 +1770,7 @@ class TapasTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -1776,6 +1793,9 @@ class TapasTokenizer(PreTrainedTokenizer): pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -1799,7 +1819,8 @@ class TapasTokenizer(PreTrainedTokenizer): if needs_to_be_padded: difference = max_length - len(encoded_inputs["input_ids"]) - if self.padding_side == "right": + padding_side = padding_side if padding_side is not None else self.padding_side + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -1817,7 +1838,7 @@ class TapasTokenizer(PreTrainedTokenizer): if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -1836,7 +1857,7 @@ class TapasTokenizer(PreTrainedTokenizer): encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"] else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/udop/tokenization_udop.py b/src/transformers/models/udop/tokenization_udop.py index 4be9799819..e40c07a58a 100644 --- a/src/transformers/models/udop/tokenization_udop.py +++ b/src/transformers/models/udop/tokenization_udop.py @@ -551,6 +551,7 @@ class UdopTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -654,6 +655,7 @@ class UdopTokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -676,6 +678,7 @@ class UdopTokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -704,6 +707,7 @@ class UdopTokenizer(PreTrainedTokenizer): stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -746,6 +750,7 @@ class UdopTokenizer(PreTrainedTokenizer): stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -813,6 +818,7 @@ class UdopTokenizer(PreTrainedTokenizer): stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -865,6 +871,7 @@ class UdopTokenizer(PreTrainedTokenizer): stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -892,6 +899,7 @@ class UdopTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -920,6 +928,7 @@ class UdopTokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -944,6 +953,7 @@ class UdopTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -975,6 +985,7 @@ class UdopTokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=None, # we pad in batch afterward + padding_side=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -995,6 +1006,7 @@ class UdopTokenizer(PreTrainedTokenizer): padding=padding_strategy.value, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -1014,6 +1026,7 @@ class UdopTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -1044,6 +1057,7 @@ class UdopTokenizer(PreTrainedTokenizer): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, prepend_batch_axis=True, return_attention_mask=return_attention_mask, @@ -1067,6 +1081,7 @@ class UdopTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -1240,6 +1255,7 @@ class UdopTokenizer(PreTrainedTokenizer): max_length=max_length, padding=padding_strategy.value, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -1385,6 +1401,7 @@ class UdopTokenizer(PreTrainedTokenizer): max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -1407,6 +1424,9 @@ class UdopTokenizer(PreTrainedTokenizer): pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -1430,7 +1450,8 @@ class UdopTokenizer(PreTrainedTokenizer): if needs_to_be_padded: difference = max_length - len(required_input) - if self.padding_side == "right": + padding_side = padding_side if padding_side is not None else self.padding_side + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -1444,7 +1465,7 @@ class UdopTokenizer(PreTrainedTokenizer): if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -1459,6 +1480,6 @@ class UdopTokenizer(PreTrainedTokenizer): encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/udop/tokenization_udop_fast.py b/src/transformers/models/udop/tokenization_udop_fast.py index 8340c4af4e..8ee0577fa1 100644 --- a/src/transformers/models/udop/tokenization_udop_fast.py +++ b/src/transformers/models/udop/tokenization_udop_fast.py @@ -286,6 +286,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -389,6 +390,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -411,6 +413,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -453,6 +456,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast): stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -501,6 +505,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast): stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -528,6 +533,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -548,6 +554,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, ) if is_pair: @@ -684,6 +691,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[bool] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -712,6 +720,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -794,6 +803,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast): stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -846,6 +856,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast): stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -864,6 +875,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast): max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -886,6 +898,9 @@ class UdopTokenizerFast(PreTrainedTokenizerFast): pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -909,7 +924,8 @@ class UdopTokenizerFast(PreTrainedTokenizerFast): if needs_to_be_padded: difference = max_length - len(required_input) - if self.padding_side == "right": + padding_side = padding_side if padding_side is not None else self.padding_side + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -923,7 +939,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast): if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -938,7 +954,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast): encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py index 647b18521d..c1a333fe48 100644 --- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py +++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py @@ -781,6 +781,7 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer): padding: Union[bool, str, PaddingStrategy] = False, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, verbose: bool = True, **kwargs, @@ -794,6 +795,10 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer): The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float values, a list of numpy array or a list of list of float values. Must be mono channel audio, not stereo, i.e. single float per timestep. + + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. """ is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1 @@ -825,6 +830,7 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer): padding=padding, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=self.return_attention_mask, return_tensors=return_tensors, verbose=verbose, diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index f04eaae452..6a5bff3679 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -749,6 +749,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -806,6 +807,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, prepend_batch_axis=True, return_attention_mask=return_attention_mask, @@ -833,6 +835,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -891,6 +894,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -913,6 +917,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -942,6 +947,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): max_length=max_length, stride=stride, pad_to_multiple_of=None, # we pad in batch afterward + padding_side=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -963,6 +969,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): padding=padding_strategy.value, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 5e9170456a..93dea5ba09 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1427,6 +1427,9 @@ ENCODE_KWARGS_DOCSTRING = r""" If set will pad the sequence to a multiple of the provided value. Requires `padding` to be activated. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_tensors (`str` or [`~utils.TensorType`], *optional*): If set, will return tensors instead of list of python integers. Acceptable values are: @@ -2767,6 +2770,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): truncation: Union[bool, str, TruncationStrategy] = None, max_length: Optional[int] = None, stride: int = 0, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs, ) -> List[int]: @@ -2793,6 +2797,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): truncation=truncation, max_length=max_length, stride=stride, + padding_side=padding_side, return_tensors=return_tensors, **kwargs, ) @@ -2956,6 +2961,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -2997,6 +3003,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): "stride": stride, "is_split_into_words": is_split_into_words, "pad_to_multiple_of": pad_to_multiple_of, + "padding_side": padding_side, "return_tensors": return_tensors, "return_token_type_ids": return_token_type_ids, "return_attention_mask": return_attention_mask, @@ -3041,6 +3048,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -3111,6 +3119,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -3133,6 +3142,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -3157,6 +3167,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -3207,6 +3218,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -3230,6 +3242,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -3261,6 +3274,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -3307,6 +3321,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -3336,6 +3351,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -3361,6 +3377,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): padding: Union[bool, str, PaddingStrategy] = True, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, verbose: bool = True, @@ -3409,6 +3426,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask (`bool`, *optional*): Whether to return the attention mask. If left to the default, will return the attention mask according to the specific tokenizer's default, defined by the `return_outputs` attribute. @@ -3491,6 +3511,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): max_length=max_length, padding_strategy=padding_strategy, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) return BatchEncoding(encoded_inputs, tensor_type=return_tensors) @@ -3512,6 +3533,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): max_length=max_length, padding_strategy=padding_strategy, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -3573,6 +3595,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -3686,6 +3709,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): max_length=max_length, padding=padding_strategy.value, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -3828,6 +3852,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -3843,13 +3868,16 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): - PaddingStrategy.LONGEST Pad to the longest sequence in the batch - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) - PaddingStrategy.DO_NOT_PAD: Do not pad - The tokenizer padding sides are defined in self.padding_side: + The tokenizer padding sides are defined in `padding_side` argument: - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -3873,8 +3901,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): if needs_to_be_padded: difference = max_length - len(required_input) + padding_side = padding_side if padding_side is not None else self.padding_side - if self.padding_side == "right": + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -3884,7 +3913,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -3895,7 +3924,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: - raise ValueError(f"Invalid padding strategy:{self.padding_side}") + raise ValueError(f"Invalid padding strategy:{padding_side}") return encoded_inputs diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index 7d5446d7cb..724484b3b3 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -429,6 +429,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): max_length: int, stride: int, pad_to_multiple_of: Optional[int], + padding_side: Optional[bool], ): """ Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers @@ -450,6 +451,9 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): pad_to_multiple_of (`int`, *optional*): If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. """ _truncation = self._tokenizer.truncation _padding = self._tokenizer.padding @@ -484,7 +488,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): length = max_length if padding_strategy == PaddingStrategy.MAX_LENGTH else None target = { "length": length, - "direction": self.padding_side, + "direction": padding_side if padding_side is not None else self.padding_side, "pad_id": self.pad_token_id, "pad_token": self.pad_token, "pad_type_id": self.pad_token_type_id, @@ -505,6 +509,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -527,6 +532,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, ) if self._tokenizer.encode_special_tokens != split_special_tokens: @@ -593,6 +599,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[bool] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -614,6 +621,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, diff --git a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py index bb526e140e..19a6aeec46 100644 --- a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py @@ -21,6 +21,8 @@ import tempfile import unittest from typing import List +from parameterized import parameterized + from transformers import ( AddedToken, LayoutLMv2TokenizerFast, @@ -393,7 +395,8 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_split_special_tokens(self): pass - def test_encode_plus_with_padding(self): + @parameterized.expand([(True,), (False,)]) + def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): @@ -444,15 +447,18 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask) # Test right padding - tokenizer.padding_side = "right" + tokenizer_kwargs_right = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } - right_padded_sequence = tokenizer.encode_plus( - words, - boxes=boxes, - max_length=sequence_length + padding_size, - padding="max_length", - return_special_tokens_mask=True, - ) + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "right" + else: + tokenizer_kwargs_right["padding_side"] = "right" + + right_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_right) right_padded_input_ids = right_padded_sequence["input_ids"] right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"] @@ -463,14 +469,18 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask) # Test left padding - tokenizer.padding_side = "left" - left_padded_sequence = tokenizer.encode_plus( - words, - boxes=boxes, - max_length=sequence_length + padding_size, - padding="max_length", - return_special_tokens_mask=True, - ) + tokenizer_kwargs_left = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } + + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "left" + else: + tokenizer_kwargs_left["padding_side"] = "left" + + left_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_left) left_padded_input_ids = left_padded_sequence["input_ids"] left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"] left_padded_sequence_length = len(left_padded_input_ids) diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py index 5ea384f0b2..007e23430b 100644 --- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py +++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py @@ -22,6 +22,8 @@ import tempfile import unittest from typing import List +from parameterized import parameterized + from transformers import ( AddedToken, LayoutLMv3TokenizerFast, @@ -273,7 +275,8 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_split_special_tokens(self): pass - def test_encode_plus_with_padding(self): + @parameterized.expand([(True,), (False,)]) + def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): @@ -324,15 +327,18 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask) # Test right padding - tokenizer.padding_side = "right" + tokenizer_kwargs_right = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } - right_padded_sequence = tokenizer.encode_plus( - words, - boxes=boxes, - max_length=sequence_length + padding_size, - padding="max_length", - return_special_tokens_mask=True, - ) + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "right" + else: + tokenizer_kwargs_right["padding_side"] = "right" + + right_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_right) right_padded_input_ids = right_padded_sequence["input_ids"] right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"] @@ -343,14 +349,18 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask) # Test left padding - tokenizer.padding_side = "left" - left_padded_sequence = tokenizer.encode_plus( - words, - boxes=boxes, - max_length=sequence_length + padding_size, - padding="max_length", - return_special_tokens_mask=True, - ) + tokenizer_kwargs_left = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } + + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "left" + else: + tokenizer_kwargs_left["padding_side"] = "left" + + left_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_left) left_padded_input_ids = left_padded_sequence["input_ids"] left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"] left_padded_sequence_length = len(left_padded_input_ids) diff --git a/tests/models/layoutxlm/test_tokenization_layoutxlm.py b/tests/models/layoutxlm/test_tokenization_layoutxlm.py index c0e44fcb30..8acd3716cf 100644 --- a/tests/models/layoutxlm/test_tokenization_layoutxlm.py +++ b/tests/models/layoutxlm/test_tokenization_layoutxlm.py @@ -19,6 +19,8 @@ import tempfile import unittest from typing import List +from parameterized import parameterized + from transformers import ( AddedToken, LayoutXLMTokenizerFast, @@ -324,7 +326,8 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens) self.assertIn(decoded, [output, output.lower()]) - def test_encode_plus_with_padding(self): + @parameterized.expand([(True,), (False,)]) + def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): @@ -375,15 +378,18 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask) # Test right padding - tokenizer.padding_side = "right" + tokenizer_kwargs_right = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } - right_padded_sequence = tokenizer.encode_plus( - words, - boxes=boxes, - max_length=sequence_length + padding_size, - padding="max_length", - return_special_tokens_mask=True, - ) + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "right" + else: + tokenizer_kwargs_right["padding_side"] = "right" + + right_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_right) right_padded_input_ids = right_padded_sequence["input_ids"] right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"] @@ -394,14 +400,18 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask) # Test left padding - tokenizer.padding_side = "left" - left_padded_sequence = tokenizer.encode_plus( - words, - boxes=boxes, - max_length=sequence_length + padding_size, - padding="max_length", - return_special_tokens_mask=True, - ) + tokenizer_kwargs_left = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } + + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "left" + else: + tokenizer_kwargs_left["padding_side"] = "left" + + left_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_left) left_padded_input_ids = left_padded_sequence["input_ids"] left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"] left_padded_sequence_length = len(left_padded_input_ids) diff --git a/tests/models/markuplm/test_tokenization_markuplm.py b/tests/models/markuplm/test_tokenization_markuplm.py index 458df94ec2..fcdde2eb8a 100644 --- a/tests/models/markuplm/test_tokenization_markuplm.py +++ b/tests/models/markuplm/test_tokenization_markuplm.py @@ -22,6 +22,8 @@ import tempfile import unittest from typing import List +from parameterized import parameterized + from transformers import ( AddedToken, MarkupLMTokenizerFast, @@ -211,7 +213,8 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_right_and_left_truncation(self): pass - def test_encode_plus_with_padding(self): + @parameterized.expand([(True,), (False,)]) + def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): @@ -262,15 +265,18 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask) # Test right padding - tokenizer.padding_side = "right" + tokenizer_kwargs_right = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } - right_padded_sequence = tokenizer.encode_plus( - nodes, - xpaths=xpaths, - max_length=sequence_length + padding_size, - padding="max_length", - return_special_tokens_mask=True, - ) + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "right" + else: + tokenizer_kwargs_right["padding_side"] = "right" + + right_padded_sequence = tokenizer.encode_plus(nodes, xpaths=xpaths, **tokenizer_kwargs_right) right_padded_input_ids = right_padded_sequence["input_ids"] right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"] @@ -281,14 +287,18 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask) # Test left padding - tokenizer.padding_side = "left" - left_padded_sequence = tokenizer.encode_plus( - nodes, - xpaths=xpaths, - max_length=sequence_length + padding_size, - padding="max_length", - return_special_tokens_mask=True, - ) + tokenizer_kwargs_left = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } + + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "left" + else: + tokenizer_kwargs_left["padding_side"] = "left" + + left_padded_sequence = tokenizer.encode_plus(nodes, xpaths=xpaths, **tokenizer_kwargs_left) left_padded_input_ids = left_padded_sequence["input_ids"] left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"] left_padded_sequence_length = len(left_padded_input_ids) diff --git a/tests/models/tapas/test_tokenization_tapas.py b/tests/models/tapas/test_tokenization_tapas.py index a9b8e9a0c7..49327a39cd 100644 --- a/tests/models/tapas/test_tokenization_tapas.py +++ b/tests/models/tapas/test_tokenization_tapas.py @@ -21,6 +21,7 @@ from typing import List import numpy as np import pandas as pd +from parameterized import parameterized from transformers import AddedToken, is_torch_available from transformers.models.tapas.tokenization_tapas import ( @@ -494,7 +495,8 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase): decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens) self.assertIn(decoded, [output, output.lower()]) - def test_encode_plus_with_padding(self): + @parameterized.expand([(True,), (False,)]) + def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): @@ -547,15 +549,18 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase): assert special_tokens_mask == not_padded_special_tokens_mask # Test right padding - tokenizer.padding_side = "right" + tokenizer_kwargs_right = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } - right_padded_sequence = tokenizer.encode_plus( - table, - sequence, - max_length=sequence_length + padding_size, - padding="max_length", - return_special_tokens_mask=True, - ) + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "right" + else: + tokenizer_kwargs_right["padding_side"] = "right" + + right_padded_sequence = tokenizer.encode_plus(table, sequence, **tokenizer_kwargs_right) right_padded_input_ids = right_padded_sequence["input_ids"] right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"] @@ -566,14 +571,18 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase): assert special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask # Test left padding - tokenizer.padding_side = "left" - left_padded_sequence = tokenizer.encode_plus( - table, - sequence, - max_length=sequence_length + padding_size, - padding="max_length", - return_special_tokens_mask=True, - ) + tokenizer_kwargs_left = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } + + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "left" + else: + tokenizer_kwargs_left["padding_side"] = "left" + + left_padded_sequence = tokenizer.encode_plus(table, sequence, **tokenizer_kwargs_left) left_padded_input_ids = left_padded_sequence["input_ids"] left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"] left_padded_sequence_length = len(left_padded_input_ids) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 64c860e3fc..342254dfbd 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -2225,7 +2225,15 @@ class TokenizerTesterMixin: else: self.assertListEqual(padded_features["attention_mask"], [[1, 1, 1, 1, 1, 0], [0, 0, 0, 1, 1, 0]]) - def test_encode_plus_with_padding(self): + @parameterized.expand([(True,), (False,)]) + def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool): + """ + This test checks that padding works as expected when tokenizing a sequence. + Padding is expected to have no effect when the input is a single sequence and + the padding-strategy is not `max_length`. Otherwise it pads to the specified max-length + using tokenizer classes `padding_side` attribute. Also, we check that passing `padding_side` + as call time kwarg works same way as when one sets `tokenizer.padding_side` attribute. + """ tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): @@ -2244,8 +2252,6 @@ class TokenizerTesterMixin: sequence_length = len(input_ids) # Test 'longest' and 'no_padding' don't do anything - tokenizer.padding_side = "right" - not_padded_sequence = tokenizer.encode_plus( sequence, padding=True, @@ -2275,14 +2281,18 @@ class TokenizerTesterMixin: self.assertEqual(special_tokens_mask, not_padded_special_tokens_mask) # Test right padding - tokenizer.padding_side = "right" + tokenizer_kwargs_right = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } - right_padded_sequence = tokenizer.encode_plus( - sequence, - max_length=sequence_length + padding_size, - padding="max_length", - return_special_tokens_mask=True, - ) + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "right" + else: + tokenizer_kwargs_right["padding_side"] = "right" + + right_padded_sequence = tokenizer.encode_plus(sequence, **tokenizer_kwargs_right) right_padded_input_ids = right_padded_sequence["input_ids"] right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"] @@ -2293,13 +2303,18 @@ class TokenizerTesterMixin: self.assertEqual(special_tokens_mask + [1] * padding_size, right_padded_special_tokens_mask) # Test left padding - tokenizer.padding_side = "left" - left_padded_sequence = tokenizer.encode_plus( - sequence, - max_length=sequence_length + padding_size, - padding="max_length", - return_special_tokens_mask=True, - ) + tokenizer_kwargs_left = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } + + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "left" + else: + tokenizer_kwargs_left["padding_side"] = "left" + + left_padded_sequence = tokenizer.encode_plus(sequence, **tokenizer_kwargs_left) left_padded_input_ids = left_padded_sequence["input_ids"] left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"] left_padded_sequence_length = len(left_padded_input_ids)