Enable padding_side as call time kwargs (#33385)

* fix

* add padding-side kwarg

* add padding side in all models & fix tests

* fix copies

* fix tests
This commit is contained in:
Raushan Turganbay
2024-09-13 12:58:38 +02:00
committed by GitHub
parent 1027a532c5
commit 4b0418df11
26 changed files with 528 additions and 149 deletions

View File

@@ -414,6 +414,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -517,6 +518,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -539,6 +541,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -567,6 +570,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -598,6 +602,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -625,6 +630,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -653,6 +659,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -677,6 +684,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -708,6 +716,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=None, # we pad in batch afterward
padding_side=None, # we pad in batch afterward
return_attention_mask=False, # we pad in batch afterward
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -728,6 +737,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -748,6 +758,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -769,6 +780,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -795,6 +807,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -838,6 +851,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -861,6 +875,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -891,6 +906,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
@@ -914,6 +930,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -1100,6 +1117,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
max_length=max_length,
padding=padding_strategy.value,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -1243,6 +1261,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -1265,6 +1284,9 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
padding_side:
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -1288,7 +1310,8 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
if needs_to_be_padded:
difference = max_length - len(required_input)
if self.padding_side == "right":
padding_side = padding_side if padding_side is not None else self.padding_side
if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -1302,7 +1325,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
elif self.padding_side == "left":
elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -1317,7 +1340,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs

View File

@@ -165,6 +165,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -268,6 +269,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -290,6 +292,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -318,6 +321,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -349,6 +353,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -381,6 +386,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -424,6 +430,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -451,6 +458,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -470,6 +478,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
)
if is_pair:
@@ -603,6 +612,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[bool] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -631,6 +641,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -663,6 +674,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -685,6 +697,9 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
padding_side:
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -708,7 +723,8 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
if needs_to_be_padded:
difference = max_length - len(required_input)
if self.padding_side == "right":
padding_side = padding_side if padding_side is not None else self.padding_side
if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -722,7 +738,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
elif self.padding_side == "left":
elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -737,7 +753,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs

View File

@@ -543,6 +543,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -646,6 +647,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -668,6 +670,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -697,6 +700,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -728,6 +732,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -756,6 +761,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -784,6 +790,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -809,6 +816,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -840,6 +848,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=None, # we pad in batch afterward
padding_side=None, # we pad in batch afterward
return_attention_mask=False, # we pad in batch afterward
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -860,6 +869,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -881,6 +891,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -902,6 +913,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -929,6 +941,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -972,6 +985,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -996,6 +1010,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -1026,6 +1041,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
@@ -1049,6 +1065,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -1237,6 +1254,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
max_length=max_length,
padding=padding_strategy.value,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -1382,6 +1400,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -1404,6 +1423,9 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
padding_side:
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -1427,7 +1449,8 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
if needs_to_be_padded:
difference = max_length - len(required_input)
if self.padding_side == "right":
padding_side = padding_side if padding_side is not None else self.padding_side
if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -1441,7 +1464,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
elif self.padding_side == "left":
elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -1456,6 +1479,6 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs

View File

@@ -217,6 +217,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -320,6 +321,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -342,6 +344,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -371,6 +374,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -402,6 +406,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -436,6 +441,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -479,6 +485,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -506,6 +513,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -525,6 +533,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
)
if is_pair:
@@ -664,6 +673,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[bool] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -692,6 +702,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -725,6 +736,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -747,6 +759,9 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
padding_side:
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -770,7 +785,8 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
if needs_to_be_padded:
difference = max_length - len(required_input)
if self.padding_side == "right":
padding_side = padding_side if padding_side is not None else self.padding_side
if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -784,7 +800,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
elif self.padding_side == "left":
elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -799,7 +815,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs

View File

@@ -447,6 +447,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -550,6 +551,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -572,6 +574,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -599,6 +602,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -627,6 +631,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -651,6 +656,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -682,6 +688,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=None, # we pad in batch afterward
padding_side=None, # we pad in batch afterward
return_attention_mask=False, # we pad in batch afterward
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -702,6 +709,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -721,6 +729,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -751,6 +760,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
@@ -774,6 +784,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -947,6 +958,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
max_length=max_length,
padding=padding_strategy.value,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -1090,6 +1102,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -1112,6 +1125,9 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
padding_side (`str`, *optional*):
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -1135,7 +1151,8 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
if needs_to_be_padded:
difference = max_length - len(required_input)
if self.padding_side == "right":
padding_side = padding_side if padding_side is not None else self.padding_side
if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -1149,7 +1166,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
elif self.padding_side == "left":
elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -1164,6 +1181,6 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs

View File

@@ -277,6 +277,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -380,6 +381,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -402,6 +404,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -442,6 +445,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -462,6 +466,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
)
if is_pair:
@@ -595,6 +600,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[bool] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -623,6 +629,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -655,6 +662,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -677,6 +685,9 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
padding_side (`str`, *optional*):
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -700,7 +711,8 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
if needs_to_be_padded:
difference = max_length - len(required_input)
if self.padding_side == "right":
padding_side = padding_side if padding_side is not None else self.padding_side
if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -714,7 +726,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
elif self.padding_side == "left":
elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -729,7 +741,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs

View File

@@ -412,6 +412,7 @@ class LEDTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
encoded_inputs = super()._pad(
@@ -419,6 +420,7 @@ class LEDTokenizer(PreTrainedTokenizer):
max_length=max_length,
padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)

View File

@@ -288,6 +288,7 @@ class LEDTokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
encoded_inputs = super()._pad(
@@ -295,6 +296,7 @@ class LEDTokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)

View File

@@ -570,6 +570,7 @@ class LukeTokenizer(PreTrainedTokenizer):
stride: int = 0,
is_split_into_words: Optional[bool] = False,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -662,6 +663,7 @@ class LukeTokenizer(PreTrainedTokenizer):
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -688,6 +690,7 @@ class LukeTokenizer(PreTrainedTokenizer):
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -715,6 +718,7 @@ class LukeTokenizer(PreTrainedTokenizer):
stride: int = 0,
is_split_into_words: Optional[bool] = False,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -769,6 +773,7 @@ class LukeTokenizer(PreTrainedTokenizer):
max_entity_length=max_entity_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
@@ -796,6 +801,7 @@ class LukeTokenizer(PreTrainedTokenizer):
stride: int = 0,
is_split_into_words: Optional[bool] = False,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -876,6 +882,7 @@ class LukeTokenizer(PreTrainedTokenizer):
max_entity_length=max_entity_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -1070,6 +1077,7 @@ class LukeTokenizer(PreTrainedTokenizer):
max_entity_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -1112,6 +1120,7 @@ class LukeTokenizer(PreTrainedTokenizer):
max_entity_length=max_entity_length,
stride=stride,
pad_to_multiple_of=None, # we pad in batch afterward
padding_side=None, # we pad in batch afterward
return_attention_mask=False, # we pad in batch afterward
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -1132,6 +1141,7 @@ class LukeTokenizer(PreTrainedTokenizer):
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -1155,6 +1165,7 @@ class LukeTokenizer(PreTrainedTokenizer):
max_entity_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -1357,6 +1368,7 @@ class LukeTokenizer(PreTrainedTokenizer):
max_entity_length=max_entity_length,
padding=padding_strategy.value,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -1382,6 +1394,7 @@ class LukeTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
max_entity_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
verbose: bool = True,
@@ -1418,6 +1431,9 @@ class LukeTokenizer(PreTrainedTokenizer):
pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
padding_side:
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
return_attention_mask (`bool`, *optional*):
Whether to return the attention mask. If left to the default, will return the attention mask according
to the specific tokenizer's default, defined by the `return_outputs` attribute. [What are attention
@@ -1495,6 +1511,7 @@ class LukeTokenizer(PreTrainedTokenizer):
max_entity_length=max_entity_length,
padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
@@ -1519,6 +1536,7 @@ class LukeTokenizer(PreTrainedTokenizer):
max_entity_length=max_entity_length,
padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -1536,6 +1554,7 @@ class LukeTokenizer(PreTrainedTokenizer):
max_entity_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -1562,6 +1581,9 @@ class LukeTokenizer(PreTrainedTokenizer):
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
padding_side:
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -1600,9 +1622,10 @@ class LukeTokenizer(PreTrainedTokenizer):
if needs_to_be_padded:
difference = max_length - len(encoded_inputs["input_ids"])
padding_side = padding_side if padding_side is not None else self.padding_side
if entities_provided:
entity_difference = max_entity_length - len(encoded_inputs["entity_ids"])
if self.padding_side == "right":
if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if entities_provided:
@@ -1633,7 +1656,7 @@ class LukeTokenizer(PreTrainedTokenizer):
encoded_inputs["entity_end_positions"] + [0] * entity_difference
)
elif self.padding_side == "left":
elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if entities_provided:
@@ -1664,7 +1687,7 @@ class LukeTokenizer(PreTrainedTokenizer):
"entity_end_positions"
]
else:
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs

View File

@@ -503,6 +503,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -602,6 +603,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -624,6 +626,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -652,6 +655,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -683,6 +687,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -710,6 +715,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -738,6 +744,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -762,6 +769,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -793,6 +801,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=None, # we pad in batch afterward
padding_side=None, # we pad in batch afterward
return_attention_mask=False, # we pad in batch afterward
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -813,6 +822,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -833,6 +843,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -854,6 +865,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -880,6 +892,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -923,6 +936,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -946,6 +960,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -976,6 +991,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
@@ -999,6 +1015,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -1203,6 +1220,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
max_length=max_length,
padding=padding_strategy.value,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -1357,6 +1375,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -1376,6 +1395,9 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
padding_side:
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -1399,7 +1421,8 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
if needs_to_be_padded:
difference = max_length - len(required_input)
if self.padding_side == "right":
padding_side = padding_side if padding_side is not None else self.padding_side
if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -1419,7 +1442,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
elif self.padding_side == "left":
elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -1440,6 +1463,6 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs

View File

@@ -286,6 +286,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -385,6 +386,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -407,6 +409,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -435,6 +438,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -466,6 +470,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -498,6 +503,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -541,6 +547,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -568,6 +575,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -587,6 +595,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
)
if is_pair:
@@ -721,6 +730,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[bool] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -749,6 +759,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -781,6 +792,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -800,6 +812,9 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
padding_side:
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -823,7 +838,8 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
if needs_to_be_padded:
difference = max_length - len(required_input)
if self.padding_side == "right":
padding_side = padding_side if padding_side is not None else self.padding_side
if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -843,7 +859,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
elif self.padding_side == "left":
elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -864,7 +880,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs

View File

@@ -399,6 +399,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
stride: int = 0,
is_split_into_words: Optional[bool] = False,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -491,6 +492,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -517,6 +519,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -545,6 +548,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
stride: int = 0,
is_split_into_words: Optional[bool] = False,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -599,6 +603,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
max_entity_length=max_entity_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
@@ -627,6 +632,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
stride: int = 0,
is_split_into_words: Optional[bool] = False,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -707,6 +713,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
max_entity_length=max_entity_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -904,6 +911,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
max_entity_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -946,6 +954,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
max_entity_length=max_entity_length,
stride=stride,
pad_to_multiple_of=None, # we pad in batch afterward
padding_side=None, # we pad in batch afterward
return_attention_mask=False, # we pad in batch afterward
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -966,6 +975,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -990,6 +1000,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
max_entity_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -1192,6 +1203,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
max_entity_length=max_entity_length,
padding=padding_strategy.value,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -1218,6 +1230,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
max_entity_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
verbose: bool = True,
@@ -1254,6 +1267,9 @@ class MLukeTokenizer(PreTrainedTokenizer):
pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
padding_side:
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
return_attention_mask (`bool`, *optional*):
Whether to return the attention mask. If left to the default, will return the attention mask according
to the specific tokenizer's default, defined by the `return_outputs` attribute. [What are attention
@@ -1331,6 +1347,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
max_entity_length=max_entity_length,
padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
@@ -1355,6 +1372,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
max_entity_length=max_entity_length,
padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -1373,6 +1391,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
max_entity_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -1399,6 +1418,9 @@ class MLukeTokenizer(PreTrainedTokenizer):
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
padding_side:
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -1437,9 +1459,10 @@ class MLukeTokenizer(PreTrainedTokenizer):
if needs_to_be_padded:
difference = max_length - len(encoded_inputs["input_ids"])
padding_side = padding_side if padding_side is not None else self.padding_side
if entities_provided:
entity_difference = max_entity_length - len(encoded_inputs["entity_ids"])
if self.padding_side == "right":
if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if entities_provided:
@@ -1470,7 +1493,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
encoded_inputs["entity_end_positions"] + [0] * entity_difference
)
elif self.padding_side == "left":
elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if entities_provided:
@@ -1501,7 +1524,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
"entity_end_positions"
]
else:
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs

View File

@@ -210,6 +210,7 @@ class RoCBertTokenizer(PreTrainedTokenizer):
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -283,6 +284,7 @@ class RoCBertTokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
@@ -308,6 +310,7 @@ class RoCBertTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -462,6 +465,7 @@ class RoCBertTokenizer(PreTrainedTokenizer):
max_length=max_length,
padding=padding_strategy.value,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -480,6 +484,7 @@ class RoCBertTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
# Load from model defaults
@@ -502,8 +507,9 @@ class RoCBertTokenizer(PreTrainedTokenizer):
if needs_to_be_padded:
difference = max_length - len(required_input)
padding_side = padding_side if padding_side is not None else self.padding_side
if self.padding_side == "right":
if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -516,7 +522,7 @@ class RoCBertTokenizer(PreTrainedTokenizer):
if key in encoded_inputs:
encoded_inputs[key] = encoded_inputs[key] + [self.pad_token_id] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
elif self.padding_side == "left":
elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -530,7 +536,7 @@ class RoCBertTokenizer(PreTrainedTokenizer):
encoded_inputs[key] = [self.pad_token_id] * difference + encoded_inputs[key]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs
@@ -551,6 +557,7 @@ class RoCBertTokenizer(PreTrainedTokenizer):
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -627,6 +634,7 @@ class RoCBertTokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -650,6 +658,7 @@ class RoCBertTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -686,6 +695,7 @@ class RoCBertTokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=None, # we pad in batch afterward
padding_side=None, # we pad in batch afterward
return_attention_mask=False, # we pad in batch afterward
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -706,6 +716,7 @@ class RoCBertTokenizer(PreTrainedTokenizer):
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)

View File

@@ -517,6 +517,7 @@ class TapasTokenizer(PreTrainedTokenizer):
truncation: Union[bool, str, TapasTruncationStrategy] = False,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -581,6 +582,7 @@ class TapasTokenizer(PreTrainedTokenizer):
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -602,6 +604,7 @@ class TapasTokenizer(PreTrainedTokenizer):
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -631,6 +634,7 @@ class TapasTokenizer(PreTrainedTokenizer):
truncation: Union[bool, str, TapasTruncationStrategy] = False,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -699,6 +703,7 @@ class TapasTokenizer(PreTrainedTokenizer):
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -738,6 +743,7 @@ class TapasTokenizer(PreTrainedTokenizer):
truncation: Union[bool, str, TapasTruncationStrategy] = False,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = True,
return_attention_mask: Optional[bool] = None,
@@ -768,6 +774,7 @@ class TapasTokenizer(PreTrainedTokenizer):
add_special_tokens=add_special_tokens,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
@@ -797,6 +804,7 @@ class TapasTokenizer(PreTrainedTokenizer):
truncation: Union[bool, str, TapasTruncationStrategy] = False,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = True,
return_attention_mask: Optional[bool] = True,
@@ -823,6 +831,7 @@ class TapasTokenizer(PreTrainedTokenizer):
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=None, # we pad in batch afterwards
padding_side=None, # we pad in batch afterward
return_attention_mask=False, # we pad in batch afterwards
return_token_type_ids=return_token_type_ids,
return_special_tokens_mask=return_special_tokens_mask,
@@ -844,6 +853,7 @@ class TapasTokenizer(PreTrainedTokenizer):
padding=padding,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -912,6 +922,7 @@ class TapasTokenizer(PreTrainedTokenizer):
truncation: Union[bool, str, TapasTruncationStrategy] = False,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -968,6 +979,7 @@ class TapasTokenizer(PreTrainedTokenizer):
padding=padding,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -993,6 +1005,7 @@ class TapasTokenizer(PreTrainedTokenizer):
truncation: Union[bool, str, TapasTruncationStrategy] = False,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = True,
return_attention_mask: Optional[bool] = True,
@@ -1024,6 +1037,7 @@ class TapasTokenizer(PreTrainedTokenizer):
padding=padding,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
@@ -1051,6 +1065,7 @@ class TapasTokenizer(PreTrainedTokenizer):
truncation: Union[bool, str, TapasTruncationStrategy] = False,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = True,
return_attention_mask: Optional[bool] = True,
@@ -1214,6 +1229,7 @@ class TapasTokenizer(PreTrainedTokenizer):
max_length=max_length,
padding=padding.value,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -1754,6 +1770,7 @@ class TapasTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -1776,6 +1793,9 @@ class TapasTokenizer(PreTrainedTokenizer):
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
padding_side:
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -1799,7 +1819,8 @@ class TapasTokenizer(PreTrainedTokenizer):
if needs_to_be_padded:
difference = max_length - len(encoded_inputs["input_ids"])
if self.padding_side == "right":
padding_side = padding_side if padding_side is not None else self.padding_side
if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -1817,7 +1838,7 @@ class TapasTokenizer(PreTrainedTokenizer):
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
elif self.padding_side == "left":
elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -1836,7 +1857,7 @@ class TapasTokenizer(PreTrainedTokenizer):
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"]
else:
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs

View File

@@ -551,6 +551,7 @@ class UdopTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -654,6 +655,7 @@ class UdopTokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -676,6 +678,7 @@ class UdopTokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -704,6 +707,7 @@ class UdopTokenizer(PreTrainedTokenizer):
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -746,6 +750,7 @@ class UdopTokenizer(PreTrainedTokenizer):
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -813,6 +818,7 @@ class UdopTokenizer(PreTrainedTokenizer):
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -865,6 +871,7 @@ class UdopTokenizer(PreTrainedTokenizer):
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -892,6 +899,7 @@ class UdopTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -920,6 +928,7 @@ class UdopTokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -944,6 +953,7 @@ class UdopTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -975,6 +985,7 @@ class UdopTokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=None, # we pad in batch afterward
padding_side=None, # we pad in batch afterward
return_attention_mask=False, # we pad in batch afterward
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -995,6 +1006,7 @@ class UdopTokenizer(PreTrainedTokenizer):
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -1014,6 +1026,7 @@ class UdopTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -1044,6 +1057,7 @@ class UdopTokenizer(PreTrainedTokenizer):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
@@ -1067,6 +1081,7 @@ class UdopTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -1240,6 +1255,7 @@ class UdopTokenizer(PreTrainedTokenizer):
max_length=max_length,
padding=padding_strategy.value,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -1385,6 +1401,7 @@ class UdopTokenizer(PreTrainedTokenizer):
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -1407,6 +1424,9 @@ class UdopTokenizer(PreTrainedTokenizer):
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
padding_side (`str`, *optional*):
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -1430,7 +1450,8 @@ class UdopTokenizer(PreTrainedTokenizer):
if needs_to_be_padded:
difference = max_length - len(required_input)
if self.padding_side == "right":
padding_side = padding_side if padding_side is not None else self.padding_side
if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -1444,7 +1465,7 @@ class UdopTokenizer(PreTrainedTokenizer):
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
elif self.padding_side == "left":
elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -1459,6 +1480,6 @@ class UdopTokenizer(PreTrainedTokenizer):
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs

View File

@@ -286,6 +286,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -389,6 +390,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -411,6 +413,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -453,6 +456,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -501,6 +505,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -528,6 +533,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -548,6 +554,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
)
if is_pair:
@@ -684,6 +691,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[bool] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -712,6 +720,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -794,6 +803,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -846,6 +856,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -864,6 +875,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -886,6 +898,9 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
padding_side (`str`, *optional*):
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -909,7 +924,8 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
if needs_to_be_padded:
difference = max_length - len(required_input)
if self.padding_side == "right":
padding_side = padding_side if padding_side is not None else self.padding_side
if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -923,7 +939,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
elif self.padding_side == "left":
elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -938,7 +954,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs

View File

@@ -781,6 +781,7 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer):
padding: Union[bool, str, PaddingStrategy] = False,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
verbose: bool = True,
**kwargs,
@@ -794,6 +795,10 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer):
The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
values, a list of numpy array or a list of list of float values. Must be mono channel audio, not
stereo, i.e. single float per timestep.
padding_side (`str`, *optional*):
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
"""
is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
@@ -825,6 +830,7 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer):
padding=padding,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=self.return_attention_mask,
return_tensors=return_tensors,
verbose=verbose,

View File

@@ -749,6 +749,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -806,6 +807,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
@@ -833,6 +835,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -891,6 +894,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -913,6 +917,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -942,6 +947,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
max_length=max_length,
stride=stride,
pad_to_multiple_of=None, # we pad in batch afterward
padding_side=None, # we pad in batch afterward
return_attention_mask=False, # we pad in batch afterward
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -963,6 +969,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)

View File

@@ -1427,6 +1427,9 @@ ENCODE_KWARGS_DOCSTRING = r"""
If set will pad the sequence to a multiple of the provided value. Requires `padding` to be activated.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
padding_side (`str`, *optional*):
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors instead of list of python integers. Acceptable values are:
@@ -2767,6 +2770,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
) -> List[int]:
@@ -2793,6 +2797,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
truncation=truncation,
max_length=max_length,
stride=stride,
padding_side=padding_side,
return_tensors=return_tensors,
**kwargs,
)
@@ -2956,6 +2961,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -2997,6 +3003,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
"stride": stride,
"is_split_into_words": is_split_into_words,
"pad_to_multiple_of": pad_to_multiple_of,
"padding_side": padding_side,
"return_tensors": return_tensors,
"return_token_type_ids": return_token_type_ids,
"return_attention_mask": return_attention_mask,
@@ -3041,6 +3048,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -3111,6 +3119,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -3133,6 +3142,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -3157,6 +3167,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -3207,6 +3218,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -3230,6 +3242,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -3261,6 +3274,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -3307,6 +3321,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -3336,6 +3351,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -3361,6 +3377,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
padding: Union[bool, str, PaddingStrategy] = True,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
verbose: bool = True,
@@ -3409,6 +3426,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
padding_side (`str`, *optional*):
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
return_attention_mask (`bool`, *optional*):
Whether to return the attention mask. If left to the default, will return the attention mask according
to the specific tokenizer's default, defined by the `return_outputs` attribute.
@@ -3491,6 +3511,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
max_length=max_length,
padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
@@ -3512,6 +3533,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
max_length=max_length,
padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -3573,6 +3595,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -3686,6 +3709,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
max_length=max_length,
padding=padding_strategy.value,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -3828,6 +3852,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -3843,13 +3868,16 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
- PaddingStrategy.DO_NOT_PAD: Do not pad
The tokenizer padding sides are defined in self.padding_side:
The tokenizer padding sides are defined in `padding_side` argument:
- 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
padding_side:
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -3873,8 +3901,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
if needs_to_be_padded:
difference = max_length - len(required_input)
padding_side = padding_side if padding_side is not None else self.padding_side
if self.padding_side == "right":
if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -3884,7 +3913,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
elif self.padding_side == "left":
elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -3895,7 +3924,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
raise ValueError(f"Invalid padding strategy:{self.padding_side}")
raise ValueError(f"Invalid padding strategy:{padding_side}")
return encoded_inputs

View File

@@ -429,6 +429,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
max_length: int,
stride: int,
pad_to_multiple_of: Optional[int],
padding_side: Optional[bool],
):
"""
Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
@@ -450,6 +451,9 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
padding_side (`str`, *optional*):
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
"""
_truncation = self._tokenizer.truncation
_padding = self._tokenizer.padding
@@ -484,7 +488,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
length = max_length if padding_strategy == PaddingStrategy.MAX_LENGTH else None
target = {
"length": length,
"direction": self.padding_side,
"direction": padding_side if padding_side is not None else self.padding_side,
"pad_id": self.pad_token_id,
"pad_token": self.pad_token,
"pad_type_id": self.pad_token_type_id,
@@ -505,6 +509,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -527,6 +532,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
)
if self._tokenizer.encode_special_tokens != split_special_tokens:
@@ -593,6 +599,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[bool] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -614,6 +621,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,

View File

@@ -21,6 +21,8 @@ import tempfile
import unittest
from typing import List
from parameterized import parameterized
from transformers import (
AddedToken,
LayoutLMv2TokenizerFast,
@@ -393,7 +395,8 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_split_special_tokens(self):
pass
def test_encode_plus_with_padding(self):
@parameterized.expand([(True,), (False,)])
def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
@@ -444,15 +447,18 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
# Test right padding
tokenizer.padding_side = "right"
tokenizer_kwargs_right = {
"max_length": sequence_length + padding_size,
"padding": "max_length",
"return_special_tokens_mask": True,
}
right_padded_sequence = tokenizer.encode_plus(
words,
boxes=boxes,
max_length=sequence_length + padding_size,
padding="max_length",
return_special_tokens_mask=True,
)
if not use_padding_as_call_kwarg:
tokenizer.padding_side = "right"
else:
tokenizer_kwargs_right["padding_side"] = "right"
right_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_right)
right_padded_input_ids = right_padded_sequence["input_ids"]
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
@@ -463,14 +469,18 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask)
# Test left padding
tokenizer.padding_side = "left"
left_padded_sequence = tokenizer.encode_plus(
words,
boxes=boxes,
max_length=sequence_length + padding_size,
padding="max_length",
return_special_tokens_mask=True,
)
tokenizer_kwargs_left = {
"max_length": sequence_length + padding_size,
"padding": "max_length",
"return_special_tokens_mask": True,
}
if not use_padding_as_call_kwarg:
tokenizer.padding_side = "left"
else:
tokenizer_kwargs_left["padding_side"] = "left"
left_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_left)
left_padded_input_ids = left_padded_sequence["input_ids"]
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
left_padded_sequence_length = len(left_padded_input_ids)

View File

@@ -22,6 +22,8 @@ import tempfile
import unittest
from typing import List
from parameterized import parameterized
from transformers import (
AddedToken,
LayoutLMv3TokenizerFast,
@@ -273,7 +275,8 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_split_special_tokens(self):
pass
def test_encode_plus_with_padding(self):
@parameterized.expand([(True,), (False,)])
def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
@@ -324,15 +327,18 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
# Test right padding
tokenizer.padding_side = "right"
tokenizer_kwargs_right = {
"max_length": sequence_length + padding_size,
"padding": "max_length",
"return_special_tokens_mask": True,
}
right_padded_sequence = tokenizer.encode_plus(
words,
boxes=boxes,
max_length=sequence_length + padding_size,
padding="max_length",
return_special_tokens_mask=True,
)
if not use_padding_as_call_kwarg:
tokenizer.padding_side = "right"
else:
tokenizer_kwargs_right["padding_side"] = "right"
right_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_right)
right_padded_input_ids = right_padded_sequence["input_ids"]
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
@@ -343,14 +349,18 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask)
# Test left padding
tokenizer.padding_side = "left"
left_padded_sequence = tokenizer.encode_plus(
words,
boxes=boxes,
max_length=sequence_length + padding_size,
padding="max_length",
return_special_tokens_mask=True,
)
tokenizer_kwargs_left = {
"max_length": sequence_length + padding_size,
"padding": "max_length",
"return_special_tokens_mask": True,
}
if not use_padding_as_call_kwarg:
tokenizer.padding_side = "left"
else:
tokenizer_kwargs_left["padding_side"] = "left"
left_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_left)
left_padded_input_ids = left_padded_sequence["input_ids"]
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
left_padded_sequence_length = len(left_padded_input_ids)

View File

@@ -19,6 +19,8 @@ import tempfile
import unittest
from typing import List
from parameterized import parameterized
from transformers import (
AddedToken,
LayoutXLMTokenizerFast,
@@ -324,7 +326,8 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
self.assertIn(decoded, [output, output.lower()])
def test_encode_plus_with_padding(self):
@parameterized.expand([(True,), (False,)])
def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
@@ -375,15 +378,18 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
# Test right padding
tokenizer.padding_side = "right"
tokenizer_kwargs_right = {
"max_length": sequence_length + padding_size,
"padding": "max_length",
"return_special_tokens_mask": True,
}
right_padded_sequence = tokenizer.encode_plus(
words,
boxes=boxes,
max_length=sequence_length + padding_size,
padding="max_length",
return_special_tokens_mask=True,
)
if not use_padding_as_call_kwarg:
tokenizer.padding_side = "right"
else:
tokenizer_kwargs_right["padding_side"] = "right"
right_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_right)
right_padded_input_ids = right_padded_sequence["input_ids"]
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
@@ -394,14 +400,18 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask)
# Test left padding
tokenizer.padding_side = "left"
left_padded_sequence = tokenizer.encode_plus(
words,
boxes=boxes,
max_length=sequence_length + padding_size,
padding="max_length",
return_special_tokens_mask=True,
)
tokenizer_kwargs_left = {
"max_length": sequence_length + padding_size,
"padding": "max_length",
"return_special_tokens_mask": True,
}
if not use_padding_as_call_kwarg:
tokenizer.padding_side = "left"
else:
tokenizer_kwargs_left["padding_side"] = "left"
left_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_left)
left_padded_input_ids = left_padded_sequence["input_ids"]
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
left_padded_sequence_length = len(left_padded_input_ids)

View File

@@ -22,6 +22,8 @@ import tempfile
import unittest
from typing import List
from parameterized import parameterized
from transformers import (
AddedToken,
MarkupLMTokenizerFast,
@@ -211,7 +213,8 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_right_and_left_truncation(self):
pass
def test_encode_plus_with_padding(self):
@parameterized.expand([(True,), (False,)])
def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
@@ -262,15 +265,18 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
# Test right padding
tokenizer.padding_side = "right"
tokenizer_kwargs_right = {
"max_length": sequence_length + padding_size,
"padding": "max_length",
"return_special_tokens_mask": True,
}
right_padded_sequence = tokenizer.encode_plus(
nodes,
xpaths=xpaths,
max_length=sequence_length + padding_size,
padding="max_length",
return_special_tokens_mask=True,
)
if not use_padding_as_call_kwarg:
tokenizer.padding_side = "right"
else:
tokenizer_kwargs_right["padding_side"] = "right"
right_padded_sequence = tokenizer.encode_plus(nodes, xpaths=xpaths, **tokenizer_kwargs_right)
right_padded_input_ids = right_padded_sequence["input_ids"]
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
@@ -281,14 +287,18 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask)
# Test left padding
tokenizer.padding_side = "left"
left_padded_sequence = tokenizer.encode_plus(
nodes,
xpaths=xpaths,
max_length=sequence_length + padding_size,
padding="max_length",
return_special_tokens_mask=True,
)
tokenizer_kwargs_left = {
"max_length": sequence_length + padding_size,
"padding": "max_length",
"return_special_tokens_mask": True,
}
if not use_padding_as_call_kwarg:
tokenizer.padding_side = "left"
else:
tokenizer_kwargs_left["padding_side"] = "left"
left_padded_sequence = tokenizer.encode_plus(nodes, xpaths=xpaths, **tokenizer_kwargs_left)
left_padded_input_ids = left_padded_sequence["input_ids"]
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
left_padded_sequence_length = len(left_padded_input_ids)

View File

@@ -21,6 +21,7 @@ from typing import List
import numpy as np
import pandas as pd
from parameterized import parameterized
from transformers import AddedToken, is_torch_available
from transformers.models.tapas.tokenization_tapas import (
@@ -494,7 +495,8 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
self.assertIn(decoded, [output, output.lower()])
def test_encode_plus_with_padding(self):
@parameterized.expand([(True,), (False,)])
def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
@@ -547,15 +549,18 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
assert special_tokens_mask == not_padded_special_tokens_mask
# Test right padding
tokenizer.padding_side = "right"
tokenizer_kwargs_right = {
"max_length": sequence_length + padding_size,
"padding": "max_length",
"return_special_tokens_mask": True,
}
right_padded_sequence = tokenizer.encode_plus(
table,
sequence,
max_length=sequence_length + padding_size,
padding="max_length",
return_special_tokens_mask=True,
)
if not use_padding_as_call_kwarg:
tokenizer.padding_side = "right"
else:
tokenizer_kwargs_right["padding_side"] = "right"
right_padded_sequence = tokenizer.encode_plus(table, sequence, **tokenizer_kwargs_right)
right_padded_input_ids = right_padded_sequence["input_ids"]
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
@@ -566,14 +571,18 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
assert special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask
# Test left padding
tokenizer.padding_side = "left"
left_padded_sequence = tokenizer.encode_plus(
table,
sequence,
max_length=sequence_length + padding_size,
padding="max_length",
return_special_tokens_mask=True,
)
tokenizer_kwargs_left = {
"max_length": sequence_length + padding_size,
"padding": "max_length",
"return_special_tokens_mask": True,
}
if not use_padding_as_call_kwarg:
tokenizer.padding_side = "left"
else:
tokenizer_kwargs_left["padding_side"] = "left"
left_padded_sequence = tokenizer.encode_plus(table, sequence, **tokenizer_kwargs_left)
left_padded_input_ids = left_padded_sequence["input_ids"]
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
left_padded_sequence_length = len(left_padded_input_ids)

View File

@@ -2225,7 +2225,15 @@ class TokenizerTesterMixin:
else:
self.assertListEqual(padded_features["attention_mask"], [[1, 1, 1, 1, 1, 0], [0, 0, 0, 1, 1, 0]])
def test_encode_plus_with_padding(self):
@parameterized.expand([(True,), (False,)])
def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
"""
This test checks that padding works as expected when tokenizing a sequence.
Padding is expected to have no effect when the input is a single sequence and
the padding-strategy is not `max_length`. Otherwise it pads to the specified max-length
using tokenizer classes `padding_side` attribute. Also, we check that passing `padding_side`
as call time kwarg works same way as when one sets `tokenizer.padding_side` attribute.
"""
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
@@ -2244,8 +2252,6 @@ class TokenizerTesterMixin:
sequence_length = len(input_ids)
# Test 'longest' and 'no_padding' don't do anything
tokenizer.padding_side = "right"
not_padded_sequence = tokenizer.encode_plus(
sequence,
padding=True,
@@ -2275,14 +2281,18 @@ class TokenizerTesterMixin:
self.assertEqual(special_tokens_mask, not_padded_special_tokens_mask)
# Test right padding
tokenizer.padding_side = "right"
tokenizer_kwargs_right = {
"max_length": sequence_length + padding_size,
"padding": "max_length",
"return_special_tokens_mask": True,
}
right_padded_sequence = tokenizer.encode_plus(
sequence,
max_length=sequence_length + padding_size,
padding="max_length",
return_special_tokens_mask=True,
)
if not use_padding_as_call_kwarg:
tokenizer.padding_side = "right"
else:
tokenizer_kwargs_right["padding_side"] = "right"
right_padded_sequence = tokenizer.encode_plus(sequence, **tokenizer_kwargs_right)
right_padded_input_ids = right_padded_sequence["input_ids"]
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
@@ -2293,13 +2303,18 @@ class TokenizerTesterMixin:
self.assertEqual(special_tokens_mask + [1] * padding_size, right_padded_special_tokens_mask)
# Test left padding
tokenizer.padding_side = "left"
left_padded_sequence = tokenizer.encode_plus(
sequence,
max_length=sequence_length + padding_size,
padding="max_length",
return_special_tokens_mask=True,
)
tokenizer_kwargs_left = {
"max_length": sequence_length + padding_size,
"padding": "max_length",
"return_special_tokens_mask": True,
}
if not use_padding_as_call_kwarg:
tokenizer.padding_side = "left"
else:
tokenizer_kwargs_left["padding_side"] = "left"
left_padded_sequence = tokenizer.encode_plus(sequence, **tokenizer_kwargs_left)
left_padded_input_ids = left_padded_sequence["input_ids"]
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
left_padded_sequence_length = len(left_padded_input_ids)