Enable padding_side as call time kwargs (#33385)
* fix * add padding-side kwarg * add padding side in all models & fix tests * fix copies * fix tests
This commit is contained in:
committed by
GitHub
parent
1027a532c5
commit
4b0418df11
@@ -414,6 +414,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -517,6 +518,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -539,6 +541,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -567,6 +570,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -598,6 +602,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -625,6 +630,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -653,6 +659,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_overflowing_tokens=return_overflowing_tokens,
|
||||
@@ -677,6 +684,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[str] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -708,6 +716,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=None, # we pad in batch afterward
|
||||
padding_side=None, # we pad in batch afterward
|
||||
return_attention_mask=False, # we pad in batch afterward
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_overflowing_tokens=return_overflowing_tokens,
|
||||
@@ -728,6 +737,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
padding=padding_strategy.value,
|
||||
max_length=max_length,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
@@ -748,6 +758,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -769,6 +780,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -795,6 +807,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -838,6 +851,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -861,6 +875,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -891,6 +906,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
prepend_batch_axis=True,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -914,6 +930,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -1100,6 +1117,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
padding=padding_strategy.value,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
@@ -1243,6 +1261,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
@@ -1265,6 +1284,9 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
||||
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
|
||||
`>= 7.5` (Volta).
|
||||
padding_side:
|
||||
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
||||
Default value is picked from the class attribute of the same name.
|
||||
return_attention_mask:
|
||||
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
||||
"""
|
||||
@@ -1288,7 +1310,8 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
|
||||
if needs_to_be_padded:
|
||||
difference = max_length - len(required_input)
|
||||
if self.padding_side == "right":
|
||||
padding_side = padding_side if padding_side is not None else self.padding_side
|
||||
if padding_side == "right":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -1302,7 +1325,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
if "special_tokens_mask" in encoded_inputs:
|
||||
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
|
||||
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
|
||||
elif self.padding_side == "left":
|
||||
elif padding_side == "left":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -1317,7 +1340,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
||||
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
|
||||
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
||||
else:
|
||||
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
|
||||
raise ValueError("Invalid padding strategy:" + str(padding_side))
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
|
||||
@@ -165,6 +165,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -268,6 +269,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -290,6 +292,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -318,6 +321,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -349,6 +353,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -381,6 +386,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -424,6 +430,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -451,6 +458,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[str] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -470,6 +478,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
)
|
||||
|
||||
if is_pair:
|
||||
@@ -603,6 +612,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[bool] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -631,6 +641,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -663,6 +674,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
@@ -685,6 +697,9 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
|
||||
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
||||
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
|
||||
`>= 7.5` (Volta).
|
||||
padding_side:
|
||||
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
||||
Default value is picked from the class attribute of the same name.
|
||||
return_attention_mask:
|
||||
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
||||
"""
|
||||
@@ -708,7 +723,8 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
|
||||
|
||||
if needs_to_be_padded:
|
||||
difference = max_length - len(required_input)
|
||||
if self.padding_side == "right":
|
||||
padding_side = padding_side if padding_side is not None else self.padding_side
|
||||
if padding_side == "right":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -722,7 +738,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
|
||||
if "special_tokens_mask" in encoded_inputs:
|
||||
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
|
||||
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
|
||||
elif self.padding_side == "left":
|
||||
elif padding_side == "left":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -737,7 +753,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
|
||||
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
|
||||
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
||||
else:
|
||||
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
|
||||
raise ValueError("Invalid padding strategy:" + str(padding_side))
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
|
||||
@@ -543,6 +543,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -646,6 +647,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -668,6 +670,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -697,6 +700,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -728,6 +732,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -756,6 +761,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -784,6 +790,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_overflowing_tokens=return_overflowing_tokens,
|
||||
@@ -809,6 +816,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[str] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -840,6 +848,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=None, # we pad in batch afterward
|
||||
padding_side=None, # we pad in batch afterward
|
||||
return_attention_mask=False, # we pad in batch afterward
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_overflowing_tokens=return_overflowing_tokens,
|
||||
@@ -860,6 +869,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
padding=padding_strategy.value,
|
||||
max_length=max_length,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
@@ -881,6 +891,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -902,6 +913,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -929,6 +941,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -972,6 +985,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -996,6 +1010,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -1026,6 +1041,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
prepend_batch_axis=True,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -1049,6 +1065,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -1237,6 +1254,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
padding=padding_strategy.value,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
@@ -1382,6 +1400,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
@@ -1404,6 +1423,9 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
||||
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
|
||||
`>= 7.5` (Volta).
|
||||
padding_side:
|
||||
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
||||
Default value is picked from the class attribute of the same name.
|
||||
return_attention_mask:
|
||||
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
||||
"""
|
||||
@@ -1427,7 +1449,8 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
|
||||
if needs_to_be_padded:
|
||||
difference = max_length - len(required_input)
|
||||
if self.padding_side == "right":
|
||||
padding_side = padding_side if padding_side is not None else self.padding_side
|
||||
if padding_side == "right":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -1441,7 +1464,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
if "special_tokens_mask" in encoded_inputs:
|
||||
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
|
||||
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
|
||||
elif self.padding_side == "left":
|
||||
elif padding_side == "left":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -1456,6 +1479,6 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
|
||||
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
|
||||
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
||||
else:
|
||||
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
|
||||
raise ValueError("Invalid padding strategy:" + str(padding_side))
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
@@ -217,6 +217,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -320,6 +321,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -342,6 +344,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -371,6 +374,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -402,6 +406,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -436,6 +441,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -479,6 +485,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -506,6 +513,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[str] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -525,6 +533,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
)
|
||||
|
||||
if is_pair:
|
||||
@@ -664,6 +673,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[bool] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -692,6 +702,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -725,6 +736,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
@@ -747,6 +759,9 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
|
||||
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
||||
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
|
||||
`>= 7.5` (Volta).
|
||||
padding_side:
|
||||
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
||||
Default value is picked from the class attribute of the same name.
|
||||
return_attention_mask:
|
||||
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
||||
"""
|
||||
@@ -770,7 +785,8 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
|
||||
|
||||
if needs_to_be_padded:
|
||||
difference = max_length - len(required_input)
|
||||
if self.padding_side == "right":
|
||||
padding_side = padding_side if padding_side is not None else self.padding_side
|
||||
if padding_side == "right":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -784,7 +800,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
|
||||
if "special_tokens_mask" in encoded_inputs:
|
||||
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
|
||||
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
|
||||
elif self.padding_side == "left":
|
||||
elif padding_side == "left":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -799,7 +815,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
|
||||
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
|
||||
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
||||
else:
|
||||
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
|
||||
raise ValueError("Invalid padding strategy:" + str(padding_side))
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
|
||||
@@ -447,6 +447,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -550,6 +551,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -572,6 +574,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -599,6 +602,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -627,6 +631,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_overflowing_tokens=return_overflowing_tokens,
|
||||
@@ -651,6 +656,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[str] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -682,6 +688,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=None, # we pad in batch afterward
|
||||
padding_side=None, # we pad in batch afterward
|
||||
return_attention_mask=False, # we pad in batch afterward
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_overflowing_tokens=return_overflowing_tokens,
|
||||
@@ -702,6 +709,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
|
||||
padding=padding_strategy.value,
|
||||
max_length=max_length,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
@@ -721,6 +729,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -751,6 +760,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
prepend_batch_axis=True,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -774,6 +784,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -947,6 +958,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
padding=padding_strategy.value,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
@@ -1090,6 +1102,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
@@ -1112,6 +1125,9 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
|
||||
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
||||
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
|
||||
`>= 7.5` (Volta).
|
||||
padding_side (`str`, *optional*):
|
||||
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
||||
Default value is picked from the class attribute of the same name.
|
||||
return_attention_mask:
|
||||
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
||||
"""
|
||||
@@ -1135,7 +1151,8 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
|
||||
|
||||
if needs_to_be_padded:
|
||||
difference = max_length - len(required_input)
|
||||
if self.padding_side == "right":
|
||||
padding_side = padding_side if padding_side is not None else self.padding_side
|
||||
if padding_side == "right":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -1149,7 +1166,7 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
|
||||
if "special_tokens_mask" in encoded_inputs:
|
||||
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
|
||||
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
|
||||
elif self.padding_side == "left":
|
||||
elif padding_side == "left":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -1164,6 +1181,6 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
|
||||
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
|
||||
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
||||
else:
|
||||
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
|
||||
raise ValueError("Invalid padding strategy:" + str(padding_side))
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
@@ -277,6 +277,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -380,6 +381,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -402,6 +404,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -442,6 +445,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[str] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -462,6 +466,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
)
|
||||
|
||||
if is_pair:
|
||||
@@ -595,6 +600,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[bool] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -623,6 +629,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -655,6 +662,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
@@ -677,6 +685,9 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
||||
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
|
||||
`>= 7.5` (Volta).
|
||||
padding_side (`str`, *optional*):
|
||||
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
||||
Default value is picked from the class attribute of the same name.
|
||||
return_attention_mask:
|
||||
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
||||
"""
|
||||
@@ -700,7 +711,8 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
|
||||
if needs_to_be_padded:
|
||||
difference = max_length - len(required_input)
|
||||
if self.padding_side == "right":
|
||||
padding_side = padding_side if padding_side is not None else self.padding_side
|
||||
if padding_side == "right":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -714,7 +726,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
if "special_tokens_mask" in encoded_inputs:
|
||||
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
|
||||
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
|
||||
elif self.padding_side == "left":
|
||||
elif padding_side == "left":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -729,7 +741,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
|
||||
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
||||
else:
|
||||
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
|
||||
raise ValueError("Invalid padding strategy:" + str(padding_side))
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
|
||||
@@ -412,6 +412,7 @@ class LEDTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
) -> dict:
|
||||
encoded_inputs = super()._pad(
|
||||
@@ -419,6 +420,7 @@ class LEDTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
padding_strategy=padding_strategy,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
|
||||
@@ -288,6 +288,7 @@ class LEDTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
) -> dict:
|
||||
encoded_inputs = super()._pad(
|
||||
@@ -295,6 +296,7 @@ class LEDTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
padding_strategy=padding_strategy,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
|
||||
@@ -570,6 +570,7 @@ class LukeTokenizer(PreTrainedTokenizer):
|
||||
stride: int = 0,
|
||||
is_split_into_words: Optional[bool] = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -662,6 +663,7 @@ class LukeTokenizer(PreTrainedTokenizer):
|
||||
stride=stride,
|
||||
is_split_into_words=is_split_into_words,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -688,6 +690,7 @@ class LukeTokenizer(PreTrainedTokenizer):
|
||||
stride=stride,
|
||||
is_split_into_words=is_split_into_words,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -715,6 +718,7 @@ class LukeTokenizer(PreTrainedTokenizer):
|
||||
stride: int = 0,
|
||||
is_split_into_words: Optional[bool] = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -769,6 +773,7 @@ class LukeTokenizer(PreTrainedTokenizer):
|
||||
max_entity_length=max_entity_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
prepend_batch_axis=True,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -796,6 +801,7 @@ class LukeTokenizer(PreTrainedTokenizer):
|
||||
stride: int = 0,
|
||||
is_split_into_words: Optional[bool] = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -876,6 +882,7 @@ class LukeTokenizer(PreTrainedTokenizer):
|
||||
max_entity_length=max_entity_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_overflowing_tokens=return_overflowing_tokens,
|
||||
@@ -1070,6 +1077,7 @@ class LukeTokenizer(PreTrainedTokenizer):
|
||||
max_entity_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[str] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -1112,6 +1120,7 @@ class LukeTokenizer(PreTrainedTokenizer):
|
||||
max_entity_length=max_entity_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=None, # we pad in batch afterward
|
||||
padding_side=None, # we pad in batch afterward
|
||||
return_attention_mask=False, # we pad in batch afterward
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_overflowing_tokens=return_overflowing_tokens,
|
||||
@@ -1132,6 +1141,7 @@ class LukeTokenizer(PreTrainedTokenizer):
|
||||
padding=padding_strategy.value,
|
||||
max_length=max_length,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
@@ -1155,6 +1165,7 @@ class LukeTokenizer(PreTrainedTokenizer):
|
||||
max_entity_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -1357,6 +1368,7 @@ class LukeTokenizer(PreTrainedTokenizer):
|
||||
max_entity_length=max_entity_length,
|
||||
padding=padding_strategy.value,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
@@ -1382,6 +1394,7 @@ class LukeTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
max_entity_length: Optional[int] = None,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
verbose: bool = True,
|
||||
@@ -1418,6 +1431,9 @@ class LukeTokenizer(PreTrainedTokenizer):
|
||||
pad_to_multiple_of (`int`, *optional*):
|
||||
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
|
||||
the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
|
||||
padding_side:
|
||||
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
||||
Default value is picked from the class attribute of the same name.
|
||||
return_attention_mask (`bool`, *optional*):
|
||||
Whether to return the attention mask. If left to the default, will return the attention mask according
|
||||
to the specific tokenizer's default, defined by the `return_outputs` attribute. [What are attention
|
||||
@@ -1495,6 +1511,7 @@ class LukeTokenizer(PreTrainedTokenizer):
|
||||
max_entity_length=max_entity_length,
|
||||
padding_strategy=padding_strategy,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
|
||||
@@ -1519,6 +1536,7 @@ class LukeTokenizer(PreTrainedTokenizer):
|
||||
max_entity_length=max_entity_length,
|
||||
padding_strategy=padding_strategy,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
@@ -1536,6 +1554,7 @@ class LukeTokenizer(PreTrainedTokenizer):
|
||||
max_entity_length: Optional[int] = None,
|
||||
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
@@ -1562,6 +1581,9 @@ class LukeTokenizer(PreTrainedTokenizer):
|
||||
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
||||
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
|
||||
`>= 7.5` (Volta).
|
||||
padding_side:
|
||||
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
||||
Default value is picked from the class attribute of the same name.
|
||||
return_attention_mask:
|
||||
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
||||
"""
|
||||
@@ -1600,9 +1622,10 @@ class LukeTokenizer(PreTrainedTokenizer):
|
||||
|
||||
if needs_to_be_padded:
|
||||
difference = max_length - len(encoded_inputs["input_ids"])
|
||||
padding_side = padding_side if padding_side is not None else self.padding_side
|
||||
if entities_provided:
|
||||
entity_difference = max_entity_length - len(encoded_inputs["entity_ids"])
|
||||
if self.padding_side == "right":
|
||||
if padding_side == "right":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
|
||||
if entities_provided:
|
||||
@@ -1633,7 +1656,7 @@ class LukeTokenizer(PreTrainedTokenizer):
|
||||
encoded_inputs["entity_end_positions"] + [0] * entity_difference
|
||||
)
|
||||
|
||||
elif self.padding_side == "left":
|
||||
elif padding_side == "left":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
|
||||
if entities_provided:
|
||||
@@ -1664,7 +1687,7 @@ class LukeTokenizer(PreTrainedTokenizer):
|
||||
"entity_end_positions"
|
||||
]
|
||||
else:
|
||||
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
|
||||
raise ValueError("Invalid padding strategy:" + str(padding_side))
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
|
||||
@@ -503,6 +503,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -602,6 +603,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -624,6 +626,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -652,6 +655,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -683,6 +687,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -710,6 +715,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -738,6 +744,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_overflowing_tokens=return_overflowing_tokens,
|
||||
@@ -762,6 +769,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[str] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -793,6 +801,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=None, # we pad in batch afterward
|
||||
padding_side=None, # we pad in batch afterward
|
||||
return_attention_mask=False, # we pad in batch afterward
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_overflowing_tokens=return_overflowing_tokens,
|
||||
@@ -813,6 +822,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
padding=padding_strategy.value,
|
||||
max_length=max_length,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
@@ -833,6 +843,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -854,6 +865,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -880,6 +892,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -923,6 +936,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -946,6 +960,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -976,6 +991,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
prepend_batch_axis=True,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -999,6 +1015,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -1203,6 +1220,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
padding=padding_strategy.value,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
@@ -1357,6 +1375,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
@@ -1376,6 +1395,9 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
||||
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
|
||||
`>= 7.5` (Volta).
|
||||
padding_side:
|
||||
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
||||
Default value is picked from the class attribute of the same name.
|
||||
return_attention_mask:
|
||||
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
||||
"""
|
||||
@@ -1399,7 +1421,8 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
|
||||
if needs_to_be_padded:
|
||||
difference = max_length - len(required_input)
|
||||
if self.padding_side == "right":
|
||||
padding_side = padding_side if padding_side is not None else self.padding_side
|
||||
if padding_side == "right":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -1419,7 +1442,7 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
if "special_tokens_mask" in encoded_inputs:
|
||||
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
|
||||
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
|
||||
elif self.padding_side == "left":
|
||||
elif padding_side == "left":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -1440,6 +1463,6 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
||||
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
|
||||
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
||||
else:
|
||||
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
|
||||
raise ValueError("Invalid padding strategy:" + str(padding_side))
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
@@ -286,6 +286,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -385,6 +386,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -407,6 +409,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -435,6 +438,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -466,6 +470,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -498,6 +503,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -541,6 +547,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -568,6 +575,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[str] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -587,6 +595,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
)
|
||||
|
||||
if is_pair:
|
||||
@@ -721,6 +730,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[bool] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -749,6 +759,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -781,6 +792,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
@@ -800,6 +812,9 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
||||
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
|
||||
`>= 7.5` (Volta).
|
||||
padding_side:
|
||||
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
||||
Default value is picked from the class attribute of the same name.
|
||||
return_attention_mask:
|
||||
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
||||
"""
|
||||
@@ -823,7 +838,8 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
|
||||
if needs_to_be_padded:
|
||||
difference = max_length - len(required_input)
|
||||
if self.padding_side == "right":
|
||||
padding_side = padding_side if padding_side is not None else self.padding_side
|
||||
if padding_side == "right":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -843,7 +859,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
if "special_tokens_mask" in encoded_inputs:
|
||||
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
|
||||
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
|
||||
elif self.padding_side == "left":
|
||||
elif padding_side == "left":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -864,7 +880,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
|
||||
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
|
||||
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
||||
else:
|
||||
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
|
||||
raise ValueError("Invalid padding strategy:" + str(padding_side))
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
|
||||
@@ -399,6 +399,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
|
||||
stride: int = 0,
|
||||
is_split_into_words: Optional[bool] = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -491,6 +492,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
|
||||
stride=stride,
|
||||
is_split_into_words=is_split_into_words,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -517,6 +519,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
|
||||
stride=stride,
|
||||
is_split_into_words=is_split_into_words,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -545,6 +548,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
|
||||
stride: int = 0,
|
||||
is_split_into_words: Optional[bool] = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -599,6 +603,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
|
||||
max_entity_length=max_entity_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
prepend_batch_axis=True,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -627,6 +632,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
|
||||
stride: int = 0,
|
||||
is_split_into_words: Optional[bool] = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -707,6 +713,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
|
||||
max_entity_length=max_entity_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_overflowing_tokens=return_overflowing_tokens,
|
||||
@@ -904,6 +911,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
|
||||
max_entity_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[str] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -946,6 +954,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
|
||||
max_entity_length=max_entity_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=None, # we pad in batch afterward
|
||||
padding_side=None, # we pad in batch afterward
|
||||
return_attention_mask=False, # we pad in batch afterward
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_overflowing_tokens=return_overflowing_tokens,
|
||||
@@ -966,6 +975,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
|
||||
padding=padding_strategy.value,
|
||||
max_length=max_length,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
@@ -990,6 +1000,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
|
||||
max_entity_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -1192,6 +1203,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
|
||||
max_entity_length=max_entity_length,
|
||||
padding=padding_strategy.value,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
@@ -1218,6 +1230,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
max_entity_length: Optional[int] = None,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
verbose: bool = True,
|
||||
@@ -1254,6 +1267,9 @@ class MLukeTokenizer(PreTrainedTokenizer):
|
||||
pad_to_multiple_of (`int`, *optional*):
|
||||
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
|
||||
the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
|
||||
padding_side:
|
||||
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
||||
Default value is picked from the class attribute of the same name.
|
||||
return_attention_mask (`bool`, *optional*):
|
||||
Whether to return the attention mask. If left to the default, will return the attention mask according
|
||||
to the specific tokenizer's default, defined by the `return_outputs` attribute. [What are attention
|
||||
@@ -1331,6 +1347,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
|
||||
max_entity_length=max_entity_length,
|
||||
padding_strategy=padding_strategy,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
|
||||
@@ -1355,6 +1372,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
|
||||
max_entity_length=max_entity_length,
|
||||
padding_strategy=padding_strategy,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
@@ -1373,6 +1391,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
|
||||
max_entity_length: Optional[int] = None,
|
||||
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
@@ -1399,6 +1418,9 @@ class MLukeTokenizer(PreTrainedTokenizer):
|
||||
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
||||
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
|
||||
`>= 7.5` (Volta).
|
||||
padding_side:
|
||||
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
||||
Default value is picked from the class attribute of the same name.
|
||||
return_attention_mask:
|
||||
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
||||
"""
|
||||
@@ -1437,9 +1459,10 @@ class MLukeTokenizer(PreTrainedTokenizer):
|
||||
|
||||
if needs_to_be_padded:
|
||||
difference = max_length - len(encoded_inputs["input_ids"])
|
||||
padding_side = padding_side if padding_side is not None else self.padding_side
|
||||
if entities_provided:
|
||||
entity_difference = max_entity_length - len(encoded_inputs["entity_ids"])
|
||||
if self.padding_side == "right":
|
||||
if padding_side == "right":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
|
||||
if entities_provided:
|
||||
@@ -1470,7 +1493,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
|
||||
encoded_inputs["entity_end_positions"] + [0] * entity_difference
|
||||
)
|
||||
|
||||
elif self.padding_side == "left":
|
||||
elif padding_side == "left":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
|
||||
if entities_provided:
|
||||
@@ -1501,7 +1524,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
|
||||
"entity_end_positions"
|
||||
]
|
||||
else:
|
||||
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
|
||||
raise ValueError("Invalid padding strategy:" + str(padding_side))
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
|
||||
@@ -210,6 +210,7 @@ class RoCBertTokenizer(PreTrainedTokenizer):
|
||||
stride: int = 0,
|
||||
is_split_into_words: bool = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -283,6 +284,7 @@ class RoCBertTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
prepend_batch_axis=True,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -308,6 +310,7 @@ class RoCBertTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -462,6 +465,7 @@ class RoCBertTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
padding=padding_strategy.value,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
@@ -480,6 +484,7 @@ class RoCBertTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
) -> dict:
|
||||
# Load from model defaults
|
||||
@@ -502,8 +507,9 @@ class RoCBertTokenizer(PreTrainedTokenizer):
|
||||
|
||||
if needs_to_be_padded:
|
||||
difference = max_length - len(required_input)
|
||||
padding_side = padding_side if padding_side is not None else self.padding_side
|
||||
|
||||
if self.padding_side == "right":
|
||||
if padding_side == "right":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -516,7 +522,7 @@ class RoCBertTokenizer(PreTrainedTokenizer):
|
||||
if key in encoded_inputs:
|
||||
encoded_inputs[key] = encoded_inputs[key] + [self.pad_token_id] * difference
|
||||
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
|
||||
elif self.padding_side == "left":
|
||||
elif padding_side == "left":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -530,7 +536,7 @@ class RoCBertTokenizer(PreTrainedTokenizer):
|
||||
encoded_inputs[key] = [self.pad_token_id] * difference + encoded_inputs[key]
|
||||
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
||||
else:
|
||||
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
|
||||
raise ValueError("Invalid padding strategy:" + str(padding_side))
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
@@ -551,6 +557,7 @@ class RoCBertTokenizer(PreTrainedTokenizer):
|
||||
stride: int = 0,
|
||||
is_split_into_words: bool = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -627,6 +634,7 @@ class RoCBertTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_overflowing_tokens=return_overflowing_tokens,
|
||||
@@ -650,6 +658,7 @@ class RoCBertTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[str] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -686,6 +695,7 @@ class RoCBertTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=None, # we pad in batch afterward
|
||||
padding_side=None, # we pad in batch afterward
|
||||
return_attention_mask=False, # we pad in batch afterward
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_overflowing_tokens=return_overflowing_tokens,
|
||||
@@ -706,6 +716,7 @@ class RoCBertTokenizer(PreTrainedTokenizer):
|
||||
padding=padding_strategy.value,
|
||||
max_length=max_length,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
|
||||
@@ -517,6 +517,7 @@ class TapasTokenizer(PreTrainedTokenizer):
|
||||
truncation: Union[bool, str, TapasTruncationStrategy] = False,
|
||||
max_length: Optional[int] = None,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -581,6 +582,7 @@ class TapasTokenizer(PreTrainedTokenizer):
|
||||
truncation=truncation,
|
||||
max_length=max_length,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -602,6 +604,7 @@ class TapasTokenizer(PreTrainedTokenizer):
|
||||
truncation=truncation,
|
||||
max_length=max_length,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -631,6 +634,7 @@ class TapasTokenizer(PreTrainedTokenizer):
|
||||
truncation: Union[bool, str, TapasTruncationStrategy] = False,
|
||||
max_length: Optional[int] = None,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -699,6 +703,7 @@ class TapasTokenizer(PreTrainedTokenizer):
|
||||
truncation=truncation,
|
||||
max_length=max_length,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -738,6 +743,7 @@ class TapasTokenizer(PreTrainedTokenizer):
|
||||
truncation: Union[bool, str, TapasTruncationStrategy] = False,
|
||||
max_length: Optional[int] = None,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = True,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -768,6 +774,7 @@ class TapasTokenizer(PreTrainedTokenizer):
|
||||
add_special_tokens=add_special_tokens,
|
||||
max_length=max_length,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
prepend_batch_axis=True,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -797,6 +804,7 @@ class TapasTokenizer(PreTrainedTokenizer):
|
||||
truncation: Union[bool, str, TapasTruncationStrategy] = False,
|
||||
max_length: Optional[int] = None,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = True,
|
||||
return_attention_mask: Optional[bool] = True,
|
||||
@@ -823,6 +831,7 @@ class TapasTokenizer(PreTrainedTokenizer):
|
||||
truncation=truncation,
|
||||
max_length=max_length,
|
||||
pad_to_multiple_of=None, # we pad in batch afterwards
|
||||
padding_side=None, # we pad in batch afterward
|
||||
return_attention_mask=False, # we pad in batch afterwards
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_special_tokens_mask=return_special_tokens_mask,
|
||||
@@ -844,6 +853,7 @@ class TapasTokenizer(PreTrainedTokenizer):
|
||||
padding=padding,
|
||||
max_length=max_length,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
@@ -912,6 +922,7 @@ class TapasTokenizer(PreTrainedTokenizer):
|
||||
truncation: Union[bool, str, TapasTruncationStrategy] = False,
|
||||
max_length: Optional[int] = None,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -968,6 +979,7 @@ class TapasTokenizer(PreTrainedTokenizer):
|
||||
padding=padding,
|
||||
max_length=max_length,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -993,6 +1005,7 @@ class TapasTokenizer(PreTrainedTokenizer):
|
||||
truncation: Union[bool, str, TapasTruncationStrategy] = False,
|
||||
max_length: Optional[int] = None,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = True,
|
||||
return_attention_mask: Optional[bool] = True,
|
||||
@@ -1024,6 +1037,7 @@ class TapasTokenizer(PreTrainedTokenizer):
|
||||
padding=padding,
|
||||
max_length=max_length,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
prepend_batch_axis=True,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -1051,6 +1065,7 @@ class TapasTokenizer(PreTrainedTokenizer):
|
||||
truncation: Union[bool, str, TapasTruncationStrategy] = False,
|
||||
max_length: Optional[int] = None,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = True,
|
||||
return_attention_mask: Optional[bool] = True,
|
||||
@@ -1214,6 +1229,7 @@ class TapasTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
padding=padding.value,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
@@ -1754,6 +1770,7 @@ class TapasTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
@@ -1776,6 +1793,9 @@ class TapasTokenizer(PreTrainedTokenizer):
|
||||
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
||||
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
|
||||
`>= 7.5` (Volta).
|
||||
padding_side:
|
||||
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
||||
Default value is picked from the class attribute of the same name.
|
||||
return_attention_mask:
|
||||
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
||||
"""
|
||||
@@ -1799,7 +1819,8 @@ class TapasTokenizer(PreTrainedTokenizer):
|
||||
|
||||
if needs_to_be_padded:
|
||||
difference = max_length - len(encoded_inputs["input_ids"])
|
||||
if self.padding_side == "right":
|
||||
padding_side = padding_side if padding_side is not None else self.padding_side
|
||||
if padding_side == "right":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -1817,7 +1838,7 @@ class TapasTokenizer(PreTrainedTokenizer):
|
||||
if "special_tokens_mask" in encoded_inputs:
|
||||
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
|
||||
encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
|
||||
elif self.padding_side == "left":
|
||||
elif padding_side == "left":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -1836,7 +1857,7 @@ class TapasTokenizer(PreTrainedTokenizer):
|
||||
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
|
||||
encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"]
|
||||
else:
|
||||
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
|
||||
raise ValueError("Invalid padding strategy:" + str(padding_side))
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
|
||||
@@ -551,6 +551,7 @@ class UdopTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -654,6 +655,7 @@ class UdopTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -676,6 +678,7 @@ class UdopTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -704,6 +707,7 @@ class UdopTokenizer(PreTrainedTokenizer):
|
||||
stride: int = 0,
|
||||
is_split_into_words: bool = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -746,6 +750,7 @@ class UdopTokenizer(PreTrainedTokenizer):
|
||||
stride=stride,
|
||||
is_split_into_words=is_split_into_words,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -813,6 +818,7 @@ class UdopTokenizer(PreTrainedTokenizer):
|
||||
stride: int = 0,
|
||||
is_split_into_words: bool = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -865,6 +871,7 @@ class UdopTokenizer(PreTrainedTokenizer):
|
||||
stride=stride,
|
||||
is_split_into_words=is_split_into_words,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -892,6 +899,7 @@ class UdopTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -920,6 +928,7 @@ class UdopTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_overflowing_tokens=return_overflowing_tokens,
|
||||
@@ -944,6 +953,7 @@ class UdopTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[str] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -975,6 +985,7 @@ class UdopTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=None, # we pad in batch afterward
|
||||
padding_side=None, # we pad in batch afterward
|
||||
return_attention_mask=False, # we pad in batch afterward
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_overflowing_tokens=return_overflowing_tokens,
|
||||
@@ -995,6 +1006,7 @@ class UdopTokenizer(PreTrainedTokenizer):
|
||||
padding=padding_strategy.value,
|
||||
max_length=max_length,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
@@ -1014,6 +1026,7 @@ class UdopTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -1044,6 +1057,7 @@ class UdopTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
prepend_batch_axis=True,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -1067,6 +1081,7 @@ class UdopTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -1240,6 +1255,7 @@ class UdopTokenizer(PreTrainedTokenizer):
|
||||
max_length=max_length,
|
||||
padding=padding_strategy.value,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
@@ -1385,6 +1401,7 @@ class UdopTokenizer(PreTrainedTokenizer):
|
||||
max_length: Optional[int] = None,
|
||||
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
@@ -1407,6 +1424,9 @@ class UdopTokenizer(PreTrainedTokenizer):
|
||||
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
||||
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
|
||||
`>= 7.5` (Volta).
|
||||
padding_side (`str`, *optional*):
|
||||
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
||||
Default value is picked from the class attribute of the same name.
|
||||
return_attention_mask:
|
||||
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
||||
"""
|
||||
@@ -1430,7 +1450,8 @@ class UdopTokenizer(PreTrainedTokenizer):
|
||||
|
||||
if needs_to_be_padded:
|
||||
difference = max_length - len(required_input)
|
||||
if self.padding_side == "right":
|
||||
padding_side = padding_side if padding_side is not None else self.padding_side
|
||||
if padding_side == "right":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -1444,7 +1465,7 @@ class UdopTokenizer(PreTrainedTokenizer):
|
||||
if "special_tokens_mask" in encoded_inputs:
|
||||
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
|
||||
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
|
||||
elif self.padding_side == "left":
|
||||
elif padding_side == "left":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -1459,6 +1480,6 @@ class UdopTokenizer(PreTrainedTokenizer):
|
||||
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
|
||||
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
||||
else:
|
||||
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
|
||||
raise ValueError("Invalid padding strategy:" + str(padding_side))
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
@@ -286,6 +286,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -389,6 +390,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -411,6 +413,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -453,6 +456,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
|
||||
stride: int = 0,
|
||||
is_split_into_words: bool = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -501,6 +505,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
|
||||
stride=stride,
|
||||
is_split_into_words=is_split_into_words,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -528,6 +533,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[str] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -548,6 +554,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
)
|
||||
|
||||
if is_pair:
|
||||
@@ -684,6 +691,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[bool] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -712,6 +720,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -794,6 +803,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
|
||||
stride: int = 0,
|
||||
is_split_into_words: bool = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -846,6 +856,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
|
||||
stride=stride,
|
||||
is_split_into_words=is_split_into_words,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -864,6 +875,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
|
||||
max_length: Optional[int] = None,
|
||||
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
@@ -886,6 +898,9 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
|
||||
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
||||
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
|
||||
`>= 7.5` (Volta).
|
||||
padding_side (`str`, *optional*):
|
||||
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
||||
Default value is picked from the class attribute of the same name.
|
||||
return_attention_mask:
|
||||
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
||||
"""
|
||||
@@ -909,7 +924,8 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
|
||||
|
||||
if needs_to_be_padded:
|
||||
difference = max_length - len(required_input)
|
||||
if self.padding_side == "right":
|
||||
padding_side = padding_side if padding_side is not None else self.padding_side
|
||||
if padding_side == "right":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -923,7 +939,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
|
||||
if "special_tokens_mask" in encoded_inputs:
|
||||
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
|
||||
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
|
||||
elif self.padding_side == "left":
|
||||
elif padding_side == "left":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -938,7 +954,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
|
||||
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
|
||||
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
||||
else:
|
||||
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
|
||||
raise ValueError("Invalid padding strategy:" + str(padding_side))
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
|
||||
@@ -781,6 +781,7 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer):
|
||||
padding: Union[bool, str, PaddingStrategy] = False,
|
||||
max_length: Optional[int] = None,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
verbose: bool = True,
|
||||
**kwargs,
|
||||
@@ -794,6 +795,10 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer):
|
||||
The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
|
||||
values, a list of numpy array or a list of list of float values. Must be mono channel audio, not
|
||||
stereo, i.e. single float per timestep.
|
||||
|
||||
padding_side (`str`, *optional*):
|
||||
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
||||
Default value is picked from the class attribute of the same name.
|
||||
"""
|
||||
|
||||
is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
|
||||
@@ -825,6 +830,7 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer):
|
||||
padding=padding,
|
||||
max_length=max_length,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=self.return_attention_mask,
|
||||
return_tensors=return_tensors,
|
||||
verbose=verbose,
|
||||
|
||||
@@ -749,6 +749,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
|
||||
stride: int = 0,
|
||||
is_split_into_words: bool = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -806,6 +807,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
prepend_batch_axis=True,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -833,6 +835,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
|
||||
stride: int = 0,
|
||||
is_split_into_words: bool = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -891,6 +894,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_overflowing_tokens=return_overflowing_tokens,
|
||||
@@ -913,6 +917,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[str] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -942,6 +947,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=None, # we pad in batch afterward
|
||||
padding_side=None, # we pad in batch afterward
|
||||
return_attention_mask=False, # we pad in batch afterward
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_overflowing_tokens=return_overflowing_tokens,
|
||||
@@ -963,6 +969,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
|
||||
padding=padding_strategy.value,
|
||||
max_length=max_length,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
|
||||
@@ -1427,6 +1427,9 @@ ENCODE_KWARGS_DOCSTRING = r"""
|
||||
If set will pad the sequence to a multiple of the provided value. Requires `padding` to be activated.
|
||||
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
|
||||
`>= 7.5` (Volta).
|
||||
padding_side (`str`, *optional*):
|
||||
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
||||
Default value is picked from the class attribute of the same name.
|
||||
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||
If set, will return tensors instead of list of python integers. Acceptable values are:
|
||||
|
||||
@@ -2767,6 +2770,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
truncation: Union[bool, str, TruncationStrategy] = None,
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
**kwargs,
|
||||
) -> List[int]:
|
||||
@@ -2793,6 +2797,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
truncation=truncation,
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
**kwargs,
|
||||
)
|
||||
@@ -2956,6 +2961,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
stride: int = 0,
|
||||
is_split_into_words: bool = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -2997,6 +3003,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
"stride": stride,
|
||||
"is_split_into_words": is_split_into_words,
|
||||
"pad_to_multiple_of": pad_to_multiple_of,
|
||||
"padding_side": padding_side,
|
||||
"return_tensors": return_tensors,
|
||||
"return_token_type_ids": return_token_type_ids,
|
||||
"return_attention_mask": return_attention_mask,
|
||||
@@ -3041,6 +3048,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
stride: int = 0,
|
||||
is_split_into_words: bool = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -3111,6 +3119,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
stride=stride,
|
||||
is_split_into_words=is_split_into_words,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -3133,6 +3142,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
stride=stride,
|
||||
is_split_into_words=is_split_into_words,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -3157,6 +3167,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
stride: int = 0,
|
||||
is_split_into_words: bool = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -3207,6 +3218,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
stride=stride,
|
||||
is_split_into_words=is_split_into_words,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -3230,6 +3242,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
stride: int = 0,
|
||||
is_split_into_words: bool = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -3261,6 +3274,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
stride: int = 0,
|
||||
is_split_into_words: bool = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -3307,6 +3321,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
stride=stride,
|
||||
is_split_into_words=is_split_into_words,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
@@ -3336,6 +3351,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
stride: int = 0,
|
||||
is_split_into_words: bool = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -3361,6 +3377,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
padding: Union[bool, str, PaddingStrategy] = True,
|
||||
max_length: Optional[int] = None,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
verbose: bool = True,
|
||||
@@ -3409,6 +3426,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
|
||||
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
|
||||
`>= 7.5` (Volta).
|
||||
padding_side (`str`, *optional*):
|
||||
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
||||
Default value is picked from the class attribute of the same name.
|
||||
return_attention_mask (`bool`, *optional*):
|
||||
Whether to return the attention mask. If left to the default, will return the attention mask according
|
||||
to the specific tokenizer's default, defined by the `return_outputs` attribute.
|
||||
@@ -3491,6 +3511,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
max_length=max_length,
|
||||
padding_strategy=padding_strategy,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
|
||||
@@ -3512,6 +3533,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
max_length=max_length,
|
||||
padding_strategy=padding_strategy,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
@@ -3573,6 +3595,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -3686,6 +3709,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
max_length=max_length,
|
||||
padding=padding_strategy.value,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
@@ -3828,6 +3852,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
max_length: Optional[int] = None,
|
||||
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
@@ -3843,13 +3868,16 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
|
||||
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
|
||||
- PaddingStrategy.DO_NOT_PAD: Do not pad
|
||||
The tokenizer padding sides are defined in self.padding_side:
|
||||
The tokenizer padding sides are defined in `padding_side` argument:
|
||||
|
||||
- 'left': pads on the left of the sequences
|
||||
- 'right': pads on the right of the sequences
|
||||
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
||||
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
|
||||
`>= 7.5` (Volta).
|
||||
padding_side:
|
||||
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
||||
Default value is picked from the class attribute of the same name.
|
||||
return_attention_mask:
|
||||
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
||||
"""
|
||||
@@ -3873,8 +3901,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
|
||||
if needs_to_be_padded:
|
||||
difference = max_length - len(required_input)
|
||||
padding_side = padding_side if padding_side is not None else self.padding_side
|
||||
|
||||
if self.padding_side == "right":
|
||||
if padding_side == "right":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -3884,7 +3913,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
if "special_tokens_mask" in encoded_inputs:
|
||||
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
|
||||
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
|
||||
elif self.padding_side == "left":
|
||||
elif padding_side == "left":
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
|
||||
if "token_type_ids" in encoded_inputs:
|
||||
@@ -3895,7 +3924,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
|
||||
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
||||
else:
|
||||
raise ValueError(f"Invalid padding strategy:{self.padding_side}")
|
||||
raise ValueError(f"Invalid padding strategy:{padding_side}")
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
|
||||
@@ -429,6 +429,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
||||
max_length: int,
|
||||
stride: int,
|
||||
pad_to_multiple_of: Optional[int],
|
||||
padding_side: Optional[bool],
|
||||
):
|
||||
"""
|
||||
Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
|
||||
@@ -450,6 +451,9 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
||||
pad_to_multiple_of (`int`, *optional*):
|
||||
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
|
||||
the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
|
||||
padding_side (`str`, *optional*):
|
||||
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
||||
Default value is picked from the class attribute of the same name.
|
||||
"""
|
||||
_truncation = self._tokenizer.truncation
|
||||
_padding = self._tokenizer.padding
|
||||
@@ -484,7 +488,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
||||
length = max_length if padding_strategy == PaddingStrategy.MAX_LENGTH else None
|
||||
target = {
|
||||
"length": length,
|
||||
"direction": self.padding_side,
|
||||
"direction": padding_side if padding_side is not None else self.padding_side,
|
||||
"pad_id": self.pad_token_id,
|
||||
"pad_token": self.pad_token,
|
||||
"pad_type_id": self.pad_token_type_id,
|
||||
@@ -505,6 +509,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
||||
stride: int = 0,
|
||||
is_split_into_words: bool = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[str] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -527,6 +532,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
)
|
||||
|
||||
if self._tokenizer.encode_special_tokens != split_special_tokens:
|
||||
@@ -593,6 +599,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
||||
stride: int = 0,
|
||||
is_split_into_words: bool = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
padding_side: Optional[bool] = None,
|
||||
return_tensors: Optional[bool] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
@@ -614,6 +621,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
padding_side=padding_side,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
|
||||
@@ -21,6 +21,8 @@ import tempfile
|
||||
import unittest
|
||||
from typing import List
|
||||
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import (
|
||||
AddedToken,
|
||||
LayoutLMv2TokenizerFast,
|
||||
@@ -393,7 +395,8 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
def test_split_special_tokens(self):
|
||||
pass
|
||||
|
||||
def test_encode_plus_with_padding(self):
|
||||
@parameterized.expand([(True,), (False,)])
|
||||
def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
@@ -444,15 +447,18 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
|
||||
|
||||
# Test right padding
|
||||
tokenizer.padding_side = "right"
|
||||
tokenizer_kwargs_right = {
|
||||
"max_length": sequence_length + padding_size,
|
||||
"padding": "max_length",
|
||||
"return_special_tokens_mask": True,
|
||||
}
|
||||
|
||||
right_padded_sequence = tokenizer.encode_plus(
|
||||
words,
|
||||
boxes=boxes,
|
||||
max_length=sequence_length + padding_size,
|
||||
padding="max_length",
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
if not use_padding_as_call_kwarg:
|
||||
tokenizer.padding_side = "right"
|
||||
else:
|
||||
tokenizer_kwargs_right["padding_side"] = "right"
|
||||
|
||||
right_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_right)
|
||||
right_padded_input_ids = right_padded_sequence["input_ids"]
|
||||
|
||||
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
|
||||
@@ -463,14 +469,18 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask)
|
||||
|
||||
# Test left padding
|
||||
tokenizer.padding_side = "left"
|
||||
left_padded_sequence = tokenizer.encode_plus(
|
||||
words,
|
||||
boxes=boxes,
|
||||
max_length=sequence_length + padding_size,
|
||||
padding="max_length",
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
tokenizer_kwargs_left = {
|
||||
"max_length": sequence_length + padding_size,
|
||||
"padding": "max_length",
|
||||
"return_special_tokens_mask": True,
|
||||
}
|
||||
|
||||
if not use_padding_as_call_kwarg:
|
||||
tokenizer.padding_side = "left"
|
||||
else:
|
||||
tokenizer_kwargs_left["padding_side"] = "left"
|
||||
|
||||
left_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_left)
|
||||
left_padded_input_ids = left_padded_sequence["input_ids"]
|
||||
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
|
||||
left_padded_sequence_length = len(left_padded_input_ids)
|
||||
|
||||
@@ -22,6 +22,8 @@ import tempfile
|
||||
import unittest
|
||||
from typing import List
|
||||
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import (
|
||||
AddedToken,
|
||||
LayoutLMv3TokenizerFast,
|
||||
@@ -273,7 +275,8 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
def test_split_special_tokens(self):
|
||||
pass
|
||||
|
||||
def test_encode_plus_with_padding(self):
|
||||
@parameterized.expand([(True,), (False,)])
|
||||
def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
@@ -324,15 +327,18 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
|
||||
|
||||
# Test right padding
|
||||
tokenizer.padding_side = "right"
|
||||
tokenizer_kwargs_right = {
|
||||
"max_length": sequence_length + padding_size,
|
||||
"padding": "max_length",
|
||||
"return_special_tokens_mask": True,
|
||||
}
|
||||
|
||||
right_padded_sequence = tokenizer.encode_plus(
|
||||
words,
|
||||
boxes=boxes,
|
||||
max_length=sequence_length + padding_size,
|
||||
padding="max_length",
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
if not use_padding_as_call_kwarg:
|
||||
tokenizer.padding_side = "right"
|
||||
else:
|
||||
tokenizer_kwargs_right["padding_side"] = "right"
|
||||
|
||||
right_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_right)
|
||||
right_padded_input_ids = right_padded_sequence["input_ids"]
|
||||
|
||||
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
|
||||
@@ -343,14 +349,18 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask)
|
||||
|
||||
# Test left padding
|
||||
tokenizer.padding_side = "left"
|
||||
left_padded_sequence = tokenizer.encode_plus(
|
||||
words,
|
||||
boxes=boxes,
|
||||
max_length=sequence_length + padding_size,
|
||||
padding="max_length",
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
tokenizer_kwargs_left = {
|
||||
"max_length": sequence_length + padding_size,
|
||||
"padding": "max_length",
|
||||
"return_special_tokens_mask": True,
|
||||
}
|
||||
|
||||
if not use_padding_as_call_kwarg:
|
||||
tokenizer.padding_side = "left"
|
||||
else:
|
||||
tokenizer_kwargs_left["padding_side"] = "left"
|
||||
|
||||
left_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_left)
|
||||
left_padded_input_ids = left_padded_sequence["input_ids"]
|
||||
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
|
||||
left_padded_sequence_length = len(left_padded_input_ids)
|
||||
|
||||
@@ -19,6 +19,8 @@ import tempfile
|
||||
import unittest
|
||||
from typing import List
|
||||
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import (
|
||||
AddedToken,
|
||||
LayoutXLMTokenizerFast,
|
||||
@@ -324,7 +326,8 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
|
||||
self.assertIn(decoded, [output, output.lower()])
|
||||
|
||||
def test_encode_plus_with_padding(self):
|
||||
@parameterized.expand([(True,), (False,)])
|
||||
def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
@@ -375,15 +378,18 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
|
||||
|
||||
# Test right padding
|
||||
tokenizer.padding_side = "right"
|
||||
tokenizer_kwargs_right = {
|
||||
"max_length": sequence_length + padding_size,
|
||||
"padding": "max_length",
|
||||
"return_special_tokens_mask": True,
|
||||
}
|
||||
|
||||
right_padded_sequence = tokenizer.encode_plus(
|
||||
words,
|
||||
boxes=boxes,
|
||||
max_length=sequence_length + padding_size,
|
||||
padding="max_length",
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
if not use_padding_as_call_kwarg:
|
||||
tokenizer.padding_side = "right"
|
||||
else:
|
||||
tokenizer_kwargs_right["padding_side"] = "right"
|
||||
|
||||
right_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_right)
|
||||
right_padded_input_ids = right_padded_sequence["input_ids"]
|
||||
|
||||
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
|
||||
@@ -394,14 +400,18 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask)
|
||||
|
||||
# Test left padding
|
||||
tokenizer.padding_side = "left"
|
||||
left_padded_sequence = tokenizer.encode_plus(
|
||||
words,
|
||||
boxes=boxes,
|
||||
max_length=sequence_length + padding_size,
|
||||
padding="max_length",
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
tokenizer_kwargs_left = {
|
||||
"max_length": sequence_length + padding_size,
|
||||
"padding": "max_length",
|
||||
"return_special_tokens_mask": True,
|
||||
}
|
||||
|
||||
if not use_padding_as_call_kwarg:
|
||||
tokenizer.padding_side = "left"
|
||||
else:
|
||||
tokenizer_kwargs_left["padding_side"] = "left"
|
||||
|
||||
left_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_left)
|
||||
left_padded_input_ids = left_padded_sequence["input_ids"]
|
||||
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
|
||||
left_padded_sequence_length = len(left_padded_input_ids)
|
||||
|
||||
@@ -22,6 +22,8 @@ import tempfile
|
||||
import unittest
|
||||
from typing import List
|
||||
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import (
|
||||
AddedToken,
|
||||
MarkupLMTokenizerFast,
|
||||
@@ -211,7 +213,8 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
def test_right_and_left_truncation(self):
|
||||
pass
|
||||
|
||||
def test_encode_plus_with_padding(self):
|
||||
@parameterized.expand([(True,), (False,)])
|
||||
def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
@@ -262,15 +265,18 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
|
||||
|
||||
# Test right padding
|
||||
tokenizer.padding_side = "right"
|
||||
tokenizer_kwargs_right = {
|
||||
"max_length": sequence_length + padding_size,
|
||||
"padding": "max_length",
|
||||
"return_special_tokens_mask": True,
|
||||
}
|
||||
|
||||
right_padded_sequence = tokenizer.encode_plus(
|
||||
nodes,
|
||||
xpaths=xpaths,
|
||||
max_length=sequence_length + padding_size,
|
||||
padding="max_length",
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
if not use_padding_as_call_kwarg:
|
||||
tokenizer.padding_side = "right"
|
||||
else:
|
||||
tokenizer_kwargs_right["padding_side"] = "right"
|
||||
|
||||
right_padded_sequence = tokenizer.encode_plus(nodes, xpaths=xpaths, **tokenizer_kwargs_right)
|
||||
right_padded_input_ids = right_padded_sequence["input_ids"]
|
||||
|
||||
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
|
||||
@@ -281,14 +287,18 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask)
|
||||
|
||||
# Test left padding
|
||||
tokenizer.padding_side = "left"
|
||||
left_padded_sequence = tokenizer.encode_plus(
|
||||
nodes,
|
||||
xpaths=xpaths,
|
||||
max_length=sequence_length + padding_size,
|
||||
padding="max_length",
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
tokenizer_kwargs_left = {
|
||||
"max_length": sequence_length + padding_size,
|
||||
"padding": "max_length",
|
||||
"return_special_tokens_mask": True,
|
||||
}
|
||||
|
||||
if not use_padding_as_call_kwarg:
|
||||
tokenizer.padding_side = "left"
|
||||
else:
|
||||
tokenizer_kwargs_left["padding_side"] = "left"
|
||||
|
||||
left_padded_sequence = tokenizer.encode_plus(nodes, xpaths=xpaths, **tokenizer_kwargs_left)
|
||||
left_padded_input_ids = left_padded_sequence["input_ids"]
|
||||
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
|
||||
left_padded_sequence_length = len(left_padded_input_ids)
|
||||
|
||||
@@ -21,6 +21,7 @@ from typing import List
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import AddedToken, is_torch_available
|
||||
from transformers.models.tapas.tokenization_tapas import (
|
||||
@@ -494,7 +495,8 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
|
||||
self.assertIn(decoded, [output, output.lower()])
|
||||
|
||||
def test_encode_plus_with_padding(self):
|
||||
@parameterized.expand([(True,), (False,)])
|
||||
def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
@@ -547,15 +549,18 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
assert special_tokens_mask == not_padded_special_tokens_mask
|
||||
|
||||
# Test right padding
|
||||
tokenizer.padding_side = "right"
|
||||
tokenizer_kwargs_right = {
|
||||
"max_length": sequence_length + padding_size,
|
||||
"padding": "max_length",
|
||||
"return_special_tokens_mask": True,
|
||||
}
|
||||
|
||||
right_padded_sequence = tokenizer.encode_plus(
|
||||
table,
|
||||
sequence,
|
||||
max_length=sequence_length + padding_size,
|
||||
padding="max_length",
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
if not use_padding_as_call_kwarg:
|
||||
tokenizer.padding_side = "right"
|
||||
else:
|
||||
tokenizer_kwargs_right["padding_side"] = "right"
|
||||
|
||||
right_padded_sequence = tokenizer.encode_plus(table, sequence, **tokenizer_kwargs_right)
|
||||
right_padded_input_ids = right_padded_sequence["input_ids"]
|
||||
|
||||
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
|
||||
@@ -566,14 +571,18 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
assert special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask
|
||||
|
||||
# Test left padding
|
||||
tokenizer.padding_side = "left"
|
||||
left_padded_sequence = tokenizer.encode_plus(
|
||||
table,
|
||||
sequence,
|
||||
max_length=sequence_length + padding_size,
|
||||
padding="max_length",
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
tokenizer_kwargs_left = {
|
||||
"max_length": sequence_length + padding_size,
|
||||
"padding": "max_length",
|
||||
"return_special_tokens_mask": True,
|
||||
}
|
||||
|
||||
if not use_padding_as_call_kwarg:
|
||||
tokenizer.padding_side = "left"
|
||||
else:
|
||||
tokenizer_kwargs_left["padding_side"] = "left"
|
||||
|
||||
left_padded_sequence = tokenizer.encode_plus(table, sequence, **tokenizer_kwargs_left)
|
||||
left_padded_input_ids = left_padded_sequence["input_ids"]
|
||||
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
|
||||
left_padded_sequence_length = len(left_padded_input_ids)
|
||||
|
||||
@@ -2225,7 +2225,15 @@ class TokenizerTesterMixin:
|
||||
else:
|
||||
self.assertListEqual(padded_features["attention_mask"], [[1, 1, 1, 1, 1, 0], [0, 0, 0, 1, 1, 0]])
|
||||
|
||||
def test_encode_plus_with_padding(self):
|
||||
@parameterized.expand([(True,), (False,)])
|
||||
def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
|
||||
"""
|
||||
This test checks that padding works as expected when tokenizing a sequence.
|
||||
Padding is expected to have no effect when the input is a single sequence and
|
||||
the padding-strategy is not `max_length`. Otherwise it pads to the specified max-length
|
||||
using tokenizer classes `padding_side` attribute. Also, we check that passing `padding_side`
|
||||
as call time kwarg works same way as when one sets `tokenizer.padding_side` attribute.
|
||||
"""
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
@@ -2244,8 +2252,6 @@ class TokenizerTesterMixin:
|
||||
sequence_length = len(input_ids)
|
||||
|
||||
# Test 'longest' and 'no_padding' don't do anything
|
||||
tokenizer.padding_side = "right"
|
||||
|
||||
not_padded_sequence = tokenizer.encode_plus(
|
||||
sequence,
|
||||
padding=True,
|
||||
@@ -2275,14 +2281,18 @@ class TokenizerTesterMixin:
|
||||
self.assertEqual(special_tokens_mask, not_padded_special_tokens_mask)
|
||||
|
||||
# Test right padding
|
||||
tokenizer.padding_side = "right"
|
||||
tokenizer_kwargs_right = {
|
||||
"max_length": sequence_length + padding_size,
|
||||
"padding": "max_length",
|
||||
"return_special_tokens_mask": True,
|
||||
}
|
||||
|
||||
right_padded_sequence = tokenizer.encode_plus(
|
||||
sequence,
|
||||
max_length=sequence_length + padding_size,
|
||||
padding="max_length",
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
if not use_padding_as_call_kwarg:
|
||||
tokenizer.padding_side = "right"
|
||||
else:
|
||||
tokenizer_kwargs_right["padding_side"] = "right"
|
||||
|
||||
right_padded_sequence = tokenizer.encode_plus(sequence, **tokenizer_kwargs_right)
|
||||
right_padded_input_ids = right_padded_sequence["input_ids"]
|
||||
|
||||
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
|
||||
@@ -2293,13 +2303,18 @@ class TokenizerTesterMixin:
|
||||
self.assertEqual(special_tokens_mask + [1] * padding_size, right_padded_special_tokens_mask)
|
||||
|
||||
# Test left padding
|
||||
tokenizer.padding_side = "left"
|
||||
left_padded_sequence = tokenizer.encode_plus(
|
||||
sequence,
|
||||
max_length=sequence_length + padding_size,
|
||||
padding="max_length",
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
tokenizer_kwargs_left = {
|
||||
"max_length": sequence_length + padding_size,
|
||||
"padding": "max_length",
|
||||
"return_special_tokens_mask": True,
|
||||
}
|
||||
|
||||
if not use_padding_as_call_kwarg:
|
||||
tokenizer.padding_side = "left"
|
||||
else:
|
||||
tokenizer_kwargs_left["padding_side"] = "left"
|
||||
|
||||
left_padded_sequence = tokenizer.encode_plus(sequence, **tokenizer_kwargs_left)
|
||||
left_padded_input_ids = left_padded_sequence["input_ids"]
|
||||
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
|
||||
left_padded_sequence_length = len(left_padded_input_ids)
|
||||
|
||||
Reference in New Issue
Block a user