From 7bd865051237137ec1df666034595408a7e38e24 Mon Sep 17 00:00:00 2001 From: Maria Khalusova Date: Mon, 20 Mar 2023 14:18:55 -0400 Subject: [PATCH] Example of pad_to_multiple_of for padding and truncation guide & docstring update (#22278) * added an example of pad_to_multiple_of * make style * addressed feedback --- docs/source/en/pad_truncation.mdx | 1 + src/transformers/tokenization_utils_base.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/source/en/pad_truncation.mdx b/docs/source/en/pad_truncation.mdx index f848e23bed..8862e0be00 100644 --- a/docs/source/en/pad_truncation.mdx +++ b/docs/source/en/pad_truncation.mdx @@ -50,6 +50,7 @@ The following table summarizes the recommended way to setup padding and truncati | | | `tokenizer(batch_sentences, padding='longest')` | | | padding to max model input length | `tokenizer(batch_sentences, padding='max_length')` | | | padding to specific length | `tokenizer(batch_sentences, padding='max_length', max_length=42)` | +| | padding to a multiple of a value | `tokenizer(batch_sentences, padding=True, pad_to_multiple_of=8) | | truncation to max model input length | no padding | `tokenizer(batch_sentences, truncation=True)` or | | | | `tokenizer(batch_sentences, truncation=STRATEGY)` | | | padding to max sequence in batch | `tokenizer(batch_sentences, padding=True, truncation=True)` or | diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index eb52ef0adb..66164c2778 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1342,8 +1342,9 @@ ENCODE_KWARGS_DOCSTRING = r""" tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace) which it will tokenize. This is useful for NER or token classification. pad_to_multiple_of (`int`, *optional*): - If set will pad the sequence to a multiple of the provided value. This is especially useful to enable - the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta). + If set will pad the sequence to a multiple of the provided value. Requires `padding` to be activated. + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability + `>= 7.5` (Volta). return_tensors (`str` or [`~utils.TensorType`], *optional*): If set, will return tensors instead of list of python integers. Acceptable values are: