From 438698085cfb92b6b7e9b3b94a226d3a32b77a85 Mon Sep 17 00:00:00 2001 From: SaulLu <55560583+SaulLu@users.noreply.github.com> Date: Tue, 23 Aug 2022 13:23:51 +0200 Subject: [PATCH] improve `add_tokens` docstring (#18687) * improve add_tokens documentation * format --- src/transformers/tokenization_utils_base.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 566fd3fbf9..fd8e1ee585 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -915,10 +915,12 @@ class SpecialTokensMixin: ) -> int: """ Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to - it with indices starting from length of the current vocabulary. + it with indices starting from length of the current vocabulary and and will be isolated before the tokenization + algorithm is applied. Added tokens and tokens from the vocabulary of the tokenization algorithm are therefore + not treated in the same way. - Note,None When adding new tokens to the vocabulary, you should make sure to also resize the token embedding - matrix of the model so that its embedding matrix matches the tokenizer. + Note, when adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix + of the model so that its embedding matrix matches the tokenizer. In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.