From e73a3e1891775a915846cc0f24b7e9a26d6688fb Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Mon, 22 Feb 2021 15:48:20 +0100 Subject: [PATCH] Add note to resize token embeddings matrix when adding new tokens to voc (#10331) --- src/transformers/tokenization_utils_base.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index a895df01c0..00ecdc7e40 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -971,6 +971,12 @@ class SpecialTokensMixin: Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to it with indices starting from length of the current vocabulary. + .. Note:: + When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of + the model so that its embedding matrix matches the tokenizer. + + In order to do that, please use the :meth:`~transformers.PreTrainedModel.resize_token_embeddings` method. + Args: new_tokens (:obj:`str`, :obj:`tokenizers.AddedToken` or a list of `str` or :obj:`tokenizers.AddedToken`): Tokens are only added if they are not already in the vocabulary. :obj:`tokenizers.AddedToken` wraps a