From 5af3a1aa48ac5909fce2f870c72c2a8297715c94 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Fri, 9 Jun 2023 16:30:20 +0200 Subject: [PATCH] [lamaTokenizerFast] Update documentation (#24132) * Update documentation * nits --- docs/source/en/model_doc/llama.mdx | 1 + src/transformers/models/llama/tokenization_llama_fast.py | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/docs/source/en/model_doc/llama.mdx b/docs/source/en/model_doc/llama.mdx index a5f0553358..5a107eaa84 100644 --- a/docs/source/en/model_doc/llama.mdx +++ b/docs/source/en/model_doc/llama.mdx @@ -65,6 +65,7 @@ This model was contributed by [zphang](https://huggingface.co/zphang) with contr - build_inputs_with_special_tokens - get_special_tokens_mask - create_token_type_ids_from_sequences + - update_post_processor - save_vocabulary ## LlamaModel diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py index 095f65b628..28e9413a50 100644 --- a/src/transformers/models/llama/tokenization_llama_fast.py +++ b/src/transformers/models/llama/tokenization_llama_fast.py @@ -48,6 +48,12 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast): >>> [1, 15043, 445, 338, 263, 1243] ``` + If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or + call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the + values of the first token and final token of an encoded sequence will not be correct). For more details, checkout + [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation. + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to this superclass for more information regarding those methods. @@ -108,6 +114,9 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast): self.can_save_slow_tokenizer = False if not self.vocab_file else True def update_post_processor(self): + """ + Updates the underlying post processor with the current `bos_token` and `eos_token`. + """ bos = self.bos_token bos_token_id = self.bos_token_id