From f1e2e423ab46d65bba98757fba420eecce14ea52 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Mon, 6 Jul 2020 18:45:01 -0400 Subject: [PATCH] Fix fast tokenizers too (#5562) --- src/transformers/tokenization_gpt2.py | 29 ++++++++++++++++-------- src/transformers/tokenization_roberta.py | 27 ++++++++++++++-------- 2 files changed, 37 insertions(+), 19 deletions(-) diff --git a/src/transformers/tokenization_gpt2.py b/src/transformers/tokenization_gpt2.py index 04a04d34d9..05376b1f6e 100644 --- a/src/transformers/tokenization_gpt2.py +++ b/src/transformers/tokenization_gpt2.py @@ -297,21 +297,30 @@ class GPT2Tokenizer(PreTrainedTokenizer): class GPT2TokenizerFast(PreTrainedTokenizerFast): """ - Constructs a "Fast" GPT-2 BPE tokenizer (backed by HuggingFace's `tokenizers` library). + Constructs a "Fast" GPT-2 BPE tokenizer (backed by HuggingFace's `tokenizers` library), using byte-level + Byte-Pair-Encoding. - Peculiarities: - - - Byte-level Byte-Pair-Encoding - - Requires a space to start the input string => the encoding methods should be called with the - ``add_prefix_space`` flag set to ``True``. - Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve - the absence of a space at the beginning of a string: + This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: :: - tokenizer.decode(tokenizer.encode("Hello")) = " Hello" + >>> from transformers import GPT2TokenizerFast + >>> tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") + >>> tokenizer("Hello world")['input_ids'] + [15496, 995] + >>> tokenizer(" Hello world")['input_ids'] + [18435, 995] - This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the methods. Users + You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you + call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. + + .. note:: + + When used with ``is_pretokenized=True``, this tokenizer needs to be instantiated with + ``add_prefix_space=True``. + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: diff --git a/src/transformers/tokenization_roberta.py b/src/transformers/tokenization_roberta.py index 452bc8a91e..450e659f8f 100644 --- a/src/transformers/tokenization_roberta.py +++ b/src/transformers/tokenization_roberta.py @@ -260,19 +260,28 @@ class RobertaTokenizer(GPT2Tokenizer): class RobertaTokenizerFast(GPT2TokenizerFast): """ - Constructs a "Fast" RoBERTa BPE tokenizer (backed by HuggingFace's `tokenizers` library). + Constructs a "Fast" RoBERTa BPE tokenizer (backed by HuggingFace's `tokenizers` library), derived from the GPT-2 + tokenizer, using byte-level Byte-Pair-Encoding. - Peculiarities: - - - Byte-level Byte-Pair-Encoding - - Requires a space to start the input string => the encoding methods should be called with the - ``add_prefix_space`` flag set to ``True``. - Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve - the absence of a space at the beginning of a string: + This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: :: - tokenizer.decode(tokenizer.encode("Hello")) = " Hello" + >>> from transformers import RobertaTokenizerFast + >>> tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base") + >>> tokenizer("Hello world")['input_ids'] + [0, 31414, 232, 328, 2] + >>> tokenizer(" Hello world")['input_ids'] + [0, 20920, 232, 2] + + You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you + call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. + + .. note:: + + When used with ``is_pretokenized=True``, this tokenizer needs to be instantiated with + ``add_prefix_space=True``. This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the methods. Users should refer to the superclass for more information regarding methods.