From 4e6a3172cecef53f790f1c995c7569ca11e04444 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 30 Aug 2019 12:23:37 +0200 Subject: [PATCH] update roberta docstring as well --- pytorch_transformers/tokenization_roberta.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py index 26805d9f4e..f290168c95 100644 --- a/pytorch_transformers/tokenization_roberta.py +++ b/pytorch_transformers/tokenization_roberta.py @@ -64,7 +64,11 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { class RobertaTokenizer(GPT2Tokenizer): """ - RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: Byte-level BPE + RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: + - Byte-level Byte-Pair-Encoding + - Requires a space to start the input string => will add a space is there isn't. + As a consequence, this tokenizer `encode` and `decode` method will not conserve + the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello" """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP