From fd10d79b55d159d845a30adb238cd7019965aa23 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 30 Aug 2019 12:23:12 +0200 Subject: [PATCH] update GPT2 docstring --- pytorch_transformers/tokenization_gpt2.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py index eb56e7303e..1fa7cbd06b 100644 --- a/pytorch_transformers/tokenization_gpt2.py +++ b/pytorch_transformers/tokenization_gpt2.py @@ -99,7 +99,10 @@ def get_pairs(word): class GPT2Tokenizer(PreTrainedTokenizer): """ GPT-2 BPE tokenizer. Peculiarities: - - Byte-level BPE + - Byte-level Byte-Pair-Encoding + - Requires a space to start the input string => will add a space is there isn't. + As a consequence, this tokenizer `encode` and `decode` method will not conserve + the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello" """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP