update roberta docstring as well

2019-08-30 12:23:37 +02:00
parent fd10d79b55
commit 4e6a3172ce
1 changed files with 5 additions and 1 deletions
--- a/pytorch_transformers/tokenization_roberta.py
+++ b/pytorch_transformers/tokenization_roberta.py
@@ -64,7 +64,11 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {

 class RobertaTokenizer(GPT2Tokenizer):
    """
-    RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: Byte-level BPE
+    RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
+        - Byte-level Byte-Pair-Encoding
+        - Requires a space to start the input string => will add a space is there isn't.
+          As a consequence, this tokenizer `encode` and `decode` method will not conserve
+          the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
    """
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP