update roberta docstring as well
This commit is contained in:
@@ -64,7 +64,11 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
|||||||
|
|
||||||
class RobertaTokenizer(GPT2Tokenizer):
|
class RobertaTokenizer(GPT2Tokenizer):
|
||||||
"""
|
"""
|
||||||
RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: Byte-level BPE
|
RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
|
||||||
|
- Byte-level Byte-Pair-Encoding
|
||||||
|
- Requires a space to start the input string => will add a space is there isn't.
|
||||||
|
As a consequence, this tokenizer `encode` and `decode` method will not conserve
|
||||||
|
the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
|
||||||
"""
|
"""
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||||
|
|||||||
Reference in New Issue
Block a user