This commit is contained in:
Lysandre
2019-10-22 11:26:42 -04:00
parent b8c9ea0010
commit 777faa8ae7

View File

@@ -63,11 +63,7 @@ def get_pairs(word):
class CTRLTokenizer(PreTrainedTokenizer): class CTRLTokenizer(PreTrainedTokenizer):
""" """
CTRL BPE tokenizer. Peculiarities: CTRL BPE tokenizer. Peculiarities:
- Byte-level Byte-Pair-Encoding - Byte-Pair-Encoding
- Requires a space to start the input string => the encoding methods should be called with the
``add_prefix_space`` flag set to ``True``.
Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP