Documentation (#2989)

* All Tokenizers

BertTokenizer + few fixes
RobertaTokenizer
OpenAIGPTTokenizer + Fixes
GPT2Tokenizer + fixes
TransfoXLTokenizer
Correct rst for TransformerXL
XLMTokenizer + fixes
XLNet Tokenizer + Style
DistilBERT + Fix XLNet RST
CTRLTokenizer
CamemBERT Tokenizer
FlaubertTokenizer
XLMRobertaTokenizer
cleanup

* cleanup
This commit is contained in:
Lysandre Debut
2020-02-25 18:43:36 -05:00
committed by GitHub
parent c913eb9c38
commit bb7c468520
30 changed files with 866 additions and 242 deletions

View File

@@ -116,8 +116,21 @@ def get_pairs(word):
class CTRLTokenizer(PreTrainedTokenizer):
"""
CTRL BPE tokenizer. Peculiarities:
- Byte-Pair-Encoding
Constructs a CTRL tokenizer. Peculiarities:
- Byte-Pair-Encoding
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
"""
vocab_files_names = VOCAB_FILES_NAMES
@@ -219,7 +232,16 @@ class CTRLTokenizer(PreTrainedTokenizer):
return out_string
def save_vocabulary(self, save_directory):
"""Save the tokenizer vocabulary and merge files to a directory."""
"""
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return