diff --git a/docs/source/model_doc/albert.rst b/docs/source/model_doc/albert.rst index 06a9b5bfd5..0740fd5b98 100644 --- a/docs/source/model_doc/albert.rst +++ b/docs/source/model_doc/albert.rst @@ -41,7 +41,8 @@ AlbertTokenizer ~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.AlbertTokenizer - :members: + :members: build_inputs_with_special_tokens, get_special_tokens_mask, + create_token_type_ids_from_sequences, save_vocabulary AlbertModel diff --git a/docs/source/model_doc/bert.rst b/docs/source/model_doc/bert.rst index 5e785eed1c..b66189493f 100644 --- a/docs/source/model_doc/bert.rst +++ b/docs/source/model_doc/bert.rst @@ -46,7 +46,8 @@ BertTokenizer ~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.BertTokenizer - :members: + :members: build_inputs_with_special_tokens, get_special_tokens_mask, + create_token_type_ids_from_sequences, save_vocabulary BertModel diff --git a/docs/source/model_doc/camembert.rst b/docs/source/model_doc/camembert.rst index 611d930d6e..5db97c4402 100644 --- a/docs/source/model_doc/camembert.rst +++ b/docs/source/model_doc/camembert.rst @@ -33,7 +33,8 @@ CamembertTokenizer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.CamembertTokenizer - :members: + :members: build_inputs_with_special_tokens, get_special_tokens_mask, + create_token_type_ids_from_sequences, save_vocabulary CamembertModel diff --git a/docs/source/model_doc/ctrl.rst b/docs/source/model_doc/ctrl.rst index a8a04837d7..8d8ca57bbd 100644 --- a/docs/source/model_doc/ctrl.rst +++ b/docs/source/model_doc/ctrl.rst @@ -43,7 +43,7 @@ CTRLTokenizer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.CTRLTokenizer - :members: + :members: save_vocabulary CTRLModel diff --git a/docs/source/model_doc/gpt.rst b/docs/source/model_doc/gpt.rst index 9604b39cea..0b693cd928 100644 --- a/docs/source/model_doc/gpt.rst +++ b/docs/source/model_doc/gpt.rst @@ -47,7 +47,7 @@ OpenAIGPTTokenizer ~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.OpenAIGPTTokenizer - :members: + :members: save_vocabulary OpenAIGPTModel diff --git a/docs/source/model_doc/gpt2.rst b/docs/source/model_doc/gpt2.rst index 54ef3cea08..e86713f40f 100644 --- a/docs/source/model_doc/gpt2.rst +++ b/docs/source/model_doc/gpt2.rst @@ -5,7 +5,7 @@ Overview ~~~~~~~~~~~~~~~~~~~~~ OpenAI GPT-2 model was proposed in -`Language Models are Unsupervised Multitask Learners`_ +`Language Models are Unsupervised Multitask Learners `_ by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**. It's a causal (unidirectional) transformer pre-trained using language modeling on a very large corpus of ~40 GB of text data. @@ -46,7 +46,7 @@ GPT2Tokenizer ~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.GPT2Tokenizer - :members: + :members: save_vocabulary GPT2Model diff --git a/docs/source/model_doc/roberta.rst b/docs/source/model_doc/roberta.rst index 62138bb72e..92438abd91 100644 --- a/docs/source/model_doc/roberta.rst +++ b/docs/source/model_doc/roberta.rst @@ -39,7 +39,8 @@ RobertaTokenizer ~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.RobertaTokenizer - :members: + :members: build_inputs_with_special_tokens, get_special_tokens_mask, + create_token_type_ids_from_sequences, save_vocabulary RobertaModel diff --git a/docs/source/model_doc/transformerxl.rst b/docs/source/model_doc/transformerxl.rst index 5240df3df4..84951f8a6d 100644 --- a/docs/source/model_doc/transformerxl.rst +++ b/docs/source/model_doc/transformerxl.rst @@ -42,7 +42,7 @@ TransfoXLTokenizer ~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.TransfoXLTokenizer - :members: + :members: save_vocabulary TransfoXLModel diff --git a/docs/source/model_doc/xlm.rst b/docs/source/model_doc/xlm.rst index 7346693752..b65c4f4dbd 100644 --- a/docs/source/model_doc/xlm.rst +++ b/docs/source/model_doc/xlm.rst @@ -41,7 +41,8 @@ XLMTokenizer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.XLMTokenizer - :members: + :members: build_inputs_with_special_tokens, get_special_tokens_mask, + create_token_type_ids_from_sequences, save_vocabulary XLMModel ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/model_doc/xlmroberta.rst b/docs/source/model_doc/xlmroberta.rst index 6e5bcce7c9..38735696ef 100644 --- a/docs/source/model_doc/xlmroberta.rst +++ b/docs/source/model_doc/xlmroberta.rst @@ -39,7 +39,8 @@ XLMRobertaTokenizer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.XLMRobertaTokenizer - :members: + :members: build_inputs_with_special_tokens, get_special_tokens_mask, + create_token_type_ids_from_sequences, save_vocabulary XLMRobertaModel diff --git a/docs/source/model_doc/xlnet.rst b/docs/source/model_doc/xlnet.rst index 0f8c61098c..b768e6ec75 100644 --- a/docs/source/model_doc/xlnet.rst +++ b/docs/source/model_doc/xlnet.rst @@ -44,7 +44,8 @@ XLNetTokenizer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.XLNetTokenizer - :members: + :members: build_inputs_with_special_tokens, get_special_tokens_mask, + create_token_type_ids_from_sequences, save_vocabulary XLNetModel diff --git a/src/transformers/configuration_flaubert.py b/src/transformers/configuration_flaubert.py index 5110330819..0c9860cbed 100644 --- a/src/transformers/configuration_flaubert.py +++ b/src/transformers/configuration_flaubert.py @@ -109,11 +109,12 @@ class FlaubertConfig(XLMConfig): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers.XLMForSequenceClassification`. Is one of the following options: - - 'last' => take the last token hidden state (like XLNet) - - 'first' => take the first token hidden state (like Bert) - - 'mean' => take the mean of all tokens hidden states - - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - - 'attn' => Not implemented now, use multi-head attention + + - 'last' => take the last token hidden state (like XLNet) + - 'first' => take the first token hidden state (like Bert) + - 'mean' => take the mean of all tokens hidden states + - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) + - 'attn' => Not implemented now, use multi-head attention summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers.XLMForSequenceClassification`. diff --git a/src/transformers/configuration_gpt2.py b/src/transformers/configuration_gpt2.py index 7fff0b6c49..ed639dc18c 100644 --- a/src/transformers/configuration_gpt2.py +++ b/src/transformers/configuration_gpt2.py @@ -73,11 +73,12 @@ class GPT2Config(PretrainedConfig): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers.GPT2DoubleHeadsModel`. Is one of the following options: - - 'last' => take the last token hidden state (like XLNet) - - 'first' => take the first token hidden state (like Bert) - - 'mean' => take the mean of all tokens hidden states - - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - - 'attn' => Not implemented now, use multi-head attention + + - 'last' => take the last token hidden state (like XLNet) + - 'first' => take the first token hidden state (like Bert) + - 'mean' => take the mean of all tokens hidden states + - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) + - 'attn' => Not implemented now, use multi-head attention summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers.GPT2DoubleHeadsModel`. diff --git a/src/transformers/configuration_openai.py b/src/transformers/configuration_openai.py index d4a965bde1..528558144a 100644 --- a/src/transformers/configuration_openai.py +++ b/src/transformers/configuration_openai.py @@ -73,11 +73,12 @@ class OpenAIGPTConfig(PretrainedConfig): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers.OpenAIGPTDoubleHeadsModel`. Is one of the following options: - - 'last' => take the last token hidden state (like XLNet) - - 'first' => take the first token hidden state (like Bert) - - 'mean' => take the mean of all tokens hidden states - - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - - 'attn' => Not implemented now, use multi-head attention + + - 'last' => take the last token hidden state (like XLNet) + - 'first' => take the first token hidden state (like Bert) + - 'mean' => take the mean of all tokens hidden states + - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) + - 'attn' => Not implemented now, use multi-head attention summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers.OpenAIGPTDoubleHeadsModel`. diff --git a/src/transformers/configuration_xlm.py b/src/transformers/configuration_xlm.py index c4d61808d6..36b3bd0711 100644 --- a/src/transformers/configuration_xlm.py +++ b/src/transformers/configuration_xlm.py @@ -108,11 +108,12 @@ class XLMConfig(PretrainedConfig): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers.XLMForSequenceClassification`. Is one of the following options: - - 'last' => take the last token hidden state (like XLNet) - - 'first' => take the first token hidden state (like Bert) - - 'mean' => take the mean of all tokens hidden states - - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - - 'attn' => Not implemented now, use multi-head attention + + - 'last' => take the last token hidden state (like XLNet) + - 'first' => take the first token hidden state (like Bert) + - 'mean' => take the mean of all tokens hidden states + - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) + - 'attn' => Not implemented now, use multi-head attention summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers.XLMForSequenceClassification`. diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index 400ecb33ff..47d7b2301f 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -1230,7 +1230,7 @@ class BertForMultipleChoice(BertPreTrainedModel): Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - loss (:obj:`torch.FloatTensor`` of shape ``(1,)`, `optional`, returned when :obj:`labels` is provided): + loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): Classification loss. classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): `num_choices` is the second dimension of the input tensors. (see `input_ids` above). diff --git a/src/transformers/modeling_tf_bert.py b/src/transformers/modeling_tf_bert.py index 01bc1c2be7..1904623581 100644 --- a/src/transformers/modeling_tf_bert.py +++ b/src/transformers/modeling_tf_bert.py @@ -668,38 +668,39 @@ class TFBertModel(TFBertPreTrainedModel): @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" - Returns: + Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): - Last layer hidden-state of the first token of the sequence (classification token) - further processed by a Linear layer and a Tanh activation function. The Linear - layer weights are trained from the next sentence prediction (classification) - objective during Bert pretraining. This output is usually *not* a good summary - of the semantic content of the input, you're often better with averaging or pooling - the sequence of hidden-states for the whole input sequence. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. + last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): + Last layer hidden-state of the first token of the sequence (classification token) + further processed by a Linear layer and a Tanh activation function. The Linear + layer weights are trained from the next sentence prediction (classification) + objective during Bert pretraining. This output is usually *not* a good summary + of the semantic content of the input, you're often better with averaging or pooling + the sequence of hidden-states for the whole input sequence. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - Examples:: - import tensorflow as tf - from transformers import BertTokenizer, TFBertModel + Examples:: - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - model = TFBertModel.from_pretrained('bert-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 - outputs = model(input_ids) - last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + import tensorflow as tf + from transformers import BertTokenizer, TFBertModel + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = TFBertModel.from_pretrained('bert-base-uncased') + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 + outputs = model(input_ids) + last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ outputs = self.bert(inputs, **kwargs) return outputs diff --git a/src/transformers/tokenization_albert.py b/src/transformers/tokenization_albert.py index 224636c997..e85efbb6a9 100644 --- a/src/transformers/tokenization_albert.py +++ b/src/transformers/tokenization_albert.py @@ -19,6 +19,7 @@ import logging import os import unicodedata from shutil import copyfile +from typing import List, Optional from .tokenization_utils import PreTrainedTokenizer @@ -55,9 +56,55 @@ SPIECE_UNDERLINE = "▁" class AlbertTokenizer(PreTrainedTokenizer): """ - SentencePiece based tokenizer. Peculiarities: + Constructs an ALBERT tokenizer. Based on `SentencePiece `__ - - requires `SentencePiece `_ + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users + should refer to the superclass for more information regarding methods. + + Args: + vocab_file (:obj:`string`): + `SentencePiece `__ file (generally has a .spm extension) that + contains the vocabulary necessary to instantiate a tokenizer. + do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to lowercase the input when tokenizing. + remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to strip the text when tokenizing (removing excess spaces before and after the string). + keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to keep accents when tokenizing. + bos_token (:obj:`string`, `optional`, defaults to "[CLS]"): + The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the beginning + of sequence. The token used is the :obj:`cls_token`. + eos_token (:obj:`string`, `optional`, defaults to "[SEP]"): + The end of sequence token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the end + of sequence. The token used is the :obj:`sep_token`. + unk_token (:obj:`string`, `optional`, defaults to ""): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (:obj:`string`, `optional`, defaults to "[SEP]"): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences + for sequence classification or for a text and a question for question answering. + It is also used as the last token of a sequence built with special tokens. + pad_token (:obj:`string`, `optional`, defaults to ""): + The token used for padding, for example when batching sequences of different lengths. + cls_token (:obj:`string`, `optional`, defaults to "[CLS]"): + The classifier token which is used when doing sequence classification (classification of the whole + sequence instead of per-token classification). It is the first token of the sequence when built with + special tokens. + mask_token (:obj:`string`, `optional`, defaults to "[MASK]"): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + + Attributes: + sp_model (:obj:`SentencePieceProcessor`): + The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). """ vocab_files_names = VOCAB_FILES_NAMES @@ -185,17 +232,28 @@ class AlbertTokenizer(PreTrainedTokenizer): return self.sp_model.IdToPiece(index) def convert_tokens_to_string(self, tokens): - """Converts a sequence of tokens (strings for sub-words) in a single string.""" out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() return out_string - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. An ALBERT sequence has the following format: - single sequence: [CLS] X [SEP] - pair of sequences: [CLS] A [SEP] B [SEP] + + - single sequence: ``[CLS] X [SEP]`` + - pair of sequences: ``[CLS] A [SEP] B [SEP]`` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ sep = [self.sep_token_id] cls = [self.cls_token_id] @@ -203,27 +261,30 @@ class AlbertTokenizer(PreTrainedTokenizer): return cls + token_ids_0 + sep return cls + token_ids_0 + sep + token_ids_1 + sep - def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: - token_ids_0: list of ids (must not contain special tokens) - token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids - for sequence pairs - already_has_special_tokens: (default False) Set to True if the token list is already formated with - special tokens for the model + token_ids_0 (:obj:`List[int]`): + List of ids. + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True if the token list is already formatted with special tokens for the model Returns: - A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. + :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. """ if already_has_special_tokens: if token_ids_1 is not None: raise ValueError( "You should not supply a second sequence if the provided sequence of " - "ids is already formated with special tokens for the model." + "ids is already formatted with special tokens for the model." ) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) @@ -231,14 +292,29 @@ class AlbertTokenizer(PreTrainedTokenizer): return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1] - def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT sequence pair mask has the following format: - 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 - | first sequence | second sequence - if token_ids_1 is None, only returns the first portion of the mask (0's). + :: + + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + + if token_ids_1 is None, only returns the first portion of the mask (0s). + + Args: + token_ids_0 (:obj:`List[int]`): + List of ids. + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given + sequence(s). """ sep = [self.sep_token_id] cls = [self.cls_token_id] @@ -248,8 +324,15 @@ class AlbertTokenizer(PreTrainedTokenizer): return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] def save_vocabulary(self, save_directory): - """ Save the sentencepiece vocabulary (copy original file) and special tokens file - to a directory. + """ + Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. + + Args: + save_directory (:obj:`str`): + The directory in which to save the vocabulary. + + Returns: + :obj:`Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) diff --git a/src/transformers/tokenization_bert.py b/src/transformers/tokenization_bert.py index de74ee579c..159600a37c 100644 --- a/src/transformers/tokenization_bert.py +++ b/src/transformers/tokenization_bert.py @@ -19,6 +19,7 @@ import collections import logging import os import unicodedata +from typing import List, Optional from tokenizers import BertWordPieceTokenizer @@ -117,17 +118,41 @@ def whitespace_tokenize(text): class BertTokenizer(PreTrainedTokenizer): r""" - Constructs a BertTokenizer. - :class:`~transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece + Constructs a BERT tokenizer. Based on WordPiece. + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users + should refer to the superclass for more information regarding methods. Args: - vocab_file: Path to a one-wordpiece-per-line vocabulary file - do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True - do_basic_tokenize: Whether to do basic tokenization before wordpiece. - max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the - minimum of this value (if specified) and the underlying BERT model's sequence length. - never_split: List of tokens which will never be split during tokenization. Only has an effect when - do_basic_tokenize=True + vocab_file (:obj:`string`): + File containing the vocabulary. + do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to lowercase the input when tokenizing. + do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to do basic tokenization before WordPiece. + never_split (:obj:`bool`, `optional`, defaults to :obj:`True`): + List of tokens which will never be split during tokenization. Only has an effect when + :obj:`do_basic_tokenize=True` + unk_token (:obj:`string`, `optional`, defaults to "[UNK]"): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (:obj:`string`, `optional`, defaults to "[SEP]"): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences + for sequence classification or for a text and a question for question answering. + It is also used as the last token of a sequence built with special tokens. + pad_token (:obj:`string`, `optional`, defaults to "[PAD]"): + The token used for padding, for example when batching sequences of different lengths. + cls_token (:obj:`string`, `optional`, defaults to "[CLS]"): + The classifier token which is used when doing sequence classification (classification of the whole + sequence instead of per-token classification). It is the first token of the sequence when built with + special tokens. + mask_token (:obj:`string`, `optional`, defaults to "[MASK]"): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to tokenize Chinese characters. + This should likely be deactivated for Japanese: + see: https://github.com/huggingface/transformers/issues/328 """ vocab_files_names = VOCAB_FILES_NAMES @@ -149,23 +174,6 @@ class BertTokenizer(PreTrainedTokenizer): tokenize_chinese_chars=True, **kwargs ): - """Constructs a BertTokenizer. - - Args: - **vocab_file**: Path to a one-wordpiece-per-line vocabulary file - **do_lower_case**: (`optional`) boolean (default True) - Whether to lower case the input - Only has an effect when do_basic_tokenize=True - **do_basic_tokenize**: (`optional`) boolean (default True) - Whether to do basic tokenization before wordpiece. - **never_split**: (`optional`) list of string - List of tokens which will never be split during tokenization. - Only has an effect when do_basic_tokenize=True - **tokenize_chinese_chars**: (`optional`) boolean (default True) - Whether to tokenize Chinese characters. - This should likely be deactivated for Japanese: - see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328 - """ super().__init__( unk_token=unk_token, sep_token=sep_token, @@ -221,13 +229,25 @@ class BertTokenizer(PreTrainedTokenizer): out_string = " ".join(tokens).replace(" ##", "").strip() return out_string - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A BERT sequence has the following format: - single sequence: [CLS] X [SEP] - pair of sequences: [CLS] A [SEP] B [SEP] + + - single sequence: ``[CLS] X [SEP]`` + - pair of sequences: ``[CLS] A [SEP] B [SEP]`` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ if token_ids_1 is None: return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] @@ -235,20 +255,23 @@ class BertTokenizer(PreTrainedTokenizer): sep = [self.sep_token_id] return cls + token_ids_0 + sep + token_ids_1 + sep - def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: - token_ids_0: list of ids (must not contain special tokens) - token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids - for sequence pairs - already_has_special_tokens: (default False) Set to True if the token list is already formated with - special tokens for the model + token_ids_0 (:obj:`List[int]`): + List of ids. + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True if the token list is already formatted with special tokens for the model Returns: - A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. """ if already_has_special_tokens: @@ -263,14 +286,29 @@ class BertTokenizer(PreTrainedTokenizer): return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1] - def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence pair mask has the following format: - 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 - | first sequence | second sequence + + :: + + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | if token_ids_1 is None, only returns the first portion of the mask (0's). + + Args: + token_ids_0 (:obj:`List[int]`): + List of ids. + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given + sequence(s). """ sep = [self.sep_token_id] cls = [self.cls_token_id] @@ -279,7 +317,16 @@ class BertTokenizer(PreTrainedTokenizer): return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] def save_vocabulary(self, vocab_path): - """Save the tokenizer vocabulary to a directory or file.""" + """ + Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. + + Args: + vocab_path (:obj:`str`): + The directory in which to save the vocabulary. + + Returns: + :obj:`Tuple(str)`: Paths to the files saved. + """ index = 0 if os.path.isdir(vocab_path): vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"]) diff --git a/src/transformers/tokenization_camembert.py b/src/transformers/tokenization_camembert.py index a158419470..cc4fdc650b 100644 --- a/src/transformers/tokenization_camembert.py +++ b/src/transformers/tokenization_camembert.py @@ -18,6 +18,7 @@ import logging import os from shutil import copyfile +from typing import List, Optional import sentencepiece as spm @@ -53,7 +54,50 @@ class CamembertTokenizer(PreTrainedTokenizer): Adapted from RobertaTokenizer and XLNetTokenizer SentencePiece based tokenizer. Peculiarities: - - requires `SentencePiece `_ + - requires `SentencePiece `_ + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users + should refer to the superclass for more information regarding methods. + + Args: + vocab_file (:obj:`str`): + Path to the vocabulary file. + bos_token (:obj:`string`, `optional`, defaults to ""): + The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the beginning + of sequence. The token used is the :obj:`cls_token`. + eos_token (:obj:`string`, `optional`, defaults to ""): + The end of sequence token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the end + of sequence. The token used is the :obj:`sep_token`. + sep_token (:obj:`string`, `optional`, defaults to ""): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences + for sequence classification or for a text and a question for question answering. + It is also used as the last token of a sequence built with special tokens. + cls_token (:obj:`string`, `optional`, defaults to ""): + The classifier token which is used when doing sequence classification (classification of the whole + sequence instead of per-token classification). It is the first token of the sequence when built with + special tokens. + unk_token (:obj:`string`, `optional`, defaults to ""): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (:obj:`string`, `optional`, defaults to ""): + The token used for padding, for example when batching sequences of different lengths. + mask_token (:obj:`string`, `optional`, defaults to ""): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): + Additional special tokens used by the tokenizer. + + Attributes: + sp_model (:obj:`SentencePieceProcessor`): + The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). """ vocab_files_names = VOCAB_FILES_NAMES @@ -97,34 +141,50 @@ class CamembertTokenizer(PreTrainedTokenizer): self.fairseq_tokens_to_ids[""] = len(self.sp_model) + len(self.fairseq_tokens_to_ids) self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. - A RoBERTa sequence has the following format: - single sequence: X - pair of sequences: A B + A CamemBERT sequence has the following format: + + - single sequence: `` X `` + - pair of sequences: `` A B `` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ + if token_ids_1 is None: return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] cls = [self.cls_token_id] sep = [self.sep_token_id] return cls + token_ids_0 + sep + sep + token_ids_1 + sep - def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: - token_ids_0: list of ids (must not contain special tokens) - token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids - for sequence pairs - already_has_special_tokens: (default False) Set to True if the token list is already formated with - special tokens for the model + token_ids_0 (:obj:`List[int]`): + List of ids. + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True if the token list is already formatted with special tokens for the model Returns: - A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. """ if already_has_special_tokens: if token_ids_1 is not None: @@ -138,14 +198,29 @@ class CamembertTokenizer(PreTrainedTokenizer): return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] - def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. - A RoBERTa sequence pair mask has the following format: - 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 - | first sequence | second sequence + A CamemBERT sequence pair mask has the following format: - if token_ids_1 is None, only returns the first portion of the mask (0's). + :: + + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | | second sequence | + + if token_ids_1 is None, only returns the first portion of the mask (0s). + + Args: + token_ids_0 (:obj:`List[int]`): + List of ids. + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given + sequence(s). """ sep = [self.sep_token_id] cls = [self.cls_token_id] @@ -200,8 +275,15 @@ class CamembertTokenizer(PreTrainedTokenizer): return out_string def save_vocabulary(self, save_directory): - """ Save the sentencepiece vocabulary (copy original file) and special tokens file - to a directory. + """ + Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. + + Args: + save_directory (:obj:`str`): + The directory in which to save the vocabulary. + + Returns: + :obj:`Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) diff --git a/src/transformers/tokenization_ctrl.py b/src/transformers/tokenization_ctrl.py index 691824b92b..5c487952c4 100644 --- a/src/transformers/tokenization_ctrl.py +++ b/src/transformers/tokenization_ctrl.py @@ -116,8 +116,21 @@ def get_pairs(word): class CTRLTokenizer(PreTrainedTokenizer): """ - CTRL BPE tokenizer. Peculiarities: - - Byte-Pair-Encoding + Constructs a CTRL tokenizer. Peculiarities: + + - Byte-Pair-Encoding + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users + should refer to the superclass for more information regarding methods. + + Args: + vocab_file (:obj:`str`): + Path to the vocabulary file. + merges_file (:obj:`str`): + Path to the merges file. + unk_token (:obj:`string`, `optional`, defaults to ""): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. """ vocab_files_names = VOCAB_FILES_NAMES @@ -219,7 +232,16 @@ class CTRLTokenizer(PreTrainedTokenizer): return out_string def save_vocabulary(self, save_directory): - """Save the tokenizer vocabulary and merge files to a directory.""" + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (:obj:`str`): + The directory in which to save the vocabulary. + + Returns: + :obj:`Tuple(str)`: Paths to the files saved. + """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return diff --git a/src/transformers/tokenization_distilbert.py b/src/transformers/tokenization_distilbert.py index 8ca0a531d3..7a5bf34367 100644 --- a/src/transformers/tokenization_distilbert.py +++ b/src/transformers/tokenization_distilbert.py @@ -58,16 +58,11 @@ PRETRAINED_INIT_CONFIGURATION = { class DistilBertTokenizer(BertTokenizer): r""" Constructs a DistilBertTokenizer. - :class:`~transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece + :class:`~transformers.DistilBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end + tokenization: punctuation splitting + wordpiece. - Args: - vocab_file: Path to a one-wordpiece-per-line vocabulary file - do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True - do_basic_tokenize: Whether to do basic tokenization before wordpiece. - max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the - minimum of this value (if specified) and the underlying BERT model's sequence length. - never_split: List of tokens which will never be split during tokenization. Only has an effect when - do_basic_tokenize=True + Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning + parameters. """ vocab_files_names = VOCAB_FILES_NAMES diff --git a/src/transformers/tokenization_flaubert.py b/src/transformers/tokenization_flaubert.py index e648a61c94..dd0115b0cd 100644 --- a/src/transformers/tokenization_flaubert.py +++ b/src/transformers/tokenization_flaubert.py @@ -80,14 +80,14 @@ class FlaubertTokenizer(XLMTokenizer): """ BPE tokenizer for Flaubert - - Moses preprocessing & tokenization + - Moses preprocessing & tokenization + - Normalize all inputs text + - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \ + (ex: "__classify__") to a vocabulary + - `do_lowercase` controle lower casing (automatically set for pretrained vocabularies) - - Normalize all inputs text - - - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \ - (ex: "__classify__") to a vocabulary - - - `do_lowercase` controle lower casing (automatically set for pretrained vocabularies) + This tokenizer inherits from :class:`~transformers.XLMTokenizer`. Please check the superclass for usage examples + and documentation regarding arguments. """ vocab_files_names = VOCAB_FILES_NAMES diff --git a/src/transformers/tokenization_gpt2.py b/src/transformers/tokenization_gpt2.py index 961797b97a..45331445b1 100644 --- a/src/transformers/tokenization_gpt2.py +++ b/src/transformers/tokenization_gpt2.py @@ -101,11 +101,35 @@ def get_pairs(word): class GPT2Tokenizer(PreTrainedTokenizer): """ GPT-2 BPE tokenizer. Peculiarities: - - Byte-level Byte-Pair-Encoding - - Requires a space to start the input string => the encoding and tokenize methods should be called with the - ``add_prefix_space`` flag set to ``True``. - Otherwise, this tokenizer's ``encode``, ``decode``, and ``tokenize`` methods will not conserve - the spaces at the beginning of a string: `tokenizer.decode(tokenizer.encode(" Hello")) = "Hello"` + + - Byte-level Byte-Pair-Encoding + - Requires a space to start the input string => the encoding methods should be called with the + ``add_prefix_space`` flag set to ``True``. + Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve + the absence of a space at the beginning of a string: + + :: + + tokenizer.decode(tokenizer.encode("Hello")) = " Hello" + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users + should refer to the superclass for more information regarding methods. + + Args: + vocab_file (:obj:`str`): + Path to the vocabulary file. + merges_file (:obj:`str`): + Path to the merges file. + errors (:obj:`str`, `optional`, defaults to "replace"): + Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode + `__ for more information. + unk_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`): + The beginning of sequence token. + eos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`): + The end of sequence token. """ vocab_files_names = VOCAB_FILES_NAMES @@ -219,7 +243,16 @@ class GPT2Tokenizer(PreTrainedTokenizer): return text def save_vocabulary(self, save_directory): - """Save the tokenizer vocabulary and merge files to a directory.""" + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (:obj:`str`): + The directory in which to save the vocabulary. + + Returns: + :obj:`Tuple(str)`: Paths to the files saved. + """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return diff --git a/src/transformers/tokenization_openai.py b/src/transformers/tokenization_openai.py index 912ab852a7..ea0f52a806 100644 --- a/src/transformers/tokenization_openai.py +++ b/src/transformers/tokenization_openai.py @@ -82,8 +82,21 @@ def text_standardize(text): class OpenAIGPTTokenizer(PreTrainedTokenizer): """ BPE tokenizer. Peculiarities: - - lower case all inputs - - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not. + + - lower case all inputs + - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not. + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users + should refer to the superclass for more information regarding methods. + + Args: + vocab_file (:obj:`str`): + Path to the vocabulary file. + merges_file (:obj:`str`): + Path to the merges file. + unk_token (:obj:`string`, `optional`, defaults to ""): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. """ vocab_files_names = VOCAB_FILES_NAMES @@ -201,7 +214,16 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): return out_string def save_vocabulary(self, save_directory): - """Save the tokenizer vocabulary and merge files to a directory.""" + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (:obj:`str`): + The directory in which to save the vocabulary. + + Returns: + :obj:`Tuple(str)`: Paths to the files saved. + """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return diff --git a/src/transformers/tokenization_roberta.py b/src/transformers/tokenization_roberta.py index fda82fb307..7275ceb4ca 100644 --- a/src/transformers/tokenization_roberta.py +++ b/src/transformers/tokenization_roberta.py @@ -16,6 +16,7 @@ import logging +from typing import List, Optional from tokenizers.processors import RobertaProcessing @@ -60,12 +61,59 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { class RobertaTokenizer(GPT2Tokenizer): """ - RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: - - Byte-level Byte-Pair-Encoding - - Requires a space to start the input string => the encoding methods should be called with the - ``add_prefix_space`` flag set to ``True``. - Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve - the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"` + Constructs a RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: + + - Byte-level Byte-Pair-Encoding + - Requires a space to start the input string => the encoding methods should be called with the + ``add_prefix_space`` flag set to ``True``. + Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve + the absence of a space at the beginning of a string: + + :: + + tokenizer.decode(tokenizer.encode("Hello")) = " Hello" + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users + should refer to the superclass for more information regarding methods. + + Args: + vocab_file (:obj:`str`): + Path to the vocabulary file. + merges_file (:obj:`str`): + Path to the merges file. + errors (:obj:`str`, `optional`, defaults to "replace"): + Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode + `__ for more information. + bos_token (:obj:`string`, `optional`, defaults to ""): + The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the beginning + of sequence. The token used is the :obj:`cls_token`. + eos_token (:obj:`string`, `optional`, defaults to ""): + The end of sequence token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the end + of sequence. The token used is the :obj:`sep_token`. + sep_token (:obj:`string`, `optional`, defaults to ""): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences + for sequence classification or for a text and a question for question answering. + It is also used as the last token of a sequence built with special tokens. + cls_token (:obj:`string`, `optional`, defaults to ""): + The classifier token which is used when doing sequence classification (classification of the whole + sequence instead of per-token classification). It is the first token of the sequence when built with + special tokens. + unk_token (:obj:`string`, `optional`, defaults to ""): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (:obj:`string`, `optional`, defaults to ""): + The token used for padding, for example when batching sequences of different lengths. + mask_token (:obj:`string`, `optional`, defaults to ""): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. """ vocab_files_names = VOCAB_FILES_NAMES @@ -102,13 +150,25 @@ class RobertaTokenizer(GPT2Tokenizer): self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A RoBERTa sequence has the following format: - single sequence: X - pair of sequences: A B + + - single sequence: `` X `` + - pair of sequences: `` A B `` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ if token_ids_1 is None: return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] @@ -116,20 +176,23 @@ class RobertaTokenizer(GPT2Tokenizer): sep = [self.sep_token_id] return cls + token_ids_0 + sep + sep + token_ids_1 + sep - def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: - token_ids_0: list of ids (must not contain special tokens) - token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids - for sequence pairs - already_has_special_tokens: (default False) Set to True if the token list is already formated with - special tokens for the model + token_ids_0 (:obj:`List[int]`): + List of ids. + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True if the token list is already formatted with special tokens for the model Returns: - A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. """ if already_has_special_tokens: if token_ids_1 is not None: @@ -143,12 +206,22 @@ class RobertaTokenizer(GPT2Tokenizer): return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] - def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not make use of token type ids, therefore a list of zeros is returned. - if token_ids_1 is None, only returns the first portion of the mask (0's). + Args: + token_ids_0 (:obj:`List[int]`): + List of ids. + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of zeros. + """ sep = [self.sep_token_id] cls = [self.cls_token_id] diff --git a/src/transformers/tokenization_transfo_xl.py b/src/transformers/tokenization_transfo_xl.py index c09cc5acd6..a26d9b6371 100644 --- a/src/transformers/tokenization_transfo_xl.py +++ b/src/transformers/tokenization_transfo_xl.py @@ -72,6 +72,9 @@ CORPUS_NAME = "corpus.bin" class TransfoXLTokenizer(PreTrainedTokenizer): """ Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users + should refer to the superclass for more information regarding methods. """ vocab_files_names = VOCAB_FILES_NAMES @@ -189,7 +192,16 @@ class TransfoXLTokenizer(PreTrainedTokenizer): raise ValueError("No token in vocabulary") def save_vocabulary(self, vocab_path): - """Save the tokenizer vocabulary to a directory or file.""" + """ + Save the vocabulary and special tokens file to a directory. + + Args: + vocab_path (:obj:`str`): + The directory in which to save the vocabulary. + + Returns: + :obj:`Tuple(str)`: Paths to the files saved. + """ logger.warning( "Please note you will not be able to load the save vocabulary in" diff --git a/src/transformers/tokenization_xlm.py b/src/transformers/tokenization_xlm.py index 93b8092abc..5afe1d29f0 100644 --- a/src/transformers/tokenization_xlm.py +++ b/src/transformers/tokenization_xlm.py @@ -21,6 +21,7 @@ import os import re import sys import unicodedata +from typing import List, Optional import sacremoses as sm @@ -530,20 +531,59 @@ class XLMTokenizer(PreTrainedTokenizer): """ BPE tokenizer for XLM - - Moses preprocessing & tokenization for most supported languages + - Moses preprocessing & tokenization for most supported languages + - Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP) + - (optionally) lower case & normalize all inputs text + - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \ + (ex: "__classify__") to a vocabulary + - `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies) + - `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies) - - Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP) + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users + should refer to the superclass for more information regarding methods. - - (optionally) lower case & normalize all inputs text + Args: + vocab_file (:obj:`string`): + Vocabulary file. + merges_file (:obj:`string`): + Merges file. + do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to lowercase the input when tokenizing. + remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to strip the text when tokenizing (removing excess spaces before and after the string). + keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to keep accents when tokenizing. + unk_token (:obj:`string`, `optional`, defaults to ""): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (:obj:`string`, `optional`, defaults to ""): + The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. - - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \ - (ex: "__classify__") to a vocabulary + .. note:: - - `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies) - - - `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies) - - - `do_lowercase_and_remove_accent` controle lower casing and accent (automatically set for pretrained vocabularies) + When building a sequence using special tokens, this is not the token that is used for the beginning + of sequence. The token used is the :obj:`cls_token`. + sep_token (:obj:`string`, `optional`, defaults to ""): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences + for sequence classification or for a text and a question for question answering. + It is also used as the last token of a sequence built with special tokens. + pad_token (:obj:`string`, `optional`, defaults to ""): + The token used for padding, for example when batching sequences of different lengths. + cls_token (:obj:`string`, `optional`, defaults to ""): + The classifier token which is used when doing sequence classification (classification of the whole + sequence instead of per-token classification). It is the first token of the sequence when built with + special tokens. + mask_token (:obj:`string`, `optional`, defaults to ""): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["","","","","","","","","",""]`): + List of additional special tokens. + lang2id (:obj:`Dict[str, int]`, `optional`, defaults to :obj:`None`): + Dictionary mapping languages string identifiers to their IDs. + id2lang (:obj:`Dict[int, str`, `optional`, defaults to :obj:`None`): + Dictionary mapping language IDs to their string identifiers. + do_lowercase_and_remove_accent (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to lowercase and remove accents when tokenizing. """ vocab_files_names = VOCAB_FILES_NAMES @@ -812,13 +852,26 @@ class XLMTokenizer(PreTrainedTokenizer): out_string = "".join(tokens).replace("", " ").strip() return out_string - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A XLM sequence has the following format: - single sequence: X - pair of sequences: A B + + - single sequence: `` X `` + - pair of sequences: `` A B `` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. + """ if token_ids_1 is None: return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] @@ -826,20 +879,23 @@ class XLMTokenizer(PreTrainedTokenizer): cls = [self.cls_token_id] return cls + token_ids_0 + sep + token_ids_1 + sep - def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: - token_ids_0: list of ids (must not contain special tokens) - token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids - for sequence pairs - already_has_special_tokens: (default False) Set to True if the token list is already formated with - special tokens for the model + token_ids_0 (:obj:`List[int]`): + List of ids. + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True if the token list is already formatted with special tokens for the model Returns: - A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. """ if already_has_special_tokens: @@ -854,14 +910,29 @@ class XLMTokenizer(PreTrainedTokenizer): return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1] - def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An XLM sequence pair mask has the following format: - 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 - | first sequence | second sequence - if token_ids_1 is None, only returns the first portion of the mask (0's). + :: + + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + + if token_ids_1 is None, only returns the first portion of the mask (0s). + + Args: + token_ids_0 (:obj:`List[int]`): + List of ids. + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given + sequence(s). """ sep = [self.sep_token_id] cls = [self.cls_token_id] @@ -870,7 +941,16 @@ class XLMTokenizer(PreTrainedTokenizer): return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] def save_vocabulary(self, save_directory): - """Save the tokenizer vocabulary and merge files to a directory.""" + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (:obj:`str`): + The directory in which to save the vocabulary. + + Returns: + :obj:`Tuple(str)`: Paths to the files saved. + """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return diff --git a/src/transformers/tokenization_xlm_roberta.py b/src/transformers/tokenization_xlm_roberta.py index 1e903d8a2b..810ef6c4a7 100644 --- a/src/transformers/tokenization_xlm_roberta.py +++ b/src/transformers/tokenization_xlm_roberta.py @@ -18,6 +18,7 @@ import logging import os from shutil import copyfile +from typing import List, Optional from transformers.tokenization_utils import PreTrainedTokenizer @@ -54,7 +55,50 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): Adapted from RobertaTokenizer and XLNetTokenizer SentencePiece based tokenizer. Peculiarities: - - requires `SentencePiece `_ + - requires `SentencePiece `_ + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users + should refer to the superclass for more information regarding methods. + + Args: + vocab_file (:obj:`str`): + Path to the vocabulary file. + bos_token (:obj:`string`, `optional`, defaults to ""): + The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the beginning + of sequence. The token used is the :obj:`cls_token`. + eos_token (:obj:`string`, `optional`, defaults to ""): + The end of sequence token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the end + of sequence. The token used is the :obj:`sep_token`. + sep_token (:obj:`string`, `optional`, defaults to ""): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences + for sequence classification or for a text and a question for question answering. + It is also used as the last token of a sequence built with special tokens. + cls_token (:obj:`string`, `optional`, defaults to ""): + The classifier token which is used when doing sequence classification (classification of the whole + sequence instead of per-token classification). It is the first token of the sequence when built with + special tokens. + unk_token (:obj:`string`, `optional`, defaults to ""): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (:obj:`string`, `optional`, defaults to ""): + The token used for padding, for example when batching sequences of different lengths. + mask_token (:obj:`string`, `optional`, defaults to ""): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): + Additional special tokens used by the tokenizer. + + Attributes: + sp_model (:obj:`SentencePieceProcessor`): + The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). """ vocab_files_names = VOCAB_FILES_NAMES @@ -132,35 +176,52 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. - A RoBERTa sequence has the following format: - single sequence: X - pair of sequences: A B + A XLM-R sequence has the following format: + + - single sequence: `` X `` + - pair of sequences: `` A B `` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ + if token_ids_1 is None: return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] cls = [self.cls_token_id] sep = [self.sep_token_id] return cls + token_ids_0 + sep + sep + token_ids_1 + sep - def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: - token_ids_0: list of ids (must not contain special tokens) - token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids - for sequence pairs - already_has_special_tokens: (default False) Set to True if the token list is already formated with - special tokens for the model + token_ids_0 (:obj:`List[int]`): + List of ids. + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True if the token list is already formatted with special tokens for the model Returns: - A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. """ + if already_has_special_tokens: if token_ids_1 is not None: raise ValueError( @@ -173,12 +234,24 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] - def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. - RoBERTa does not make use of token type ids, therefore a list of zeros is returned. - if token_ids_1 is None, only returns the first portion of the mask (0's). + XLM-R does not make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (:obj:`List[int]`): + List of ids. + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of zeros. + """ + sep = [self.sep_token_id] cls = [self.cls_token_id] @@ -216,8 +289,15 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): return out_string def save_vocabulary(self, save_directory): - """ Save the sentencepiece vocabulary (copy original file) and special tokens file - to a directory. + """ + Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. + + Args: + save_directory (:obj:`str`): + The directory in which to save the vocabulary. + + Returns: + :obj:`Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) diff --git a/src/transformers/tokenization_xlnet.py b/src/transformers/tokenization_xlnet.py index 30fdfda22e..800ef09c99 100644 --- a/src/transformers/tokenization_xlnet.py +++ b/src/transformers/tokenization_xlnet.py @@ -19,6 +19,7 @@ import logging import os import unicodedata from shutil import copyfile +from typing import List, Optional from .tokenization_utils import PreTrainedTokenizer @@ -51,9 +52,57 @@ SEG_ID_PAD = 4 class XLNetTokenizer(PreTrainedTokenizer): """ - SentencePiece based tokenizer. Peculiarities: + Constructs an XLNet tokenizer. Based on `SentencePiece `__ - - requires `SentencePiece `_ + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users + should refer to the superclass for more information regarding methods. + + Args: + vocab_file (:obj:`string`): + `SentencePiece `__ file (generally has a .spm extension) that + contains the vocabulary necessary to instantiate a tokenizer. + do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to lowercase the input when tokenizing. + remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to strip the text when tokenizing (removing excess spaces before and after the string). + keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to keep accents when tokenizing. + bos_token (:obj:`string`, `optional`, defaults to ""): + The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the beginning + of sequence. The token used is the :obj:`cls_token`. + eos_token (:obj:`string`, `optional`, defaults to ""): + The end of sequence token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the end + of sequence. The token used is the :obj:`sep_token`. + unk_token (:obj:`string`, `optional`, defaults to ""): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (:obj:`string`, `optional`, defaults to ""): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences + for sequence classification or for a text and a question for question answering. + It is also used as the last token of a sequence built with special tokens. + pad_token (:obj:`string`, `optional`, defaults to ""): + The token used for padding, for example when batching sequences of different lengths. + cls_token (:obj:`string`, `optional`, defaults to ""): + The classifier token which is used when doing sequence classification (classification of the whole + sequence instead of per-token classification). It is the first token of the sequence when built with + special tokens. + mask_token (:obj:`string`, `optional`, defaults to ""): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["", ""]`): + Additional special tokens used by the tokenizer. + + Attributes: + sp_model (:obj:`SentencePieceProcessor`): + The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). """ vocab_files_names = VOCAB_FILES_NAMES @@ -189,13 +238,25 @@ class XLNetTokenizer(PreTrainedTokenizer): out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() return out_string - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. An XLNet sequence has the following format: - single sequence: X - pair of sequences: A B + + - single sequence: ``X `` + - pair of sequences: ``A B `` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ sep = [self.sep_token_id] cls = [self.cls_token_id] @@ -203,20 +264,23 @@ class XLNetTokenizer(PreTrainedTokenizer): return token_ids_0 + sep + cls return token_ids_0 + sep + token_ids_1 + sep + cls - def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: - token_ids_0: list of ids (must not contain special tokens) - token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids - for sequence pairs - already_has_special_tokens: (default False) Set to True if the token list is already formated with - special tokens for the model + token_ids_0 (:obj:`List[int]`): + List of ids. + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True if the token list is already formatted with special tokens for the model Returns: - A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. """ if already_has_special_tokens: @@ -231,7 +295,9 @@ class XLNetTokenizer(PreTrainedTokenizer): return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1] return ([0] * len(token_ids_0)) + [1, 1] - def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An XLNet sequence pair mask has the following format: @@ -239,6 +305,16 @@ class XLNetTokenizer(PreTrainedTokenizer): | first sequence | second sequence | CLS segment ID if token_ids_1 is None, only returns the first portion of the mask (0's). + + Args: + token_ids_0 (:obj:`List[int]`): + List of ids. + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given + sequence(s). """ sep = [self.sep_token_id] cls_segment_id = [2] @@ -248,8 +324,15 @@ class XLNetTokenizer(PreTrainedTokenizer): return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id def save_vocabulary(self, save_directory): - """ Save the sentencepiece vocabulary (copy original file) and special tokens file - to a directory. + """ + Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. + + Args: + save_directory (:obj:`str`): + The directory in which to save the vocabulary. + + Returns: + :obj:`Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory))