Doc styling (#8067)

* Important files * Styling them all * Revert "Styling them all" This reverts commit 7d029395fdae8513b8281cbc2a6c239f8093503e. * Syling them for realsies * Fix syntax error * Fix benchmark_utils * More fixes * Fix modeling auto and script * Remove new line * Fixes * More fixes * Fix more files * Style * Add FSMT * More fixes * More fixes * More fixes * More fixes * Fixes * More fixes * More fixes * Last fixes * Make sphinx happy
2020-10-26 18:26:02 -04:00
parent 04a17f8550
commit 08f534d2da
271 changed files with 9726 additions and 8991 deletions
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -12,10 +12,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Base classes common to both the slow and the fast tokenization classes:
-    PreTrainedTokenizerBase (host all the user fronting encoding methodes)
-    Special token mixing (host the special tokens logic) and
-    BatchEncoding (wrap the dictionary of output with special method for the Fast tokenizers)
+"""
+Base classes common to both the slow and the fast tokenization classes: PreTrainedTokenizerBase (host all the user
+fronting encoding methodes) Special token mixing (host the special tokens logic) and BatchEncoding (wrap the dictionary
+of output with special method for the Fast tokenizers)
 """

 import copy
@@ -58,8 +58,9 @@ else:

    @dataclass(frozen=True, eq=True)
    class AddedToken:
-        """AddedToken represents a token to be added to a Tokenizer
-        An AddedToken can have special options defining the way it should behave.
+        """
+        AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the
+        way it should behave.
        """

        content: str = field(default_factory=str)
@@ -116,8 +117,8 @@ class ExplicitEnum(Enum):

 class TruncationStrategy(ExplicitEnum):
    """
-    Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`.
-    Useful for tab-completion in an IDE.
+    Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
+    tab-completion in an IDE.
    """

    ONLY_FIRST = "only_first"
@@ -128,8 +129,8 @@ class TruncationStrategy(ExplicitEnum):

 class PaddingStrategy(ExplicitEnum):
    """
-    Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`.
-    Useful for tab-completion in an IDE.
+    Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion
+    in an IDE.
    """

    LONGEST = "longest"
@@ -139,8 +140,8 @@ class PaddingStrategy(ExplicitEnum):

 class TensorType(ExplicitEnum):
    """
-    Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`.
-    Useful for tab-completion in an IDE.
+    Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
+    tab-completion in an IDE.
    """

    PYTORCH = "pt"
@@ -177,8 +178,7 @@ class TokenSpan(NamedTuple):

 def to_py_obj(obj):
    """
-    Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list
-    to a python list.
+    Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
    """
    if isinstance(obj, (list, tuple)):
        return [to_py_obj(o) for o in obj]
@@ -194,8 +194,8 @@ def to_py_obj(obj):

 class BatchEncoding(UserDict):
    """
-    Holds the output of the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus`
-    and :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.batch_encode` methods (tokens,
+    Holds the output of the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus` and
+    :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.batch_encode` methods (tokens,
    attention_masks, etc).

    This class is derived from a python dictionary and can be used as a dictionary. In addition, this class exposes
@@ -242,8 +242,8 @@ class BatchEncoding(UserDict):

    def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:
        """
-        If the key is a string, returns the value of the dict associated to :obj:`key` ('input_ids',
-        'attention_mask', etc.).
+        If the key is a string, returns the value of the dict associated to :obj:`key` ('input_ids', 'attention_mask',
+        etc.).

        If the key is an integer, get the :obj:`tokenizers.Encoding` for batch item with index :obj:`key`.
        """
@@ -289,15 +289,15 @@ class BatchEncoding(UserDict):
    @property
    def encodings(self) -> Optional[List[EncodingFast]]:
        """
-        :obj:`Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process.
-        Returns :obj:`None` if the input was tokenized through Python (i.e., not a fast) tokenizer.
+        :obj:`Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process. Returns
+        :obj:`None` if the input was tokenized through Python (i.e., not a fast) tokenizer.
        """
        return self._encodings

    def tokens(self, batch_index: int = 0) -> List[str]:
        """
-        Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion
-        to integer indices) at a given batch index (only works for the output of a fast tokenizer).
+        Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to
+        integer indices) at a given batch index (only works for the output of a fast tokenizer).

        Args:
            batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
@@ -327,25 +327,24 @@ class BatchEncoding(UserDict):

    def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
        """
-        Get the index of the word corresponding (i.e. comprising) to an encoded token
-        in a sequence of the batch.
+        Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch.

        Can be called as:

        - ``self.token_to_word(token_index)`` if batch size is 1
        - ``self.token_to_word(batch_index, token_index)`` if batch size is greater than 1

-        This method is particularly suited when the input sequences are provided as
-        pre-tokenized sequences (i.e., words are defined by the user). In this case it allows
-        to easily associate encoded tokens with provided tokenized words.
+        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
+        words are defined by the user). In this case it allows to easily associate encoded tokens with provided
+        tokenized words.

        Args:
            batch_or_token_index (:obj:`int`):
-                Index of the sequence in the batch. If the batch only comprise one sequence,
-                this can be the index of the token in the sequence.
+                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
+                the token in the sequence.
            token_index (:obj:`int`, `optional`):
-                If a batch index is provided in `batch_or_token_index`, this can be the index
-                of the token in the sequence.
+                If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the
+                sequence.

        Returns:
            :obj:`int`: Index of the word in the input sequence.
@@ -378,22 +377,21 @@ class BatchEncoding(UserDict):
        - ``self.word_to_tokens(word_index)`` if batch size is 1
        - ``self.word_to_tokens(batch_index, word_index)`` if batch size is greater or equal to 1

-        This method is particularly suited when the input sequences are provided as
-        pre-tokenized sequences (i.e. words are defined by the user). In this case it allows
-        to easily associate encoded tokens with provided tokenized words.
+        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
+        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
+        words.

        Args:
            batch_or_word_index (:obj:`int`):
-                Index of the sequence in the batch. If the batch only comprises one sequence,
-                this can be the index of the word in the sequence.
+                Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
+                the word in the sequence.
            word_index (:obj:`int`, `optional`):
-                If a batch index is provided in `batch_or_token_index`, this can be the index
-                of the word in the sequence.
+                If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
+                sequence.

        Returns:
-            Optional :class:`~transformers.tokenization_utils_base.TokenSpan`
-            Span of tokens in the encoded sequence. Returns :obj:`None` if no tokens correspond
-            to the word.
+            Optional :class:`~transformers.tokenization_utils_base.TokenSpan` Span of tokens in the encoded sequence.
+            Returns :obj:`None` if no tokens correspond to the word.
        """

        if not self._encodings:
@@ -427,15 +425,14 @@ class BatchEncoding(UserDict):

        Args:
            batch_or_token_index (:obj:`int`):
-                Index of the sequence in the batch. If the batch only comprise one sequence,
-                this can be the index of the token in the sequence.
+                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
+                the token in the sequence.
            token_index (:obj:`int`, `optional`):
-                If a batch index is provided in `batch_or_token_index`, this can be the index
-                of the token or tokens in the sequence.
+                If a batch index is provided in `batch_or_token_index`, this can be the index of the token or tokens in
+                the sequence.

        Returns:
-            :class:`~transformers.tokenization_utils_base.CharSpan`:
-            Span of characters in the original string.
+            :class:`~transformers.tokenization_utils_base.CharSpan`: Span of characters in the original string.
        """

        if not self._encodings:
@@ -449,25 +446,25 @@ class BatchEncoding(UserDict):

    def char_to_token(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int:
        """
-        Get the index of the token in the encoded output comprising a character
-        in the original string for a sequence of the batch.
+        Get the index of the token in the encoded output comprising a character in the original string for a sequence
+        of the batch.

        Can be called as:

        - ``self.char_to_token(char_index)`` if batch size is 1
        - ``self.char_to_token(batch_index, char_index)`` if batch size is greater or equal to 1

-        This method is particularly suited when the input sequences are provided as
-        pre-tokenized sequences (i.e. words are defined by the user). In this case it allows
-        to easily associate encoded tokens with provided tokenized words.
+        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
+        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
+        words.

        Args:
            batch_or_char_index (:obj:`int`):
-                Index of the sequence in the batch. If the batch only comprise one sequence,
-                this can be the index of the word in the sequence
+                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
+                the word in the sequence
            char_index (:obj:`int`, `optional`):
-                If a batch index is provided in `batch_or_token_index`, this can be the index
-                of the word in the sequence.
+                If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
+                sequence.


        Returns:
@@ -485,8 +482,7 @@ class BatchEncoding(UserDict):

    def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int] = None) -> CharSpan:
        """
-        Get the character span in the original string corresponding to given word in a sequence
-        of the batch.
+        Get the character span in the original string corresponding to given word in a sequence of the batch.

        Character spans are returned as a CharSpan NamedTuple with:

@@ -500,19 +496,19 @@ class BatchEncoding(UserDict):

        Args:
            batch_or_word_index (:obj:`int`):
-                Index of the sequence in the batch. If the batch only comprise one sequence,
-                this can be the index of the word in the sequence
+                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
+                the word in the sequence
            word_index (:obj:`int`, `optional`):
-                If a batch index is provided in `batch_or_token_index`, this can be the index
-                of the word in the sequence.
+                If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
+                sequence.

        Returns:
-            :obj:`CharSpan` or :obj:`List[CharSpan]`:
-                Span(s) of the associated character or characters in the string.
-                CharSpan are NamedTuple with:
+            :obj:`CharSpan` or :obj:`List[CharSpan]`: Span(s) of the associated character or characters in the string.
+            CharSpan are NamedTuple with:

                - start: index of the first character associated to the token in the original string
-                - end: index of the character following the last character associated to the token in the original string
+                - end: index of the character following the last character associated to the token in the original
+                  string
        """

        if not self._encodings:
@@ -526,30 +522,29 @@ class BatchEncoding(UserDict):

    def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int:
        """
-        Get the word in the original string corresponding to a character in the original string of
-        a sequence of the batch.
+        Get the word in the original string corresponding to a character in the original string of a sequence of the
+        batch.

        Can be called as:

        - ``self.char_to_word(char_index)`` if batch size is 1
        - ``self.char_to_word(batch_index, char_index)`` if batch size is greater than 1

-        This method is particularly suited when the input sequences are provided as
-        pre-tokenized sequences (i.e. words are defined by the user). In this case it allows
-        to easily associate encoded tokens with provided tokenized words.
+        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
+        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
+        words.

        Args:
            batch_or_char_index (:obj:`int`):
-                Index of the sequence in the batch. If the batch only comprise one sequence,
-                this can be the index of the character in the orginal string.
+                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
+                the character in the orginal string.
            char_index (:obj:`int`, `optional`):
-                If a batch index is provided in `batch_or_token_index`, this can be the index
-                of the character in the orginal string.
+                If a batch index is provided in `batch_or_token_index`, this can be the index of the character in the
+                orginal string.


        Returns:
-            :obj:`int` or :obj:`List[int]`:
-                Index or indices of the associated encoded token(s).
+            :obj:`int` or :obj:`List[int]`: Index or indices of the associated encoded token(s).
        """

        if not self._encodings:
@@ -642,8 +637,8 @@ class BatchEncoding(UserDict):
            device (:obj:`str` or :obj:`torch.device`): The device to put the tensors on.

        Returns:
-            :class:`~transformers.BatchEncoding`:
-            The same instance of :class:`~transformers.BatchEncoding` after modification.
+            :class:`~transformers.BatchEncoding`: The same instance of :class:`~transformers.BatchEncoding` after
+            modification.
        """
        self.data = {k: v.to(device) for k, v in self.data.items()}
        return self
@@ -651,8 +646,8 @@ class BatchEncoding(UserDict):

 class SpecialTokensMixin:
    """
-    A mixin derived by :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast`
-    to handle specific behaviors related to special tokens. In particular, this class hold the attributes which can be
+    A mixin derived by :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` to
+    handle specific behaviors related to special tokens. In particular, this class hold the attributes which can be
    used to directly access these special tokens in a model-independant manner and allow to set and update the special
    tokens.

@@ -874,8 +869,8 @@ class SpecialTokensMixin:
    @property
    def sep_token(self) -> str:
        """
-        :obj:`str`: Separation token, to separate context and query in an input sequence.
-        Log an error if used while not having been set.
+        :obj:`str`: Separation token, to separate context and query in an input sequence. Log an error if used while
+        not having been set.
        """
        if self._sep_token is None and self.verbose:
            logger.error("Using sep_token, but it is not set yet.")
@@ -895,8 +890,8 @@ class SpecialTokensMixin:
    @property
    def cls_token(self) -> str:
        """
-        :obj:`str`: Classification token, to extract a summary of an input sequence leveraging self-attention along
-        the full depth of the model. Log an error if used while not having been set.
+        :obj:`str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the
+        full depth of the model. Log an error if used while not having been set.
        """
        if self._cls_token is None and self.verbose:
            logger.error("Using cls_token, but it is not set yet.")
@@ -1039,8 +1034,8 @@ class SpecialTokensMixin:
    @property
    def additional_special_tokens_ids(self) -> List[int]:
        """
-        :obj:`List[int]`: Ids of all the additional special tokens in the vocabulary.
-        Log an error if used while not having been set.
+        :obj:`List[int]`: Ids of all the additional special tokens in the vocabulary. Log an error if used while not
+        having been set.
        """
        return self.convert_tokens_to_ids(self.additional_special_tokens)

@@ -1079,8 +1074,8 @@ class SpecialTokensMixin:
    @property
    def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
        """
-        :obj:`Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes
-        (:obj:`cls_token`, :obj:`unk_token`, etc.) to their values (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.).
+        :obj:`Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes (:obj:`cls_token`,
+        :obj:`unk_token`, etc.) to their values (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.).

        Convert potential tokens of :obj:`tokenizers.AddedToken` type to string.
        """
@@ -1199,8 +1194,8 @@ ENCODE_KWARGS_DOCSTRING = r"""

 ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
            return_token_type_ids (:obj:`bool`, `optional`):
-                Whether to return token type IDs. If left to the default, will return the token type IDs according
-                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+                Whether to return token type IDs. If left to the default, will return the token type IDs according to
+                the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.

                `What are token type IDs? <../glossary.html#token-type-ids>`__
            return_attention_mask (:obj:`bool`, `optional`):
@@ -1230,14 +1225,17 @@ ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
            - **input_ids** -- List of token ids to be fed to a model.

              `What are input IDs? <../glossary.html#input-ids>`__
+
            - **token_type_ids** -- List of token type ids to be fed to a model (when :obj:`return_token_type_ids=True`
              or if `"token_type_ids"` is in :obj:`self.model_input_names`).

              `What are token type IDs? <../glossary.html#token-type-ids>`__
+
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              :obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`).

              `What are attention masks? <../glossary.html#attention-mask>`__
+
            - **overflowing_tokens** -- List of overflowing tokens sequences (when a :obj:`max_length` is specified and
              :obj:`return_overflowing_tokens=True`).
            - **num_truncated_tokens** -- Number of tokens truncated (when a :obj:`max_length` is specified and
@@ -1249,6 +1247,7 @@ ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""

 INIT_TOKENIZER_DOCSTRING = r"""
    Class attributes (overridden by derived classes)
+
        - **vocab_files_names** (:obj:`Dict[str, str]`) -- A dictionary with, as keys, the ``__init__`` keyword name of
          each vocabulary file required by the model, and as associated values, the filename for saving the associated
          file (string).
@@ -1260,8 +1259,8 @@ INIT_TOKENIZER_DOCSTRING = r"""
          :obj:`short-cut-names` of the pretrained models, and as associated values, the maximum length of the sequence
          inputs of this model, or :obj:`None` if the model has no maximum input size.
        - **pretrained_init_configuration** (:obj:`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the
-          :obj:`short-cut-names` of the pretrained models, and as associated values, a dictionary of specific
-          arguments to pass to the ``__init__`` method of the tokenizer class for this pretrained model when loading the
+          :obj:`short-cut-names` of the pretrained models, and as associated values, a dictionary of specific arguments
+          to pass to the ``__init__`` method of the tokenizer class for this pretrained model when loading the
          tokenizer with the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`
          method.
        - **model_input_names** (:obj:`List[str]`) -- A list of inputs expected in the forward pass of the model.
@@ -1270,11 +1269,10 @@ INIT_TOKENIZER_DOCSTRING = r"""

    Args:
        model_max_length (:obj:`int`, `optional`):
-            The maximum length (in number of tokens) for the inputs to the transformer model.
-            When the tokenizer is loaded with
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`, this will be set to
-            the value stored for the associated model in ``max_model_input_sizes`` (see above). If no value is
-            provided, will default to VERY_LARGE_INTEGER (:obj:`int(1e30)`).
+            The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is
+            loaded with :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`, this
+            will be set to the value stored for the associated model in ``max_model_input_sizes`` (see above). If no
+            value is provided, will default to VERY_LARGE_INTEGER (:obj:`int(1e30)`).
        padding_side: (:obj:`str`, `optional`):
            The side on which the model should have padding applied. Should be selected between ['right', 'left'].
            Default value is picked from the class attribute of the same name.
@@ -1319,13 +1317,13 @@ PREPARE_SEQ2SEQ_BATCH_DOCSTRING = """
            tgt_texts (:obj:`list`, `optional`):
                List of summaries or target language texts.
            max_length (:obj:`int`, `optional`):
-                Controls the maximum length for encoder inputs (documents to summarize or source language texts)
-                If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
-                length is required by one of the truncation/padding parameters. If the model has no specific maximum
-                input length (like XLNet) truncation/padding to a maximum length will be deactivated.
+                Controls the maximum length for encoder inputs (documents to summarize or source language texts) If
+                left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum length
+                is required by one of the truncation/padding parameters. If the model has no specific maximum input
+                length (like XLNet) truncation/padding to a maximum length will be deactivated.
            max_target_length (:obj:`int`, `optional`):
-                Controls the maximum length of decoder inputs (target language texts or summaries)
-                If left unset or set to :obj:`None`, this will use the max_length value.
+                Controls the maximum length of decoder inputs (target language texts or summaries) If left unset or set
+                to :obj:`None`, this will use the max_length value.
            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
                Activates and controls padding. Accepts the following values:

@@ -1366,8 +1364,8 @@ PREPARE_SEQ2SEQ_BATCH_DOCSTRING = """
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
            - **labels** -- List of token ids for tgt_texts.

-            The full set of keys ``[input_ids, attention_mask, labels]``,
-            will only be returned if tgt_texts is passed. Otherwise, input_ids, attention_mask will be the only keys.
+            The full set of keys ``[input_ids, attention_mask, labels]``, will only be returned if tgt_texts is passed.
+            Otherwise, input_ids, attention_mask will be the only keys.

 """

@@ -1515,9 +1513,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
                Whether or not to delete incompletely received files. Attempt to resume the download if such a file
                exists.
            proxies (:obj:`Dict[str, str], `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.,
-                :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each
-                request.
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            inputs (additional positional arguments, `optional`):
                Will be passed along to the Tokenizer ``__init__`` method.
            kwargs (additional keyword arguments, `optional`):
@@ -1792,10 +1789,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained` class method.

        .. Note::
-            A "fast" tokenizer (instance of :class:`transformers.PreTrainedTokenizerFast`) saved with
-            this method will not be possible to load back
-            in a "slow" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizer` instance. It can only be loaded
-            in a "fast" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizerFast` instance.
+            A "fast" tokenizer (instance of :class:`transformers.PreTrainedTokenizerFast`) saved with this method will
+            not be possible to load back in a "slow" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizer`
+            instance. It can only be loaded in a "fast" tokenizer, i.e. in a
+            :class:`transformers.PreTrainedTokenizerFast` instance.

        .. Warning::
           This won't save modifications you may have applied to the tokenizer after the instantiation (for instance,
@@ -1804,10 +1801,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        Args:
            save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved.
            legacy_format (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                Whether to save the tokenizer in legacy format (default), i.e. with tokenizer specific vocabulary and
-                a separate added_tokens files or in the unified JSON file format for the `tokenizers` library.
-                It's only possible to save a Fast tokenizer in the unified JSON format and this format is incompatible
-                with "slow" tokenizers (not powered by the `tokenizers` library).
+                Whether to save the tokenizer in legacy format (default), i.e. with tokenizer specific vocabulary and a
+                separate added_tokens files or in the unified JSON file format for the `tokenizers` library. It's only
+                possible to save a Fast tokenizer in the unified JSON format and this format is incompatible with
+                "slow" tokenizers (not powered by the `tokenizers` library).
            filename_prefix: (:obj:`str`, `optional`):
                A prefix to add to the names of the files saved by the tokenizer.

@@ -1871,10 +1868,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        legacy_format: bool = True,
        filename_prefix: Optional[str] = None,
    ) -> Tuple[str]:
-        """Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
+        """
+        Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.

-        Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens}
-        using the specific :meth:`~transformers.tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`
+        Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
+        specific :meth:`~transformers.tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`
        """
        if not legacy_format:
            raise ValueError(
@@ -1898,9 +1896,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        """
        Save only the vocabulary of the tokenizer (vocabulary + added tokens).

-        This method won't save the configuration and special token mappings of the tokenizer.
-        Use :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained` to save
-        the whole state of the tokenizer.
+        This method won't save the configuration and special token mappings of the tokenizer. Use
+        :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained` to save the whole state of the tokenizer.

        Args:
            save_directory (:obj:`str`):
@@ -1918,10 +1915,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        Converts a string in a sequence of tokens, using the backend Rust tokenizer.

        Note that this method behave differently between fast and slow tokenizers:
-            - in fast tokenizers (instances of :class:`~transformers.PreTrainedTokenizerFast`), this method
-                will replace the unknown tokens with the :obj:`unk_token`,
-            - in slow tokenizers (instances of :class:`~transformers.PreTrainedTokenizer`), this method
-                keep unknown tokens unchanged.
+
+            - in fast tokenizers (instances of :class:`~transformers.PreTrainedTokenizerFast`), this method will
+              replace the unknown tokens with the :obj:`unk_token`,
+            - in slow tokenizers (instances of :class:`~transformers.PreTrainedTokenizer`), this method keep unknown
+              tokens unchanged.

        Args:
            text (:obj:`str`):
@@ -1931,8 +1929,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to add the special tokens associated with the corresponding model.
            kwargs (additional keyword arguments, `optional`):
-                Will be passed to the underlying model specific encode method.
-                See details in :meth:`~transformers.PreTrainedTokenizer.__call__`
+                Will be passed to the underlying model specific encode method. See details in
+                :meth:`~transformers.PreTrainedTokenizer.__call__`

        Returns:
            :obj:`List[str]`: The list of tokens.
@@ -1946,8 +1944,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        """,
        """
        Returns:
-            :obj:`List[int]`, :obj:`torch.Tensor`, :obj:`tf.Tensor` or :obj:`np.ndarray`:
-            The tokenized ids of the text.
+            :obj:`List[int]`, :obj:`torch.Tensor`, :obj:`tf.Tensor` or :obj:`np.ndarray`: The tokenized ids of the
+            text.
        """,
    )
    def encode(
@@ -1969,12 +1967,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):

        Args:
            text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`):
-                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
-                the ``tokenize`` method) or a list of integers (tokenized string ids using the
-                ``convert_tokens_to_ids`` method).
+                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
+                ``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids``
+                method).
            text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
-                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
-                string using the ``tokenize`` method) or a list of integers (tokenized string ids using the
+                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
+                the ``tokenize`` method) or a list of integers (tokenized string ids using the
                ``convert_tokens_to_ids`` method).
        """
        encoded_inputs = self.encode_plus(
@@ -1998,8 +1996,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
    ):
        """
-        Find the correct padding/truncation strategy with backward compatibility
-        for old arguments (truncation_strategy and pad_to_max_length) and behaviors.
+        Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy
+        and pad_to_max_length) and behaviors.
        """
        old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate")
        old_pad_to_max_length = kwargs.pop("pad_to_max_length", False)
@@ -2150,14 +2148,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):

        Args:
            text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
-                The sequence or batch of sequences to be encoded.
-                Each sequence can be a string or a list of strings (pretokenized string).
-                If the sequences are provided as list of strings (pretokenized), you must set
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                :obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
-                The sequence or batch of sequences to be encoded.
-                Each sequence can be a string or a list of strings (pretokenized string).
-                If the sequences are provided as list of strings (pretokenized), you must set
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                :obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
        """
        # Input type checking for clearer error
@@ -2276,12 +2272,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):

        Args:
            text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]` (the latter only for not-fast tokenizers)):
-                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
-                the ``tokenize`` method) or a list of integers (tokenized string ids using the
-                ``convert_tokens_to_ids`` method).
+                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
+                ``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids``
+                method).
            text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
-                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
-                string using the ``tokenize`` method) or a list of integers (tokenized string ids using the
+                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
+                the ``tokenize`` method) or a list of integers (tokenized string ids using the
                ``convert_tokens_to_ids`` method).
        """

@@ -2375,9 +2371,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):

        Args:
            batch_text_or_text_pairs (:obj:`List[str]`, :obj:`List[Tuple[str, str]]`, :obj:`List[List[str]]`, :obj:`List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also :obj:`List[List[int]]`, :obj:`List[Tuple[List[int], List[int]]]`):
-                Batch of sequences or pair of sequences to be encoded.
-                This can be a list of string/string-sequences/int-sequences or a list of pair of
-                string/string-sequences/int-sequence (see details in ``encode_plus``).
+                Batch of sequences or pair of sequences to be encoded. This can be a list of
+                string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see
+                details in ``encode_plus``).
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
@@ -2459,8 +2455,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
        in the batch.

-        Padding side (left/right) padding token ids are defined at the tokenizer level
-        (with ``self.padding_side``, ``self.pad_token_id`` and ``self.pad_token_type_id``)
+        Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``,
+        ``self.pad_token_id`` and ``self.pad_token_type_id``)

        .. note::

@@ -2470,10 +2466,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):

        Args:
            encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
-                Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or
-                :obj:`Dict[str, List[int]]`) or a batch of tokenized inputs (list of
-                :class:`~transformers.BatchEncoding`, `Dict[str, List[List[int]]]` or `List[Dict[str, List[int]]]`) so
-                you can use this method during preprocessing as well as in a PyTorch Dataloader collate function.
+                Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
+                List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
+                List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
+                well as in a PyTorch Dataloader collate function.

                Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
                see the note above for the return type.
@@ -2592,8 +2588,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create the token type IDs corresponding to the sequences passed.
-        `What are token type IDs? <../glossary.html#token-type-ids>`__
+        Create the token type IDs corresponding to the sequences passed. `What are token type IDs?
+        <../glossary.html#token-type-ids>`__

        Should be overriden in a subclass if the model has a special way of building those.

@@ -2612,8 +2608,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
-        by concatenating and adding special tokens.
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens.

        This implementation does not add special tokens and this method should be overriden in a subclass.

@@ -2651,17 +2647,17 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        **kwargs
    ) -> BatchEncoding:
        """
-        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
-        It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
+        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
+        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
        manages a moving window (with user defined stride) for overflowing tokens

        Args:
            ids (:obj:`List[int]`):
-                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the
-                ``tokenize`` and ``convert_tokens_to_ids`` methods.
+                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the ``tokenize``
+                and ``convert_tokens_to_ids`` methods.
            pair_ids (:obj:`List[int]`, `optional`):
-                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the
-                ``tokenize`` and ``convert_tokens_to_ids`` methods.
+                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the ``tokenize``
+                and ``convert_tokens_to_ids`` methods.
        """

        if "return_lengths" in kwargs:
@@ -2780,28 +2776,28 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):

        Args:
            ids (:obj:`List[int]`):
-                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the
-                ``tokenize`` and ``convert_tokens_to_ids`` methods.
+                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the ``tokenize``
+                and ``convert_tokens_to_ids`` methods.
            pair_ids (:obj:`List[int]`, `optional`):
-                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the
-                ``tokenize`` and ``convert_tokens_to_ids`` methods.
+                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the ``tokenize``
+                and ``convert_tokens_to_ids`` methods.
            num_tokens_to_remove (:obj:`int`, `optional`, defaults to 0):
                Number of tokens to remove using the truncation strategy.
            truncation (:obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
                The strategy to follow for truncation. Can be:

-                * :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
-                  :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
-                  provided. This will truncate token by token, removing a token from the longest sequence in the pair
-                  if a pair of sequences (or a batch of pairs) is provided.
+                * :obj:`'longest_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
+                  to the maximum acceptable input length for the model if that argument is not provided. This will
+                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
+                  sequences (or a batch of pairs) is provided.
                * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
                  the maximum acceptable input length for the model if that argument is not provided. This will only
                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
                * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
                  to the maximum acceptable input length for the model if that argument is not provided. This will only
                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                * :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
-                  sequence lengths greater than the model maximum admissible input size).
+                * :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                  greater than the model maximum admissible input size).
            max_length (:obj:`int`, `optional`):
                Controls the maximum length to use by one of the truncation/padding parameters.

@@ -2809,12 +2805,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
                length is required by one of the truncation/padding parameters. If the model has no specific maximum
                input length (like XLNet) truncation/padding to a maximum length will be deactivated.
            stride (:obj:`int`, `optional`, defaults to 0):
-                If set to a positive number, the overflowing tokens returned will contain some tokens
-                from the main sequence returned. The value of this argument defines the number of additional tokens.
+                If set to a positive number, the overflowing tokens returned will contain some tokens from the main
+                sequence returned. The value of this argument defines the number of additional tokens.

        Returns:
-            :obj:`Tuple[List[int], List[int], List[int]]`:
-            The truncated ``ids``, the truncated ``pair_ids`` and the list of overflowing tokens.
+            :obj:`Tuple[List[int], List[int], List[int]]`: The truncated ``ids``, the truncated ``pair_ids`` and the
+            list of overflowing tokens.
        """
        if num_tokens_to_remove <= 0:
            return ids, pair_ids, []
@@ -2882,10 +2878,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
            max_length: maximum length of the returned list and optionally padding length (see below).
                Will truncate by taking into account the special tokens.
            padding_strategy: PaddingStrategy to use for padding.
+
                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
                - PaddingStrategy.DO_NOT_PAD: Do not pad
                The tokenizer padding sides are defined in self.padding_side:
+
                    - 'left': pads on the left of the sequences
                    - 'right': pads on the right of the sequences
            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
@@ -2939,9 +2937,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):

    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        """
-        Converts a sequence of token ids in a single string.
-        The most simple way to do it is ``" ".join(tokens)`` but we often want to remove
-        sub-word tokenization artifacts at the same time.
+        Converts a sequence of token ids in a single string. The most simple way to do it is ``" ".join(tokens)`` but
+        we often want to remove sub-word tokenization artifacts at the same time
+
        Args:
            tokens (:obj:`List[str]`): The token to join in a string.
        Return: The joined tokens.
@@ -2989,8 +2987,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        **kwargs
    ) -> str:
        """
-        Converts a sequence of ids in a string, using the tokenizer and vocabulary
-        with options to remove special tokens and clean up tokenization spaces.
+        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
+        tokens and clean up tokenization spaces.

        Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.