Doc styling (#8067)

* Important files

* Styling them all

* Revert "Styling them all"

This reverts commit 7d029395fdae8513b8281cbc2a6c239f8093503e.

* Syling them for realsies

* Fix syntax error

* Fix benchmark_utils

* More fixes

* Fix modeling auto and script

* Remove new line

* Fixes

* More fixes

* Fix more files

* Style

* Add FSMT

* More fixes

* More fixes

* More fixes

* More fixes

* Fixes

* More fixes

* More fixes

* Last fixes

* Make sphinx happy
This commit is contained in:
Sylvain Gugger
2020-10-26 18:26:02 -04:00
committed by GitHub
parent 04a17f8550
commit 08f534d2da
271 changed files with 9726 additions and 8991 deletions

View File

@@ -12,10 +12,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Base classes common to both the slow and the fast tokenization classes:
PreTrainedTokenizerBase (host all the user fronting encoding methodes)
Special token mixing (host the special tokens logic) and
BatchEncoding (wrap the dictionary of output with special method for the Fast tokenizers)
"""
Base classes common to both the slow and the fast tokenization classes: PreTrainedTokenizerBase (host all the user
fronting encoding methodes) Special token mixing (host the special tokens logic) and BatchEncoding (wrap the dictionary
of output with special method for the Fast tokenizers)
"""
import copy
@@ -58,8 +58,9 @@ else:
@dataclass(frozen=True, eq=True)
class AddedToken:
"""AddedToken represents a token to be added to a Tokenizer
An AddedToken can have special options defining the way it should behave.
"""
AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the
way it should behave.
"""
content: str = field(default_factory=str)
@@ -116,8 +117,8 @@ class ExplicitEnum(Enum):
class TruncationStrategy(ExplicitEnum):
"""
Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`.
Useful for tab-completion in an IDE.
Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
tab-completion in an IDE.
"""
ONLY_FIRST = "only_first"
@@ -128,8 +129,8 @@ class TruncationStrategy(ExplicitEnum):
class PaddingStrategy(ExplicitEnum):
"""
Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`.
Useful for tab-completion in an IDE.
Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion
in an IDE.
"""
LONGEST = "longest"
@@ -139,8 +140,8 @@ class PaddingStrategy(ExplicitEnum):
class TensorType(ExplicitEnum):
"""
Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`.
Useful for tab-completion in an IDE.
Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
tab-completion in an IDE.
"""
PYTORCH = "pt"
@@ -177,8 +178,7 @@ class TokenSpan(NamedTuple):
def to_py_obj(obj):
"""
Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list
to a python list.
Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
"""
if isinstance(obj, (list, tuple)):
return [to_py_obj(o) for o in obj]
@@ -194,8 +194,8 @@ def to_py_obj(obj):
class BatchEncoding(UserDict):
"""
Holds the output of the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus`
and :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.batch_encode` methods (tokens,
Holds the output of the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus` and
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.batch_encode` methods (tokens,
attention_masks, etc).
This class is derived from a python dictionary and can be used as a dictionary. In addition, this class exposes
@@ -242,8 +242,8 @@ class BatchEncoding(UserDict):
def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:
"""
If the key is a string, returns the value of the dict associated to :obj:`key` ('input_ids',
'attention_mask', etc.).
If the key is a string, returns the value of the dict associated to :obj:`key` ('input_ids', 'attention_mask',
etc.).
If the key is an integer, get the :obj:`tokenizers.Encoding` for batch item with index :obj:`key`.
"""
@@ -289,15 +289,15 @@ class BatchEncoding(UserDict):
@property
def encodings(self) -> Optional[List[EncodingFast]]:
"""
:obj:`Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process.
Returns :obj:`None` if the input was tokenized through Python (i.e., not a fast) tokenizer.
:obj:`Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process. Returns
:obj:`None` if the input was tokenized through Python (i.e., not a fast) tokenizer.
"""
return self._encodings
def tokens(self, batch_index: int = 0) -> List[str]:
"""
Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion
to integer indices) at a given batch index (only works for the output of a fast tokenizer).
Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to
integer indices) at a given batch index (only works for the output of a fast tokenizer).
Args:
batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
@@ -327,25 +327,24 @@ class BatchEncoding(UserDict):
def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
"""
Get the index of the word corresponding (i.e. comprising) to an encoded token
in a sequence of the batch.
Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch.
Can be called as:
- ``self.token_to_word(token_index)`` if batch size is 1
- ``self.token_to_word(batch_index, token_index)`` if batch size is greater than 1
This method is particularly suited when the input sequences are provided as
pre-tokenized sequences (i.e., words are defined by the user). In this case it allows
to easily associate encoded tokens with provided tokenized words.
This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
words are defined by the user). In this case it allows to easily associate encoded tokens with provided
tokenized words.
Args:
batch_or_token_index (:obj:`int`):
Index of the sequence in the batch. If the batch only comprise one sequence,
this can be the index of the token in the sequence.
Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
the token in the sequence.
token_index (:obj:`int`, `optional`):
If a batch index is provided in `batch_or_token_index`, this can be the index
of the token in the sequence.
If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the
sequence.
Returns:
:obj:`int`: Index of the word in the input sequence.
@@ -378,22 +377,21 @@ class BatchEncoding(UserDict):
- ``self.word_to_tokens(word_index)`` if batch size is 1
- ``self.word_to_tokens(batch_index, word_index)`` if batch size is greater or equal to 1
This method is particularly suited when the input sequences are provided as
pre-tokenized sequences (i.e. words are defined by the user). In this case it allows
to easily associate encoded tokens with provided tokenized words.
This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
words.
Args:
batch_or_word_index (:obj:`int`):
Index of the sequence in the batch. If the batch only comprises one sequence,
this can be the index of the word in the sequence.
Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
the word in the sequence.
word_index (:obj:`int`, `optional`):
If a batch index is provided in `batch_or_token_index`, this can be the index
of the word in the sequence.
If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
sequence.
Returns:
Optional :class:`~transformers.tokenization_utils_base.TokenSpan`
Span of tokens in the encoded sequence. Returns :obj:`None` if no tokens correspond
to the word.
Optional :class:`~transformers.tokenization_utils_base.TokenSpan` Span of tokens in the encoded sequence.
Returns :obj:`None` if no tokens correspond to the word.
"""
if not self._encodings:
@@ -427,15 +425,14 @@ class BatchEncoding(UserDict):
Args:
batch_or_token_index (:obj:`int`):
Index of the sequence in the batch. If the batch only comprise one sequence,
this can be the index of the token in the sequence.
Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
the token in the sequence.
token_index (:obj:`int`, `optional`):
If a batch index is provided in `batch_or_token_index`, this can be the index
of the token or tokens in the sequence.
If a batch index is provided in `batch_or_token_index`, this can be the index of the token or tokens in
the sequence.
Returns:
:class:`~transformers.tokenization_utils_base.CharSpan`:
Span of characters in the original string.
:class:`~transformers.tokenization_utils_base.CharSpan`: Span of characters in the original string.
"""
if not self._encodings:
@@ -449,25 +446,25 @@ class BatchEncoding(UserDict):
def char_to_token(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int:
"""
Get the index of the token in the encoded output comprising a character
in the original string for a sequence of the batch.
Get the index of the token in the encoded output comprising a character in the original string for a sequence
of the batch.
Can be called as:
- ``self.char_to_token(char_index)`` if batch size is 1
- ``self.char_to_token(batch_index, char_index)`` if batch size is greater or equal to 1
This method is particularly suited when the input sequences are provided as
pre-tokenized sequences (i.e. words are defined by the user). In this case it allows
to easily associate encoded tokens with provided tokenized words.
This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
words.
Args:
batch_or_char_index (:obj:`int`):
Index of the sequence in the batch. If the batch only comprise one sequence,
this can be the index of the word in the sequence
Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
the word in the sequence
char_index (:obj:`int`, `optional`):
If a batch index is provided in `batch_or_token_index`, this can be the index
of the word in the sequence.
If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
sequence.
Returns:
@@ -485,8 +482,7 @@ class BatchEncoding(UserDict):
def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int] = None) -> CharSpan:
"""
Get the character span in the original string corresponding to given word in a sequence
of the batch.
Get the character span in the original string corresponding to given word in a sequence of the batch.
Character spans are returned as a CharSpan NamedTuple with:
@@ -500,19 +496,19 @@ class BatchEncoding(UserDict):
Args:
batch_or_word_index (:obj:`int`):
Index of the sequence in the batch. If the batch only comprise one sequence,
this can be the index of the word in the sequence
Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
the word in the sequence
word_index (:obj:`int`, `optional`):
If a batch index is provided in `batch_or_token_index`, this can be the index
of the word in the sequence.
If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
sequence.
Returns:
:obj:`CharSpan` or :obj:`List[CharSpan]`:
Span(s) of the associated character or characters in the string.
CharSpan are NamedTuple with:
:obj:`CharSpan` or :obj:`List[CharSpan]`: Span(s) of the associated character or characters in the string.
CharSpan are NamedTuple with:
- start: index of the first character associated to the token in the original string
- end: index of the character following the last character associated to the token in the original string
- end: index of the character following the last character associated to the token in the original
string
"""
if not self._encodings:
@@ -526,30 +522,29 @@ class BatchEncoding(UserDict):
def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int:
"""
Get the word in the original string corresponding to a character in the original string of
a sequence of the batch.
Get the word in the original string corresponding to a character in the original string of a sequence of the
batch.
Can be called as:
- ``self.char_to_word(char_index)`` if batch size is 1
- ``self.char_to_word(batch_index, char_index)`` if batch size is greater than 1
This method is particularly suited when the input sequences are provided as
pre-tokenized sequences (i.e. words are defined by the user). In this case it allows
to easily associate encoded tokens with provided tokenized words.
This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
words.
Args:
batch_or_char_index (:obj:`int`):
Index of the sequence in the batch. If the batch only comprise one sequence,
this can be the index of the character in the orginal string.
Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
the character in the orginal string.
char_index (:obj:`int`, `optional`):
If a batch index is provided in `batch_or_token_index`, this can be the index
of the character in the orginal string.
If a batch index is provided in `batch_or_token_index`, this can be the index of the character in the
orginal string.
Returns:
:obj:`int` or :obj:`List[int]`:
Index or indices of the associated encoded token(s).
:obj:`int` or :obj:`List[int]`: Index or indices of the associated encoded token(s).
"""
if not self._encodings:
@@ -642,8 +637,8 @@ class BatchEncoding(UserDict):
device (:obj:`str` or :obj:`torch.device`): The device to put the tensors on.
Returns:
:class:`~transformers.BatchEncoding`:
The same instance of :class:`~transformers.BatchEncoding` after modification.
:class:`~transformers.BatchEncoding`: The same instance of :class:`~transformers.BatchEncoding` after
modification.
"""
self.data = {k: v.to(device) for k, v in self.data.items()}
return self
@@ -651,8 +646,8 @@ class BatchEncoding(UserDict):
class SpecialTokensMixin:
"""
A mixin derived by :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast`
to handle specific behaviors related to special tokens. In particular, this class hold the attributes which can be
A mixin derived by :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` to
handle specific behaviors related to special tokens. In particular, this class hold the attributes which can be
used to directly access these special tokens in a model-independant manner and allow to set and update the special
tokens.
@@ -874,8 +869,8 @@ class SpecialTokensMixin:
@property
def sep_token(self) -> str:
"""
:obj:`str`: Separation token, to separate context and query in an input sequence.
Log an error if used while not having been set.
:obj:`str`: Separation token, to separate context and query in an input sequence. Log an error if used while
not having been set.
"""
if self._sep_token is None and self.verbose:
logger.error("Using sep_token, but it is not set yet.")
@@ -895,8 +890,8 @@ class SpecialTokensMixin:
@property
def cls_token(self) -> str:
"""
:obj:`str`: Classification token, to extract a summary of an input sequence leveraging self-attention along
the full depth of the model. Log an error if used while not having been set.
:obj:`str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the
full depth of the model. Log an error if used while not having been set.
"""
if self._cls_token is None and self.verbose:
logger.error("Using cls_token, but it is not set yet.")
@@ -1039,8 +1034,8 @@ class SpecialTokensMixin:
@property
def additional_special_tokens_ids(self) -> List[int]:
"""
:obj:`List[int]`: Ids of all the additional special tokens in the vocabulary.
Log an error if used while not having been set.
:obj:`List[int]`: Ids of all the additional special tokens in the vocabulary. Log an error if used while not
having been set.
"""
return self.convert_tokens_to_ids(self.additional_special_tokens)
@@ -1079,8 +1074,8 @@ class SpecialTokensMixin:
@property
def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
"""
:obj:`Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes
(:obj:`cls_token`, :obj:`unk_token`, etc.) to their values (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.).
:obj:`Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes (:obj:`cls_token`,
:obj:`unk_token`, etc.) to their values (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.).
Convert potential tokens of :obj:`tokenizers.AddedToken` type to string.
"""
@@ -1199,8 +1194,8 @@ ENCODE_KWARGS_DOCSTRING = r"""
ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
return_token_type_ids (:obj:`bool`, `optional`):
Whether to return token type IDs. If left to the default, will return the token type IDs according
to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
Whether to return token type IDs. If left to the default, will return the token type IDs according to
the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
`What are token type IDs? <../glossary.html#token-type-ids>`__
return_attention_mask (:obj:`bool`, `optional`):
@@ -1230,14 +1225,17 @@ ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
- **input_ids** -- List of token ids to be fed to a model.
`What are input IDs? <../glossary.html#input-ids>`__
- **token_type_ids** -- List of token type ids to be fed to a model (when :obj:`return_token_type_ids=True`
or if `"token_type_ids"` is in :obj:`self.model_input_names`).
`What are token type IDs? <../glossary.html#token-type-ids>`__
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
:obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`).
`What are attention masks? <../glossary.html#attention-mask>`__
- **overflowing_tokens** -- List of overflowing tokens sequences (when a :obj:`max_length` is specified and
:obj:`return_overflowing_tokens=True`).
- **num_truncated_tokens** -- Number of tokens truncated (when a :obj:`max_length` is specified and
@@ -1249,6 +1247,7 @@ ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
INIT_TOKENIZER_DOCSTRING = r"""
Class attributes (overridden by derived classes)
- **vocab_files_names** (:obj:`Dict[str, str]`) -- A dictionary with, as keys, the ``__init__`` keyword name of
each vocabulary file required by the model, and as associated values, the filename for saving the associated
file (string).
@@ -1260,8 +1259,8 @@ INIT_TOKENIZER_DOCSTRING = r"""
:obj:`short-cut-names` of the pretrained models, and as associated values, the maximum length of the sequence
inputs of this model, or :obj:`None` if the model has no maximum input size.
- **pretrained_init_configuration** (:obj:`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the
:obj:`short-cut-names` of the pretrained models, and as associated values, a dictionary of specific
arguments to pass to the ``__init__`` method of the tokenizer class for this pretrained model when loading the
:obj:`short-cut-names` of the pretrained models, and as associated values, a dictionary of specific arguments
to pass to the ``__init__`` method of the tokenizer class for this pretrained model when loading the
tokenizer with the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`
method.
- **model_input_names** (:obj:`List[str]`) -- A list of inputs expected in the forward pass of the model.
@@ -1270,11 +1269,10 @@ INIT_TOKENIZER_DOCSTRING = r"""
Args:
model_max_length (:obj:`int`, `optional`):
The maximum length (in number of tokens) for the inputs to the transformer model.
When the tokenizer is loaded with
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`, this will be set to
the value stored for the associated model in ``max_model_input_sizes`` (see above). If no value is
provided, will default to VERY_LARGE_INTEGER (:obj:`int(1e30)`).
The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is
loaded with :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`, this
will be set to the value stored for the associated model in ``max_model_input_sizes`` (see above). If no
value is provided, will default to VERY_LARGE_INTEGER (:obj:`int(1e30)`).
padding_side: (:obj:`str`, `optional`):
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
@@ -1319,13 +1317,13 @@ PREPARE_SEQ2SEQ_BATCH_DOCSTRING = """
tgt_texts (:obj:`list`, `optional`):
List of summaries or target language texts.
max_length (:obj:`int`, `optional`):
Controls the maximum length for encoder inputs (documents to summarize or source language texts)
If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
length is required by one of the truncation/padding parameters. If the model has no specific maximum
input length (like XLNet) truncation/padding to a maximum length will be deactivated.
Controls the maximum length for encoder inputs (documents to summarize or source language texts) If
left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum length
is required by one of the truncation/padding parameters. If the model has no specific maximum input
length (like XLNet) truncation/padding to a maximum length will be deactivated.
max_target_length (:obj:`int`, `optional`):
Controls the maximum length of decoder inputs (target language texts or summaries)
If left unset or set to :obj:`None`, this will use the max_length value.
Controls the maximum length of decoder inputs (target language texts or summaries) If left unset or set
to :obj:`None`, this will use the max_length value.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
Activates and controls padding. Accepts the following values:
@@ -1366,8 +1364,8 @@ PREPARE_SEQ2SEQ_BATCH_DOCSTRING = """
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
- **labels** -- List of token ids for tgt_texts.
The full set of keys ``[input_ids, attention_mask, labels]``,
will only be returned if tgt_texts is passed. Otherwise, input_ids, attention_mask will be the only keys.
The full set of keys ``[input_ids, attention_mask, labels]``, will only be returned if tgt_texts is passed.
Otherwise, input_ids, attention_mask will be the only keys.
"""
@@ -1515,9 +1513,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
Whether or not to delete incompletely received files. Attempt to resume the download if such a file
exists.
proxies (:obj:`Dict[str, str], `optional`):
A dictionary of proxy servers to use by protocol or endpoint, e.g.,
:obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each
request.
A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
inputs (additional positional arguments, `optional`):
Will be passed along to the Tokenizer ``__init__`` method.
kwargs (additional keyword arguments, `optional`):
@@ -1792,10 +1789,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained` class method.
.. Note::
A "fast" tokenizer (instance of :class:`transformers.PreTrainedTokenizerFast`) saved with
this method will not be possible to load back
in a "slow" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizer` instance. It can only be loaded
in a "fast" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizerFast` instance.
A "fast" tokenizer (instance of :class:`transformers.PreTrainedTokenizerFast`) saved with this method will
not be possible to load back in a "slow" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizer`
instance. It can only be loaded in a "fast" tokenizer, i.e. in a
:class:`transformers.PreTrainedTokenizerFast` instance.
.. Warning::
This won't save modifications you may have applied to the tokenizer after the instantiation (for instance,
@@ -1804,10 +1801,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
Args:
save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved.
legacy_format (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to save the tokenizer in legacy format (default), i.e. with tokenizer specific vocabulary and
a separate added_tokens files or in the unified JSON file format for the `tokenizers` library.
It's only possible to save a Fast tokenizer in the unified JSON format and this format is incompatible
with "slow" tokenizers (not powered by the `tokenizers` library).
Whether to save the tokenizer in legacy format (default), i.e. with tokenizer specific vocabulary and a
separate added_tokens files or in the unified JSON file format for the `tokenizers` library. It's only
possible to save a Fast tokenizer in the unified JSON format and this format is incompatible with
"slow" tokenizers (not powered by the `tokenizers` library).
filename_prefix: (:obj:`str`, `optional`):
A prefix to add to the names of the files saved by the tokenizer.
@@ -1871,10 +1868,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
legacy_format: bool = True,
filename_prefix: Optional[str] = None,
) -> Tuple[str]:
"""Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
"""
Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens}
using the specific :meth:`~transformers.tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`
Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
specific :meth:`~transformers.tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`
"""
if not legacy_format:
raise ValueError(
@@ -1898,9 +1896,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
"""
Save only the vocabulary of the tokenizer (vocabulary + added tokens).
This method won't save the configuration and special token mappings of the tokenizer.
Use :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained` to save
the whole state of the tokenizer.
This method won't save the configuration and special token mappings of the tokenizer. Use
:meth:`~transformers.PreTrainedTokenizerFast._save_pretrained` to save the whole state of the tokenizer.
Args:
save_directory (:obj:`str`):
@@ -1918,10 +1915,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
Converts a string in a sequence of tokens, using the backend Rust tokenizer.
Note that this method behave differently between fast and slow tokenizers:
- in fast tokenizers (instances of :class:`~transformers.PreTrainedTokenizerFast`), this method
will replace the unknown tokens with the :obj:`unk_token`,
- in slow tokenizers (instances of :class:`~transformers.PreTrainedTokenizer`), this method
keep unknown tokens unchanged.
- in fast tokenizers (instances of :class:`~transformers.PreTrainedTokenizerFast`), this method will
replace the unknown tokens with the :obj:`unk_token`,
- in slow tokenizers (instances of :class:`~transformers.PreTrainedTokenizer`), this method keep unknown
tokens unchanged.
Args:
text (:obj:`str`):
@@ -1931,8 +1929,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to add the special tokens associated with the corresponding model.
kwargs (additional keyword arguments, `optional`):
Will be passed to the underlying model specific encode method.
See details in :meth:`~transformers.PreTrainedTokenizer.__call__`
Will be passed to the underlying model specific encode method. See details in
:meth:`~transformers.PreTrainedTokenizer.__call__`
Returns:
:obj:`List[str]`: The list of tokens.
@@ -1946,8 +1944,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
""",
"""
Returns:
:obj:`List[int]`, :obj:`torch.Tensor`, :obj:`tf.Tensor` or :obj:`np.ndarray`:
The tokenized ids of the text.
:obj:`List[int]`, :obj:`torch.Tensor`, :obj:`tf.Tensor` or :obj:`np.ndarray`: The tokenized ids of the
text.
""",
)
def encode(
@@ -1969,12 +1967,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
Args:
text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`):
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
the ``tokenize`` method) or a list of integers (tokenized string ids using the
``convert_tokens_to_ids`` method).
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids``
method).
text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
string using the ``tokenize`` method) or a list of integers (tokenized string ids using the
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
the ``tokenize`` method) or a list of integers (tokenized string ids using the
``convert_tokens_to_ids`` method).
"""
encoded_inputs = self.encode_plus(
@@ -1998,8 +1996,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
):
"""
Find the correct padding/truncation strategy with backward compatibility
for old arguments (truncation_strategy and pad_to_max_length) and behaviors.
Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy
and pad_to_max_length) and behaviors.
"""
old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate")
old_pad_to_max_length = kwargs.pop("pad_to_max_length", False)
@@ -2150,14 +2148,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
Args:
text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
The sequence or batch of sequences to be encoded.
Each sequence can be a string or a list of strings (pretokenized string).
If the sequences are provided as list of strings (pretokenized), you must set
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
:obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
The sequence or batch of sequences to be encoded.
Each sequence can be a string or a list of strings (pretokenized string).
If the sequences are provided as list of strings (pretokenized), you must set
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
:obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
"""
# Input type checking for clearer error
@@ -2276,12 +2272,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
Args:
text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]` (the latter only for not-fast tokenizers)):
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
the ``tokenize`` method) or a list of integers (tokenized string ids using the
``convert_tokens_to_ids`` method).
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids``
method).
text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
string using the ``tokenize`` method) or a list of integers (tokenized string ids using the
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
the ``tokenize`` method) or a list of integers (tokenized string ids using the
``convert_tokens_to_ids`` method).
"""
@@ -2375,9 +2371,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
Args:
batch_text_or_text_pairs (:obj:`List[str]`, :obj:`List[Tuple[str, str]]`, :obj:`List[List[str]]`, :obj:`List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also :obj:`List[List[int]]`, :obj:`List[Tuple[List[int], List[int]]]`):
Batch of sequences or pair of sequences to be encoded.
This can be a list of string/string-sequences/int-sequences or a list of pair of
string/string-sequences/int-sequence (see details in ``encode_plus``).
Batch of sequences or pair of sequences to be encoded. This can be a list of
string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see
details in ``encode_plus``).
"""
# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
@@ -2459,8 +2455,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
in the batch.
Padding side (left/right) padding token ids are defined at the tokenizer level
(with ``self.padding_side``, ``self.pad_token_id`` and ``self.pad_token_type_id``)
Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``,
``self.pad_token_id`` and ``self.pad_token_type_id``)
.. note::
@@ -2470,10 +2466,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
Args:
encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or
:obj:`Dict[str, List[int]]`) or a batch of tokenized inputs (list of
:class:`~transformers.BatchEncoding`, `Dict[str, List[List[int]]]` or `List[Dict[str, List[int]]]`) so
you can use this method during preprocessing as well as in a PyTorch Dataloader collate function.
Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
well as in a PyTorch Dataloader collate function.
Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
see the note above for the return type.
@@ -2592,8 +2588,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create the token type IDs corresponding to the sequences passed.
`What are token type IDs? <../glossary.html#token-type-ids>`__
Create the token type IDs corresponding to the sequences passed. `What are token type IDs?
<../glossary.html#token-type-ids>`__
Should be overriden in a subclass if the model has a special way of building those.
@@ -2612,8 +2608,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens.
This implementation does not add special tokens and this method should be overriden in a subclass.
@@ -2651,17 +2647,17 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
**kwargs
) -> BatchEncoding:
"""
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
manages a moving window (with user defined stride) for overflowing tokens
Args:
ids (:obj:`List[int]`):
Tokenized input ids of the first sequence. Can be obtained from a string by chaining the
``tokenize`` and ``convert_tokens_to_ids`` methods.
Tokenized input ids of the first sequence. Can be obtained from a string by chaining the ``tokenize``
and ``convert_tokens_to_ids`` methods.
pair_ids (:obj:`List[int]`, `optional`):
Tokenized input ids of the second sequence. Can be obtained from a string by chaining the
``tokenize`` and ``convert_tokens_to_ids`` methods.
Tokenized input ids of the second sequence. Can be obtained from a string by chaining the ``tokenize``
and ``convert_tokens_to_ids`` methods.
"""
if "return_lengths" in kwargs:
@@ -2780,28 +2776,28 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
Args:
ids (:obj:`List[int]`):
Tokenized input ids of the first sequence. Can be obtained from a string by chaining the
``tokenize`` and ``convert_tokens_to_ids`` methods.
Tokenized input ids of the first sequence. Can be obtained from a string by chaining the ``tokenize``
and ``convert_tokens_to_ids`` methods.
pair_ids (:obj:`List[int]`, `optional`):
Tokenized input ids of the second sequence. Can be obtained from a string by chaining the
``tokenize`` and ``convert_tokens_to_ids`` methods.
Tokenized input ids of the second sequence. Can be obtained from a string by chaining the ``tokenize``
and ``convert_tokens_to_ids`` methods.
num_tokens_to_remove (:obj:`int`, `optional`, defaults to 0):
Number of tokens to remove using the truncation strategy.
truncation (:obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
The strategy to follow for truncation. Can be:
* :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
:obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
provided. This will truncate token by token, removing a token from the longest sequence in the pair
if a pair of sequences (or a batch of pairs) is provided.
* :obj:`'longest_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
to the maximum acceptable input length for the model if that argument is not provided. This will
truncate token by token, removing a token from the longest sequence in the pair if a pair of
sequences (or a batch of pairs) is provided.
* :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
the maximum acceptable input length for the model if that argument is not provided. This will only
truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
* :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
to the maximum acceptable input length for the model if that argument is not provided. This will only
truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
* :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
sequence lengths greater than the model maximum admissible input size).
* :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
greater than the model maximum admissible input size).
max_length (:obj:`int`, `optional`):
Controls the maximum length to use by one of the truncation/padding parameters.
@@ -2809,12 +2805,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
length is required by one of the truncation/padding parameters. If the model has no specific maximum
input length (like XLNet) truncation/padding to a maximum length will be deactivated.
stride (:obj:`int`, `optional`, defaults to 0):
If set to a positive number, the overflowing tokens returned will contain some tokens
from the main sequence returned. The value of this argument defines the number of additional tokens.
If set to a positive number, the overflowing tokens returned will contain some tokens from the main
sequence returned. The value of this argument defines the number of additional tokens.
Returns:
:obj:`Tuple[List[int], List[int], List[int]]`:
The truncated ``ids``, the truncated ``pair_ids`` and the list of overflowing tokens.
:obj:`Tuple[List[int], List[int], List[int]]`: The truncated ``ids``, the truncated ``pair_ids`` and the
list of overflowing tokens.
"""
if num_tokens_to_remove <= 0:
return ids, pair_ids, []
@@ -2882,10 +2878,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
max_length: maximum length of the returned list and optionally padding length (see below).
Will truncate by taking into account the special tokens.
padding_strategy: PaddingStrategy to use for padding.
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
- PaddingStrategy.DO_NOT_PAD: Do not pad
The tokenizer padding sides are defined in self.padding_side:
- 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
@@ -2939,9 +2937,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
def convert_tokens_to_string(self, tokens: List[str]) -> str:
"""
Converts a sequence of token ids in a single string.
The most simple way to do it is ``" ".join(tokens)`` but we often want to remove
sub-word tokenization artifacts at the same time.
Converts a sequence of token ids in a single string. The most simple way to do it is ``" ".join(tokens)`` but
we often want to remove sub-word tokenization artifacts at the same time
Args:
tokens (:obj:`List[str]`): The token to join in a string.
Return: The joined tokens.
@@ -2989,8 +2987,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
**kwargs
) -> str:
"""
Converts a sequence of ids in a string, using the tokenizer and vocabulary
with options to remove special tokens and clean up tokenization spaces.
Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
tokens and clean up tokenization spaces.
Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.