Doc styling (#8067)
* Important files * Styling them all * Revert "Styling them all" This reverts commit 7d029395fdae8513b8281cbc2a6c239f8093503e. * Syling them for realsies * Fix syntax error * Fix benchmark_utils * More fixes * Fix modeling auto and script * Remove new line * Fixes * More fixes * Fix more files * Style * Add FSMT * More fixes * More fixes * More fixes * More fixes * Fixes * More fixes * More fixes * Last fixes * Make sphinx happy
This commit is contained in:
@@ -12,10 +12,10 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Base classes common to both the slow and the fast tokenization classes:
|
||||
PreTrainedTokenizerBase (host all the user fronting encoding methodes)
|
||||
Special token mixing (host the special tokens logic) and
|
||||
BatchEncoding (wrap the dictionary of output with special method for the Fast tokenizers)
|
||||
"""
|
||||
Base classes common to both the slow and the fast tokenization classes: PreTrainedTokenizerBase (host all the user
|
||||
fronting encoding methodes) Special token mixing (host the special tokens logic) and BatchEncoding (wrap the dictionary
|
||||
of output with special method for the Fast tokenizers)
|
||||
"""
|
||||
|
||||
import copy
|
||||
@@ -58,8 +58,9 @@ else:
|
||||
|
||||
@dataclass(frozen=True, eq=True)
|
||||
class AddedToken:
|
||||
"""AddedToken represents a token to be added to a Tokenizer
|
||||
An AddedToken can have special options defining the way it should behave.
|
||||
"""
|
||||
AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the
|
||||
way it should behave.
|
||||
"""
|
||||
|
||||
content: str = field(default_factory=str)
|
||||
@@ -116,8 +117,8 @@ class ExplicitEnum(Enum):
|
||||
|
||||
class TruncationStrategy(ExplicitEnum):
|
||||
"""
|
||||
Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`.
|
||||
Useful for tab-completion in an IDE.
|
||||
Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
|
||||
tab-completion in an IDE.
|
||||
"""
|
||||
|
||||
ONLY_FIRST = "only_first"
|
||||
@@ -128,8 +129,8 @@ class TruncationStrategy(ExplicitEnum):
|
||||
|
||||
class PaddingStrategy(ExplicitEnum):
|
||||
"""
|
||||
Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`.
|
||||
Useful for tab-completion in an IDE.
|
||||
Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion
|
||||
in an IDE.
|
||||
"""
|
||||
|
||||
LONGEST = "longest"
|
||||
@@ -139,8 +140,8 @@ class PaddingStrategy(ExplicitEnum):
|
||||
|
||||
class TensorType(ExplicitEnum):
|
||||
"""
|
||||
Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`.
|
||||
Useful for tab-completion in an IDE.
|
||||
Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
|
||||
tab-completion in an IDE.
|
||||
"""
|
||||
|
||||
PYTORCH = "pt"
|
||||
@@ -177,8 +178,7 @@ class TokenSpan(NamedTuple):
|
||||
|
||||
def to_py_obj(obj):
|
||||
"""
|
||||
Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list
|
||||
to a python list.
|
||||
Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
|
||||
"""
|
||||
if isinstance(obj, (list, tuple)):
|
||||
return [to_py_obj(o) for o in obj]
|
||||
@@ -194,8 +194,8 @@ def to_py_obj(obj):
|
||||
|
||||
class BatchEncoding(UserDict):
|
||||
"""
|
||||
Holds the output of the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus`
|
||||
and :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.batch_encode` methods (tokens,
|
||||
Holds the output of the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus` and
|
||||
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.batch_encode` methods (tokens,
|
||||
attention_masks, etc).
|
||||
|
||||
This class is derived from a python dictionary and can be used as a dictionary. In addition, this class exposes
|
||||
@@ -242,8 +242,8 @@ class BatchEncoding(UserDict):
|
||||
|
||||
def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:
|
||||
"""
|
||||
If the key is a string, returns the value of the dict associated to :obj:`key` ('input_ids',
|
||||
'attention_mask', etc.).
|
||||
If the key is a string, returns the value of the dict associated to :obj:`key` ('input_ids', 'attention_mask',
|
||||
etc.).
|
||||
|
||||
If the key is an integer, get the :obj:`tokenizers.Encoding` for batch item with index :obj:`key`.
|
||||
"""
|
||||
@@ -289,15 +289,15 @@ class BatchEncoding(UserDict):
|
||||
@property
|
||||
def encodings(self) -> Optional[List[EncodingFast]]:
|
||||
"""
|
||||
:obj:`Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process.
|
||||
Returns :obj:`None` if the input was tokenized through Python (i.e., not a fast) tokenizer.
|
||||
:obj:`Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process. Returns
|
||||
:obj:`None` if the input was tokenized through Python (i.e., not a fast) tokenizer.
|
||||
"""
|
||||
return self._encodings
|
||||
|
||||
def tokens(self, batch_index: int = 0) -> List[str]:
|
||||
"""
|
||||
Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion
|
||||
to integer indices) at a given batch index (only works for the output of a fast tokenizer).
|
||||
Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to
|
||||
integer indices) at a given batch index (only works for the output of a fast tokenizer).
|
||||
|
||||
Args:
|
||||
batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
|
||||
@@ -327,25 +327,24 @@ class BatchEncoding(UserDict):
|
||||
|
||||
def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
|
||||
"""
|
||||
Get the index of the word corresponding (i.e. comprising) to an encoded token
|
||||
in a sequence of the batch.
|
||||
Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch.
|
||||
|
||||
Can be called as:
|
||||
|
||||
- ``self.token_to_word(token_index)`` if batch size is 1
|
||||
- ``self.token_to_word(batch_index, token_index)`` if batch size is greater than 1
|
||||
|
||||
This method is particularly suited when the input sequences are provided as
|
||||
pre-tokenized sequences (i.e., words are defined by the user). In this case it allows
|
||||
to easily associate encoded tokens with provided tokenized words.
|
||||
This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
|
||||
words are defined by the user). In this case it allows to easily associate encoded tokens with provided
|
||||
tokenized words.
|
||||
|
||||
Args:
|
||||
batch_or_token_index (:obj:`int`):
|
||||
Index of the sequence in the batch. If the batch only comprise one sequence,
|
||||
this can be the index of the token in the sequence.
|
||||
Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
|
||||
the token in the sequence.
|
||||
token_index (:obj:`int`, `optional`):
|
||||
If a batch index is provided in `batch_or_token_index`, this can be the index
|
||||
of the token in the sequence.
|
||||
If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the
|
||||
sequence.
|
||||
|
||||
Returns:
|
||||
:obj:`int`: Index of the word in the input sequence.
|
||||
@@ -378,22 +377,21 @@ class BatchEncoding(UserDict):
|
||||
- ``self.word_to_tokens(word_index)`` if batch size is 1
|
||||
- ``self.word_to_tokens(batch_index, word_index)`` if batch size is greater or equal to 1
|
||||
|
||||
This method is particularly suited when the input sequences are provided as
|
||||
pre-tokenized sequences (i.e. words are defined by the user). In this case it allows
|
||||
to easily associate encoded tokens with provided tokenized words.
|
||||
This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
|
||||
are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
|
||||
words.
|
||||
|
||||
Args:
|
||||
batch_or_word_index (:obj:`int`):
|
||||
Index of the sequence in the batch. If the batch only comprises one sequence,
|
||||
this can be the index of the word in the sequence.
|
||||
Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
|
||||
the word in the sequence.
|
||||
word_index (:obj:`int`, `optional`):
|
||||
If a batch index is provided in `batch_or_token_index`, this can be the index
|
||||
of the word in the sequence.
|
||||
If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
|
||||
sequence.
|
||||
|
||||
Returns:
|
||||
Optional :class:`~transformers.tokenization_utils_base.TokenSpan`
|
||||
Span of tokens in the encoded sequence. Returns :obj:`None` if no tokens correspond
|
||||
to the word.
|
||||
Optional :class:`~transformers.tokenization_utils_base.TokenSpan` Span of tokens in the encoded sequence.
|
||||
Returns :obj:`None` if no tokens correspond to the word.
|
||||
"""
|
||||
|
||||
if not self._encodings:
|
||||
@@ -427,15 +425,14 @@ class BatchEncoding(UserDict):
|
||||
|
||||
Args:
|
||||
batch_or_token_index (:obj:`int`):
|
||||
Index of the sequence in the batch. If the batch only comprise one sequence,
|
||||
this can be the index of the token in the sequence.
|
||||
Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
|
||||
the token in the sequence.
|
||||
token_index (:obj:`int`, `optional`):
|
||||
If a batch index is provided in `batch_or_token_index`, this can be the index
|
||||
of the token or tokens in the sequence.
|
||||
If a batch index is provided in `batch_or_token_index`, this can be the index of the token or tokens in
|
||||
the sequence.
|
||||
|
||||
Returns:
|
||||
:class:`~transformers.tokenization_utils_base.CharSpan`:
|
||||
Span of characters in the original string.
|
||||
:class:`~transformers.tokenization_utils_base.CharSpan`: Span of characters in the original string.
|
||||
"""
|
||||
|
||||
if not self._encodings:
|
||||
@@ -449,25 +446,25 @@ class BatchEncoding(UserDict):
|
||||
|
||||
def char_to_token(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int:
|
||||
"""
|
||||
Get the index of the token in the encoded output comprising a character
|
||||
in the original string for a sequence of the batch.
|
||||
Get the index of the token in the encoded output comprising a character in the original string for a sequence
|
||||
of the batch.
|
||||
|
||||
Can be called as:
|
||||
|
||||
- ``self.char_to_token(char_index)`` if batch size is 1
|
||||
- ``self.char_to_token(batch_index, char_index)`` if batch size is greater or equal to 1
|
||||
|
||||
This method is particularly suited when the input sequences are provided as
|
||||
pre-tokenized sequences (i.e. words are defined by the user). In this case it allows
|
||||
to easily associate encoded tokens with provided tokenized words.
|
||||
This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
|
||||
are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
|
||||
words.
|
||||
|
||||
Args:
|
||||
batch_or_char_index (:obj:`int`):
|
||||
Index of the sequence in the batch. If the batch only comprise one sequence,
|
||||
this can be the index of the word in the sequence
|
||||
Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
|
||||
the word in the sequence
|
||||
char_index (:obj:`int`, `optional`):
|
||||
If a batch index is provided in `batch_or_token_index`, this can be the index
|
||||
of the word in the sequence.
|
||||
If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
|
||||
sequence.
|
||||
|
||||
|
||||
Returns:
|
||||
@@ -485,8 +482,7 @@ class BatchEncoding(UserDict):
|
||||
|
||||
def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int] = None) -> CharSpan:
|
||||
"""
|
||||
Get the character span in the original string corresponding to given word in a sequence
|
||||
of the batch.
|
||||
Get the character span in the original string corresponding to given word in a sequence of the batch.
|
||||
|
||||
Character spans are returned as a CharSpan NamedTuple with:
|
||||
|
||||
@@ -500,19 +496,19 @@ class BatchEncoding(UserDict):
|
||||
|
||||
Args:
|
||||
batch_or_word_index (:obj:`int`):
|
||||
Index of the sequence in the batch. If the batch only comprise one sequence,
|
||||
this can be the index of the word in the sequence
|
||||
Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
|
||||
the word in the sequence
|
||||
word_index (:obj:`int`, `optional`):
|
||||
If a batch index is provided in `batch_or_token_index`, this can be the index
|
||||
of the word in the sequence.
|
||||
If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
|
||||
sequence.
|
||||
|
||||
Returns:
|
||||
:obj:`CharSpan` or :obj:`List[CharSpan]`:
|
||||
Span(s) of the associated character or characters in the string.
|
||||
CharSpan are NamedTuple with:
|
||||
:obj:`CharSpan` or :obj:`List[CharSpan]`: Span(s) of the associated character or characters in the string.
|
||||
CharSpan are NamedTuple with:
|
||||
|
||||
- start: index of the first character associated to the token in the original string
|
||||
- end: index of the character following the last character associated to the token in the original string
|
||||
- end: index of the character following the last character associated to the token in the original
|
||||
string
|
||||
"""
|
||||
|
||||
if not self._encodings:
|
||||
@@ -526,30 +522,29 @@ class BatchEncoding(UserDict):
|
||||
|
||||
def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int:
|
||||
"""
|
||||
Get the word in the original string corresponding to a character in the original string of
|
||||
a sequence of the batch.
|
||||
Get the word in the original string corresponding to a character in the original string of a sequence of the
|
||||
batch.
|
||||
|
||||
Can be called as:
|
||||
|
||||
- ``self.char_to_word(char_index)`` if batch size is 1
|
||||
- ``self.char_to_word(batch_index, char_index)`` if batch size is greater than 1
|
||||
|
||||
This method is particularly suited when the input sequences are provided as
|
||||
pre-tokenized sequences (i.e. words are defined by the user). In this case it allows
|
||||
to easily associate encoded tokens with provided tokenized words.
|
||||
This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
|
||||
are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
|
||||
words.
|
||||
|
||||
Args:
|
||||
batch_or_char_index (:obj:`int`):
|
||||
Index of the sequence in the batch. If the batch only comprise one sequence,
|
||||
this can be the index of the character in the orginal string.
|
||||
Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
|
||||
the character in the orginal string.
|
||||
char_index (:obj:`int`, `optional`):
|
||||
If a batch index is provided in `batch_or_token_index`, this can be the index
|
||||
of the character in the orginal string.
|
||||
If a batch index is provided in `batch_or_token_index`, this can be the index of the character in the
|
||||
orginal string.
|
||||
|
||||
|
||||
Returns:
|
||||
:obj:`int` or :obj:`List[int]`:
|
||||
Index or indices of the associated encoded token(s).
|
||||
:obj:`int` or :obj:`List[int]`: Index or indices of the associated encoded token(s).
|
||||
"""
|
||||
|
||||
if not self._encodings:
|
||||
@@ -642,8 +637,8 @@ class BatchEncoding(UserDict):
|
||||
device (:obj:`str` or :obj:`torch.device`): The device to put the tensors on.
|
||||
|
||||
Returns:
|
||||
:class:`~transformers.BatchEncoding`:
|
||||
The same instance of :class:`~transformers.BatchEncoding` after modification.
|
||||
:class:`~transformers.BatchEncoding`: The same instance of :class:`~transformers.BatchEncoding` after
|
||||
modification.
|
||||
"""
|
||||
self.data = {k: v.to(device) for k, v in self.data.items()}
|
||||
return self
|
||||
@@ -651,8 +646,8 @@ class BatchEncoding(UserDict):
|
||||
|
||||
class SpecialTokensMixin:
|
||||
"""
|
||||
A mixin derived by :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast`
|
||||
to handle specific behaviors related to special tokens. In particular, this class hold the attributes which can be
|
||||
A mixin derived by :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` to
|
||||
handle specific behaviors related to special tokens. In particular, this class hold the attributes which can be
|
||||
used to directly access these special tokens in a model-independant manner and allow to set and update the special
|
||||
tokens.
|
||||
|
||||
@@ -874,8 +869,8 @@ class SpecialTokensMixin:
|
||||
@property
|
||||
def sep_token(self) -> str:
|
||||
"""
|
||||
:obj:`str`: Separation token, to separate context and query in an input sequence.
|
||||
Log an error if used while not having been set.
|
||||
:obj:`str`: Separation token, to separate context and query in an input sequence. Log an error if used while
|
||||
not having been set.
|
||||
"""
|
||||
if self._sep_token is None and self.verbose:
|
||||
logger.error("Using sep_token, but it is not set yet.")
|
||||
@@ -895,8 +890,8 @@ class SpecialTokensMixin:
|
||||
@property
|
||||
def cls_token(self) -> str:
|
||||
"""
|
||||
:obj:`str`: Classification token, to extract a summary of an input sequence leveraging self-attention along
|
||||
the full depth of the model. Log an error if used while not having been set.
|
||||
:obj:`str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the
|
||||
full depth of the model. Log an error if used while not having been set.
|
||||
"""
|
||||
if self._cls_token is None and self.verbose:
|
||||
logger.error("Using cls_token, but it is not set yet.")
|
||||
@@ -1039,8 +1034,8 @@ class SpecialTokensMixin:
|
||||
@property
|
||||
def additional_special_tokens_ids(self) -> List[int]:
|
||||
"""
|
||||
:obj:`List[int]`: Ids of all the additional special tokens in the vocabulary.
|
||||
Log an error if used while not having been set.
|
||||
:obj:`List[int]`: Ids of all the additional special tokens in the vocabulary. Log an error if used while not
|
||||
having been set.
|
||||
"""
|
||||
return self.convert_tokens_to_ids(self.additional_special_tokens)
|
||||
|
||||
@@ -1079,8 +1074,8 @@ class SpecialTokensMixin:
|
||||
@property
|
||||
def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
|
||||
"""
|
||||
:obj:`Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes
|
||||
(:obj:`cls_token`, :obj:`unk_token`, etc.) to their values (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.).
|
||||
:obj:`Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes (:obj:`cls_token`,
|
||||
:obj:`unk_token`, etc.) to their values (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.).
|
||||
|
||||
Convert potential tokens of :obj:`tokenizers.AddedToken` type to string.
|
||||
"""
|
||||
@@ -1199,8 +1194,8 @@ ENCODE_KWARGS_DOCSTRING = r"""
|
||||
|
||||
ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
|
||||
return_token_type_ids (:obj:`bool`, `optional`):
|
||||
Whether to return token type IDs. If left to the default, will return the token type IDs according
|
||||
to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
|
||||
Whether to return token type IDs. If left to the default, will return the token type IDs according to
|
||||
the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
|
||||
|
||||
`What are token type IDs? <../glossary.html#token-type-ids>`__
|
||||
return_attention_mask (:obj:`bool`, `optional`):
|
||||
@@ -1230,14 +1225,17 @@ ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
|
||||
- **input_ids** -- List of token ids to be fed to a model.
|
||||
|
||||
`What are input IDs? <../glossary.html#input-ids>`__
|
||||
|
||||
- **token_type_ids** -- List of token type ids to be fed to a model (when :obj:`return_token_type_ids=True`
|
||||
or if `"token_type_ids"` is in :obj:`self.model_input_names`).
|
||||
|
||||
`What are token type IDs? <../glossary.html#token-type-ids>`__
|
||||
|
||||
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
||||
:obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`).
|
||||
|
||||
`What are attention masks? <../glossary.html#attention-mask>`__
|
||||
|
||||
- **overflowing_tokens** -- List of overflowing tokens sequences (when a :obj:`max_length` is specified and
|
||||
:obj:`return_overflowing_tokens=True`).
|
||||
- **num_truncated_tokens** -- Number of tokens truncated (when a :obj:`max_length` is specified and
|
||||
@@ -1249,6 +1247,7 @@ ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
|
||||
|
||||
INIT_TOKENIZER_DOCSTRING = r"""
|
||||
Class attributes (overridden by derived classes)
|
||||
|
||||
- **vocab_files_names** (:obj:`Dict[str, str]`) -- A dictionary with, as keys, the ``__init__`` keyword name of
|
||||
each vocabulary file required by the model, and as associated values, the filename for saving the associated
|
||||
file (string).
|
||||
@@ -1260,8 +1259,8 @@ INIT_TOKENIZER_DOCSTRING = r"""
|
||||
:obj:`short-cut-names` of the pretrained models, and as associated values, the maximum length of the sequence
|
||||
inputs of this model, or :obj:`None` if the model has no maximum input size.
|
||||
- **pretrained_init_configuration** (:obj:`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the
|
||||
:obj:`short-cut-names` of the pretrained models, and as associated values, a dictionary of specific
|
||||
arguments to pass to the ``__init__`` method of the tokenizer class for this pretrained model when loading the
|
||||
:obj:`short-cut-names` of the pretrained models, and as associated values, a dictionary of specific arguments
|
||||
to pass to the ``__init__`` method of the tokenizer class for this pretrained model when loading the
|
||||
tokenizer with the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`
|
||||
method.
|
||||
- **model_input_names** (:obj:`List[str]`) -- A list of inputs expected in the forward pass of the model.
|
||||
@@ -1270,11 +1269,10 @@ INIT_TOKENIZER_DOCSTRING = r"""
|
||||
|
||||
Args:
|
||||
model_max_length (:obj:`int`, `optional`):
|
||||
The maximum length (in number of tokens) for the inputs to the transformer model.
|
||||
When the tokenizer is loaded with
|
||||
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`, this will be set to
|
||||
the value stored for the associated model in ``max_model_input_sizes`` (see above). If no value is
|
||||
provided, will default to VERY_LARGE_INTEGER (:obj:`int(1e30)`).
|
||||
The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is
|
||||
loaded with :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`, this
|
||||
will be set to the value stored for the associated model in ``max_model_input_sizes`` (see above). If no
|
||||
value is provided, will default to VERY_LARGE_INTEGER (:obj:`int(1e30)`).
|
||||
padding_side: (:obj:`str`, `optional`):
|
||||
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
||||
Default value is picked from the class attribute of the same name.
|
||||
@@ -1319,13 +1317,13 @@ PREPARE_SEQ2SEQ_BATCH_DOCSTRING = """
|
||||
tgt_texts (:obj:`list`, `optional`):
|
||||
List of summaries or target language texts.
|
||||
max_length (:obj:`int`, `optional`):
|
||||
Controls the maximum length for encoder inputs (documents to summarize or source language texts)
|
||||
If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
|
||||
length is required by one of the truncation/padding parameters. If the model has no specific maximum
|
||||
input length (like XLNet) truncation/padding to a maximum length will be deactivated.
|
||||
Controls the maximum length for encoder inputs (documents to summarize or source language texts) If
|
||||
left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum length
|
||||
is required by one of the truncation/padding parameters. If the model has no specific maximum input
|
||||
length (like XLNet) truncation/padding to a maximum length will be deactivated.
|
||||
max_target_length (:obj:`int`, `optional`):
|
||||
Controls the maximum length of decoder inputs (target language texts or summaries)
|
||||
If left unset or set to :obj:`None`, this will use the max_length value.
|
||||
Controls the maximum length of decoder inputs (target language texts or summaries) If left unset or set
|
||||
to :obj:`None`, this will use the max_length value.
|
||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
|
||||
Activates and controls padding. Accepts the following values:
|
||||
|
||||
@@ -1366,8 +1364,8 @@ PREPARE_SEQ2SEQ_BATCH_DOCSTRING = """
|
||||
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
|
||||
- **labels** -- List of token ids for tgt_texts.
|
||||
|
||||
The full set of keys ``[input_ids, attention_mask, labels]``,
|
||||
will only be returned if tgt_texts is passed. Otherwise, input_ids, attention_mask will be the only keys.
|
||||
The full set of keys ``[input_ids, attention_mask, labels]``, will only be returned if tgt_texts is passed.
|
||||
Otherwise, input_ids, attention_mask will be the only keys.
|
||||
|
||||
"""
|
||||
|
||||
@@ -1515,9 +1513,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
Whether or not to delete incompletely received files. Attempt to resume the download if such a file
|
||||
exists.
|
||||
proxies (:obj:`Dict[str, str], `optional`):
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.,
|
||||
:obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each
|
||||
request.
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
|
||||
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
|
||||
inputs (additional positional arguments, `optional`):
|
||||
Will be passed along to the Tokenizer ``__init__`` method.
|
||||
kwargs (additional keyword arguments, `optional`):
|
||||
@@ -1792,10 +1789,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained` class method.
|
||||
|
||||
.. Note::
|
||||
A "fast" tokenizer (instance of :class:`transformers.PreTrainedTokenizerFast`) saved with
|
||||
this method will not be possible to load back
|
||||
in a "slow" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizer` instance. It can only be loaded
|
||||
in a "fast" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizerFast` instance.
|
||||
A "fast" tokenizer (instance of :class:`transformers.PreTrainedTokenizerFast`) saved with this method will
|
||||
not be possible to load back in a "slow" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizer`
|
||||
instance. It can only be loaded in a "fast" tokenizer, i.e. in a
|
||||
:class:`transformers.PreTrainedTokenizerFast` instance.
|
||||
|
||||
.. Warning::
|
||||
This won't save modifications you may have applied to the tokenizer after the instantiation (for instance,
|
||||
@@ -1804,10 +1801,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
Args:
|
||||
save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved.
|
||||
legacy_format (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether to save the tokenizer in legacy format (default), i.e. with tokenizer specific vocabulary and
|
||||
a separate added_tokens files or in the unified JSON file format for the `tokenizers` library.
|
||||
It's only possible to save a Fast tokenizer in the unified JSON format and this format is incompatible
|
||||
with "slow" tokenizers (not powered by the `tokenizers` library).
|
||||
Whether to save the tokenizer in legacy format (default), i.e. with tokenizer specific vocabulary and a
|
||||
separate added_tokens files or in the unified JSON file format for the `tokenizers` library. It's only
|
||||
possible to save a Fast tokenizer in the unified JSON format and this format is incompatible with
|
||||
"slow" tokenizers (not powered by the `tokenizers` library).
|
||||
filename_prefix: (:obj:`str`, `optional`):
|
||||
A prefix to add to the names of the files saved by the tokenizer.
|
||||
|
||||
@@ -1871,10 +1868,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
legacy_format: bool = True,
|
||||
filename_prefix: Optional[str] = None,
|
||||
) -> Tuple[str]:
|
||||
"""Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
|
||||
"""
|
||||
Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
|
||||
|
||||
Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens}
|
||||
using the specific :meth:`~transformers.tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`
|
||||
Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
|
||||
specific :meth:`~transformers.tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`
|
||||
"""
|
||||
if not legacy_format:
|
||||
raise ValueError(
|
||||
@@ -1898,9 +1896,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
"""
|
||||
Save only the vocabulary of the tokenizer (vocabulary + added tokens).
|
||||
|
||||
This method won't save the configuration and special token mappings of the tokenizer.
|
||||
Use :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained` to save
|
||||
the whole state of the tokenizer.
|
||||
This method won't save the configuration and special token mappings of the tokenizer. Use
|
||||
:meth:`~transformers.PreTrainedTokenizerFast._save_pretrained` to save the whole state of the tokenizer.
|
||||
|
||||
Args:
|
||||
save_directory (:obj:`str`):
|
||||
@@ -1918,10 +1915,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
Converts a string in a sequence of tokens, using the backend Rust tokenizer.
|
||||
|
||||
Note that this method behave differently between fast and slow tokenizers:
|
||||
- in fast tokenizers (instances of :class:`~transformers.PreTrainedTokenizerFast`), this method
|
||||
will replace the unknown tokens with the :obj:`unk_token`,
|
||||
- in slow tokenizers (instances of :class:`~transformers.PreTrainedTokenizer`), this method
|
||||
keep unknown tokens unchanged.
|
||||
|
||||
- in fast tokenizers (instances of :class:`~transformers.PreTrainedTokenizerFast`), this method will
|
||||
replace the unknown tokens with the :obj:`unk_token`,
|
||||
- in slow tokenizers (instances of :class:`~transformers.PreTrainedTokenizer`), this method keep unknown
|
||||
tokens unchanged.
|
||||
|
||||
Args:
|
||||
text (:obj:`str`):
|
||||
@@ -1931,8 +1929,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to add the special tokens associated with the corresponding model.
|
||||
kwargs (additional keyword arguments, `optional`):
|
||||
Will be passed to the underlying model specific encode method.
|
||||
See details in :meth:`~transformers.PreTrainedTokenizer.__call__`
|
||||
Will be passed to the underlying model specific encode method. See details in
|
||||
:meth:`~transformers.PreTrainedTokenizer.__call__`
|
||||
|
||||
Returns:
|
||||
:obj:`List[str]`: The list of tokens.
|
||||
@@ -1946,8 +1944,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
""",
|
||||
"""
|
||||
Returns:
|
||||
:obj:`List[int]`, :obj:`torch.Tensor`, :obj:`tf.Tensor` or :obj:`np.ndarray`:
|
||||
The tokenized ids of the text.
|
||||
:obj:`List[int]`, :obj:`torch.Tensor`, :obj:`tf.Tensor` or :obj:`np.ndarray`: The tokenized ids of the
|
||||
text.
|
||||
""",
|
||||
)
|
||||
def encode(
|
||||
@@ -1969,12 +1967,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
|
||||
Args:
|
||||
text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`):
|
||||
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
|
||||
the ``tokenize`` method) or a list of integers (tokenized string ids using the
|
||||
``convert_tokens_to_ids`` method).
|
||||
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
|
||||
``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids``
|
||||
method).
|
||||
text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
|
||||
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
|
||||
string using the ``tokenize`` method) or a list of integers (tokenized string ids using the
|
||||
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
|
||||
the ``tokenize`` method) or a list of integers (tokenized string ids using the
|
||||
``convert_tokens_to_ids`` method).
|
||||
"""
|
||||
encoded_inputs = self.encode_plus(
|
||||
@@ -1998,8 +1996,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
|
||||
):
|
||||
"""
|
||||
Find the correct padding/truncation strategy with backward compatibility
|
||||
for old arguments (truncation_strategy and pad_to_max_length) and behaviors.
|
||||
Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy
|
||||
and pad_to_max_length) and behaviors.
|
||||
"""
|
||||
old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate")
|
||||
old_pad_to_max_length = kwargs.pop("pad_to_max_length", False)
|
||||
@@ -2150,14 +2148,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
|
||||
Args:
|
||||
text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
|
||||
The sequence or batch of sequences to be encoded.
|
||||
Each sequence can be a string or a list of strings (pretokenized string).
|
||||
If the sequences are provided as list of strings (pretokenized), you must set
|
||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||
:obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
|
||||
The sequence or batch of sequences to be encoded.
|
||||
Each sequence can be a string or a list of strings (pretokenized string).
|
||||
If the sequences are provided as list of strings (pretokenized), you must set
|
||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||
:obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
"""
|
||||
# Input type checking for clearer error
|
||||
@@ -2276,12 +2272,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
|
||||
Args:
|
||||
text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]` (the latter only for not-fast tokenizers)):
|
||||
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
|
||||
the ``tokenize`` method) or a list of integers (tokenized string ids using the
|
||||
``convert_tokens_to_ids`` method).
|
||||
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
|
||||
``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids``
|
||||
method).
|
||||
text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
|
||||
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
|
||||
string using the ``tokenize`` method) or a list of integers (tokenized string ids using the
|
||||
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
|
||||
the ``tokenize`` method) or a list of integers (tokenized string ids using the
|
||||
``convert_tokens_to_ids`` method).
|
||||
"""
|
||||
|
||||
@@ -2375,9 +2371,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
|
||||
Args:
|
||||
batch_text_or_text_pairs (:obj:`List[str]`, :obj:`List[Tuple[str, str]]`, :obj:`List[List[str]]`, :obj:`List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also :obj:`List[List[int]]`, :obj:`List[Tuple[List[int], List[int]]]`):
|
||||
Batch of sequences or pair of sequences to be encoded.
|
||||
This can be a list of string/string-sequences/int-sequences or a list of pair of
|
||||
string/string-sequences/int-sequence (see details in ``encode_plus``).
|
||||
Batch of sequences or pair of sequences to be encoded. This can be a list of
|
||||
string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see
|
||||
details in ``encode_plus``).
|
||||
"""
|
||||
|
||||
# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
|
||||
@@ -2459,8 +2455,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
|
||||
in the batch.
|
||||
|
||||
Padding side (left/right) padding token ids are defined at the tokenizer level
|
||||
(with ``self.padding_side``, ``self.pad_token_id`` and ``self.pad_token_type_id``)
|
||||
Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``,
|
||||
``self.pad_token_id`` and ``self.pad_token_type_id``)
|
||||
|
||||
.. note::
|
||||
|
||||
@@ -2470,10 +2466,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
|
||||
Args:
|
||||
encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
|
||||
Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or
|
||||
:obj:`Dict[str, List[int]]`) or a batch of tokenized inputs (list of
|
||||
:class:`~transformers.BatchEncoding`, `Dict[str, List[List[int]]]` or `List[Dict[str, List[int]]]`) so
|
||||
you can use this method during preprocessing as well as in a PyTorch Dataloader collate function.
|
||||
Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
|
||||
List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
|
||||
List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
|
||||
well as in a PyTorch Dataloader collate function.
|
||||
|
||||
Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
|
||||
see the note above for the return type.
|
||||
@@ -2592,8 +2588,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||
) -> List[int]:
|
||||
"""
|
||||
Create the token type IDs corresponding to the sequences passed.
|
||||
`What are token type IDs? <../glossary.html#token-type-ids>`__
|
||||
Create the token type IDs corresponding to the sequences passed. `What are token type IDs?
|
||||
<../glossary.html#token-type-ids>`__
|
||||
|
||||
Should be overriden in a subclass if the model has a special way of building those.
|
||||
|
||||
@@ -2612,8 +2608,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||
) -> List[int]:
|
||||
"""
|
||||
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
||||
by concatenating and adding special tokens.
|
||||
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
|
||||
adding special tokens.
|
||||
|
||||
This implementation does not add special tokens and this method should be overriden in a subclass.
|
||||
|
||||
@@ -2651,17 +2647,17 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
**kwargs
|
||||
) -> BatchEncoding:
|
||||
"""
|
||||
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
|
||||
It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
|
||||
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
|
||||
adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
|
||||
manages a moving window (with user defined stride) for overflowing tokens
|
||||
|
||||
Args:
|
||||
ids (:obj:`List[int]`):
|
||||
Tokenized input ids of the first sequence. Can be obtained from a string by chaining the
|
||||
``tokenize`` and ``convert_tokens_to_ids`` methods.
|
||||
Tokenized input ids of the first sequence. Can be obtained from a string by chaining the ``tokenize``
|
||||
and ``convert_tokens_to_ids`` methods.
|
||||
pair_ids (:obj:`List[int]`, `optional`):
|
||||
Tokenized input ids of the second sequence. Can be obtained from a string by chaining the
|
||||
``tokenize`` and ``convert_tokens_to_ids`` methods.
|
||||
Tokenized input ids of the second sequence. Can be obtained from a string by chaining the ``tokenize``
|
||||
and ``convert_tokens_to_ids`` methods.
|
||||
"""
|
||||
|
||||
if "return_lengths" in kwargs:
|
||||
@@ -2780,28 +2776,28 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
|
||||
Args:
|
||||
ids (:obj:`List[int]`):
|
||||
Tokenized input ids of the first sequence. Can be obtained from a string by chaining the
|
||||
``tokenize`` and ``convert_tokens_to_ids`` methods.
|
||||
Tokenized input ids of the first sequence. Can be obtained from a string by chaining the ``tokenize``
|
||||
and ``convert_tokens_to_ids`` methods.
|
||||
pair_ids (:obj:`List[int]`, `optional`):
|
||||
Tokenized input ids of the second sequence. Can be obtained from a string by chaining the
|
||||
``tokenize`` and ``convert_tokens_to_ids`` methods.
|
||||
Tokenized input ids of the second sequence. Can be obtained from a string by chaining the ``tokenize``
|
||||
and ``convert_tokens_to_ids`` methods.
|
||||
num_tokens_to_remove (:obj:`int`, `optional`, defaults to 0):
|
||||
Number of tokens to remove using the truncation strategy.
|
||||
truncation (:obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
|
||||
The strategy to follow for truncation. Can be:
|
||||
|
||||
* :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
|
||||
:obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
|
||||
provided. This will truncate token by token, removing a token from the longest sequence in the pair
|
||||
if a pair of sequences (or a batch of pairs) is provided.
|
||||
* :obj:`'longest_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
|
||||
to the maximum acceptable input length for the model if that argument is not provided. This will
|
||||
truncate token by token, removing a token from the longest sequence in the pair if a pair of
|
||||
sequences (or a batch of pairs) is provided.
|
||||
* :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
|
||||
the maximum acceptable input length for the model if that argument is not provided. This will only
|
||||
truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
|
||||
* :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
|
||||
to the maximum acceptable input length for the model if that argument is not provided. This will only
|
||||
truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
|
||||
* :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
|
||||
sequence lengths greater than the model maximum admissible input size).
|
||||
* :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
|
||||
greater than the model maximum admissible input size).
|
||||
max_length (:obj:`int`, `optional`):
|
||||
Controls the maximum length to use by one of the truncation/padding parameters.
|
||||
|
||||
@@ -2809,12 +2805,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
length is required by one of the truncation/padding parameters. If the model has no specific maximum
|
||||
input length (like XLNet) truncation/padding to a maximum length will be deactivated.
|
||||
stride (:obj:`int`, `optional`, defaults to 0):
|
||||
If set to a positive number, the overflowing tokens returned will contain some tokens
|
||||
from the main sequence returned. The value of this argument defines the number of additional tokens.
|
||||
If set to a positive number, the overflowing tokens returned will contain some tokens from the main
|
||||
sequence returned. The value of this argument defines the number of additional tokens.
|
||||
|
||||
Returns:
|
||||
:obj:`Tuple[List[int], List[int], List[int]]`:
|
||||
The truncated ``ids``, the truncated ``pair_ids`` and the list of overflowing tokens.
|
||||
:obj:`Tuple[List[int], List[int], List[int]]`: The truncated ``ids``, the truncated ``pair_ids`` and the
|
||||
list of overflowing tokens.
|
||||
"""
|
||||
if num_tokens_to_remove <= 0:
|
||||
return ids, pair_ids, []
|
||||
@@ -2882,10 +2878,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
max_length: maximum length of the returned list and optionally padding length (see below).
|
||||
Will truncate by taking into account the special tokens.
|
||||
padding_strategy: PaddingStrategy to use for padding.
|
||||
|
||||
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
|
||||
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
|
||||
- PaddingStrategy.DO_NOT_PAD: Do not pad
|
||||
The tokenizer padding sides are defined in self.padding_side:
|
||||
|
||||
- 'left': pads on the left of the sequences
|
||||
- 'right': pads on the right of the sequences
|
||||
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
||||
@@ -2939,9 +2937,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
|
||||
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
||||
"""
|
||||
Converts a sequence of token ids in a single string.
|
||||
The most simple way to do it is ``" ".join(tokens)`` but we often want to remove
|
||||
sub-word tokenization artifacts at the same time.
|
||||
Converts a sequence of token ids in a single string. The most simple way to do it is ``" ".join(tokens)`` but
|
||||
we often want to remove sub-word tokenization artifacts at the same time
|
||||
|
||||
Args:
|
||||
tokens (:obj:`List[str]`): The token to join in a string.
|
||||
Return: The joined tokens.
|
||||
@@ -2989,8 +2987,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
**kwargs
|
||||
) -> str:
|
||||
"""
|
||||
Converts a sequence of ids in a string, using the tokenizer and vocabulary
|
||||
with options to remove special tokens and clean up tokenization spaces.
|
||||
Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
|
||||
tokens and clean up tokenization spaces.
|
||||
|
||||
Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user