fixed lots of typos. (#7758)

This commit is contained in:
Tiger
2020-10-13 09:00:20 -05:00
committed by GitHub
parent 8cb4ecca25
commit 7e73c12805
22 changed files with 36 additions and 36 deletions

View File

@@ -15,7 +15,7 @@
""" Base classes common to both the slow and the fast tokenization classes:
PreTrainedTokenizerBase (host all the user fronting encoding methodes)
Special token mixing (host the special tokens logic) and
BatchEncoding (wrap the dictionnary of output with special method for the Fast tokenizers)
BatchEncoding (wrap the dictionary of output with special method for the Fast tokenizers)
"""
import copy
@@ -249,7 +249,7 @@ class BatchEncoding(UserDict):
def tokens(self, batch_index: int = 0) -> List[str]:
"""
Return the list of tokens (sub-parts of the input strings after word/subword splitting and before converstion
Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion
to integer indices) at a given batch index (only works for the output of a fast tokenizer).
Args:
@@ -1121,7 +1121,7 @@ ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to return overflowing token sequences.
return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
Wheter or not to return special tokens mask information.
Whether or not to return special tokens mask information.
return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to return :obj:`(char_start, char_end)` for each token.
@@ -1153,13 +1153,13 @@ ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
- **num_truncated_tokens** -- Number of tokens truncated (when a :obj:`max_length` is specified and
:obj:`return_overflowing_tokens=True`).
- **special_tokens_mask** -- List of 0s and 1s, with 0 specifying added special tokens and 1 specifying
regual sequence tokens (when :obj:`add_special_tokens=True` and :obj:`return_special_tokens_mask=True`).
regular sequence tokens (when :obj:`add_special_tokens=True` and :obj:`return_special_tokens_mask=True`).
- **length** -- The length of the inputs (when :obj:`return_length=True`)
"""
INIT_TOKENIZER_DOCSTRING = r"""
Class attributes (overridden by derived classes)
- **vocab_files_names** (:obj:`Dict[str, str]`) -- A ditionary with, as keys, the ``__init__`` keyword name of
- **vocab_files_names** (:obj:`Dict[str, str]`) -- A dictionary with, as keys, the ``__init__`` keyword name of
each vocabulary file required by the model, and as associated values, the filename for saving the associated
file (string).
- **pretrained_vocab_files_map** (:obj:`Dict[str, Dict[str, str]]`) -- A dictionary of dictionaries, with the
@@ -1170,7 +1170,7 @@ INIT_TOKENIZER_DOCSTRING = r"""
:obj:`short-cut-names` of the pretrained models, and as associated values, the maximum length of the sequence
inputs of this model, or :obj:`None` if the model has no maximum input size.
- **pretrained_init_configuration** (:obj:`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the
:obj:`short-cut-names` of the pretrained models, and as associated values, a dictionnary of specific
:obj:`short-cut-names` of the pretrained models, and as associated values, a dictionary of specific
arguments to pass to the ``__init__`` method of the tokenizer class for this pretrained model when loading the
tokenizer with the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`
method.
@@ -1688,7 +1688,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
modifying :obj:`tokenizer.do_lower_case` after creation).
Args:
save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved.
save_directory (:obj:`str`): The path to a directory where the tokenizer will be saved.
Returns:
A tuple of :obj:`str`: The files saved.
@@ -2383,7 +2383,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
batch_size = len(encoded_inputs["input_ids"])
assert all(
len(v) == batch_size for v in encoded_inputs.values()
), "Some items in the output dictionnary have a different batch size than others."
), "Some items in the output dictionary have a different batch size than others."
if padding_strategy == PaddingStrategy.LONGEST:
max_length = max(len(inputs) for inputs in encoded_inputs["input_ids"])
@@ -2547,7 +2547,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
sequence = ids + pair_ids if pair else ids
token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
# Build output dictionnary
# Build output dictionary
encoded_inputs["input_ids"] = sequence
if return_token_type_ids:
encoded_inputs["token_type_ids"] = token_type_ids
@@ -2819,7 +2819,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
token_ids_1 (:obj:`List[int]`, `optional`):
List of ids of the second sequence.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Wheter or not the token list is already formated with special tokens for the model.
Whether or not the token list is already formated with special tokens for the model.
Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.