Black preview (#17217)

* Black preview

* Fixup too!

* Fix check copies

* Use the same version as the CI

* Bump black
This commit is contained in:
Sylvain Gugger
2022-05-12 16:25:55 -04:00
committed by GitHub
parent 9bd67ac7bb
commit afe5d42d8d
578 changed files with 8274 additions and 3296 deletions

View File

@@ -1502,12 +1502,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose:
if not self.deprecation_warnings.get("max_len_single_sentence", False):
logger.warning(
"Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
"Setting 'max_len_single_sentence' is now deprecated. This value is automatically set up."
)
self.deprecation_warnings["max_len_single_sentence"] = True
else:
raise ValueError(
"Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
"Setting 'max_len_single_sentence' is now deprecated. This value is automatically set up."
)
@max_len_sentences_pair.setter
@@ -1516,13 +1516,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose:
if not self.deprecation_warnings.get("max_len_sentences_pair", False):
logger.warning(
"Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
"Setting 'max_len_sentences_pair' is now deprecated. This value is automatically set up."
)
self.deprecation_warnings["max_len_sentences_pair"] = True
else:
raise ValueError(
"Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
)
raise ValueError("Setting 'max_len_sentences_pair' is now deprecated. This value is automatically set up.")
def _set_processor_class(self, processor_class: str):
"""Sets processor class as an attribute."""
@@ -1530,9 +1528,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
def __repr__(self) -> str:
return (
f"{'PreTrainedTokenizerFast' if self.is_fast else 'PreTrainedTokenizer'}(name_or_path='{self.name_or_path}', "
f"vocab_size={self.vocab_size}, model_max_len={self.model_max_length}, is_fast={self.is_fast}, "
f"padding_side='{self.padding_side}', truncation_side='{self.truncation_side}', special_tokens={self.special_tokens_map_extended})"
f"{'PreTrainedTokenizerFast' if self.is_fast else 'PreTrainedTokenizer'}(name_or_path='{self.name_or_path}',"
f" vocab_size={self.vocab_size}, model_max_len={self.model_max_length}, is_fast={self.is_fast},"
f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}',"
f" special_tokens={self.special_tokens_map_extended})"
)
def get_vocab(self) -> Dict[str, int]:
@@ -1873,10 +1872,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
if config_tokenizer_class is not None:
if cls.__name__.replace("Fast", "") != config_tokenizer_class.replace("Fast", ""):
logger.warning(
"The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. "
"It may result in unexpected tokenization. \n"
f"The tokenizer class you load from this checkpoint is '{config_tokenizer_class}'. \n"
f"The class this function is called from is '{cls.__name__}'."
"The tokenizer class you load from this checkpoint is not the same type as the class this"
" function is called from. It may result in unexpected tokenization. \nThe tokenizer class you"
f" load from this checkpoint is '{config_tokenizer_class}'. \nThe class this function is called"
f" from is '{cls.__name__}'."
)
# Update with newly provided kwargs
@@ -1988,7 +1987,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
added_tokens = tokenizer.sanitize_special_tokens()
if added_tokens:
logger.warning_advice(
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained."
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are"
" fine-tuned or trained."
)
return tokenizer
@@ -2270,11 +2270,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
if verbose:
if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False):
logger.warning(
"Truncation was not explicitly activated but `max_length` is provided a specific value, "
"please use `truncation=True` to explicitly truncate examples to max length. "
"Defaulting to 'longest_first' truncation strategy. "
"If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy "
"more precisely by providing a specific strategy to `truncation`."
"Truncation was not explicitly activated but `max_length` is provided a specific value, please"
" use `truncation=True` to explicitly truncate examples to max length. Defaulting to"
" 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the"
" tokenizer you can select this strategy more precisely by providing a specific strategy to"
" `truncation`."
)
self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
truncation = "longest_first"
@@ -2316,14 +2316,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
if truncation is False and old_truncation_strategy != "do_not_truncate":
if verbose:
warnings.warn(
"The `truncation_strategy` argument is deprecated and will be removed in a future version, "
"use `truncation=True` to truncate examples to a max length. You can give a specific "
"length with `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the "
"maximal input size of the model (e.g. 512 for Bert). "
" If you have pairs of inputs, you can give a specific truncation strategy selected among "
"`truncation='only_first'` (will only truncate the first sentence in the pairs) "
"`truncation='only_second'` (will only truncate the second sentence in the pairs) "
"or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence in the pairs).",
"The `truncation_strategy` argument is deprecated and will be removed in a future version, use"
" `truncation=True` to truncate examples to a max length. You can give a specific length with"
" `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the maximal input"
" size of the model (e.g. 512 for Bert). If you have pairs of inputs, you can give a specific"
" truncation strategy selected among `truncation='only_first'` (will only truncate the first"
" sentence in the pairs) `truncation='only_second'` (will only truncate the second sentence in the"
" pairs) or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence"
" in the pairs).",
FutureWarning,
)
truncation_strategy = TruncationStrategy(old_truncation_strategy)
@@ -2346,8 +2346,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
if verbose:
if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False):
logger.warning(
"Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. "
"Default to no padding."
"Asking to pad to max_length but no maximum length is provided and the model has no"
" predefined maximum length. Default to no padding."
)
self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
padding_strategy = PaddingStrategy.DO_NOT_PAD
@@ -2359,8 +2359,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
if verbose:
if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False):
logger.warning(
"Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. "
"Default to no truncation."
"Asking to truncate to max_length but no maximum length is provided and the model has"
" no predefined maximum length. Default to no truncation."
)
self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
@@ -2384,7 +2384,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
and (max_length % pad_to_multiple_of != 0)
):
raise ValueError(
f"Truncation and padding are both activated but "
"Truncation and padding are both activated but "
f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
)
@@ -2467,11 +2467,13 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
if is_batched:
if isinstance(text_pair, str):
raise TypeError(
"when tokenizing batches of text, `text_pair` must be a list or tuple with the same length as `text`."
"when tokenizing batches of text, `text_pair` must be a list or tuple with the same length as"
" `text`."
)
if text_pair is not None and len(text) != len(text_pair):
raise ValueError(
f"batch length of `text`: {len(text)} does not match batch length of `text_pair`: {len(text_pair)}."
f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
f" {len(text_pair)}."
)
batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
return self.batch_encode_plus(
@@ -2826,7 +2828,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
else:
raise ValueError(
f"type of {first_element} unknown: {type(first_element)}. "
f"Should be one of a python, numpy, pytorch or tensorflow object."
"Should be one of a python, numpy, pytorch or tensorflow object."
)
for key, value in encoded_inputs.items():
@@ -3123,16 +3125,17 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
)
if truncation_strategy == TruncationStrategy.ONLY_FIRST:
error_msg = (
error_msg + "Please select another truncation strategy than "
error_msg
+ "Please select another truncation strategy than "
f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
)
logger.error(error_msg)
elif truncation_strategy == TruncationStrategy.LONGEST_FIRST:
logger.warning(
f"Be aware, overflowing tokens are not returned for the setting you have chosen,"
"Be aware, overflowing tokens are not returned for the setting you have chosen,"
f" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' "
f"truncation strategy. So the returned list will always be empty even if some "
f"tokens have been removed."
"truncation strategy. So the returned list will always be empty even if some "
"tokens have been removed."
)
for _ in range(num_tokens_to_remove):
if pair_ids is None or len(ids) > len(pair_ids):
@@ -3165,7 +3168,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
f"We need to remove {num_tokens_to_remove} to truncate the input "
f"but the second sequence has a length {len(pair_ids)}. "
f"Please select another truncation strategy than {truncation_strategy}, "
f"for instance 'longest_first' or 'only_first'."
"for instance 'longest_first' or 'only_first'."
)
return (ids, pair_ids, overflowing_tokens)