Black preview (#17217)
* Black preview * Fixup too! * Fix check copies * Use the same version as the CI * Bump black
This commit is contained in:
@@ -1502,12 +1502,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose:
|
||||
if not self.deprecation_warnings.get("max_len_single_sentence", False):
|
||||
logger.warning(
|
||||
"Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
|
||||
"Setting 'max_len_single_sentence' is now deprecated. This value is automatically set up."
|
||||
)
|
||||
self.deprecation_warnings["max_len_single_sentence"] = True
|
||||
else:
|
||||
raise ValueError(
|
||||
"Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
|
||||
"Setting 'max_len_single_sentence' is now deprecated. This value is automatically set up."
|
||||
)
|
||||
|
||||
@max_len_sentences_pair.setter
|
||||
@@ -1516,13 +1516,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose:
|
||||
if not self.deprecation_warnings.get("max_len_sentences_pair", False):
|
||||
logger.warning(
|
||||
"Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
|
||||
"Setting 'max_len_sentences_pair' is now deprecated. This value is automatically set up."
|
||||
)
|
||||
self.deprecation_warnings["max_len_sentences_pair"] = True
|
||||
else:
|
||||
raise ValueError(
|
||||
"Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
|
||||
)
|
||||
raise ValueError("Setting 'max_len_sentences_pair' is now deprecated. This value is automatically set up.")
|
||||
|
||||
def _set_processor_class(self, processor_class: str):
|
||||
"""Sets processor class as an attribute."""
|
||||
@@ -1530,9 +1528,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return (
|
||||
f"{'PreTrainedTokenizerFast' if self.is_fast else 'PreTrainedTokenizer'}(name_or_path='{self.name_or_path}', "
|
||||
f"vocab_size={self.vocab_size}, model_max_len={self.model_max_length}, is_fast={self.is_fast}, "
|
||||
f"padding_side='{self.padding_side}', truncation_side='{self.truncation_side}', special_tokens={self.special_tokens_map_extended})"
|
||||
f"{'PreTrainedTokenizerFast' if self.is_fast else 'PreTrainedTokenizer'}(name_or_path='{self.name_or_path}',"
|
||||
f" vocab_size={self.vocab_size}, model_max_len={self.model_max_length}, is_fast={self.is_fast},"
|
||||
f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}',"
|
||||
f" special_tokens={self.special_tokens_map_extended})"
|
||||
)
|
||||
|
||||
def get_vocab(self) -> Dict[str, int]:
|
||||
@@ -1873,10 +1872,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
if config_tokenizer_class is not None:
|
||||
if cls.__name__.replace("Fast", "") != config_tokenizer_class.replace("Fast", ""):
|
||||
logger.warning(
|
||||
"The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. "
|
||||
"It may result in unexpected tokenization. \n"
|
||||
f"The tokenizer class you load from this checkpoint is '{config_tokenizer_class}'. \n"
|
||||
f"The class this function is called from is '{cls.__name__}'."
|
||||
"The tokenizer class you load from this checkpoint is not the same type as the class this"
|
||||
" function is called from. It may result in unexpected tokenization. \nThe tokenizer class you"
|
||||
f" load from this checkpoint is '{config_tokenizer_class}'. \nThe class this function is called"
|
||||
f" from is '{cls.__name__}'."
|
||||
)
|
||||
|
||||
# Update with newly provided kwargs
|
||||
@@ -1988,7 +1987,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
added_tokens = tokenizer.sanitize_special_tokens()
|
||||
if added_tokens:
|
||||
logger.warning_advice(
|
||||
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained."
|
||||
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are"
|
||||
" fine-tuned or trained."
|
||||
)
|
||||
|
||||
return tokenizer
|
||||
@@ -2270,11 +2270,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
if verbose:
|
||||
if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False):
|
||||
logger.warning(
|
||||
"Truncation was not explicitly activated but `max_length` is provided a specific value, "
|
||||
"please use `truncation=True` to explicitly truncate examples to max length. "
|
||||
"Defaulting to 'longest_first' truncation strategy. "
|
||||
"If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy "
|
||||
"more precisely by providing a specific strategy to `truncation`."
|
||||
"Truncation was not explicitly activated but `max_length` is provided a specific value, please"
|
||||
" use `truncation=True` to explicitly truncate examples to max length. Defaulting to"
|
||||
" 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the"
|
||||
" tokenizer you can select this strategy more precisely by providing a specific strategy to"
|
||||
" `truncation`."
|
||||
)
|
||||
self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
|
||||
truncation = "longest_first"
|
||||
@@ -2316,14 +2316,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
if truncation is False and old_truncation_strategy != "do_not_truncate":
|
||||
if verbose:
|
||||
warnings.warn(
|
||||
"The `truncation_strategy` argument is deprecated and will be removed in a future version, "
|
||||
"use `truncation=True` to truncate examples to a max length. You can give a specific "
|
||||
"length with `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the "
|
||||
"maximal input size of the model (e.g. 512 for Bert). "
|
||||
" If you have pairs of inputs, you can give a specific truncation strategy selected among "
|
||||
"`truncation='only_first'` (will only truncate the first sentence in the pairs) "
|
||||
"`truncation='only_second'` (will only truncate the second sentence in the pairs) "
|
||||
"or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence in the pairs).",
|
||||
"The `truncation_strategy` argument is deprecated and will be removed in a future version, use"
|
||||
" `truncation=True` to truncate examples to a max length. You can give a specific length with"
|
||||
" `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the maximal input"
|
||||
" size of the model (e.g. 512 for Bert). If you have pairs of inputs, you can give a specific"
|
||||
" truncation strategy selected among `truncation='only_first'` (will only truncate the first"
|
||||
" sentence in the pairs) `truncation='only_second'` (will only truncate the second sentence in the"
|
||||
" pairs) or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence"
|
||||
" in the pairs).",
|
||||
FutureWarning,
|
||||
)
|
||||
truncation_strategy = TruncationStrategy(old_truncation_strategy)
|
||||
@@ -2346,8 +2346,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
if verbose:
|
||||
if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False):
|
||||
logger.warning(
|
||||
"Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. "
|
||||
"Default to no padding."
|
||||
"Asking to pad to max_length but no maximum length is provided and the model has no"
|
||||
" predefined maximum length. Default to no padding."
|
||||
)
|
||||
self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
|
||||
padding_strategy = PaddingStrategy.DO_NOT_PAD
|
||||
@@ -2359,8 +2359,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
if verbose:
|
||||
if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False):
|
||||
logger.warning(
|
||||
"Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. "
|
||||
"Default to no truncation."
|
||||
"Asking to truncate to max_length but no maximum length is provided and the model has"
|
||||
" no predefined maximum length. Default to no truncation."
|
||||
)
|
||||
self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
|
||||
truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
|
||||
@@ -2384,7 +2384,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
and (max_length % pad_to_multiple_of != 0)
|
||||
):
|
||||
raise ValueError(
|
||||
f"Truncation and padding are both activated but "
|
||||
"Truncation and padding are both activated but "
|
||||
f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
|
||||
)
|
||||
|
||||
@@ -2467,11 +2467,13 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
if is_batched:
|
||||
if isinstance(text_pair, str):
|
||||
raise TypeError(
|
||||
"when tokenizing batches of text, `text_pair` must be a list or tuple with the same length as `text`."
|
||||
"when tokenizing batches of text, `text_pair` must be a list or tuple with the same length as"
|
||||
" `text`."
|
||||
)
|
||||
if text_pair is not None and len(text) != len(text_pair):
|
||||
raise ValueError(
|
||||
f"batch length of `text`: {len(text)} does not match batch length of `text_pair`: {len(text_pair)}."
|
||||
f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
|
||||
f" {len(text_pair)}."
|
||||
)
|
||||
batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
|
||||
return self.batch_encode_plus(
|
||||
@@ -2826,7 +2828,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
else:
|
||||
raise ValueError(
|
||||
f"type of {first_element} unknown: {type(first_element)}. "
|
||||
f"Should be one of a python, numpy, pytorch or tensorflow object."
|
||||
"Should be one of a python, numpy, pytorch or tensorflow object."
|
||||
)
|
||||
|
||||
for key, value in encoded_inputs.items():
|
||||
@@ -3123,16 +3125,17 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
)
|
||||
if truncation_strategy == TruncationStrategy.ONLY_FIRST:
|
||||
error_msg = (
|
||||
error_msg + "Please select another truncation strategy than "
|
||||
error_msg
|
||||
+ "Please select another truncation strategy than "
|
||||
f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
|
||||
)
|
||||
logger.error(error_msg)
|
||||
elif truncation_strategy == TruncationStrategy.LONGEST_FIRST:
|
||||
logger.warning(
|
||||
f"Be aware, overflowing tokens are not returned for the setting you have chosen,"
|
||||
"Be aware, overflowing tokens are not returned for the setting you have chosen,"
|
||||
f" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' "
|
||||
f"truncation strategy. So the returned list will always be empty even if some "
|
||||
f"tokens have been removed."
|
||||
"truncation strategy. So the returned list will always be empty even if some "
|
||||
"tokens have been removed."
|
||||
)
|
||||
for _ in range(num_tokens_to_remove):
|
||||
if pair_ids is None or len(ids) > len(pair_ids):
|
||||
@@ -3165,7 +3168,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
f"We need to remove {num_tokens_to_remove} to truncate the input "
|
||||
f"but the second sequence has a length {len(pair_ids)}. "
|
||||
f"Please select another truncation strategy than {truncation_strategy}, "
|
||||
f"for instance 'longest_first' or 'only_first'."
|
||||
"for instance 'longest_first' or 'only_first'."
|
||||
)
|
||||
|
||||
return (ids, pair_ids, overflowing_tokens)
|
||||
|
||||
Reference in New Issue
Block a user