Update quality tooling for formatting (#21480)

* Result of black 23.1

* Update target to Python 3.7

* Switch flake8 to ruff

* Configure isort

* Configure isort

* Apply isort with line limit

* Put the right black version

* adapt black in check copies

* Fix copies
This commit is contained in:
Sylvain Gugger
2023-02-06 18:10:56 -05:00
committed by GitHub
parent b7bb2b59f7
commit 6f79d26442
1211 changed files with 1532 additions and 2687 deletions

View File

@@ -1824,7 +1824,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
cache_dir=None,
local_files_only=False,
_commit_hash=None,
**kwargs
**kwargs,
):
# We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
# file or if `from_slow` is set to True.
@@ -1932,7 +1932,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path]
if model_max_length is not None and isinstance(model_max_length, (int, float)):
model_max_length = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length)
# TODO(PVP) - uncomment following line in Transformers v5
# init_kwargs["model_max_length"] = model_max_length
@@ -2278,7 +2277,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
max_length: Optional[int] = None,
stride: int = 0,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs
**kwargs,
) -> List[int]:
"""
Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.
@@ -2474,7 +2473,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
**kwargs,
) -> BatchEncoding:
"""
Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -2558,7 +2557,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
**kwargs,
) -> BatchEncoding:
# Input type checking for clearer error
def _is_valid_text_input(t):
@@ -2671,7 +2670,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
**kwargs,
) -> BatchEncoding:
"""
Tokenize and prepare for the model a sequence or a pair of sequences.
@@ -2743,7 +2742,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
**kwargs,
) -> BatchEncoding:
raise NotImplementedError
@@ -2773,7 +2772,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
**kwargs,
) -> BatchEncoding:
"""
Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.
@@ -2846,7 +2845,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
**kwargs,
) -> BatchEncoding:
raise NotImplementedError
@@ -3083,7 +3082,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
return_length: bool = False,
verbose: bool = True,
prepend_batch_axis: bool = False,
**kwargs
**kwargs,
) -> BatchEncoding:
"""
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
@@ -3271,8 +3270,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
)
if truncation_strategy == TruncationStrategy.ONLY_FIRST:
error_msg = (
error_msg
+ "Please select another truncation strategy than "
error_msg + "Please select another truncation strategy than "
f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
)
logger.error(error_msg)
@@ -3373,7 +3371,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
if self.padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
encoded_inputs["token_type_ids"] = (
@@ -3415,7 +3412,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True,
**kwargs
**kwargs,
) -> List[str]:
"""
Convert a list of lists of token ids into a list of strings by calling decode.
@@ -3448,7 +3445,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True,
**kwargs
**kwargs,
) -> str:
"""
Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
@@ -3484,7 +3481,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
token_ids: Union[int, List[int]],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True,
**kwargs
**kwargs,
) -> str:
raise NotImplementedError