update examples after ruff being updated (#36972)

* update * update --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2025-03-25 18:15:47 +01:00
parent a41677a68b
commit 121830ab47
20 changed files with 42 additions and 45 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -154,7 +154,7 @@ jobs:
                  path: ~/transformers/installed.txt
            - run: python -c "from transformers import *" || (echo '🚨 import failed, this means you introduced unprotected imports! 🚨'; exit 1)
            - run: ruff check examples tests src utils
-            - run: ruff format tests src utils --check
+            - run: ruff format examples tests src utils --check
            - run: python utils/custom_init_isort.py --check_only
            - run: python utils/sort_auto_mappings.py --check_only
            - run: python utils/check_doc_toc.py
--- a/examples/flax/language-modeling/run_bert_flax.py
+++ b/examples/flax/language-modeling/run_bert_flax.py
@@ -53,4 +53,4 @@ for _ in range(nbenchmark):
    func()
 end = time.time()
 print(end - start)
-print(f"Throughput: {((nbenchmark * BS)/(end-start)):.3f} examples/sec")
+print(f"Throughput: {((nbenchmark * BS) / (end - start)):.3f} examples/sec")
--- a/examples/legacy/seq2seq/finetune_trainer.py
+++ b/examples/legacy/seq2seq/finetune_trainer.py
@@ -231,9 +231,9 @@ def main():
    # set decoder_start_token_id for MBart
    if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
-        assert (
+        assert data_args.tgt_lang is not None and data_args.src_lang is not None, (
-            data_args.tgt_lang is not None and data_args.src_lang is not None
+            "mBart requires --tgt_lang and --src_lang"
-        ), "mBart requires --tgt_lang and --src_lang"
+        )
        if isinstance(tokenizer, MBartTokenizer):
            model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.tgt_lang]
        else:
--- a/examples/legacy/seq2seq/run_eval_search.py
+++ b/examples/legacy/seq2seq/run_eval_search.py
@@ -128,7 +128,7 @@ def run_search():
    results_sorted = sorted(results, key=operator.itemgetter(*task_score_names[task]), reverse=True)
    print(" | ".join([f"{col:{col_widths[col]}}" for col in col_names]))
-    print(" | ".join([f"{'-'*col_widths[col]}" for col in col_names]))
+    print(" | ".join([f"{'-' * col_widths[col]}" for col in col_names]))
    for row in results_sorted:
        print(" | ".join([f"{row[col]:{col_widths[col]}}" for col in col_names]))
--- a/examples/legacy/seq2seq/utils.py
+++ b/examples/legacy/seq2seq/utils.py
@@ -282,9 +282,9 @@ class Seq2SeqDataCollator:
        self.tokenizer = tokenizer
        self.pad_token_id = tokenizer.pad_token_id
        self.decoder_start_token_id = decoder_start_token_id
-        assert (
+        assert self.pad_token_id is not None, (
-            self.pad_token_id is not None
+            f"pad_token_id is not defined for ({self.tokenizer.__class__.__name__}), it must be defined."
-        ), f"pad_token_id is not defined for ({self.tokenizer.__class__.__name__}), it must be defined."
+        )
        self.data_args = data_args
        self.tpu_num_cores = tpu_num_cores
        self.dataset_kwargs = {"add_prefix_space": True} if isinstance(tokenizer, BartTokenizer) else {}
@@ -593,7 +593,7 @@ def assert_all_frozen(model):
    model_grads: List[bool] = list(grad_status(model))
    n_require_grad = sum(lmap(int, model_grads))
    npars = len(model_grads)
-    assert not any(model_grads), f"{n_require_grad/npars:.1%} of {npars} weights require grad"
+    assert not any(model_grads), f"{n_require_grad / npars:.1%} of {npars} weights require grad"
 def assert_not_all_frozen(model):
--- a/examples/legacy/token-classification/tasks.py
+++ b/examples/legacy/token-classification/tasks.py
@@ -131,7 +131,7 @@ class POS(TokenClassificationTask):
            s_p = preds_list[example_id]
            out = ""
            for token in sentence:
-                out += f'{token["form"]} ({token["upos"]}|{s_p.pop(0)}) '
+                out += f"{token['form']} ({token['upos']}|{s_p.pop(0)}) "
            out += "\n"
            writer.write(out)
            example_id += 1
--- a/examples/modular-transformers/modeling_multimodal2.py
+++ b/examples/modular-transformers/modeling_multimodal2.py
@@ -534,7 +534,7 @@ class Multimodal2VisionEmbeddings(nn.Module):
        batch_size, _, height, width = pixel_values.shape
        if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
            raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
            )
        target_dtype = self.patch_embedding.weight.dtype
        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -438,7 +438,7 @@ def main():
    else:
        model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
        n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
-        logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
+        logger.info(f"Training new model from scratch - Total size={n_params / 2**20:.2f}M params")
    # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
    # on a small vocab and want a smaller embedding size, remove this test.
--- a/examples/pytorch/language-modeling/run_fim.py
+++ b/examples/pytorch/language-modeling/run_fim.py
@@ -265,8 +265,7 @@ class DataTrainingArguments:
        default="<fim_pad>",
        metadata={
            "help": (
-                "Fill-in-Middle Pad token. Used only when 'truncate_or_pad' is set to True. "
+                "Fill-in-Middle Pad token. Used only when 'truncate_or_pad' is set to True. Defaults to '<fim_pad>'."
                "Defaults to '<fim_pad>'."
            )
        },
    )
@@ -514,7 +513,7 @@ def main():
            attn_implementation=model_args.attn_implementation,
        )
        n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
-        logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
+        logger.info(f"Training new model from scratch - Total size={n_params / 2**20:.2f}M params")
    # Add the new FIM tokens to the tokenizer and resize model's vocab embeddings
    special_tokens = [data_args.fim_prefix_token, data_args.fim_middle_token, data_args.fim_suffix_token]
--- a/examples/pytorch/language-modeling/run_fim_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py
@@ -234,9 +234,7 @@ def parse_args():
        "--fim_pad_token",
        type=str,
        default="<fim_pad>",
-        help=(
+        help=("Fill-in-Middle Pad token. Used only when 'truncate_or_pad' is set to True. Defaults to '<fim_pad>'."),
            "Fill-in-Middle Pad token. Used only when 'truncate_or_pad' is set to True." " Defaults to '<fim_pad>'."
        ),
    )
    parser.add_argument(
        "--preprocessing_num_workers",
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -491,7 +491,7 @@ def main():
    # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
    # that could be easily picked up by the model
    chars_to_ignore_regex = (
-        f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
+        f"[{''.join(data_args.chars_to_ignore)}]" if data_args.chars_to_ignore is not None else None
    )
    text_column_name = data_args.text_column_name
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@@ -471,7 +471,7 @@ def main():
    # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
    # that could be easily picked up by the model
    chars_to_ignore_regex = (
-        f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
+        f"[{''.join(data_args.chars_to_ignore)}]" if data_args.chars_to_ignore is not None else None
    )
    text_column_name = data_args.text_column_name
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -505,9 +505,9 @@ def main():
        return
    if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)):
-        assert (
+        assert data_args.lang is not None, (
-            data_args.lang is not None
+            f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument"
-        ), f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument"
+        )
        tokenizer.src_lang = data_args.lang
        tokenizer.tgt_lang = data_args.lang
--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@@ -199,9 +199,9 @@ class DataTrainingArguments:
            train_extension = self.train_file.split(".")[-1]
            assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
            validation_extension = self.validation_file.split(".")[-1]
-            assert (
+            assert validation_extension == train_extension, (
-                validation_extension == train_extension
+                "`validation_file` should have the same extension (csv or json) as `train_file`."
-            ), "`validation_file` should have the same extension (csv or json) as `train_file`."
+            )
@dataclass
@@ -357,9 +357,9 @@ def main():
            if data_args.test_file is not None:
                train_extension = data_args.train_file.split(".")[-1]
                test_extension = data_args.test_file.split(".")[-1]
-                assert (
+                assert test_extension == train_extension, (
-                    test_extension == train_extension
+                    "`test_file` should have the same extension (csv or json) as `train_file`."
-                ), "`test_file` should have the same extension (csv or json) as `train_file`."
+                )
                data_files["test"] = data_args.test_file
            else:
                raise ValueError("Need either a dataset name or a test file for `do_predict`.")
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -156,9 +156,9 @@ class DataTrainingArguments:
            train_extension = self.train_file.split(".")[-1]
            assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
            validation_extension = self.validation_file.split(".")[-1]
-            assert (
+            assert validation_extension == train_extension, (
-                validation_extension == train_extension
+                "`validation_file` should have the same extension (csv or json) as `train_file`."
-            ), "`validation_file` should have the same extension (csv or json) as `train_file`."
+            )
@dataclass
@@ -313,9 +313,9 @@ def main():
            if data_args.test_file is not None:
                train_extension = data_args.train_file.split(".")[-1]
                test_extension = data_args.test_file.split(".")[-1]
-                assert (
+                assert test_extension == train_extension, (
-                    test_extension == train_extension
+                    "`test_file` should have the same extension (csv or json) as `train_file`."
-                ), "`test_file` should have the same extension (csv or json) as `train_file`."
+                )
                data_files["test"] = data_args.test_file
            else:
                raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
--- a/examples/pytorch/text-generation/run_generation.py
+++ b/examples/pytorch/text-generation/run_generation.py
@@ -322,7 +322,7 @@ def main():
    parser.add_argument(
        "--use_cpu",
        action="store_true",
-        help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available",
+        help="Whether or not to use cpu. If set to False, we will use gpu/npu or mps device if available",
    )
    parser.add_argument("--num_return_sequences", type=int, default=1, help="The number of samples to generate.")
    parser.add_argument(
--- a/examples/pytorch/text-generation/run_generation_contrastive_search.py
+++ b/examples/pytorch/text-generation/run_generation_contrastive_search.py
@@ -68,7 +68,7 @@ def main():
    parser.add_argument(
        "--use_cpu",
        action="store_true",
-        help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available",
+        help="Whether or not to use cpu. If set to False, we will use gpu/npu or mps device if available",
    )
    parser.add_argument(
        "--fp16",
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -436,9 +436,9 @@ def main():
    # Set decoder_start_token_id
    if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
-        assert (
+        assert args.target_lang is not None and args.source_lang is not None, (
-            args.target_lang is not None and args.source_lang is not None
+            "mBart requires --target_lang and --source_lang"
-        ), "mBart requires --target_lang and --source_lang"
+        )
        if isinstance(tokenizer, MBartTokenizer):
            model.config.decoder_start_token_id = tokenizer.lang_code_to_id[args.target_lang]
        else:
--- a/examples/run_on_remote.py
+++ b/examples/run_on_remote.py
@@ -56,7 +56,7 @@ if __name__ == "__main__":
    cluster.run(["pip install torch --upgrade --extra-index-url https://download.pytorch.org/whl/cu117"])
    # Run example. You can bypass the CLI wrapper and paste your own code here.
-    cluster.run([f'python transformers/examples/{args.example} {" ".join(shlex.quote(arg) for arg in unknown)}'])
+    cluster.run([f"python transformers/examples/{args.example} {' '.join(shlex.quote(arg) for arg in unknown)}"])
    # Alternatively, we can just import and run a training function (especially if there's no wrapper CLI):
    # from my_script... import train
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@@ -501,9 +501,9 @@ def main():
        # region Set decoder_start_token_id
        if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
-            assert (
+            assert data_args.target_lang is not None and data_args.source_lang is not None, (
-                data_args.target_lang is not None and data_args.source_lang is not None
+                "mBart requires --target_lang and --source_lang"
-            ), "mBart requires --target_lang and --source_lang"
+            )
            if isinstance(tokenizer, MBartTokenizer):
                model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.target_lang]
            else: