diff --git a/examples/flax/image-captioning/run_image_captioning_flax.py b/examples/flax/image-captioning/run_image_captioning_flax.py
index cb7dad8d58..414e1c62e3 100644
--- a/examples/flax/image-captioning/run_image_captioning_flax.py
+++ b/examples/flax/image-captioning/run_image_captioning_flax.py
@@ -906,7 +906,7 @@ def main():
         layer_norm_named_params = {
             layer[-2:]
             for layer_norm_name in layer_norm_candidates
-            for layer in flat_params.keys()
+            for layer in flat_params
             if layer_norm_name in "".join(layer).lower()
         }
         flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params}
diff --git a/examples/flax/language-modeling/run_bart_dlm_flax.py b/examples/flax/language-modeling/run_bart_dlm_flax.py
index f3e27be1cc..2ddadcdf99 100644
--- a/examples/flax/language-modeling/run_bart_dlm_flax.py
+++ b/examples/flax/language-modeling/run_bart_dlm_flax.py
@@ -530,7 +530,7 @@ def main():
             trust_remote_code=data_args.trust_remote_code,
         )
 
-        if "validation" not in datasets.keys():
+        if "validation" not in datasets:
             datasets["validation"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
@@ -567,7 +567,7 @@ def main():
             num_proc=data_args.preprocessing_num_workers,
         )
 
-        if "validation" not in datasets.keys():
+        if "validation" not in datasets:
             datasets["validation"] = load_dataset(
                 extension,
                 data_files=data_files,
@@ -671,7 +671,7 @@ def main():
     # max_seq_length.
     def group_texts(examples):
         # Concatenate all texts.
-        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples}
         total_length = len(concatenated_examples[list(examples.keys())[0]])
         # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
         # customize this part to your needs.
@@ -777,7 +777,7 @@ def main():
         layer_norm_named_params = {
             layer[-2:]
             for layer_norm_name in layer_norm_candidates
-            for layer in flat_params.keys()
+            for layer in flat_params
             if layer_norm_name in "".join(layer).lower()
         }
         flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params}
diff --git a/examples/flax/language-modeling/run_clm_flax.py b/examples/flax/language-modeling/run_clm_flax.py
index 8d2daaf517..fc1367fb6c 100755
--- a/examples/flax/language-modeling/run_clm_flax.py
+++ b/examples/flax/language-modeling/run_clm_flax.py
@@ -407,7 +407,7 @@ def main():
             trust_remote_code=model_args.trust_remote_code,
         )
 
-        if "validation" not in dataset.keys():
+        if "validation" not in dataset:
             dataset["validation"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
@@ -447,7 +447,7 @@ def main():
             num_proc=data_args.preprocessing_num_workers,
         )
 
-        if "validation" not in dataset.keys():
+        if "validation" not in dataset:
             dataset["validation"] = load_dataset(
                 extension,
                 data_files=data_files,
@@ -580,7 +580,7 @@ def main():
     # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
     def group_texts(examples):
         # Concatenate all texts.
-        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples}
         total_length = len(concatenated_examples[list(examples.keys())[0]])
         # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
         # customize this part to your needs.
@@ -674,7 +674,7 @@ def main():
         layer_norm_named_params = {
             layer[-2:]
             for layer_norm_name in layer_norm_candidates
-            for layer in flat_params.keys()
+            for layer in flat_params
             if layer_norm_name in "".join(layer).lower()
         }
         flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params}
diff --git a/examples/flax/language-modeling/run_mlm_flax.py b/examples/flax/language-modeling/run_mlm_flax.py
index df548de619..0d80b7b0bf 100755
--- a/examples/flax/language-modeling/run_mlm_flax.py
+++ b/examples/flax/language-modeling/run_mlm_flax.py
@@ -448,7 +448,7 @@ def main():
             trust_remote_code=model_args.trust_remote_code,
         )
 
-        if "validation" not in datasets.keys():
+        if "validation" not in datasets:
             datasets["validation"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
@@ -485,7 +485,7 @@ def main():
             num_proc=data_args.preprocessing_num_workers,
         )
 
-        if "validation" not in datasets.keys():
+        if "validation" not in datasets:
             datasets["validation"] = load_dataset(
                 extension,
                 data_files=data_files,
@@ -603,7 +603,7 @@ def main():
         # max_seq_length.
         def group_texts(examples):
             # Concatenate all texts.
-            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+            concatenated_examples = {k: list(chain(*examples[k])) for k in examples}
             total_length = len(concatenated_examples[list(examples.keys())[0]])
             # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
             # customize this part to your needs.
@@ -707,7 +707,7 @@ def main():
         layer_norm_named_params = {
             layer[-2:]
             for layer_norm_name in layer_norm_candidates
-            for layer in flat_params.keys()
+            for layer in flat_params
             if layer_norm_name in "".join(layer).lower()
         }
         flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params}
diff --git a/examples/flax/language-modeling/run_t5_mlm_flax.py b/examples/flax/language-modeling/run_t5_mlm_flax.py
index 9a64b6d716..af3394cccb 100755
--- a/examples/flax/language-modeling/run_t5_mlm_flax.py
+++ b/examples/flax/language-modeling/run_t5_mlm_flax.py
@@ -572,7 +572,7 @@ def main():
             trust_remote_code=data_args.trust_remote_code,
         )
 
-        if "validation" not in datasets.keys():
+        if "validation" not in datasets:
             datasets["validation"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
@@ -609,7 +609,7 @@ def main():
             num_proc=data_args.preprocessing_num_workers,
         )
 
-        if "validation" not in datasets.keys():
+        if "validation" not in datasets:
             datasets["validation"] = load_dataset(
                 extension,
                 data_files=data_files,
@@ -703,7 +703,7 @@ def main():
     # Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length.
     def group_texts(examples):
         # Concatenate all texts.
-        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples}
         total_length = len(concatenated_examples[list(examples.keys())[0]])
         # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
         # customize this part to your needs.
@@ -814,7 +814,7 @@ def main():
         layer_norm_named_params = {
             layer[-2:]
             for layer_norm_name in layer_norm_candidates
-            for layer in flat_params.keys()
+            for layer in flat_params
             if layer_norm_name in "".join(layer).lower()
         }
         flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params}
diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py
index 55fb4d9c62..d17304c702 100644
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -345,7 +345,7 @@ def create_train_state(
         layer_norm_named_params = {
             layer[-2:]
             for layer_norm_name in layer_norm_candidates
-            for layer in flat_params.keys()
+            for layer in flat_params
             if layer_norm_name in "".join(layer).lower()
         }
         flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params}
diff --git a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
index a1b7519b9e..40267a95c6 100644
--- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
+++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
@@ -668,7 +668,7 @@ def main():
         layer_norm_named_params = {
             layer[-2:]
             for layer_norm_name in layer_norm_candidates
-            for layer in flat_params.keys()
+            for layer in flat_params
             if layer_norm_name in "".join(layer).lower()
         }
         flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params}
diff --git a/examples/flax/summarization/run_summarization_flax.py b/examples/flax/summarization/run_summarization_flax.py
index 6c9777cf32..5240db323f 100644
--- a/examples/flax/summarization/run_summarization_flax.py
+++ b/examples/flax/summarization/run_summarization_flax.py
@@ -768,7 +768,7 @@ def main():
         layer_norm_named_params = {
             layer[-2:]
             for layer_norm_name in layer_norm_candidates
-            for layer in flat_params.keys()
+            for layer in flat_params
             if layer_norm_name in "".join(layer).lower()
         }
         flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params}
diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py
index 2f84235945..ade6bc0e49 100755
--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@@ -249,7 +249,7 @@ def create_train_state(
         layer_norm_named_params = {
             layer[-2:]
             for layer_norm_name in layer_norm_candidates
-            for layer in flat_params.keys()
+            for layer in flat_params
             if layer_norm_name in "".join(layer).lower()
         }
         flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params}
diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py
index ccdabd1c71..93130cc52c 100644
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@@ -310,7 +310,7 @@ def create_train_state(
         layer_norm_named_params = {
             layer[-2:]
             for layer_norm_name in layer_norm_candidates
-            for layer in flat_params.keys()
+            for layer in flat_params
             if layer_norm_name in "".join(layer).lower()
         }
         flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params}
diff --git a/examples/legacy/pytorch-lightning/lightning_base.py b/examples/legacy/pytorch-lightning/lightning_base.py
index 5c34fbe7e0..228fc87ca3 100644
--- a/examples/legacy/pytorch-lightning/lightning_base.py
+++ b/examples/legacy/pytorch-lightning/lightning_base.py
@@ -379,8 +379,8 @@ def generic_train(
         train_params["distributed_backend"] = "ddp"
 
     train_params["accumulate_grad_batches"] = args.accumulate_grad_batches
-    train_params["accelerator"] = extra_train_kwargs.get("accelerator", None)
-    train_params["profiler"] = extra_train_kwargs.get("profiler", None)
+    train_params["accelerator"] = extra_train_kwargs.get("accelerator")
+    train_params["profiler"] = extra_train_kwargs.get("profiler")
 
     trainer = pl.Trainer.from_argparse_args(
         args,
diff --git a/examples/legacy/seq2seq/download_wmt.py b/examples/legacy/seq2seq/download_wmt.py
index c52c0c7b4f..2cff6f5992 100755
--- a/examples/legacy/seq2seq/download_wmt.py
+++ b/examples/legacy/seq2seq/download_wmt.py
@@ -44,7 +44,7 @@ def download_wmt_dataset(src_lang="ro", tgt_lang="en", dataset="wmt16", save_dir
     save_dir = Path(save_dir)
     save_dir.mkdir(exist_ok=True)
 
-    for split in ds.keys():
+    for split in ds:
         print(f"Splitting {split} with {ds[split].num_rows} records")
 
         # to save to val.source, val.target like summary datasets
diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py
index 96f8a83bc3..10baf5f8a0 100755
--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@@ -288,7 +288,7 @@ def main():
         return {"pixel_values": pixel_values, "labels": labels}
 
     # If we don't have a validation split, split off a percentage of train as validation.
-    data_args.train_val_split = None if "validation" in dataset.keys() else data_args.train_val_split
+    data_args.train_val_split = None if "validation" in dataset else data_args.train_val_split
     if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0:
         split = dataset["train"].train_test_split(data_args.train_val_split)
         dataset["train"] = split["train"]
diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index a44959812c..e27605b8ed 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -324,7 +324,7 @@ def main():
         )
 
     # If we don't have a validation split, split off a percentage of train as validation.
-    args.train_val_split = None if "validation" in dataset.keys() else args.train_val_split
+    args.train_val_split = None if "validation" in dataset else args.train_val_split
     if isinstance(args.train_val_split, float) and args.train_val_split > 0.0:
         split = dataset["train"].train_test_split(args.train_val_split)
         dataset["train"] = split["train"]
diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py
index f9102b27cd..1ae581b223 100644
--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@@ -247,7 +247,7 @@ def main():
     )
 
     # If we don't have a validation split, split off a percentage of train as validation.
-    data_args.train_val_split = None if "validation" in ds.keys() else data_args.train_val_split
+    data_args.train_val_split = None if "validation" in ds else data_args.train_val_split
     if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0:
         split = ds["train"].train_test_split(data_args.train_val_split)
         ds["train"] = split["train"]
diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py
index 538db2038f..f3d47bfff3 100644
--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@@ -311,7 +311,7 @@ def main():
     )
 
     # If we don't have a validation split, split off a percentage of train as validation.
-    data_args.train_val_split = None if "validation" in ds.keys() else data_args.train_val_split
+    data_args.train_val_split = None if "validation" in ds else data_args.train_val_split
     if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0:
         split = ds["train"].train_test_split(data_args.train_val_split)
         ds["train"] = split["train"]
diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
index 429ad7d1c6..581a610137 100644
--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@@ -452,7 +452,7 @@ def main():
     )
 
     # If we don't have a validation split, split off a percentage of train as validation.
-    args.train_val_split = None if "validation" in ds.keys() else args.train_val_split
+    args.train_val_split = None if "validation" in ds else args.train_val_split
     if isinstance(args.train_val_split, float) and args.train_val_split > 0.0:
         split = ds["train"].train_test_split(args.train_val_split)
         ds["train"] = split["train"]
diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
index 28ebaf9a33..0c5829818d 100755
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -358,7 +358,7 @@ def main():
             streaming=data_args.streaming,
             trust_remote_code=model_args.trust_remote_code,
         )
-        if "validation" not in raw_datasets.keys():
+        if "validation" not in raw_datasets:
             if data_args.streaming:
                 dataset_stream = load_dataset(
                     data_args.dataset_name,
@@ -412,7 +412,7 @@ def main():
             **dataset_args,
         )
         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
-        if "validation" not in raw_datasets.keys():
+        if "validation" not in raw_datasets:
             if data_args.streaming:
                 dataset_stream = load_dataset(
                     extension,
@@ -579,7 +579,7 @@ def main():
     # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
     def group_texts(examples):
         # Concatenate all texts.
-        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples}
         total_length = len(concatenated_examples[list(examples.keys())[0]])
         # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
         # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index f5ac527e84..0c397bc28c 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -335,7 +335,7 @@ def main():
         raw_datasets = load_dataset(
             args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
         )
-        if "validation" not in raw_datasets.keys():
+        if "validation" not in raw_datasets:
             raw_datasets["validation"] = load_dataset(
                 args.dataset_name,
                 args.dataset_config_name,
@@ -362,7 +362,7 @@ def main():
             dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks
         raw_datasets = load_dataset(extension, data_files=data_files, **dataset_args)
         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
-        if "validation" not in raw_datasets.keys():
+        if "validation" not in raw_datasets:
             raw_datasets["validation"] = load_dataset(
                 extension,
                 data_files=data_files,
@@ -465,7 +465,7 @@ def main():
     # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
     def group_texts(examples):
         # Concatenate all texts.
-        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples}
         total_length = len(concatenated_examples[list(examples.keys())[0]])
         # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
         # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
diff --git a/examples/pytorch/language-modeling/run_fim.py b/examples/pytorch/language-modeling/run_fim.py
index 7e82238e6e..bce3adabfc 100644
--- a/examples/pytorch/language-modeling/run_fim.py
+++ b/examples/pytorch/language-modeling/run_fim.py
@@ -388,7 +388,7 @@ def main():
             streaming=data_args.streaming,
             trust_remote_code=model_args.trust_remote_code,
         )
-        if "validation" not in raw_datasets.keys():
+        if "validation" not in raw_datasets:
             raw_datasets["validation"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
@@ -430,7 +430,7 @@ def main():
             **dataset_args,
         )
         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
-        if "validation" not in raw_datasets.keys():
+        if "validation" not in raw_datasets:
             raw_datasets["validation"] = load_dataset(
                 extension,
                 data_files=data_files,
@@ -652,7 +652,7 @@ def main():
     # Data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
     def group_texts(examples):
         # Concatenate all texts.
-        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples}
         total_length = len(concatenated_examples[list(examples.keys())[0]])
         # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
         # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
diff --git a/examples/pytorch/language-modeling/run_fim_no_trainer.py b/examples/pytorch/language-modeling/run_fim_no_trainer.py
index 400dd643a5..6e5cc427f4 100644
--- a/examples/pytorch/language-modeling/run_fim_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py
@@ -401,7 +401,7 @@ def main():
         raw_datasets = load_dataset(
             args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
         )
-        if "validation" not in raw_datasets.keys():
+        if "validation" not in raw_datasets:
             raw_datasets["validation"] = load_dataset(
                 args.dataset_name,
                 args.dataset_config_name,
@@ -427,7 +427,7 @@ def main():
             dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks
         raw_datasets = load_dataset(extension, data_files=data_files, **dataset_args)
         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
-        if "validation" not in raw_datasets.keys():
+        if "validation" not in raw_datasets:
             raw_datasets["validation"] = load_dataset(
                 extension,
                 data_files=data_files,
@@ -599,7 +599,7 @@ def main():
     # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
     def group_texts(examples):
         # Concatenate all texts.
-        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples}
         total_length = len(concatenated_examples[list(examples.keys())[0]])
         # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
         # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
index 57f8a34189..4573d343a7 100755
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -331,7 +331,7 @@ def main():
             streaming=data_args.streaming,
             trust_remote_code=model_args.trust_remote_code,
         )
-        if "validation" not in raw_datasets.keys():
+        if "validation" not in raw_datasets:
             raw_datasets["validation"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
@@ -368,7 +368,7 @@ def main():
         )
 
         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
-        if "validation" not in raw_datasets.keys():
+        if "validation" not in raw_datasets:
             raw_datasets["validation"] = load_dataset(
                 extension,
                 data_files=data_files,
@@ -541,7 +541,7 @@ def main():
         # max_seq_length.
         def group_texts(examples):
             # Concatenate all texts.
-            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+            concatenated_examples = {k: list(chain(*examples[k])) for k in examples}
             total_length = len(concatenated_examples[list(examples.keys())[0]])
             # We drop the small remainder, and if the total_length < max_seq_length  we exclude this batch and return an empty dict.
             # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 559194aaa9..d975e1acf6 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -342,7 +342,7 @@ def main():
         raw_datasets = load_dataset(
             args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
         )
-        if "validation" not in raw_datasets.keys():
+        if "validation" not in raw_datasets:
             raw_datasets["validation"] = load_dataset(
                 args.dataset_name,
                 args.dataset_config_name,
@@ -367,7 +367,7 @@ def main():
             extension = "text"
         raw_datasets = load_dataset(extension, data_files=data_files)
         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
-        if "validation" not in raw_datasets.keys():
+        if "validation" not in raw_datasets:
             raw_datasets["validation"] = load_dataset(
                 extension,
                 data_files=data_files,
@@ -496,7 +496,7 @@ def main():
         # max_seq_length.
         def group_texts(examples):
             # Concatenate all texts.
-            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+            concatenated_examples = {k: list(chain(*examples[k])) for k in examples}
             total_length = len(concatenated_examples[list(examples.keys())[0]])
             # We drop the small remainder, and if the total_length < max_seq_length  we exclude this batch and return an empty dict.
             # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py
index 31c70d6a57..d7bb35d595 100755
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@@ -309,7 +309,7 @@ def main():
             token=model_args.token,
             trust_remote_code=data_args.trust_remote_code,
         )
-        if "validation" not in raw_datasets.keys():
+        if "validation" not in raw_datasets:
             raw_datasets["validation"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
@@ -338,7 +338,7 @@ def main():
             extension = "text"
         raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
-        if "validation" not in raw_datasets.keys():
+        if "validation" not in raw_datasets:
             raw_datasets["validation"] = load_dataset(
                 extension,
                 data_files=data_files,
@@ -466,7 +466,7 @@ def main():
         # max_seq_length.
         def group_texts(examples):
             # Concatenate all texts.
-            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+            concatenated_examples = {k: list(chain(*examples[k])) for k in examples}
             total_length = len(concatenated_examples[list(examples.keys())[0]])
             # We drop the small remainder, and if the total_length < max_seq_length  we exclude this batch and return an empty dict.
             # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index 60467170ca..d99cd869f0 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -316,7 +316,7 @@ def main():
         raw_datasets = load_dataset(extension, data_files=data_files)
     # Trim a number of training examples
     if args.debug:
-        for split in raw_datasets.keys():
+        for split in raw_datasets:
             raw_datasets[split] = raw_datasets[split].select(range(100))
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.
diff --git a/examples/pytorch/object-detection/run_object_detection.py b/examples/pytorch/object-detection/run_object_detection.py
index f97be95d0b..71fff54cca 100644
--- a/examples/pytorch/object-detection/run_object_detection.py
+++ b/examples/pytorch/object-detection/run_object_detection.py
@@ -403,7 +403,7 @@ def main():
     )
 
     # If we don't have a validation split, split off a percentage of train as validation
-    data_args.train_val_split = None if "validation" in dataset.keys() else data_args.train_val_split
+    data_args.train_val_split = None if "validation" in dataset else data_args.train_val_split
     if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0:
         split = dataset["train"].train_test_split(data_args.train_val_split, seed=training_args.seed)
         dataset["train"] = split["train"]
diff --git a/examples/pytorch/object-detection/run_object_detection_no_trainer.py b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
index 805cc18d89..aaf54138f0 100644
--- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py
+++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
@@ -465,7 +465,7 @@ def main():
     dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir, trust_remote_code=args.trust_remote_code)
 
     # If we don't have a validation split, split off a percentage of train as validation.
-    args.train_val_split = None if "validation" in dataset.keys() else args.train_val_split
+    args.train_val_split = None if "validation" in dataset else args.train_val_split
     if isinstance(args.train_val_split, float) and args.train_val_split > 0.0:
         split = dataset["train"].train_test_split(args.train_val_split, seed=args.seed)
         dataset["train"] = split["train"]
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
index 9db580c376..3facad307e 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@@ -255,7 +255,7 @@ def main():
         dataset = dataset.rename_columns({"annotation": "label"})
 
     # If we don't have a validation split, split off a percentage of train as validation.
-    data_args.train_val_split = None if "validation" in dataset.keys() else data_args.train_val_split
+    data_args.train_val_split = None if "validation" in dataset else data_args.train_val_split
     if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0:
         split = dataset["train"].train_test_split(data_args.train_val_split)
         dataset["train"] = split["train"]
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index d30e72cb64..3d2caf88bf 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -314,7 +314,7 @@ def main():
         dataset = dataset.rename_columns({"annotation": "label"})
 
     # If we don't have a validation split, split off a percentage of train as validation.
-    args.train_val_split = None if "validation" in dataset.keys() else args.train_val_split
+    args.train_val_split = None if "validation" in dataset else args.train_val_split
     if isinstance(args.train_val_split, float) and args.train_val_split > 0.0:
         split = dataset["train"].train_test_split(args.train_val_split)
         dataset["train"] = split["train"]
diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py
index c146d4c800..80c616ee74 100755
--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@@ -378,7 +378,7 @@ def main():
             else:
                 raise ValueError("Need either a dataset name or a test file for `do_predict`.")
 
-        for key in data_files.keys():
+        for key in data_files:
             logger.info(f"load a local file for {key}: {data_files[key]}")
 
         if data_args.train_file.endswith(".csv"):
@@ -422,13 +422,13 @@ def main():
         raw_datasets.pop(data_args.test_split_name)
 
     if data_args.remove_columns is not None:
-        for split in raw_datasets.keys():
+        for split in raw_datasets:
             for column in data_args.remove_columns.split(","):
                 logger.info(f"removing column {column} from split {split}")
                 raw_datasets[split] = raw_datasets[split].remove_columns(column)
 
     if data_args.label_column_name is not None and data_args.label_column_name != "label":
-        for key in raw_datasets.keys():
+        for key in raw_datasets:
             raw_datasets[key] = raw_datasets[key].rename_column(data_args.label_column_name, "label")
 
     # Trying to have good defaults here, don't hesitate to tweak to your needs.
@@ -444,7 +444,7 @@ def main():
         label_list = None
         num_labels = 1
         # regression requires float as label type, let's cast it if needed
-        for split in raw_datasets.keys():
+        for split in raw_datasets:
             if raw_datasets[split].features["label"].dtype not in ["float32", "float64"]:
                 logger.warning(
                     f"Label type for {split} set to float32, was {raw_datasets[split].features['label'].dtype}"
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index 587f75084e..63b3b9ab8f 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -161,7 +161,7 @@ class DataTrainingArguments:
     def __post_init__(self):
         if self.task_name is not None:
             self.task_name = self.task_name.lower()
-            if self.task_name not in task_to_keys.keys():
+            if self.task_name not in task_to_keys:
                 raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys()))
         elif self.dataset_name is not None:
             pass
@@ -335,7 +335,7 @@ def main():
             else:
                 raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
 
-        for key in data_files.keys():
+        for key in data_files:
             logger.info(f"load a local file for {key}: {data_files[key]}")
 
         if data_args.train_file.endswith(".csv"):
diff --git a/examples/pytorch/text-generation/run_generation.py b/examples/pytorch/text-generation/run_generation.py
index cea4c881ea..f89ca96eef 100755
--- a/examples/pytorch/text-generation/run_generation.py
+++ b/examples/pytorch/text-generation/run_generation.py
@@ -370,7 +370,7 @@ def main():
     prompt_text = args.prompt if args.prompt else input("Model prompt >>> ")
 
     # Different models need different input formatting and/or extra arguments
-    requires_preprocessing = args.model_type in PREPROCESSING_FUNCTIONS.keys()
+    requires_preprocessing = args.model_type in PREPROCESSING_FUNCTIONS
     if requires_preprocessing:
         prepare_input = PREPROCESSING_FUNCTIONS.get(args.model_type)
         preprocessed_prompt_text = prepare_input(args, model, tokenizer, prompt_text)
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index c87a3f29fd..841337d676 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -357,7 +357,7 @@ def main():
         raw_datasets = load_dataset(extension, data_files=data_files)
     # Trim a number of training examples
     if args.debug:
-        for split in raw_datasets.keys():
+        for split in raw_datasets:
             raw_datasets[split] = raw_datasets[split].select(range(100))
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.
diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py
index 5339e77c22..097ef4c67d 100644
--- a/examples/tensorflow/image-classification/run_image_classification.py
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@@ -327,7 +327,7 @@ def main():
     )
 
     # If we don't have a validation split, split off a percentage of train as validation.
-    data_args.train_val_split = None if "validation" in dataset.keys() else data_args.train_val_split
+    data_args.train_val_split = None if "validation" in dataset else data_args.train_val_split
     if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0:
         split = dataset["train"].train_test_split(data_args.train_val_split)
         dataset["train"] = split["train"]
diff --git a/examples/tensorflow/language-modeling-tpu/prepare_tfrecord_shards.py b/examples/tensorflow/language-modeling-tpu/prepare_tfrecord_shards.py
index aa90a9db52..a839c79a5c 100644
--- a/examples/tensorflow/language-modeling-tpu/prepare_tfrecord_shards.py
+++ b/examples/tensorflow/language-modeling-tpu/prepare_tfrecord_shards.py
@@ -151,7 +151,7 @@ def main(args):
 
     def group_texts(examples):
         # Concatenate all texts.
-        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        concatenated_examples = {k: sum(examples[k], []) for k in examples}
         total_length = len(concatenated_examples[list(examples.keys())[0]])
         # We drop the small remainder, though you could add padding instead if the model supports it
         # In this, as in all things, we advise you to follow your heart 🫀
diff --git a/examples/tensorflow/language-modeling/run_clm.py b/examples/tensorflow/language-modeling/run_clm.py
index 0d776229d2..28a955734b 100755
--- a/examples/tensorflow/language-modeling/run_clm.py
+++ b/examples/tensorflow/language-modeling/run_clm.py
@@ -299,7 +299,7 @@ def main():
             token=model_args.token,
             trust_remote_code=model_args.trust_remote_code,
         )
-        if "validation" not in raw_datasets.keys():
+        if "validation" not in raw_datasets:
             raw_datasets["validation"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
@@ -339,7 +339,7 @@ def main():
             **dataset_args,
         )
         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
-        if "validation" not in raw_datasets.keys():
+        if "validation" not in raw_datasets:
             raw_datasets["validation"] = load_dataset(
                 extension,
                 data_files=data_files,
@@ -429,7 +429,7 @@ def main():
     # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
     def group_texts(examples):
         # Concatenate all texts.
-        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples}
         total_length = len(concatenated_examples[list(examples.keys())[0]])
         # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
         # customize this part to your needs.
diff --git a/examples/tensorflow/language-modeling/run_mlm.py b/examples/tensorflow/language-modeling/run_mlm.py
index 5dcbd35729..ef2c43c69e 100755
--- a/examples/tensorflow/language-modeling/run_mlm.py
+++ b/examples/tensorflow/language-modeling/run_mlm.py
@@ -308,7 +308,7 @@ def main():
             token=model_args.token,
             trust_remote_code=model_args.trust_remote_code,
         )
-        if "validation" not in raw_datasets.keys():
+        if "validation" not in raw_datasets:
             raw_datasets["validation"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
@@ -446,7 +446,7 @@ def main():
         # max_seq_length.
         def group_texts(examples):
             # Concatenate all texts.
-            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+            concatenated_examples = {k: list(chain(*examples[k])) for k in examples}
             total_length = len(concatenated_examples[list(examples.keys())[0]])
             # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
             # customize this part to your needs.
diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py
index a42d5966b6..da09963028 100644
--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@@ -132,7 +132,7 @@ class DataTrainingArguments:
 
     def __post_init__(self):
         self.task_name = self.task_name.lower()
-        if self.task_name not in task_to_keys.keys():
+        if self.task_name not in task_to_keys:
             raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys()))
 
 
@@ -268,7 +268,7 @@ def main():
 
         data_files = {"data": data_args.predict_file}
 
-        for key in data_files.keys():
+        for key in data_files:
             logger.info(f"Loading a local file for {key}: {data_files[key]}")
 
         if data_args.predict_file.endswith(".csv"):
@@ -406,7 +406,7 @@ def main():
             "test_mismatched": data_args.max_predict_samples,
             "user_data": None,
         }
-        for key in datasets.keys():
+        for key in datasets:
             if key == "train" or key.startswith("validation"):
                 assert "label" in datasets[key].features, f"Missing labels from {key} data!"
             if key == "train":
diff --git a/examples/tensorflow/text-classification/run_text_classification.py b/examples/tensorflow/text-classification/run_text_classification.py
index 7546c7bd32..c87c904008 100644
--- a/examples/tensorflow/text-classification/run_text_classification.py
+++ b/examples/tensorflow/text-classification/run_text_classification.py
@@ -272,7 +272,7 @@ def main():
     data_files = {"train": data_args.train_file, "validation": data_args.validation_file, "test": data_args.test_file}
     data_files = {key: file for key, file in data_files.items() if file is not None}
 
-    for key in data_files.keys():
+    for key in data_files:
         logger.info(f"Loading a local file for {key}: {data_files[key]}")
 
     if data_args.input_file_extension == "csv":
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index acfd450e87..70f9979a8f 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -678,7 +678,7 @@ class PretrainedConfig(PushToHubMixin):
         from_auto_class = kwargs.pop("_from_auto", False)
         commit_hash = kwargs.pop("_commit_hash", None)
 
-        gguf_file = kwargs.get("gguf_file", None)
+        gguf_file = kwargs.get("gguf_file")
 
         if trust_remote_code is True:
             logger.warning(
@@ -1033,7 +1033,7 @@ class PretrainedConfig(PushToHubMixin):
         converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
         string, which can then be stored in the json format.
         """
-        if d.get("torch_dtype", None) is not None:
+        if d.get("torch_dtype") is not None:
             if isinstance(d["torch_dtype"], dict):
                 d["torch_dtype"] = {k: str(v).split(".")[-1] for k, v in d["torch_dtype"].items()}
             elif not isinstance(d["torch_dtype"], str):
diff --git a/src/transformers/convert_pytorch_checkpoint_to_tf2.py b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
index fad6463e98..24e1f85064 100755
--- a/src/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -267,7 +267,7 @@ def convert_pt_checkpoint_to_tf(
     tf_model = model_class(config)
 
     # Load weights from tf checkpoint
-    if pytorch_checkpoint_path in aws_config_map.keys():
+    if pytorch_checkpoint_path in aws_config_map:
         pytorch_checkpoint_path = cached_file(
             pytorch_checkpoint_path, WEIGHTS_NAME, force_download=not use_cached_models
         )
diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py
index fcefc9270e..18f2980a38 100644
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -326,8 +326,8 @@ class DataCollatorForTokenClassification(DataCollatorMixin):
     def torch_call(self, features):
         import torch
 
-        label_name = "label" if "label" in features[0].keys() else "labels"
-        labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
+        label_name = "label" if "label" in features[0] else "labels"
+        labels = [feature[label_name] for feature in features] if label_name in features[0] else None
 
         no_labels_features = [{k: v for k, v in feature.items() if k != label_name} for feature in features]
 
@@ -366,8 +366,8 @@ class DataCollatorForTokenClassification(DataCollatorMixin):
     def tf_call(self, features):
         import tensorflow as tf
 
-        label_name = "label" if "label" in features[0].keys() else "labels"
-        labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
+        label_name = "label" if "label" in features[0] else "labels"
+        labels = [feature[label_name] for feature in features] if label_name in features[0] else None
         batch = pad_without_fast_tokenizer_warning(
             self.tokenizer,
             features,
@@ -396,8 +396,8 @@ class DataCollatorForTokenClassification(DataCollatorMixin):
         return batch
 
     def numpy_call(self, features):
-        label_name = "label" if "label" in features[0].keys() else "labels"
-        labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
+        label_name = "label" if "label" in features[0] else "labels"
+        labels = [feature[label_name] for feature in features] if label_name in features[0] else None
         batch = pad_without_fast_tokenizer_warning(
             self.tokenizer,
             features,
@@ -573,7 +573,7 @@ class DataCollatorForMultipleChoice(DataCollatorMixin):
         import torch
 
         # Take labels out of the examples beforehand, because they aren't nested.
-        label_name = "label" if "label" in examples[0].keys() else "labels"
+        label_name = "label" if "label" in examples[0] else "labels"
         labels = [example.pop(label_name) for example in examples]
 
         batch_size = len(examples)
@@ -602,7 +602,7 @@ class DataCollatorForMultipleChoice(DataCollatorMixin):
     def tf_call(self, features):  # Implementation taken from the docs.
         import tensorflow as tf
 
-        label_name = "label" if "label" in features[0].keys() else "labels"
+        label_name = "label" if "label" in features[0] else "labels"
         labels = [feature.pop(label_name) for feature in features]
         batch_size = len(features)
         num_choices = len(features[0]["input_ids"])
@@ -671,8 +671,8 @@ class DataCollatorForSeq2Seq:
         if return_tensors is None:
             return_tensors = self.return_tensors
 
-        label_name = "label" if "label" in features[0].keys() else "labels"
-        labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
+        label_name = "label" if "label" in features[0] else "labels"
+        labels = [feature[label_name] for feature in features] if label_name in features[0] else None
         # reconvert list[None] to None if necessary
         # this might occur when we pass {..., "labels": None}
         if labels is not None and all(label is None for label in labels):
diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index 186f3e3111..e0be17bd7d 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -124,7 +124,7 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
         # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
         if isinstance(processed_features, (list, tuple)) and isinstance(processed_features[0], (dict, BatchFeature)):
             processed_features = {
-                key: [example[key] for example in processed_features] for key in processed_features[0].keys()
+                key: [example[key] for example in processed_features] for key in processed_features[0]
             }
 
         # The model's main input name, usually `input_values`, has be passed for padding
diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
index d9d6ca0e21..59e1a40191 100644
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -405,7 +405,7 @@ class FeatureExtractionMixin(PushToHubMixin):
                 "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
                 FutureWarning,
             )
-            if kwargs.get("token", None) is not None:
+            if kwargs.get("token") is not None:
                 raise ValueError(
                     "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
                 )
diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index b62d6e7ed1..38a15f3dae 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -132,7 +132,7 @@ class AssistedCandidateGenerator(CandidateGenerator):
                 )
 
         # Remove potential default "logits_to_keep" key
-        if "logits_to_keep" in assistant_kwargs.keys() and not assistant_model._supports_logits_to_keep():
+        if "logits_to_keep" in assistant_kwargs and not assistant_model._supports_logits_to_keep():
             del assistant_kwargs["logits_to_keep"]
 
         # If the assistant is an encoder-decoder model, assume the encoder is different on the assistant.
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index 697046b3ed..77ce12d969 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -614,10 +614,7 @@ class GenerationConfig(PushToHubMixin):
             )
         # 1.4. Watermarking attributes
         if self.watermarking_config is not None:
-            if not (
-                isinstance(self.watermarking_config, WatermarkingConfig)
-                or isinstance(self.watermarking_config, SynthIDTextWatermarkingConfig)
-            ):
+            if not (isinstance(self.watermarking_config, (WatermarkingConfig, SynthIDTextWatermarkingConfig))):
                 minor_issues["watermarking_config"] = (
                     "`watermarking_config` as a dict is deprecated and will be removed in v4.54.0. Please construct "
                     "`watermarking_config` object with `WatermarkingConfig` or `SynthIDTextWatermarkingConfig` class."
@@ -847,7 +844,7 @@ class GenerationConfig(PushToHubMixin):
                 "Please use `token` instead.",
                 FutureWarning,
             )
-            if kwargs.get("token", None) is not None:
+            if kwargs.get("token") is not None:
                 raise ValueError(
                     "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
                 )
@@ -1110,7 +1107,7 @@ class GenerationConfig(PushToHubMixin):
         converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
         string, which can then be stored in the json format.
         """
-        if d.get("torch_dtype", None) is not None and not isinstance(d["torch_dtype"], str):
+        if d.get("torch_dtype") is not None and not isinstance(d["torch_dtype"], str):
             d["torch_dtype"] = str(d["torch_dtype"]).split(".")[1]
         for value in d.values():
             if isinstance(value, dict):
@@ -1247,7 +1244,7 @@ class GenerationConfig(PushToHubMixin):
         if decoder_config is not model_config:
             default_generation_config = GenerationConfig()
             decoder_config_dict = decoder_config.to_dict()
-            for attr in generation_config.to_dict().keys():
+            for attr in generation_config.to_dict():
                 is_unset = getattr(generation_config, attr) == getattr(default_generation_config, attr)
                 if attr in decoder_config_dict and is_unset:
                     setattr(generation_config, attr, decoder_config_dict[attr])
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 4948ad8452..c7b6c7a39d 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -1241,13 +1241,13 @@ class SequenceBiasLogitsProcessor(LogitsProcessor):
                 f"`sequence_bias` has to be a non-empty dictionary, or non-empty list of lists but is {sequence_bias}."
             )
         if isinstance(sequence_bias, dict) and any(
-            not isinstance(sequence_ids, tuple) for sequence_ids in sequence_bias.keys()
+            not isinstance(sequence_ids, tuple) for sequence_ids in sequence_bias
         ):
             raise ValueError(f"`sequence_bias` has to be a dict with tuples as keys, but is {sequence_bias}.")
         if isinstance(sequence_bias, dict) and any(
             any((not isinstance(token_id, (int, np.integer)) or token_id < 0) for token_id in sequence_ids)
             or len(sequence_ids) == 0
-            for sequence_ids in sequence_bias.keys()
+            for sequence_ids in sequence_bias
         ):
             raise ValueError(
                 f"Each key in `sequence_bias` has to be a non-empty tuple of positive integers, but is "
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 509adaa8e2..acd53a20b7 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1867,7 +1867,7 @@ class GenerationMixin(ContinuousMixin):
                 )
 
             decoder_mapped_modules = [
-                module_name for module_name in execution_device_map.keys() if decoder_name in module_name
+                module_name for module_name in execution_device_map if decoder_name in module_name
             ]
             # The decoder name may be present in `execution_device_map` in two forms:
             # a) each layer has a device mapping
@@ -5275,7 +5275,7 @@ def stack_model_outputs(model_outputs: list[ModelOutput], config: PretrainedConf
     # Use a dictionary comprehension to gather attributes from all objects and concatenate them
     concatenated_data = {
         k: _concat([getattr(model_output, k) for model_output in model_outputs])
-        for k in model_output_cls.__dataclass_fields__.keys()
+        for k in model_output_cls.__dataclass_fields__
     }
 
     # Return a new object of the inferred class with the concatenated attributes
diff --git a/src/transformers/generation/watermarking.py b/src/transformers/generation/watermarking.py
index 9ce2539851..0b0743f99e 100644
--- a/src/transformers/generation/watermarking.py
+++ b/src/transformers/generation/watermarking.py
@@ -161,7 +161,7 @@ class WatermarkDetector:
         for batch_idx in range(ngram_tensors.shape[0]):
             frequencies_table = collections.Counter(ngram_tensors[batch_idx])
             ngram_to_watermark_lookup = {}
-            for ngram_example in frequencies_table.keys():
+            for ngram_example in frequencies_table:
                 prefix = ngram_example if selfhash else ngram_example[:-1]
                 target = ngram_example[-1]
                 ngram_to_watermark_lookup[ngram_example] = self._get_ngram_score_cached(prefix, target)
diff --git a/src/transformers/image_processing_base.py b/src/transformers/image_processing_base.py
index 9b40b0da7b..4d708efb7c 100644
--- a/src/transformers/image_processing_base.py
+++ b/src/transformers/image_processing_base.py
@@ -229,7 +229,7 @@ class ImageProcessingMixin(PushToHubMixin):
                 "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
                 FutureWarning,
             )
-            if kwargs.get("token", None) is not None:
+            if kwargs.get("token") is not None:
                 raise ValueError(
                     "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
                 )
diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py
index 55bd84cf27..c575770a65 100644
--- a/src/transformers/image_processing_utils_fast.py
+++ b/src/transformers/image_processing_utils_fast.py
@@ -225,7 +225,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
         )
         crop_size = kwargs.pop("crop_size", self.crop_size)
         self.crop_size = get_size_dict(crop_size, param_name="crop_size") if crop_size is not None else None
-        for key in self.valid_kwargs.__annotations__.keys():
+        for key in self.valid_kwargs.__annotations__:
             kwarg = kwargs.pop(key, None)
             if kwarg is not None:
                 setattr(self, key, kwarg)
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index deb988171a..a0bcff7a18 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -860,14 +860,14 @@ def _group_images_by_shape(nested_images, is_nested: bool = False):
 def _reconstruct_nested_structure(indices, processed_images):
     """Helper function to reconstruct a single level nested structure."""
     # Find the maximum outer index
-    max_outer_idx = max(idx[0] for idx in indices.keys())
+    max_outer_idx = max(idx[0] for idx in indices)
 
     # Create the outer list
     result = [None] * (max_outer_idx + 1)
 
     # Group indices by outer index
     nested_indices = defaultdict(list)
-    for i, j in indices.keys():
+    for i, j in indices:
         nested_indices[i].append(j)
 
     for i in range(max_outer_idx + 1):
diff --git a/src/transformers/integrations/accelerate.py b/src/transformers/integrations/accelerate.py
index a404af54b2..9464a4a675 100644
--- a/src/transformers/integrations/accelerate.py
+++ b/src/transformers/integrations/accelerate.py
@@ -130,7 +130,7 @@ def init_on_device(device: "torch.device", include_buffers: bool = False):
         nn.Module.register_parameter = register_empty_parameter
         if include_buffers:
             nn.Module.register_buffer = register_empty_buffer
-        for torch_function_name in tensor_constructors_to_patch.keys():
+        for torch_function_name in tensor_constructors_to_patch:
             setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name)))
         yield
     finally:
diff --git a/src/transformers/integrations/bitsandbytes.py b/src/transformers/integrations/bitsandbytes.py
index 0508477cd9..1dd32b4268 100644
--- a/src/transformers/integrations/bitsandbytes.py
+++ b/src/transformers/integrations/bitsandbytes.py
@@ -164,7 +164,7 @@ def _replace_with_bnb_linear(
             current_key_name = []
         current_key_name.append(name)
 
-        if (isinstance(module, nn.Linear) or isinstance(module, Conv1D)) and name not in modules_to_not_convert:
+        if (isinstance(module, (nn.Linear, Conv1D))) and name not in modules_to_not_convert:
             # Check if the current key is not in the `modules_to_not_convert`
             current_key_name_str = ".".join(current_key_name)
             if not any(
@@ -382,7 +382,7 @@ def _create_accelerate_new_hook(old_hook):
     old_hook_attr = old_hook.__dict__
     filtered_old_hook_attr = {}
     old_hook_init_signature = inspect.signature(old_hook_cls.__init__)
-    for k in old_hook_attr.keys():
+    for k in old_hook_attr:
         if k in old_hook_init_signature.parameters:
             filtered_old_hook_attr[k] = old_hook_attr[k]
     new_hook = old_hook_cls(**filtered_old_hook_attr)
diff --git a/src/transformers/integrations/deepspeed.py b/src/transformers/integrations/deepspeed.py
index dabc9a6ef9..24e8546a50 100644
--- a/src/transformers/integrations/deepspeed.py
+++ b/src/transformers/integrations/deepspeed.py
@@ -333,7 +333,7 @@ def _load_state_dict_into_zero3_model(model_to_load, state_dict):
             # In sharded models, each shard has only part of the full state_dict, so only gather
             # parameters that are in the current state_dict.
             named_parameters = dict(module.named_parameters(prefix=prefix[:-1], recurse=False))
-            params_to_gather = [named_parameters[k] for k in state_dict.keys() if k in named_parameters]
+            params_to_gather = [named_parameters[k] for k in state_dict if k in named_parameters]
             if len(params_to_gather) > 0:
                 # because zero3 puts placeholders in model params, this context
                 # manager gathers (unpartitions) the params of the current layer, then loads from
diff --git a/src/transformers/integrations/flash_attention.py b/src/transformers/integrations/flash_attention.py
index 43c65b46c8..9af6eba11f 100644
--- a/src/transformers/integrations/flash_attention.py
+++ b/src/transformers/integrations/flash_attention.py
@@ -23,7 +23,7 @@ def flash_attention_forward(
     softcap: Optional[float] = None,
     **kwargs,
 ) -> tuple[torch.Tensor, None]:
-    if kwargs.get("output_attentions", False) or kwargs.get("head_mask", None) is not None:
+    if kwargs.get("output_attentions", False) or kwargs.get("head_mask") is not None:
         logger.warning_once(
             "`flash_attention_2` does not support `output_attentions=True` or `head_mask`."
             " Please set your attention to `eager` if you want any of these features."
diff --git a/src/transformers/integrations/flex_attention.py b/src/transformers/integrations/flex_attention.py
index 9abff30e39..e310ff8ac5 100644
--- a/src/transformers/integrations/flex_attention.py
+++ b/src/transformers/integrations/flex_attention.py
@@ -282,7 +282,7 @@ def flex_attention_forward(
         value = repeat_kv(value, query.shape[1] // value.shape[1])
         enable_gqa = False
 
-    kernel_options = kwargs.get("kernel_options", None)
+    kernel_options = kwargs.get("kernel_options")
     attn_output, attention_weights = compile_friendly_flex_attention(
         query,
         key,
diff --git a/src/transformers/integrations/hqq.py b/src/transformers/integrations/hqq.py
index fb0f604bfe..083ec53a2f 100755
--- a/src/transformers/integrations/hqq.py
+++ b/src/transformers/integrations/hqq.py
@@ -104,7 +104,7 @@ def prepare_for_hqq_linear(model, quantization_config=None, modules_to_not_conve
     quant_config = quantization_config.quant_config
     linear_tags = list(set(linear_tags) - set(skip_modules) - set(modules_to_not_convert))
 
-    if any(key in linear_tags for key in quant_config.keys()):
+    if any(key in linear_tags for key in quant_config):
         # If the user doesn't specify a key from get_linear_tags, the layer is not quantized via (key, None)
         patch_params = dict.fromkeys(linear_tags)
         patch_params.update(quant_config)
diff --git a/src/transformers/integrations/sdpa_attention.py b/src/transformers/integrations/sdpa_attention.py
index bd56e462e3..f46c6f0f70 100644
--- a/src/transformers/integrations/sdpa_attention.py
+++ b/src/transformers/integrations/sdpa_attention.py
@@ -51,7 +51,7 @@ def sdpa_attention_forward(
     is_causal: Optional[bool] = None,
     **kwargs,
 ) -> tuple[torch.Tensor, None]:
-    if kwargs.get("output_attentions", False) or kwargs.get("head_mask", None) is not None:
+    if kwargs.get("output_attentions", False) or kwargs.get("head_mask") is not None:
         logger.warning_once(
             "`sdpa` attention does not support `output_attentions=True` or `head_mask`."
             " Please set your attention to `eager` if you want any of these features."
diff --git a/src/transformers/keras_callbacks.py b/src/transformers/keras_callbacks.py
index 77c8fe428c..b6669a7b45 100644
--- a/src/transformers/keras_callbacks.py
+++ b/src/transformers/keras_callbacks.py
@@ -162,7 +162,7 @@ class KerasMetricCallback(keras.callbacks.Callback):
     def _postprocess_predictions_or_labels(self, inputs):
         if isinstance(inputs[0], dict):
             outputs = {}
-            for key in inputs[0].keys():
+            for key in inputs[0]:
                 outputs[key] = self._concatenate_batches([batch[key] for batch in inputs])
             # If it's a dict with only one key, just return the array
             if len(outputs) == 1:
@@ -242,7 +242,7 @@ class KerasMetricCallback(keras.callbacks.Callback):
                 labels = {key: batch[key].numpy() for key in self.label_cols}
             elif isinstance(labels, dict):
                 labels = {key: array.numpy() for key, array in labels.items()}
-            elif isinstance(labels, list) or isinstance(labels, tuple):
+            elif isinstance(labels, (list, tuple)):
                 labels = [array.numpy() for array in labels]
             elif isinstance(labels, tf.Tensor):
                 labels = labels.numpy()
diff --git a/src/transformers/loss/loss_deformable_detr.py b/src/transformers/loss/loss_deformable_detr.py
index 62080bcb3f..37a93a319d 100644
--- a/src/transformers/loss/loss_deformable_detr.py
+++ b/src/transformers/loss/loss_deformable_detr.py
@@ -136,7 +136,7 @@ def DeformableDetrForSegmentationLoss(
             aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
         weight_dict.update(aux_weight_dict)
 
-    loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+    loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict if k in weight_dict)
     return loss, loss_dict, auxiliary_outputs
 
 
@@ -174,5 +174,5 @@ def DeformableDetrForObjectDetectionLoss(
         for i in range(config.decoder_layers - 1):
             aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
         weight_dict.update(aux_weight_dict)
-    loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+    loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict if k in weight_dict)
     return loss, loss_dict, auxiliary_outputs
diff --git a/src/transformers/loss/loss_for_object_detection.py b/src/transformers/loss/loss_for_object_detection.py
index 20b16f58bd..a6b2f9646e 100644
--- a/src/transformers/loss/loss_for_object_detection.py
+++ b/src/transformers/loss/loss_for_object_detection.py
@@ -522,7 +522,7 @@ def ForSegmentationLoss(
         for i in range(config.decoder_layers - 1):
             aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
         weight_dict.update(aux_weight_dict)
-    loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+    loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict if k in weight_dict)
     return loss, loss_dict, auxiliary_outputs
 
 
@@ -558,5 +558,5 @@ def ForObjectDetectionLoss(
         for i in range(config.decoder_layers - 1):
             aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
         weight_dict.update(aux_weight_dict)
-    loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+    loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict if k in weight_dict)
     return loss, loss_dict, auxiliary_outputs
diff --git a/src/transformers/loss/loss_grounding_dino.py b/src/transformers/loss/loss_grounding_dino.py
index 17ea065e84..3e18fac866 100644
--- a/src/transformers/loss/loss_grounding_dino.py
+++ b/src/transformers/loss/loss_grounding_dino.py
@@ -267,5 +267,5 @@ def GroundingDinoForObjectDetectionLoss(
             aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
         weight_dict.update(aux_weight_dict)
 
-    loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+    loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict if k in weight_dict)
     return loss, loss_dict, auxiliary_outputs
diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py
index 2005390c85..8c68d8b8af 100644
--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@@ -322,7 +322,7 @@ def infer_metric_tags_from_eval_results(eval_results):
     if eval_results is None:
         return {}
     result = {}
-    for key in eval_results.keys():
+    for key in eval_results:
         if key.lower().replace(" ", "_") in METRIC_TAGS:
             result[key.lower().replace(" ", "_")] = key
         elif key.lower() == "rouge1":
@@ -839,7 +839,7 @@ def make_markdown_table(lines):
     """
     if lines is None or len(lines) == 0:
         return ""
-    col_widths = {key: len(str(key)) for key in lines[0].keys()}
+    col_widths = {key: len(str(key)) for key in lines[0]}
     for line in lines:
         for key, value in line.items():
             if col_widths[key] < len(_maybe_round(value)):
diff --git a/src/transformers/modeling_flax_pytorch_utils.py b/src/transformers/modeling_flax_pytorch_utils.py
index 7d9f189799..07d83d5e4a 100644
--- a/src/transformers/modeling_flax_pytorch_utils.py
+++ b/src/transformers/modeling_flax_pytorch_utils.py
@@ -179,10 +179,10 @@ def convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model):
     flax_state_dict = {}
 
     load_model_with_head_into_base_model = (model_prefix not in flax_model_params) and (
-        model_prefix in {k.split(".")[0] for k in pt_state_dict.keys()}
+        model_prefix in {k.split(".")[0] for k in pt_state_dict}
     )
     load_base_model_into_model_with_head = (model_prefix in flax_model_params) and (
-        model_prefix not in {k.split(".")[0] for k in pt_state_dict.keys()}
+        model_prefix not in {k.split(".")[0] for k in pt_state_dict}
     )
 
     # Need to change some parameters name to match Flax names
@@ -267,10 +267,10 @@ def convert_pytorch_sharded_state_dict_to_flax(shard_filenames, flax_model):
             random_flax_state_dict = flatten_dict(flax_model_params)
 
         load_model_with_head_into_base_model = (model_prefix not in flax_model_params) and (
-            model_prefix in {k.split(".")[0] for k in pt_state_dict.keys()}
+            model_prefix in {k.split(".")[0] for k in pt_state_dict}
         )
         load_base_model_into_model_with_head = (model_prefix in flax_model_params) and (
-            model_prefix not in {k.split(".")[0] for k in pt_state_dict.keys()}
+            model_prefix not in {k.split(".")[0] for k in pt_state_dict}
         )
         # Need to change some parameters name to match Flax names
         for pt_key, pt_tensor in pt_state_dict.items():
@@ -381,10 +381,10 @@ def load_flax_weights_in_pytorch_model(pt_model, flax_state):
     pt_model_dict = pt_model.state_dict()
 
     load_model_with_head_into_base_model = (pt_model.base_model_prefix in flax_state) and (
-        pt_model.base_model_prefix not in {k.split(".")[0] for k in pt_model_dict.keys()}
+        pt_model.base_model_prefix not in {k.split(".")[0] for k in pt_model_dict}
     )
     load_base_model_into_model_with_head = (pt_model.base_model_prefix not in flax_state) and (
-        pt_model.base_model_prefix in {k.split(".")[0] for k in pt_model_dict.keys()}
+        pt_model.base_model_prefix in {k.split(".")[0] for k in pt_model_dict}
     )
 
     # keep track of unexpected & missing keys
diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index 3f94a3c6ce..bc9a4d473f 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -138,7 +138,7 @@ def flax_shard_checkpoint(params, max_shard_size="10GB"):
     for idx, shard in enumerate(sharded_state_dicts):
         shard_file = FLAX_WEIGHTS_NAME.replace(".msgpack", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.msgpack")
         shards[shard_file] = shard
-        for weight_name in shard.keys():
+        for weight_name in shard:
             weight_map[weight_name] = shard_file
 
     # Add the metadata
@@ -963,7 +963,7 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
         # Mismatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not
         # matching the weights in the model.
         mismatched_keys = []
-        for key in state.keys():
+        for key in state:
             if key in random_state and state[key].shape != random_state[key].shape:
                 if ignore_mismatched_sizes:
                     mismatched_keys.append((key, state[key].shape, random_state[key].shape))
@@ -1169,11 +1169,7 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
         for filename in os.listdir(save_directory):
             full_filename = os.path.join(save_directory, filename)
             weights_no_suffix = weights_name.replace(".bin", "").replace(".safetensors", "")
-            if (
-                filename.startswith(weights_no_suffix)
-                and os.path.isfile(full_filename)
-                and filename not in shards.keys()
-            ):
+            if filename.startswith(weights_no_suffix) and os.path.isfile(full_filename) and filename not in shards:
                 os.remove(full_filename)
 
         if index is None:
diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py
index f736e37d4e..feb6c6c391 100644
--- a/src/transformers/modeling_gguf_pytorch_utils.py
+++ b/src/transformers/modeling_gguf_pytorch_utils.py
@@ -315,7 +315,7 @@ def get_gguf_hf_weights_map(
     # hf => gguf and gguf => hf mappings are reversed
     gguf_to_hf_name_map = {}
     state_dict = hf_model.state_dict()
-    for hf_name in state_dict.keys():
+    for hf_name in state_dict:
         # An exception for qwen2moe model, where the expert layers are packed
         if model_type == "qwen2moe" and "mlp.experts." in hf_name:
             hf_name = re.sub(r"mlp.experts.\d+.", "mlp.experts.", hf_name)
diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py
index 5b9f38e1bc..8f688af7be 100644
--- a/src/transformers/modeling_tf_pytorch_utils.py
+++ b/src/transformers/modeling_tf_pytorch_utils.py
@@ -330,7 +330,7 @@ def load_pytorch_state_dict_in_tf2_model(
             tf_model(tf_inputs, training=False)  # Make sure model is built
     # Convert old format to new format if needed from a PyTorch state_dict
     tf_keys_to_pt_keys = {}
-    for key in pt_state_dict.keys():
+    for key in pt_state_dict:
         new_key = None
         if "gamma" in key:
             new_key = key.replace("gamma", "weight")
@@ -361,7 +361,7 @@ def load_pytorch_state_dict_in_tf2_model(
     # and there is no MainLayer class. This means that TF base classes have one
     # extra layer in their weight names, corresponding to the MainLayer class. This code block compensates for that.
     start_prefix_to_remove = ""
-    if not any(s.startswith(tf_model.base_model_prefix) for s in tf_keys_to_pt_keys.keys()):
+    if not any(s.startswith(tf_model.base_model_prefix) for s in tf_keys_to_pt_keys):
         start_prefix_to_remove = tf_model.base_model_prefix + "."
 
     symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights
@@ -573,7 +573,7 @@ def load_tf2_state_dict_in_pytorch_model(pt_model, tf_state_dict, allow_missing_
     # Make sure we are able to load PyTorch base models as well as derived models (with heads)
     # TF models always have a prefix, some of PyTorch models (base ones) don't
     start_prefix_to_remove = ""
-    if not any(s.startswith(pt_model.base_model_prefix) for s in current_pt_params_dict.keys()):
+    if not any(s.startswith(pt_model.base_model_prefix) for s in current_pt_params_dict):
         start_prefix_to_remove = pt_model.base_model_prefix + "."
 
     # Build a map from potential PyTorch weight names to TF 2.0 Variables
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 3e2564bc7a..c7bb80656d 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -991,13 +991,13 @@ def load_tf_weights_from_h5(model, resolved_archive_file, ignore_mismatched_size
                     # here we check if the current weight is among the weights from the H5 file
                     # If yes, get the weight_value of the corresponding weight from the H5 file
                     # If not, make the value to None
-                    saved_weight_value = saved_weights.get(symbolic_weight_name, None)
+                    saved_weight_value = saved_weights.get(symbolic_weight_name)
 
                     # Retrocompatibility patch: some embeddings are stored with the weights name (e.g. Bart's
                     # `model.shared/embeddings:0` are stored as `model.shared/weights:0`)
                     if saved_weight_value is None and symbolic_weight_name.endswith("embeddings:0"):
                         symbolic_weight_name = symbolic_weight_name[:-12] + "weight:0"
-                        saved_weight_value = saved_weights.get(symbolic_weight_name, None)
+                        saved_weight_value = saved_weights.get(symbolic_weight_name)
 
                     # Add the updated name to the final list for computing missing/unexpected values
                     symbolic_weights_names.add(symbolic_weight_name)
@@ -1637,7 +1637,7 @@ class TFPreTrainedModel(keras.Model, TFModelUtilsMixin, TFGenerationMixin, PushT
                 for key, val in y.items():
                     if key in arg_names and key not in x:
                         x[key] = val
-                    elif output_to_label.get(key, None) in arg_names and key not in x:
+                    elif output_to_label.get(key) in arg_names and key not in x:
                         x[output_to_label[key]] = val
         if y is None:
             y = {key: val for key, val in x.items() if key in label_kwargs}
@@ -1662,7 +1662,7 @@ class TFPreTrainedModel(keras.Model, TFModelUtilsMixin, TFGenerationMixin, PushT
             # This next block matches outputs to label keys. Tensorflow's standard method for doing this
             # can get very confused if any of the keys contain nested values (e.g. lists/tuples of Tensors)
             if isinstance(y, dict) and len(y) == 1:
-                if list(y.keys())[0] in y_pred.keys():
+                if list(y.keys())[0] in y_pred:
                     y_pred = y_pred[list(y.keys())[0]]
                 elif list(y_pred.keys())[0] == "loss":
                     y_pred = y_pred[1]
@@ -1672,7 +1672,7 @@ class TFPreTrainedModel(keras.Model, TFModelUtilsMixin, TFGenerationMixin, PushT
             elif isinstance(y, dict):
                 # If the labels are a dict, match keys from the output by name
                 y_pred = {key: val for key, val in y_pred.items() if key in y}
-            elif isinstance(y, tuple) or isinstance(y, list):
+            elif isinstance(y, (tuple, list)):
                 # If the labels are a tuple/list, match keys to the output by order, skipping the loss.
                 if list(y_pred.keys())[0] == "loss":
                     y_pred = y_pred.to_tuple()[1:]
@@ -1745,7 +1745,7 @@ class TFPreTrainedModel(keras.Model, TFModelUtilsMixin, TFGenerationMixin, PushT
                 for key, val in y.items():
                     if key in arg_names and key not in x:
                         x[key] = val
-                    elif output_to_label.get(key, None) in arg_names and key not in x:
+                    elif output_to_label.get(key) in arg_names and key not in x:
                         x[output_to_label[key]] = val
         if y is None:
             y = {key: val for key, val in x.items() if key in label_kwargs}
@@ -1769,7 +1769,7 @@ class TFPreTrainedModel(keras.Model, TFModelUtilsMixin, TFGenerationMixin, PushT
         # This next block matches outputs to label keys. Tensorflow's standard method for doing this
         # can get very confused if any of the keys contain nested values (e.g. lists/tuples of Tensors)
         if isinstance(y, dict) and len(y) == 1:
-            if list(y.keys())[0] in y_pred.keys():
+            if list(y.keys())[0] in y_pred:
                 y_pred = y_pred[list(y.keys())[0]]
             elif list(y_pred.keys())[0] == "loss":
                 y_pred = y_pred[1]
@@ -1779,7 +1779,7 @@ class TFPreTrainedModel(keras.Model, TFModelUtilsMixin, TFGenerationMixin, PushT
         elif isinstance(y, dict):
             # If the labels are a dict, match keys from the output by name
             y_pred = {key: val for key, val in y_pred.items() if key in y}
-        elif isinstance(y, tuple) or isinstance(y, list):
+        elif isinstance(y, (tuple, list)):
             # If the labels are a tuple/list, match keys to the output by order, skipping the loss.
             if list(y_pred.keys())[0] == "loss":
                 y_pred = y_pred.to_tuple()[1:]
@@ -2464,11 +2464,7 @@ class TFPreTrainedModel(keras.Model, TFModelUtilsMixin, TFGenerationMixin, PushT
             # If we have a shard file that is not going to be replaced, we delete it, but only from the main process
             # in distributed settings to avoid race conditions.
             weights_no_suffix = weights_name.replace(".bin", "").replace(".safetensors", "")
-            if (
-                filename.startswith(weights_no_suffix)
-                and os.path.isfile(full_filename)
-                and filename not in shards.keys()
-            ):
+            if filename.startswith(weights_no_suffix) and os.path.isfile(full_filename) and filename not in shards:
                 os.remove(full_filename)
 
         if index is None:
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index a97b446c96..5a7c6eb087 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1357,12 +1357,12 @@ def _get_torch_dtype(
             elif hasattr(torch, torch_dtype):
                 torch_dtype = getattr(torch, torch_dtype)
                 config.torch_dtype = torch_dtype
-                for sub_config_key in config.sub_configs.keys():
+                for sub_config_key in config.sub_configs:
                     sub_config = getattr(config, sub_config_key)
                     sub_config.torch_dtype = torch_dtype
         elif isinstance(torch_dtype, torch.dtype):
             config.torch_dtype = torch_dtype
-            for sub_config_key in config.sub_configs.keys():
+            for sub_config_key in config.sub_configs:
                 sub_config = getattr(config, sub_config_key)
                 sub_config.torch_dtype = torch_dtype
         elif isinstance(torch_dtype, dict):
@@ -1388,7 +1388,7 @@ def _get_torch_dtype(
         # set fp32 as the default dtype for BC
         default_dtype = torch.get_default_dtype()
         config.torch_dtype = default_dtype
-        for key in config.sub_configs.keys():
+        for key in config.sub_configs:
             value = getattr(config, key)
             value.torch_dtype = default_dtype
 
@@ -1446,7 +1446,7 @@ def _get_device_map(
 
         # `inferred_max_memory` contains non-reserved memory. There may be *unused* reserved memory in the GPU,
         # which we can use to allocate parameters.
-        for device_name in inferred_max_memory.keys():
+        for device_name in inferred_max_memory:
             if isinstance(device_name, int):  # it's a GPU device
                 if is_torch_xpu_available():
                     unused_memory = torch.xpu.memory_reserved(device_name) - torch.xpu.memory_allocated(device_name)
@@ -3002,9 +3002,9 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
                     f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}"
                 )
 
-                all_encoder_weights = {module_name + "/" + sub_name for sub_name in encoder_modules.keys()}
+                all_encoder_weights = {module_name + "/" + sub_name for sub_name in encoder_modules}
                 encoder_layer_pos = 0
-                for name in decoder_modules.keys():
+                for name in decoder_modules:
                     if name.isdigit():
                         encoder_name = str(int(name) + encoder_layer_pos)
                         decoder_name = name
@@ -3942,7 +3942,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
         # Handle the case where some state_dict keys shouldn't be saved
         if self._keys_to_ignore_on_save is not None:
             for ignore_key in self._keys_to_ignore_on_save:
-                if ignore_key in state_dict.keys():
+                if ignore_key in state_dict:
                     del state_dict[ignore_key]
 
         # Rename state_dict keys before saving to file. Do nothing unless overridden in a particular model.
@@ -4057,7 +4057,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
             if (
                 filename.startswith(weights_no_suffix)
                 and os.path.isfile(full_filename)
-                and filename not in state_dict_split.filename_to_tensors.keys()
+                and filename not in state_dict_split.filename_to_tensors
                 and is_main_process
                 and reg.fullmatch(filename_no_suffix) is not None
             ):
@@ -5334,7 +5334,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
             if device_map is not None:
                 device_map = {k[len(_prefix) :] if k.startswith(_prefix) else k: v for k, v in device_map.items()}
             # small sanity check: the base model should not contain task-specific head keys
-            task_specific_expected_keys = [s for s in model.state_dict().keys() if not s.startswith(_prefix)]
+            task_specific_expected_keys = [s for s in model.state_dict() if not s.startswith(_prefix)]
             base_model_expected_keys = list(model_to_load.state_dict().keys())
             if any(
                 key in task_specific_expected_keys and key not in base_model_expected_keys for key in checkpoint_keys
diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py
index 9144cc6bdd..9e6125720d 100644
--- a/src/transformers/models/aria/modeling_aria.py
+++ b/src/transformers/models/aria/modeling_aria.py
@@ -191,7 +191,7 @@ class AriaProjector(nn.Module):
         """
         batch_size, num_patches = key_value_states.shape[0], key_value_states.shape[1]
 
-        if num_patches not in self.patch_to_query_dict.keys():
+        if num_patches not in self.patch_to_query_dict:
             raise KeyError(
                 f"Number of patches {num_patches} not found in patch_to_query_dict amongst possible values {self.patch_to_query_dict.keys()}."
             )
diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py
index a531bc43b3..c27916a0df 100644
--- a/src/transformers/models/aria/modular_aria.py
+++ b/src/transformers/models/aria/modular_aria.py
@@ -438,7 +438,7 @@ class AriaProjector(nn.Module):
         """
         batch_size, num_patches = key_value_states.shape[0], key_value_states.shape[1]
 
-        if num_patches not in self.patch_to_query_dict.keys():
+        if num_patches not in self.patch_to_query_dict:
             raise KeyError(
                 f"Number of patches {num_patches} not found in patch_to_query_dict amongst possible values {self.patch_to_query_dict.keys()}."
             )
diff --git a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
index 119114033c..325e0f65b4 100644
--- a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
@@ -105,7 +105,7 @@ def rename_key(name):
 
 
 def convert_state_dict(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
+    for key in orig_state_dict.copy():
         val = orig_state_dict.pop(key)
 
         if "qkv" in key:
diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index eeb959de18..8439affd6d 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -425,7 +425,7 @@ class _BaseAutoModelClass:
     def from_config(cls, config, **kwargs):
         trust_remote_code = kwargs.pop("trust_remote_code", None)
         has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map
-        has_local_code = type(config) in cls._model_mapping.keys()
+        has_local_code = type(config) in cls._model_mapping
         if has_remote_code:
             class_ref = config.auto_map[cls.__name__]
             if "--" in class_ref:
@@ -451,13 +451,13 @@ class _BaseAutoModelClass:
             _ = kwargs.pop("code_revision", None)
             model_class = add_generation_mixin_to_remote_model(model_class)
             return model_class._from_config(config, **kwargs)
-        elif type(config) in cls._model_mapping.keys():
+        elif type(config) in cls._model_mapping:
             model_class = _get_model_class(config, cls._model_mapping)
             return model_class._from_config(config, **kwargs)
 
         raise ValueError(
             f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
-            f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
+            f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping)}."
         )
 
     @classmethod
@@ -468,7 +468,7 @@ class _BaseAutoModelClass:
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike[str]], *model_args, **kwargs):
         config = kwargs.pop("config", None)
-        trust_remote_code = kwargs.get("trust_remote_code", None)
+        trust_remote_code = kwargs.get("trust_remote_code")
         kwargs["_from_auto"] = True
         hub_kwargs_names = [
             "cache_dir",
@@ -538,10 +538,10 @@ class _BaseAutoModelClass:
             kwargs_orig = copy.deepcopy(kwargs)
             # ensure not to pollute the config object with torch_dtype="auto" - since it's
             # meaningless in the context of the config object - torch.dtype values are acceptable
-            if kwargs.get("torch_dtype", None) == "auto":
+            if kwargs.get("torch_dtype") == "auto":
                 _ = kwargs.pop("torch_dtype")
             # to not overwrite the quantization_config if config has a quantization_config
-            if kwargs.get("quantization_config", None) is not None:
+            if kwargs.get("quantization_config") is not None:
                 _ = kwargs.pop("quantization_config")
 
             config, kwargs = AutoConfig.from_pretrained(
@@ -560,7 +560,7 @@ class _BaseAutoModelClass:
                 kwargs["quantization_config"] = kwargs_orig["quantization_config"]
 
         has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map
-        has_local_code = type(config) in cls._model_mapping.keys()
+        has_local_code = type(config) in cls._model_mapping
         upstream_repo = None
         if has_remote_code:
             class_ref = config.auto_map[cls.__name__]
@@ -593,7 +593,7 @@ class _BaseAutoModelClass:
             return model_class.from_pretrained(
                 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
             )
-        elif type(config) in cls._model_mapping.keys():
+        elif type(config) in cls._model_mapping:
             model_class = _get_model_class(config, cls._model_mapping)
             if model_class.config_class == config.sub_configs.get("text_config", None):
                 config = config.get_text_config()
@@ -602,7 +602,7 @@ class _BaseAutoModelClass:
             )
         raise ValueError(
             f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
-            f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
+            f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping)}."
         )
 
     @classmethod
@@ -636,7 +636,7 @@ class _BaseAutoBackboneClass(_BaseAutoModelClass):
 
         config = kwargs.pop("config", TimmBackboneConfig())
 
-        if kwargs.get("out_features", None) is not None:
+        if kwargs.get("out_features") is not None:
             raise ValueError("Cannot specify `out_features` for timm backbones")
 
         if kwargs.get("output_loading_info", False):
@@ -820,7 +820,7 @@ class _LazyAutoMapping(OrderedDict[type[PretrainedConfig], _LazyAutoMappingValue
         mapping_keys = [
             self._load_attr_from_module(key, name)
             for key, name in self._config_mapping.items()
-            if key in self._model_mapping.keys()
+            if key in self._model_mapping
         ]
         return mapping_keys + list(self._extra_content.keys())
 
@@ -837,7 +837,7 @@ class _LazyAutoMapping(OrderedDict[type[PretrainedConfig], _LazyAutoMappingValue
         mapping_values = [
             self._load_attr_from_module(key, name)
             for key, name in self._model_mapping.items()
-            if key in self._config_mapping.keys()
+            if key in self._config_mapping
         ]
         return mapping_values + list(self._extra_content.values())
 
@@ -847,8 +847,8 @@ class _LazyAutoMapping(OrderedDict[type[PretrainedConfig], _LazyAutoMappingValue
                 self._load_attr_from_module(key, self._config_mapping[key]),
                 self._load_attr_from_module(key, self._model_mapping[key]),
             )
-            for key in self._model_mapping.keys()
-            if key in self._config_mapping.keys()
+            for key in self._model_mapping
+            if key in self._config_mapping
         ]
         return mapping_items + list(self._extra_content.items())
 
@@ -869,7 +869,7 @@ class _LazyAutoMapping(OrderedDict[type[PretrainedConfig], _LazyAutoMappingValue
         """
         if hasattr(key, "__name__") and key.__name__ in self._reverse_config_mapping:
             model_type = self._reverse_config_mapping[key.__name__]
-            if model_type in self._model_mapping.keys() and not exist_ok:
+            if model_type in self._model_mapping and not exist_ok:
                 raise ValueError(f"'{key}' is already used by a Transformers model.")
 
         self._extra_content[key] = value
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 12f8f4f4c5..99ba934986 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -974,10 +974,10 @@ class _LazyConfigMapping(OrderedDict[str, type[PretrainedConfig]]):
         return list(self._mapping.keys()) + list(self._extra_content.keys())
 
     def values(self) -> list[type[PretrainedConfig]]:
-        return [self[k] for k in self._mapping.keys()] + list(self._extra_content.values())
+        return [self[k] for k in self._mapping] + list(self._extra_content.values())
 
     def items(self) -> list[tuple[str, type[PretrainedConfig]]]:
-        return [(k, self[k]) for k in self._mapping.keys()] + list(self._extra_content.items())
+        return [(k, self[k]) for k in self._mapping] + list(self._extra_content.items())
 
     def __iter__(self) -> Iterator[str]:
         return iter(list(self._mapping.keys()) + list(self._extra_content.keys()))
@@ -989,7 +989,7 @@ class _LazyConfigMapping(OrderedDict[str, type[PretrainedConfig]]):
         """
         Register a new configuration in this mapping.
         """
-        if key in self._mapping.keys() and not exist_ok:
+        if key in self._mapping and not exist_ok:
             raise ValueError(f"'{key}' is already used by a Transformers config, pick another name.")
         self._extra_content[key] = value
 
@@ -1230,7 +1230,7 @@ class AutoConfig:
                 "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
                 FutureWarning,
             )
-            if kwargs.get("token", None) is not None:
+            if kwargs.get("token") is not None:
                 raise ValueError(
                     "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
                 )
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 0878c5ce30..7816e739c6 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -342,7 +342,7 @@ class AutoFeatureExtractor:
                 "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
                 FutureWarning,
             )
-            if kwargs.get("token", None) is not None:
+            if kwargs.get("token") is not None:
                 raise ValueError(
                     "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
                 )
@@ -400,7 +400,7 @@ class AutoFeatureExtractor:
         raise ValueError(
             f"Unrecognized feature extractor in {pretrained_model_name_or_path}. Should have a "
             f"`feature_extractor_type` key in its {FEATURE_EXTRACTOR_NAME} of {CONFIG_NAME}, or one of the following "
-            f"`model_type` keys in its {CONFIG_NAME}: {', '.join(c for c in FEATURE_EXTRACTOR_MAPPING_NAMES.keys())}"
+            f"`model_type` keys in its {CONFIG_NAME}: {', '.join(c for c in FEATURE_EXTRACTOR_MAPPING_NAMES)}"
         )
 
     @staticmethod
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index ec55316484..cefa1335eb 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -451,7 +451,7 @@ class AutoImageProcessor:
                 "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
                 FutureWarning,
             )
-            if kwargs.get("token", None) is not None:
+            if kwargs.get("token") is not None:
                 raise ValueError(
                     "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
                 )
@@ -625,7 +625,7 @@ class AutoImageProcessor:
         raise ValueError(
             f"Unrecognized image processor in {pretrained_model_name_or_path}. Should have a "
             f"`image_processor_type` key in its {IMAGE_PROCESSOR_NAME} of {CONFIG_NAME}, or one of the following "
-            f"`model_type` keys in its {CONFIG_NAME}: {', '.join(c for c in IMAGE_PROCESSOR_MAPPING_NAMES.keys())}"
+            f"`model_type` keys in its {CONFIG_NAME}: {', '.join(c for c in IMAGE_PROCESSOR_MAPPING_NAMES)}"
         )
 
     @staticmethod
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index cc2be544f4..0d711cee06 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -261,7 +261,7 @@ class AutoProcessor:
                 "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
                 FutureWarning,
             )
-            if kwargs.get("token", None) is not None:
+            if kwargs.get("token") is not None:
                 raise ValueError(
                     "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
                 )
@@ -276,9 +276,7 @@ class AutoProcessor:
 
         # First, let's see if we have a processor or preprocessor config.
         # Filter the kwargs for `cached_file`.
-        cached_file_kwargs = {
-            key: kwargs[key] for key in inspect.signature(cached_file).parameters.keys() if key in kwargs
-        }
+        cached_file_kwargs = {key: kwargs[key] for key in inspect.signature(cached_file).parameters if key in kwargs}
         # We don't want to raise
         cached_file_kwargs.update(
             {
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 29849fdb24..f9832df525 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -875,7 +875,7 @@ def get_tokenizer_config(
             raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
         token = use_auth_token
 
-    commit_hash = kwargs.get("_commit_hash", None)
+    commit_hash = kwargs.get("_commit_hash")
     resolved_config_file = cached_file(
         pretrained_model_name_or_path,
         TOKENIZER_CONFIG_FILE,
@@ -1000,7 +1000,7 @@ class AutoTokenizer:
                 "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
                 FutureWarning,
             )
-            if kwargs.get("token", None) is not None:
+            if kwargs.get("token") is not None:
                 raise ValueError(
                     "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
                 )
@@ -1012,7 +1012,7 @@ class AutoTokenizer:
         use_fast = kwargs.pop("use_fast", True)
         tokenizer_type = kwargs.pop("tokenizer_type", None)
         trust_remote_code = kwargs.pop("trust_remote_code", None)
-        gguf_file = kwargs.get("gguf_file", None)
+        gguf_file = kwargs.get("gguf_file")
 
         # First, let's see whether the tokenizer_type is passed so that we can leverage it
         if tokenizer_type is not None:
@@ -1022,7 +1022,7 @@ class AutoTokenizer:
             if tokenizer_class_tuple is None:
                 raise ValueError(
                     f"Passed `tokenizer_type` {tokenizer_type} does not exist. `tokenizer_type` should be one of "
-                    f"{', '.join(c for c in TOKENIZER_MAPPING_NAMES.keys())}."
+                    f"{', '.join(c for c in TOKENIZER_MAPPING_NAMES)}."
                 )
 
             tokenizer_class_name, tokenizer_fast_class_name = tokenizer_class_tuple
@@ -1142,7 +1142,7 @@ class AutoTokenizer:
 
         raise ValueError(
             f"Unrecognized configuration class {config.__class__} to build an AutoTokenizer.\n"
-            f"Model type should be one of {', '.join(c.__name__ for c in TOKENIZER_MAPPING.keys())}."
+            f"Model type should be one of {', '.join(c.__name__ for c in TOKENIZER_MAPPING)}."
         )
 
     @staticmethod
diff --git a/src/transformers/models/auto/video_processing_auto.py b/src/transformers/models/auto/video_processing_auto.py
index 77a8c458bd..545fcc4d92 100644
--- a/src/transformers/models/auto/video_processing_auto.py
+++ b/src/transformers/models/auto/video_processing_auto.py
@@ -291,7 +291,7 @@ class AutoVideoProcessor:
                 "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
                 FutureWarning,
             )
-            if kwargs.get("token", None) is not None:
+            if kwargs.get("token") is not None:
                 raise ValueError(
                     "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
                 )
@@ -364,7 +364,7 @@ class AutoVideoProcessor:
         raise ValueError(
             f"Unrecognized video processor in {pretrained_model_name_or_path}. Should have a "
             f"`video_processor_type` key in its {VIDEO_PROCESSOR_NAME} of {CONFIG_NAME}, or one of the following "
-            f"`model_type` keys in its {CONFIG_NAME}: {', '.join(c for c in VIDEO_PROCESSOR_MAPPING_NAMES.keys())}"
+            f"`model_type` keys in its {CONFIG_NAME}: {', '.join(c for c in VIDEO_PROCESSOR_MAPPING_NAMES)}"
         )
 
     @staticmethod
diff --git a/src/transformers/models/bit/convert_bit_to_pytorch.py b/src/transformers/models/bit/convert_bit_to_pytorch.py
index abc24290ab..814db3ca4f 100644
--- a/src/transformers/models/bit/convert_bit_to_pytorch.py
+++ b/src/transformers/models/bit/convert_bit_to_pytorch.py
@@ -94,7 +94,7 @@ def convert_bit_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=Fal
 
     # load state_dict of original model
     state_dict = timm_model.state_dict()
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         val = state_dict.pop(key)
         state_dict[rename_key(key)] = val.squeeze() if "head" in key else val
 
diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py
index 3644c71cc0..267b0ffcb0 100644
--- a/src/transformers/models/blip/modeling_blip.py
+++ b/src/transformers/models/blip/modeling_blip.py
@@ -443,7 +443,7 @@ class BlipPreTrainedModel(PreTrainedModel):
     def _init_weights(self, module):
         """Initialize the weights"""
         factor = self.config.initializer_range
-        if isinstance(module, nn.Conv2d) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
+        if isinstance(module, (nn.Conv2d, nn.Embedding, nn.Linear)):
             module.weight.data.normal_(mean=0.0, std=factor)
             if hasattr(module, "bias") and module.bias is not None:
                 module.bias.data.zero_()
diff --git a/src/transformers/models/blip/modeling_tf_blip_text.py b/src/transformers/models/blip/modeling_tf_blip_text.py
index ec86f6d26a..7dae1126e0 100644
--- a/src/transformers/models/blip/modeling_tf_blip_text.py
+++ b/src/transformers/models/blip/modeling_tf_blip_text.py
@@ -1096,8 +1096,8 @@ class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel):
             "input_ids": input_ids,
             "attention_mask": attention_mask,
             "past_key_values": past_key_values,
-            "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
-            "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
+            "encoder_hidden_states": model_kwargs.get("encoder_hidden_states"),
+            "encoder_attention_mask": model_kwargs.get("encoder_attention_mask"),
             "is_decoder": True,
         }
 
diff --git a/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py b/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py
index b9c9b27bd3..26be31dcbb 100644
--- a/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py
+++ b/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py
@@ -114,7 +114,7 @@ def convert_bloom_checkpoint_to_pytorch(
                 if tensors is None:
                     tensors = temp
                 else:
-                    for key in tensors.keys():
+                    for key in tensors:
                         if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH):
                             # We average (sum and then divide) some weights across TP ranks (see https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/olruwase/sync_layer_norms/megatron/training.py#L425)
                             tensors[key] += temp[key]
@@ -125,7 +125,7 @@ def convert_bloom_checkpoint_to_pytorch(
                             tensors[key] = torch.cat([tensors[key], temp[key]], dim=cat_dim)
 
             # Divide by the number of TP the weights we want to average
-            for key in tensors.keys():
+            for key in tensors:
                 if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH):
                     tensors[key] = tensors[key] / pretraining_tp
             torch.save(
@@ -136,7 +136,7 @@ def convert_bloom_checkpoint_to_pytorch(
                 ),
             )
 
-            for key in tensors.keys():
+            for key in tensors:
                 value = tensors[key]
                 total_size += value.numel() * get_dtype_size(value.dtype)
                 if key not in index_dict["weight_map"]:
@@ -174,7 +174,7 @@ def convert_bloom_checkpoint_to_pytorch(
                 if tensors is None:
                     tensors = temp
                 else:
-                    for key in tensors.keys():
+                    for key in tensors:
                         # We average (sum and then divide) some weights across TP ranks (see https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/olruwase/sync_layer_norms/megatron/training.py#L425)
                         if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH):
                             tensors[key] += temp[key]
@@ -185,7 +185,7 @@ def convert_bloom_checkpoint_to_pytorch(
                             tensors[key] = torch.cat([tensors[key], temp[key]], dim=cat_dim)
 
             # Divide by the number of TP the weights we want to average
-            for key in tensors.keys():
+            for key in tensors:
                 if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH):
                     tensors[key] = tensors[key] / pretraining_tp
 
diff --git a/src/transformers/models/bros/convert_bros_to_pytorch.py b/src/transformers/models/bros/convert_bros_to_pytorch.py
index c0984f2c74..35c89a88da 100644
--- a/src/transformers/models/bros/convert_bros_to_pytorch.py
+++ b/src/transformers/models/bros/convert_bros_to_pytorch.py
@@ -55,7 +55,7 @@ def rename_key(name):
 
 def convert_state_dict(orig_state_dict, model):
     # rename keys
-    for key in orig_state_dict.copy().keys():
+    for key in orig_state_dict.copy():
         val = orig_state_dict.pop(key)
         orig_state_dict[rename_key(key)] = val
 
diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index 6f4a60737b..86b766eabf 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -112,7 +112,7 @@ def rename_key(name):
 
 
 def convert_state_dict(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
+    for key in orig_state_dict.copy():
         val = orig_state_dict.pop(key)
 
         if key.startswith("clip_model") and "attn.in_proj" in key:
@@ -172,7 +172,7 @@ def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
 
     # remove some keys
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         if key.startswith("model"):
             state_dict.pop(key, None)
 
diff --git a/src/transformers/models/clvp/tokenization_clvp.py b/src/transformers/models/clvp/tokenization_clvp.py
index 61bd5964ec..4b0b285561 100644
--- a/src/transformers/models/clvp/tokenization_clvp.py
+++ b/src/transformers/models/clvp/tokenization_clvp.py
@@ -300,7 +300,7 @@ class ClvpTokenizer(PreTrainedTokenizer):
 
             # if the token is "Ġ" we replace it with "[SPACE]" (if "[SPACE]" is present in the vocab), otherwise we keep the "Ġ".
             bpe_tokens.extend(
-                "[SPACE]" if bpe_token == "\u0120" and "[SPACE]" in self.encoder.keys() else bpe_token
+                "[SPACE]" if bpe_token == "\u0120" and "[SPACE]" in self.encoder else bpe_token
                 for bpe_token in self.bpe(token).split(" ")
             )
 
diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
index 91f00668be..22658419eb 100644
--- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -267,7 +267,7 @@ def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
     read_in_q_k_v(state_dict, is_panoptic=is_panoptic)
     # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
     prefix = "conditional_detr.model." if is_panoptic else "model."
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         if is_panoptic:
             if (
                 key.startswith("conditional_detr")
diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr_fast.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr_fast.py
index c63598e0d6..f48af23ccf 100644
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr_fast.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr_fast.py
@@ -327,8 +327,8 @@ class ConditionalDetrImageProcessorFast(BaseImageProcessorFast):
         self.size = get_size_dict(size, max_size=max_size, default_to_square=False)
 
         # Backwards compatibility
-        do_convert_annotations = kwargs.get("do_convert_annotations", None)
-        do_normalize = kwargs.get("do_normalize", None)
+        do_convert_annotations = kwargs.get("do_convert_annotations")
+        do_normalize = kwargs.get("do_normalize")
         if do_convert_annotations is None and getattr(self, "do_convert_annotations", None) is None:
             self.do_convert_annotations = do_normalize if do_normalize is not None else self.do_normalize
 
diff --git a/src/transformers/models/convnext/convert_convnext_to_pytorch.py b/src/transformers/models/convnext/convert_convnext_to_pytorch.py
index 27315ed73f..426ed98b88 100644
--- a/src/transformers/models/convnext/convert_convnext_to_pytorch.py
+++ b/src/transformers/models/convnext/convert_convnext_to_pytorch.py
@@ -128,11 +128,11 @@ def convert_convnext_checkpoint(checkpoint_url, pytorch_dump_folder_path):
     # load original state_dict from URL
     state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)["model"]
     # rename keys
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         val = state_dict.pop(key)
         state_dict[rename_key(key)] = val
     # add prefix to all keys expect classifier head
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         val = state_dict.pop(key)
         if not key.startswith("classifier"):
             key = "convnext." + key
diff --git a/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py b/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py
index 8094ecf0d6..d23f248816 100644
--- a/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py
+++ b/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py
@@ -153,11 +153,11 @@ def convert_convnextv2_checkpoint(checkpoint_url, pytorch_dump_folder_path, save
 
     print("Converting model parameters...")
     # rename keys
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         val = state_dict.pop(key)
         state_dict[rename_key(key)] = val
     # add prefix to all keys expect classifier head
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         val = state_dict.pop(key)
         if not key.startswith("classifier"):
             key = "convnextv2." + key
diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py
index e774621e19..1dce90147b 100644
--- a/src/transformers/models/ctrl/modeling_tf_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py
@@ -686,15 +686,15 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss):
 
     # Copied from transformers.models.gpt2.modeling_tf_gpt2.TFGPT2LMHeadModel.prepare_inputs_for_generation
     def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs):
-        token_type_ids = kwargs.get("token_type_ids", None)
+        token_type_ids = kwargs.get("token_type_ids")
         # only last token for inputs_ids if past is defined in kwargs
         if past_key_values:
             inputs = tf.expand_dims(inputs[:, -1], -1)
             if token_type_ids is not None:
                 token_type_ids = tf.expand_dims(token_type_ids[:, -1], -1)
 
-        position_ids = kwargs.get("position_ids", None)
-        attention_mask = kwargs.get("attention_mask", None)
+        position_ids = kwargs.get("position_ids")
+        attention_mask = kwargs.get("attention_mask")
 
         if attention_mask is not None and position_ids is None:
             position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
diff --git a/src/transformers/models/d_fine/convert_d_fine_original_pytorch_checkpoint_to_hf.py b/src/transformers/models/d_fine/convert_d_fine_original_pytorch_checkpoint_to_hf.py
index 8a344c520e..0b77ee3557 100644
--- a/src/transformers/models/d_fine/convert_d_fine_original_pytorch_checkpoint_to_hf.py
+++ b/src/transformers/models/d_fine/convert_d_fine_original_pytorch_checkpoint_to_hf.py
@@ -158,7 +158,7 @@ def load_original_state_dict(repo_id, model_name):
 
     original_state_dict = {}
     model = torch.load(directory_path, map_location="cpu")["model"]
-    for key in model.keys():
+    for key in model:
         original_state_dict[key] = model[key]
 
     return original_state_dict
@@ -406,7 +406,7 @@ def convert_d_fine_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub,
     # query, key and value matrices need special treatment
     read_in_q_k_v(state_dict, config, model_name)
     # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         if key.endswith("num_batches_tracked"):
             del state_dict[key]
         # for two_stage
diff --git a/src/transformers/models/dab_detr/convert_dab_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/dab_detr/convert_dab_detr_original_pytorch_checkpoint_to_pytorch.py
index 3d8cf3e279..efaac368f6 100644
--- a/src/transformers/models/dab_detr/convert_dab_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/dab_detr/convert_dab_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -180,7 +180,7 @@ def write_model(model_name, pretrained_model_weights_path, pytorch_dump_folder_p
     gc.collect()
     # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
     prefix = "model."
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         if not key.startswith("class_embed") and not key.startswith("bbox_predictor"):
             val = state_dict.pop(key)
             state_dict[prefix + key] = val
diff --git a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py b/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
index c88582eacc..dbd7fa3f4d 100644
--- a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
+++ b/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
@@ -127,14 +127,14 @@ def convert_deformable_detr_checkpoint(
     # load original state dict
     state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"]
     # rename keys
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         val = state_dict.pop(key)
         state_dict[rename_key(key)] = val
     # query, key and value matrices need special treatment
     read_in_q_k_v(state_dict)
     # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
     prefix = "model."
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         if not key.startswith("class_embed") and not key.startswith("bbox_embed"):
             val = state_dict.pop(key)
             state_dict[prefix + key] = val
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
index 172bdf80d4..b78ae6ee66 100644
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
@@ -318,8 +318,8 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
         self.size = get_size_dict(size, max_size=max_size, default_to_square=False)
 
         # Backwards compatibility
-        do_convert_annotations = kwargs.get("do_convert_annotations", None)
-        do_normalize = kwargs.get("do_normalize", None)
+        do_convert_annotations = kwargs.get("do_convert_annotations")
+        do_normalize = kwargs.get("do_normalize")
         if do_convert_annotations is None and getattr(self, "do_convert_annotations", None) is None:
             self.do_convert_annotations = do_normalize if do_normalize is not None else self.do_normalize
 
diff --git a/src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py b/src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py
index 6436451190..2a38bc05cc 100644
--- a/src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py
+++ b/src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py
@@ -238,7 +238,7 @@ def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
     read_in_decoder_q_k_v(state_dict, config)
 
     # fix some prefixes
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key:
             val = state_dict.pop(key)
             state_dict[key.replace("transformer.decoder", "model.decoder")] = val
diff --git a/src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py b/src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py
index c2e1ae6001..a72c8c5422 100644
--- a/src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py
+++ b/src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py
@@ -244,7 +244,7 @@ def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
     read_in_decoder_q_k_v(state_dict, config)
 
     # fix some prefixes
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key:
             val = state_dict.pop(key)
             state_dict[key.replace("transformer.decoder", "model.decoder")] = val
diff --git a/src/transformers/models/deprecated/deta/modeling_deta.py b/src/transformers/models/deprecated/deta/modeling_deta.py
index e109b84938..edc0f2598a 100644
--- a/src/transformers/models/deprecated/deta/modeling_deta.py
+++ b/src/transformers/models/deprecated/deta/modeling_deta.py
@@ -2042,7 +2042,7 @@ class DetaForObjectDetection(DetaPreTrainedModel):
                     aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
                 aux_weight_dict.update({k + "_enc": v for k, v in weight_dict.items()})
                 weight_dict.update(aux_weight_dict)
-            loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+            loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict if k in weight_dict)
 
         if not return_dict:
             if auxiliary_outputs is not None:
diff --git a/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
index 80f16881b5..7b1a4aa5f2 100644
--- a/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
@@ -105,7 +105,7 @@ def rename_key(old_name, num_meta4D_last_stage):
 
 
 def convert_torch_checkpoint(checkpoint, num_meta4D_last_stage):
-    for key in checkpoint.copy().keys():
+    for key in checkpoint.copy():
         val = checkpoint.pop(key)
         checkpoint[rename_key(key, num_meta4D_last_stage)] = val
 
diff --git a/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py
index 8aa927d821..76b9c9cf32 100644
--- a/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py
@@ -38,7 +38,7 @@ def convert_tf_gptsan_to_pt(args):
     with tf.device("/CPU:0"):
         reader = tf.train.load_checkpoint(args.tf_model_dir)
         shapes = reader.get_variable_to_shape_map()
-        for key_name in shapes.keys():
+        for key_name in shapes:
             vnp = reader.get_tensor(key_name).astype(np.float16)
             if key_name.endswith("/adam_m") or key_name.endswith("/adam_v"):
                 continue
diff --git a/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py b/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py
index 5eab135c05..c67b27f64f 100644
--- a/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py
+++ b/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py
@@ -398,7 +398,7 @@ class SubWordJapaneseTokenizer:
         self.vocab = vocab  # same as swe
         self.ids_to_tokens = ids_to_tokens  # same as bpe
         self.emoji = emoji
-        self.maxlen = np.max([len(w) for w in self.vocab.keys()])
+        self.maxlen = np.max([len(w) for w in self.vocab])
         self.content_repatter1 = re.compile(r"(https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+$,%#]+)")
         self.content_repatter2 = re.compile(r"[A-Za-z0-9\._+]*@[\-_0-9A-Za-z]+(\.[A-Za-z]+)*")
         self.content_repatter3 = re.compile(r"[\(]{0,1}[0-9]{2,4}[\)\-\(]{0,1}[0-9]{2,4}[\)\-]{0,1}[0-9]{3,4}")
diff --git a/src/transformers/models/deprecated/graphormer/collating_graphormer.py b/src/transformers/models/deprecated/graphormer/collating_graphormer.py
index a5300fe8a5..19bcaac3f5 100644
--- a/src/transformers/models/deprecated/graphormer/collating_graphormer.py
+++ b/src/transformers/models/deprecated/graphormer/collating_graphormer.py
@@ -27,12 +27,12 @@ def convert_to_single_emb(x, offset: int = 512):
 def preprocess_item(item, keep_features=True):
     requires_backends(preprocess_item, ["cython"])
 
-    if keep_features and "edge_attr" in item.keys():  # edge_attr
+    if keep_features and "edge_attr" in item:  # edge_attr
         edge_attr = np.asarray(item["edge_attr"], dtype=np.int64)
     else:
         edge_attr = np.ones((len(item["edge_index"][0]), 1), dtype=np.int64)  # same embedding for all
 
-    if keep_features and "node_feat" in item.keys():  # input_nodes
+    if keep_features and "node_feat" in item:  # input_nodes
         node_feature = np.asarray(item["node_feat"], dtype=np.int64)
     else:
         node_feature = np.ones((item["num_nodes"], 1), dtype=np.int64)  # same embedding for all
diff --git a/src/transformers/models/deprecated/jukebox/convert_jukebox.py b/src/transformers/models/deprecated/jukebox/convert_jukebox.py
index 3380e38693..29763daaa3 100644
--- a/src/transformers/models/deprecated/jukebox/convert_jukebox.py
+++ b/src/transformers/models/deprecated/jukebox/convert_jukebox.py
@@ -231,7 +231,7 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
         old_dic = torch.load(f"{pytorch_dump_folder_path}/{dict_name.split('/')[-1]}", weights_only=True)["model"]
 
         new_dic = {}
-        for k in old_dic.keys():
+        for k in old_dic:
             if k.endswith(".b"):
                 new_dic[k.replace("b", "bias")] = old_dic[k]
             elif k.endswith(".w"):
diff --git a/src/transformers/models/deprecated/jukebox/modeling_jukebox.py b/src/transformers/models/deprecated/jukebox/modeling_jukebox.py
index 4dfd1c6924..f928d49cf5 100755
--- a/src/transformers/models/deprecated/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/deprecated/jukebox/modeling_jukebox.py
@@ -2269,7 +2269,7 @@ class JukeboxPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = False
 
     def _init_weights(self, module):
-        if isinstance(module, JukeboxPrior) or isinstance(module, JukeboxVQVAE):
+        if isinstance(module, (JukeboxPrior, JukeboxVQVAE)):
             module.apply(module._init_weights)
 
     def __init__(self, *inputs, **kwargs):
diff --git a/src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py
index c6dbb12890..6ac5dd4df1 100644
--- a/src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py
@@ -188,7 +188,7 @@ def convert_checkpoint_to_huggingface(pretrained_checkpoint_path, output_path, i
     # also renaming previously confusing parameter names
     original_state_dict = original_mlm.mega.encoders.state_dict()
     updated_keys = {}
-    for module_name in original_state_dict.keys():
+    for module_name in original_state_dict:
         new_module_name = None
         # have to handle gamma, beta, and alpha differently due to their use
         # in multiple modules within the original repository;
diff --git a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
index abdcc2b40e..235cb81717 100644
--- a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
+++ b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
@@ -787,7 +787,7 @@ class OpenLlamaForCausalLM(OpenLlamaPreTrainedModel):
 
             input_ids = input_ids[:, remove_prefix_length:]
 
-        position_ids = kwargs.get("position_ids", None)
+        position_ids = kwargs.get("position_ids")
         if attention_mask is not None and position_ids is None:
             # create position_ids on the fly for batch generation
             position_ids = attention_mask.long().cumsum(-1) - 1
diff --git a/src/transformers/models/deprecated/tapex/tokenization_tapex.py b/src/transformers/models/deprecated/tapex/tokenization_tapex.py
index 4ad87dd4e7..b32383ddd4 100644
--- a/src/transformers/models/deprecated/tapex/tokenization_tapex.py
+++ b/src/transformers/models/deprecated/tapex/tokenization_tapex.py
@@ -1354,12 +1354,12 @@ class TapexTokenizer(PreTrainedTokenizer):
         # modify the answer list
         if answer is not None:
             for i, case in enumerate(answer):
-                if case in cell_mapping.keys():
+                if case in cell_mapping:
                     answer[i] = cell_mapping[case]
 
     def truncate_cell(self, cell_value):
         # do not process on these cases
-        if isinstance(cell_value, int) or isinstance(cell_value, float):
+        if isinstance(cell_value, (int, float)):
             return cell_value
         if cell_value.strip() != "":
             try_tokens = self.tokenize(cell_value)
diff --git a/src/transformers/models/deprecated/van/convert_van_to_pytorch.py b/src/transformers/models/deprecated/van/convert_van_to_pytorch.py
index bdd357f04a..ec43af68d7 100644
--- a/src/transformers/models/deprecated/van/convert_van_to_pytorch.py
+++ b/src/transformers/models/deprecated/van/convert_van_to_pytorch.py
@@ -45,7 +45,7 @@ class Tracker:
     handles: list = field(default_factory=list)
 
     def _forward_hook(self, m, inputs: Tensor, outputs: Tensor):
-        has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d)
+        has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, (nn.Conv2d, nn.BatchNorm2d))
         if has_not_submodules:
             if not isinstance(m, VanLayerScaling):
                 self.traced.append(m)
diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index bbd2eea60b..6bc14a0e15 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -158,7 +158,7 @@ class DepthProConfig(PretrainedConfig):
         self.patch_model_config = patch_model_config
         self.fov_model_config = fov_model_config
 
-        for sub_config_key in self.sub_configs.keys():
+        for sub_config_key in self.sub_configs:
             sub_config = getattr(self, sub_config_key)
 
             if sub_config is None:
diff --git a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
index ba98514501..8a7a2e0e0a 100644
--- a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -224,7 +224,7 @@ def convert_detr_checkpoint(model_name, pytorch_dump_folder_path):
     read_in_q_k_v(state_dict, is_panoptic=is_panoptic)
     # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
     prefix = "detr.model." if is_panoptic else "model."
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         if is_panoptic:
             if (
                 key.startswith("detr")
diff --git a/src/transformers/models/detr/convert_detr_to_pytorch.py b/src/transformers/models/detr/convert_detr_to_pytorch.py
index 6ba6a0e292..ffc755074d 100644
--- a/src/transformers/models/detr/convert_detr_to_pytorch.py
+++ b/src/transformers/models/detr/convert_detr_to_pytorch.py
@@ -310,7 +310,7 @@ def convert_detr_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_h
     read_in_q_k_v(state_dict, is_panoptic=is_panoptic)
     # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
     prefix = "detr.model." if is_panoptic else "model."
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         if is_panoptic:
             if (
                 key.startswith("detr")
diff --git a/src/transformers/models/detr/image_processing_detr_fast.py b/src/transformers/models/detr/image_processing_detr_fast.py
index 8a3235a41d..70ae52fef4 100644
--- a/src/transformers/models/detr/image_processing_detr_fast.py
+++ b/src/transformers/models/detr/image_processing_detr_fast.py
@@ -339,8 +339,8 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
         self.size = get_size_dict(size, max_size=max_size, default_to_square=False)
 
         # Backwards compatibility
-        do_convert_annotations = kwargs.get("do_convert_annotations", None)
-        do_normalize = kwargs.get("do_normalize", None)
+        do_convert_annotations = kwargs.get("do_convert_annotations")
+        do_normalize = kwargs.get("do_normalize")
         if do_convert_annotations is None and getattr(self, "do_convert_annotations", None) is None:
             self.do_convert_annotations = do_normalize if do_normalize is not None else self.do_normalize
 
diff --git a/src/transformers/models/dia/generation_dia.py b/src/transformers/models/dia/generation_dia.py
index 0ca5998bf2..5111e77644 100644
--- a/src/transformers/models/dia/generation_dia.py
+++ b/src/transformers/models/dia/generation_dia.py
@@ -421,7 +421,7 @@ class DiaGenerationMixin(GenerationMixin):
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
         # We expect the initial input ids to be the complete mask (delayed input)
-        delay_mask = kwargs.get("decoder_input_ids", None)
+        delay_mask = kwargs.get("decoder_input_ids")
         if delay_mask is not None:
             delay_mask = delay_mask.clone()
 
diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py
index f6f14f6d08..d58cdd6224 100644
--- a/src/transformers/models/donut/convert_donut_to_pytorch.py
+++ b/src/transformers/models/donut/convert_donut_to_pytorch.py
@@ -93,7 +93,7 @@ def rename_key(name):
 
 
 def convert_state_dict(orig_state_dict, model):
-    for key in orig_state_dict.copy().keys():
+    for key in orig_state_dict.copy():
         val = orig_state_dict.pop(key)
 
         if "qkv" in key:
diff --git a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
index ceae9b8471..ce53018a76 100644
--- a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
+++ b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
@@ -230,7 +230,7 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
     # remove certain keys
     remove_ignore_keys_(state_dict)
     # rename keys
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         val = state_dict.pop(key)
         state_dict[rename_key(key)] = val
     # read in qkv matrices
diff --git a/src/transformers/models/dpt/convert_dpt_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_to_pytorch.py
index 55e0a444e8..1341f8908b 100644
--- a/src/transformers/models/dpt/convert_dpt_to_pytorch.py
+++ b/src/transformers/models/dpt/convert_dpt_to_pytorch.py
@@ -197,7 +197,7 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
     # remove certain keys
     remove_ignore_keys_(state_dict)
     # rename keys
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         val = state_dict.pop(key)
         state_dict[rename_key(key)] = val
     # read in qkv matrices
diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
index 44d8e6cd7e..b065266795 100644
--- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -373,9 +373,9 @@ class EncoderDecoderModel(PreTrainedModel, GenerationMixin):
         }
 
         # remove encoder, decoder kwargs from kwargs
-        for key in kwargs_encoder.keys():
+        for key in kwargs_encoder:
             del kwargs["encoder_" + key]
-        for key in kwargs_decoder.keys():
+        for key in kwargs_decoder:
             del kwargs["decoder_" + key]
 
         # Load and initialize the encoder and decoder
diff --git a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
index 13da12b1aa..67fa50ac36 100644
--- a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
@@ -818,9 +818,9 @@ class FlaxEncoderDecoderModel(FlaxPreTrainedModel):
         }
 
         # remove encoder, decoder kwargs from kwargs
-        for key in kwargs_encoder.keys():
+        for key in kwargs_encoder:
             del kwargs["encoder_" + key]
-        for key in kwargs_decoder.keys():
+        for key in kwargs_decoder:
             del kwargs["decoder_" + key]
 
         # Load and initialize the encoder and decoder
diff --git a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
index 05cd02e3bc..da0dde6915 100644
--- a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
@@ -374,9 +374,9 @@ class TFEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLoss):
         }
 
         # remove encoder, decoder kwargs from kwargs
-        for key in kwargs_encoder.keys():
+        for key in kwargs_encoder:
             del kwargs["encoder_" + key]
-        for key in kwargs_decoder.keys():
+        for key in kwargs_decoder:
             del kwargs["decoder_" + key]
 
         # Load and initialize the encoder and decoder
diff --git a/src/transformers/models/eomt/image_processing_eomt.py b/src/transformers/models/eomt/image_processing_eomt.py
index e63a1be95f..37b9b11103 100644
--- a/src/transformers/models/eomt/image_processing_eomt.py
+++ b/src/transformers/models/eomt/image_processing_eomt.py
@@ -239,7 +239,7 @@ def compute_segments(
 def get_target_size(size_dict: dict[str, int]) -> tuple[int, int]:
     """Returns the height and width from a size dict."""
     target_height = size_dict["shortest_edge"]
-    target_width = size_dict.get("longest_edge", None) or target_height
+    target_width = size_dict.get("longest_edge") or target_height
 
     return target_height, target_width
 
diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py
index 5cd2bd5058..a6b73b80ce 100644
--- a/src/transformers/models/falcon/modeling_falcon.py
+++ b/src/transformers/models/falcon/modeling_falcon.py
@@ -650,7 +650,7 @@ class FalconPreTrainedModel(PreTrainedModel):
 
     def _init_weights(self, module: nn.Module):
         """Initialize the weights."""
-        if isinstance(module, nn.Linear) or isinstance(module, FalconLinear):
+        if isinstance(module, (nn.Linear, FalconLinear)):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
diff --git a/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py b/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py
index 4aed159280..d6af117e56 100644
--- a/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py
+++ b/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py
@@ -141,7 +141,7 @@ def convert_focalnet_checkpoint(model_name, pytorch_dump_folder_path, push_to_hu
     state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]
 
     # rename keys
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         val = state_dict.pop(key)
         state_dict[rename_key(key)] = val
 
diff --git a/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
index c86afddc79..35e8265850 100755
--- a/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
@@ -134,7 +134,7 @@ def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder
     # detect whether this is a do_lower_case situation, which can be derived by checking whether we
     # have at least one uppercase letter in the source vocab
     do_lower_case = True
-    for k in src_vocab.keys():
+    for k in src_vocab:
         if not k.islower():
             do_lower_case = False
             break
diff --git a/src/transformers/models/fuyu/processing_fuyu.py b/src/transformers/models/fuyu/processing_fuyu.py
index f13869a146..54e5054efb 100644
--- a/src/transformers/models/fuyu/processing_fuyu.py
+++ b/src/transformers/models/fuyu/processing_fuyu.py
@@ -607,7 +607,7 @@ class FuyuProcessor(ProcessorMixin):
 
         vision_data = {}
         if image_sizes is not None:
-            size = kwargs.get("size", None) or self.image_processor.size
+            size = kwargs.get("size") or self.image_processor.size
             padded_height, padded_width = size["height"], size["width"]
 
             num_image_tokens = []
diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py
index 19013e7fdf..978df567c7 100644
--- a/src/transformers/models/gemma3n/configuration_gemma3n.py
+++ b/src/transformers/models/gemma3n/configuration_gemma3n.py
@@ -509,7 +509,7 @@ class Gemma3nVisionConfig(PretrainedConfig):
 
     @classmethod
     def from_dict(cls, config_dict: dict[str, Any], **kwargs):
-        label_names = config_dict.get("label_names", None)
+        label_names = config_dict.get("label_names")
         is_custom_model = "num_labels" in kwargs or "id2label" in kwargs
 
         # if no labels added to config, use imagenet labeller in timm
diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py
index 3312e80c79..5a195b06ec 100644
--- a/src/transformers/models/git/modeling_git.py
+++ b/src/transformers/models/git/modeling_git.py
@@ -1451,7 +1451,7 @@ class GitForCausalLM(GitPreTrainedModel, GenerationMixin):
         return {
             "input_ids": input_ids,
             "attention_mask": attention_mask,
-            "pixel_values": kwargs.get("pixel_values", None),
+            "pixel_values": kwargs.get("pixel_values"),
             "past_key_values": past_key_values,
             "use_cache": use_cache,
         }
diff --git a/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py b/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py
index a1e09375dc..a9398805e9 100644
--- a/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py
+++ b/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py
@@ -232,7 +232,7 @@ def save_sharded_model(state_dict, output_path, max_shard_size_gb=5, num_layers=
         shard_filename = f"model-{i + 1:05d}-of-{len(shards):05d}.safetensors"
         shard_path = os.path.join(output_path, shard_filename)
 
-        for param_name in shard.keys():
+        for param_name in shard:
             index_dict["weight_map"][param_name] = shard_filename
 
         save_file(shard, shard_path, metadata={"format": "pt"})
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index c3155b17ea..7e1100d64e 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -718,7 +718,7 @@ class GPT2Model(GPT2PreTrainedModel):
         )
         assert_device_map(self.device_map, len(self.h))
         self.model_parallel = True
-        self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
+        self.first_device = "cpu" if "cpu" in self.device_map else "cuda:" + str(min(self.device_map.keys()))
         self.last_device = "cuda:" + str(max(self.device_map.keys()))
         self.wte = self.wte.to(self.first_device)
         self.wpe = self.wpe.to(self.first_device)
diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py
index 27b2e20c37..42e23fc290 100644
--- a/src/transformers/models/gpt2/modeling_tf_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py
@@ -845,15 +845,15 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss):
         self.set_input_embeddings(value)
 
     def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs):
-        token_type_ids = kwargs.get("token_type_ids", None)
+        token_type_ids = kwargs.get("token_type_ids")
         # only last token for inputs_ids if past is defined in kwargs
         if past_key_values:
             inputs = tf.expand_dims(inputs[:, -1], -1)
             if token_type_ids is not None:
                 token_type_ids = tf.expand_dims(token_type_ids[:, -1], -1)
 
-        position_ids = kwargs.get("position_ids", None)
-        attention_mask = kwargs.get("attention_mask", None)
+        position_ids = kwargs.get("position_ids")
+        attention_mask = kwargs.get("attention_mask")
 
         if attention_mask is not None and position_ids is None:
             position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
diff --git a/src/transformers/models/gpt2/tokenization_gpt2_tf.py b/src/transformers/models/gpt2/tokenization_gpt2_tf.py
index a1facd9570..145a45da0d 100644
--- a/src/transformers/models/gpt2/tokenization_gpt2_tf.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2_tf.py
@@ -61,7 +61,7 @@ class TFGPT2Tokenizer(keras.layers.Layer):
         tf_tokenizer = TFGPT2Tokenizer.from_tokenizer(tokenizer)
         ```
         """
-        merges = [" ".join(m) for m in tokenizer.bpe_ranks.keys()]
+        merges = [" ".join(m) for m in tokenizer.bpe_ranks]
         vocab = tokenizer.get_vocab()
         return cls(vocab, merges, *args, **kwargs)
 
diff --git a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
index b65d9b53be..891f77ece3 100644
--- a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
@@ -221,7 +221,7 @@ class SubWordJapaneseTokenizer:
         self.vocab = vocab  # same as swe
         self.ids_to_tokens = ids_to_tokens  # same as bpe
         self.emoji = emoji
-        self.maxlen = np.max([len(w) for w in self.vocab.keys()])
+        self.maxlen = np.max([len(w) for w in self.vocab])
         self.content_repatter1 = re.compile(r"(https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+$,%#]+)")
         self.content_repatter2 = re.compile(r"[A-Za-z0-9\._+]*@[\-_0-9A-Za-z]+(\.[A-Za-z]+)*")
         self.content_repatter3 = re.compile(r"[\(]{0,1}[0-9]{2,4}[\)\-\(]{0,1}[0-9]{2,4}[\)\-]{0,1}[0-9]{3,4}")
diff --git a/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py b/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py
index c4e2ff67c5..27ec2f20d8 100644
--- a/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py
+++ b/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py
@@ -34,7 +34,7 @@ def recursive_print(name, val, spaces=0):
     if isinstance(val, dict):
         if msg is not None:
             print(msg)
-        for k in val.keys():
+        for k in val:
             recursive_print(k, val[k], spaces + 2)
     elif isinstance(val, torch.Tensor):
         print(msg, ":", val.size())
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index 0622bf5ed0..d95d83d2c9 100644
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -581,7 +581,7 @@ class GPTJModel(GPTJPreTrainedModel):
         )
         assert_device_map(self.device_map, len(self.h))
         self.model_parallel = True
-        self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
+        self.first_device = "cpu" if "cpu" in self.device_map else "cuda:" + str(min(self.device_map.keys()))
         self.last_device = "cuda:" + str(max(self.device_map.keys()))
         self.wte = self.wte.to(self.first_device)
         # Load onto devices
diff --git a/src/transformers/models/gptj/modeling_tf_gptj.py b/src/transformers/models/gptj/modeling_tf_gptj.py
index d9327bb50a..0ec3225822 100644
--- a/src/transformers/models/gptj/modeling_tf_gptj.py
+++ b/src/transformers/models/gptj/modeling_tf_gptj.py
@@ -752,15 +752,15 @@ class TFGPTJForCausalLM(TFGPTJPreTrainedModel, TFCausalLanguageModelingLoss):
         self.config = config
 
     def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs):
-        token_type_ids = kwargs.get("token_type_ids", None)
+        token_type_ids = kwargs.get("token_type_ids")
         # only last token for inputs_ids if past is defined in kwargs
         if past_key_values:
             inputs = tf.expand_dims(inputs[:, -1], -1)
             if token_type_ids is not None:
                 token_type_ids = tf.expand_dims(token_type_ids[:, -1], -1)
 
-        position_ids = kwargs.get("position_ids", None)
-        attention_mask = kwargs.get("attention_mask", None)
+        position_ids = kwargs.get("position_ids")
+        attention_mask = kwargs.get("attention_mask")
 
         if attention_mask is not None and position_ids is None:
             position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index 9f3f70bc57..b7358e2a01 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -239,7 +239,7 @@ def create_rename_keys(state_dict, config):
     ########################################## DECODER - END
 
     ########################################## Additional - START
-    for layer_name in state_dict.keys():
+    for layer_name in state_dict:
         #### TEXT BACKBONE
         if "bert" in layer_name:
             rename_keys.append((layer_name, layer_name.replace("bert", "model.text_backbone")))
diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py
index 6505ddd896..ff291b3d67 100644
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py
@@ -349,8 +349,8 @@ class GroundingDinoImageProcessorFast(BaseImageProcessorFast):
         self.size = get_size_dict(size, max_size=max_size, default_to_square=False)
 
         # Backwards compatibility
-        do_convert_annotations = kwargs.get("do_convert_annotations", None)
-        do_normalize = kwargs.get("do_normalize", None)
+        do_convert_annotations = kwargs.get("do_convert_annotations")
+        do_normalize = kwargs.get("do_normalize")
         if do_convert_annotations is None and getattr(self, "do_convert_annotations", None) is None:
             self.do_convert_annotations = do_normalize if do_normalize is not None else self.do_normalize
 
diff --git a/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py b/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py
index 6bc2818498..ac6844bd34 100644
--- a/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py
+++ b/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py
@@ -85,7 +85,7 @@ def rename_key(name):
 
 
 def convert_state_dict(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
+    for key in orig_state_dict.copy():
         val = orig_state_dict.pop(key)
 
         if "qkv" in key:
diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py
index 0b7ed531db..d3b462692a 100644
--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@@ -1276,7 +1276,7 @@ class TFHubertMainLayer(keras.layers.Layer):
 
         hidden_states = self.feature_projection(hidden_states, training=training)
 
-        mask_time_indices = kwargs.get("mask_time_indices", None)
+        mask_time_indices = kwargs.get("mask_time_indices")
         if training:
             hidden_states = self._mask_hidden_states(hidden_states, mask_time_indices=mask_time_indices)
 
diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py
index ac8b7776c5..e47ddbe273 100644
--- a/src/transformers/models/idefics/modeling_idefics.py
+++ b/src/transformers/models/idefics/modeling_idefics.py
@@ -132,10 +132,10 @@ def expand_inputs_for_generation(
         torch.arange(input_ids.shape[0]).view(-1, 1).repeat(1, expand_size).view(-1).to(input_ids.device)
     )
     input_ids = input_ids.index_select(0, expanded_return_idx)
-    model_kwargs["pixel_values"] = model_kwargs.get("pixel_values", None)
-    model_kwargs["image_encoder_embeddings"] = model_kwargs.get("image_encoder_embeddings", None)
-    model_kwargs["perceiver_embeddings"] = model_kwargs.get("perceiver_embeddings", None)
-    model_kwargs["image_attention_mask"] = model_kwargs.get("image_attention_mask", None)
+    model_kwargs["pixel_values"] = model_kwargs.get("pixel_values")
+    model_kwargs["image_encoder_embeddings"] = model_kwargs.get("image_encoder_embeddings")
+    model_kwargs["perceiver_embeddings"] = model_kwargs.get("perceiver_embeddings")
+    model_kwargs["image_attention_mask"] = model_kwargs.get("image_attention_mask")
 
     if "token_type_ids" in model_kwargs:
         token_type_ids = model_kwargs["token_type_ids"]
diff --git a/src/transformers/models/idefics/modeling_tf_idefics.py b/src/transformers/models/idefics/modeling_tf_idefics.py
index f006862284..28bffb7921 100644
--- a/src/transformers/models/idefics/modeling_tf_idefics.py
+++ b/src/transformers/models/idefics/modeling_tf_idefics.py
@@ -149,10 +149,10 @@ def expand_inputs_for_generation(
 ):
     expanded_return_idx = tf.reshape(tf.repeat(tf.range(tf.shape(input_ids)[0]), expand_size), [-1])
     input_ids = tf.gather(input_ids, expanded_return_idx)
-    model_kwargs["pixel_values"] = model_kwargs.get("pixel_values", None)
-    model_kwargs["image_encoder_embeddings"] = model_kwargs.get("image_encoder_embeddings", None)
-    model_kwargs["perceiver_embeddings"] = model_kwargs.get("perceiver_embeddings", None)
-    model_kwargs["image_attention_mask"] = model_kwargs.get("image_attention_mask", None)
+    model_kwargs["pixel_values"] = model_kwargs.get("pixel_values")
+    model_kwargs["image_encoder_embeddings"] = model_kwargs.get("image_encoder_embeddings")
+    model_kwargs["perceiver_embeddings"] = model_kwargs.get("perceiver_embeddings")
+    model_kwargs["image_attention_mask"] = model_kwargs.get("image_attention_mask")
 
     if "token_type_ids" in model_kwargs:
         token_type_ids = model_kwargs["token_type_ids"]
@@ -208,15 +208,15 @@ def update_model_kwargs_for_generation(outputs, model_kwargs):
 
 
 def prepare_inputs_for_generation(input_ids, past_key_values=None, **kwargs):
-    token_type_ids = kwargs.get("token_type_ids", None)
+    token_type_ids = kwargs.get("token_type_ids")
     # only last token for inputs_ids if past is defined in kwargs
     if past_key_values is not None:
         input_ids = input_ids[:, -1:]
         if token_type_ids is not None:
             token_type_ids = token_type_ids[:, -1:]
 
-    attention_mask = kwargs.get("attention_mask", None)
-    position_ids = kwargs.get("position_ids", None)
+    attention_mask = kwargs.get("attention_mask")
+    position_ids = kwargs.get("position_ids")
 
     if attention_mask is not None and position_ids is None:
         # create position_ids on the fly for batch generation
@@ -225,10 +225,10 @@ def prepare_inputs_for_generation(input_ids, past_key_values=None, **kwargs):
         if past_key_values is not None:
             position_ids = position_ids[:, -1:]
 
-    pixel_values = kwargs.get("pixel_values", None)
-    image_encoder_embeddings = kwargs.get("image_encoder_embeddings", None)
-    perceiver_embeddings = kwargs.get("perceiver_embeddings", None)
-    image_attention_mask = kwargs.get("image_attention_mask", None)
+    pixel_values = kwargs.get("pixel_values")
+    image_encoder_embeddings = kwargs.get("image_encoder_embeddings")
+    perceiver_embeddings = kwargs.get("perceiver_embeddings")
+    image_attention_mask = kwargs.get("image_attention_mask")
     interpolate_pos_encoding = kwargs.get("interpolate_pos_encoding", False)
 
     return {
diff --git a/src/transformers/models/internvl/convert_internvl_weights_to_hf.py b/src/transformers/models/internvl/convert_internvl_weights_to_hf.py
index a390166a04..e20fcf4f36 100644
--- a/src/transformers/models/internvl/convert_internvl_weights_to_hf.py
+++ b/src/transformers/models/internvl/convert_internvl_weights_to_hf.py
@@ -128,7 +128,7 @@ def get_lm_type(path: str) -> Literal["qwen2", "llama"]:
     """
     Determine the type of language model (either 'qwen2' or 'llama') based on a given model path.
     """
-    if path not in LM_TYPE_CORRESPONDENCE.keys():
+    if path not in LM_TYPE_CORRESPONDENCE:
         base_config = AutoModel.from_pretrained(path, trust_remote_code=True).config
 
         lm_arch = base_config.llm_config.architectures[0]
diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
index a7c1502058..8e324ee0b8 100644
--- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
+++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
@@ -519,7 +519,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
         # To match each overflowing sample with the original sample in the batch
         # we add an overflow_to_sample_mapping array (see below)
         sanitized_tokens = {}
-        for key in tokens_and_encodings[0][0].keys():
+        for key in tokens_and_encodings[0][0]:
             stack = [e for item, _ in tokens_and_encodings for e in item[key]]
             sanitized_tokens[key] = stack
         sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
index aa2fb67b13..d040763859 100644
--- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
+++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
@@ -566,7 +566,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
         # To match each overflowing sample with the original sample in the batch
         # we add an overflow_to_sample_mapping array (see below)
         sanitized_tokens = {}
-        for key in tokens_and_encodings[0][0].keys():
+        for key in tokens_and_encodings[0][0]:
             stack = [e for item, _ in tokens_and_encodings for e in item[key]]
             sanitized_tokens[key] = stack
         sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
index cca7a805fe..6710c6c8cb 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
@@ -503,7 +503,7 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
         # To match each overflowing sample with the original sample in the batch
         # we add an overflow_to_sample_mapping array (see below)
         sanitized_tokens = {}
-        for key in tokens_and_encodings[0][0].keys():
+        for key in tokens_and_encodings[0][0]:
             stack = [e for item, _ in tokens_and_encodings for e in item[key]]
             sanitized_tokens[key] = stack
         sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
diff --git a/src/transformers/models/luke/tokenization_luke.py b/src/transformers/models/luke/tokenization_luke.py
index 8b33055877..efbc757e86 100644
--- a/src/transformers/models/luke/tokenization_luke.py
+++ b/src/transformers/models/luke/tokenization_luke.py
@@ -1450,7 +1450,7 @@ class LukeTokenizer(PreTrainedTokenizer):
         # If we have a list of dicts, let's convert it in a dict of lists
         # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
         if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping):
-            encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
+            encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0]}
 
         # The model's main input name, usually `input_ids`, has be passed for padding
         if self.model_input_names[0] not in encoded_inputs:
diff --git a/src/transformers/models/markuplm/tokenization_markuplm_fast.py b/src/transformers/models/markuplm/tokenization_markuplm_fast.py
index f844893641..4033ef319f 100644
--- a/src/transformers/models/markuplm/tokenization_markuplm_fast.py
+++ b/src/transformers/models/markuplm/tokenization_markuplm_fast.py
@@ -625,7 +625,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
         # To match each overflowing sample with the original sample in the batch
         # we add an overflow_to_sample_mapping array (see below)
         sanitized_tokens = {}
-        for key in tokens_and_encodings[0][0].keys():
+        for key in tokens_and_encodings[0][0]:
             stack = [e for item, _ in tokens_and_encodings for e in item[key]]
             sanitized_tokens[key] = stack
         sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
diff --git a/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py
index 09acc3b97a..33cba259ee 100644
--- a/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py
@@ -804,7 +804,7 @@ class OriginalMask2FormerCheckpointToOursConverter:
         logger.info(f"Not copied keys are {pformat(src_state_dict.keys())}")
         logger.info("🙌 Done")
 
-        state_dict = {key: dst_state_dict[key] for key in dst_state_dict.to_track.keys()}
+        state_dict = {key: dst_state_dict[key] for key in dst_state_dict.to_track}
         mask2former.load_state_dict(state_dict)
         return mask2former
 
@@ -816,7 +816,7 @@ class OriginalMask2FormerCheckpointToOursConverter:
 
         self.replace_universal_segmentation_module(dst_state_dict, src_state_dict)
 
-        state_dict = {key: dst_state_dict[key] for key in dst_state_dict.to_track.keys()}
+        state_dict = {key: dst_state_dict[key] for key in dst_state_dict.to_track}
         mask2former.load_state_dict(state_dict)
 
         return mask2former
diff --git a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
index cd29b7ad78..69ebed5aa8 100644
--- a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
+++ b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
@@ -57,7 +57,7 @@ def recursive_print(name, val, spaces=0):
     if isinstance(val, dict):
         if msg is not None:
             print(msg)
-        for k in val.keys():
+        for k in val:
             recursive_print(k, val[k], spaces + 2)
     elif isinstance(val, torch.Tensor):
         print(msg, ":", val.size())
@@ -116,7 +116,7 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
     # The hidden_size per head.
     hidden_size_per_head = config.hidden_size // heads
     # Megatron-LM checkpoint version
-    if "checkpoint_version" in input_state_dict.keys():
+    if "checkpoint_version" in input_state_dict:
         checkpoint_version = input_state_dict["checkpoint_version"]
     else:
         checkpoint_version = 0.0
@@ -147,7 +147,7 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
     output_state_dict["bert.embeddings.token_type_embeddings.weight"] = tokentype_embeddings
 
     # The transformer.
-    transformer = lm["transformer"] if "transformer" in lm.keys() else lm["encoder"]
+    transformer = lm["transformer"] if "transformer" in lm else lm["encoder"]
 
     # The regex to extract layer names.
     layer_re = re.compile(r"layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
diff --git a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
index 2e50a3b6b6..ab866834a2 100644
--- a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
+++ b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
@@ -189,7 +189,7 @@ def recursive_print(name, val, spaces=0):
     if isinstance(val, dict):
         if msg is not None:
             print(msg)
-        for k in val.keys():
+        for k in val:
             recursive_print(k, val[k], spaces + 2)
     elif isinstance(val, torch.Tensor):
         print(msg, ":", val.size())
@@ -448,7 +448,7 @@ def convert_checkpoint_from_megatron_to_transformers(args):
         # The transformer.
         path = (
             "model.language_model.transformer"
-            if "transformer" in get_element_from_dict_by_path(tp_state_dicts[0], "model.language_model").keys()
+            if "transformer" in get_element_from_dict_by_path(tp_state_dicts[0], "model.language_model")
             else "model.language_model.encoder"
         )
         # Extract the layers.
@@ -793,9 +793,7 @@ def convert_checkpoint_from_transformers_to_megatron(args):
         for layer in range(num_layers):
             pp_layer_id = layer + layer_offset
             layers_to_copy = [
-                layer_name
-                for layer_name in state_dict.keys()
-                if layer_name.startswith(f"transformer.h.{pp_layer_id}.")
+                layer_name for layer_name in state_dict if layer_name.startswith(f"transformer.h.{pp_layer_id}.")
             ]
 
             for layer_name in layers_to_copy:
@@ -844,7 +842,7 @@ def convert_checkpoint_from_transformers_to_megatron(args):
 
                 # handle attention and mlp weights
                 elif weight_or_bias == "weight":
-                    out_name = transformers_to_megatron.get(op_name, None)
+                    out_name = transformers_to_megatron.get(op_name)
                     if out_name is None:
                         continue
                     params = params.transpose(0, 1)
@@ -852,7 +850,7 @@ def convert_checkpoint_from_transformers_to_megatron(args):
 
                 # handle attention and mlp bias
                 elif weight_or_bias == "bias":
-                    out_name = transformers_to_megatron.get(op_name, None)
+                    out_name = transformers_to_megatron.get(op_name)
                     if out_name is None:
                         continue
                     layer_name = f"layers.{layer}.{out_name}.{weight_or_bias}"
diff --git a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
index c0054fab3f..d1953f50ba 100644
--- a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
+++ b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
@@ -57,7 +57,7 @@ def recursive_print(name, val, spaces=0):
     if isinstance(val, dict):
         if msg is not None:
             print(msg)
-        for k in val.keys():
+        for k in val:
             recursive_print(k, val[k], spaces + 2)
     elif isinstance(val, torch.Tensor):
         print(msg, ":", val.size())
@@ -115,7 +115,7 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
     # The hidden_size per head.
     hidden_size_per_head = config.n_embd // config.n_head
     # Megatron-LM checkpoint version
-    if "checkpoint_version" in input_state_dict.keys():
+    if "checkpoint_version" in input_state_dict:
         checkpoint_version = input_state_dict["checkpoint_version"]
     else:
         checkpoint_version = 0.0
@@ -145,7 +145,7 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
     output_state_dict["transformer.wpe.weight"] = pos_embeddings
 
     # The transformer.
-    transformer = lm["transformer"] if "transformer" in lm.keys() else lm["encoder"]
+    transformer = lm["transformer"] if "transformer" in lm else lm["encoder"]
 
     # The regex to extract layer names.
     layer_re = re.compile(r"layers\.(\d+)\.([a-z0-9_.]+)\.([a-z0-9_]+)")
diff --git a/src/transformers/models/mistral/convert_mistral_weights_to_hf.py b/src/transformers/models/mistral/convert_mistral_weights_to_hf.py
index 0f9cf597d5..a790fed81d 100644
--- a/src/transformers/models/mistral/convert_mistral_weights_to_hf.py
+++ b/src/transformers/models/mistral/convert_mistral_weights_to_hf.py
@@ -176,7 +176,7 @@ def convert_config(original_config: dict, max_position_embeddings: int = 32768):
     new_config_kwargs.update({k: v for k, v in original_config.items() if k in similar_keys_to_keep})
 
     # These are not always defined depending on `params.json`
-    new_config_kwargs["sliding_window"] = original_config.get("sliding_window", None)
+    new_config_kwargs["sliding_window"] = original_config.get("sliding_window")
     new_config_kwargs["num_key_value_heads"] = original_config.get(
         "n_kv_heads", new_config_kwargs["num_attention_heads"]
     )
diff --git a/src/transformers/models/mistral/modeling_tf_mistral.py b/src/transformers/models/mistral/modeling_tf_mistral.py
index b9afd8ab26..30ca8d5769 100644
--- a/src/transformers/models/mistral/modeling_tf_mistral.py
+++ b/src/transformers/models/mistral/modeling_tf_mistral.py
@@ -880,7 +880,7 @@ class TFMistralForCausalLM(TFMistralPreTrainedModel, TFCausalLanguageModelingLos
         if past_key_values:
             input_ids = tf.expand_dims(input_ids[:, -1], -1)
 
-        position_ids = kwargs.get("position_ids", None)
+        position_ids = kwargs.get("position_ids")
         if attention_mask is not None and position_ids is None:
             position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
             if past_key_values:
diff --git a/src/transformers/models/mluke/convert_mluke_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/mluke/convert_mluke_original_pytorch_checkpoint_to_pytorch.py
index 1edcae80f6..be0f52a70e 100644
--- a/src/transformers/models/mluke/convert_mluke_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/mluke/convert_mluke_original_pytorch_checkpoint_to_pytorch.py
@@ -100,7 +100,7 @@ def convert_luke_checkpoint(checkpoint_path, metadata_path, entity_vocab_path, p
     state_dict.pop("lm_head.decoder.weight")
     state_dict.pop("lm_head.decoder.bias")
     state_dict_for_hugging_face = OrderedDict()
-    for key in state_dict.keys():
+    for key in state_dict:
         if not (key.startswith("lm_head") or key.startswith("entity_predictions")):
             state_dict_for_hugging_face[f"luke.{key}"] = state_dict[key]
         else:
diff --git a/src/transformers/models/mluke/tokenization_mluke.py b/src/transformers/models/mluke/tokenization_mluke.py
index f858eca12d..15f4db5328 100644
--- a/src/transformers/models/mluke/tokenization_mluke.py
+++ b/src/transformers/models/mluke/tokenization_mluke.py
@@ -1288,7 +1288,7 @@ class MLukeTokenizer(PreTrainedTokenizer):
         # If we have a list of dicts, let's convert it in a dict of lists
         # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
         if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping):
-            encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
+            encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0]}
 
         # The model's main input name, usually `input_ids`, has be passed for padding
         if self.model_input_names[0] not in encoded_inputs:
diff --git a/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py b/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py
index 7dc6dfa288..a8159b446f 100644
--- a/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py
+++ b/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py
@@ -155,7 +155,7 @@ def convert_state_dict(orig_state_dict, model, base_model=False):
     else:
         model_prefix = "mobilevit."
 
-    for key in orig_state_dict.copy().keys():
+    for key in orig_state_dict.copy():
         val = orig_state_dict.pop(key)
 
         if key[:8] == "encoder.":
diff --git a/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py b/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py
index 1fc473e093..8d462c7dd4 100644
--- a/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py
+++ b/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py
@@ -134,7 +134,7 @@ def create_rename_keys(state_dict, base_model=False):
         model_prefix = "mobilevitv2."
 
     rename_keys = []
-    for k in state_dict.keys():
+    for k in state_dict:
         if k[:8] == "encoder.":
             k_new = k[8:]
         else:
@@ -216,7 +216,7 @@ def create_rename_keys(state_dict, base_model=False):
 def remove_unused_keys(state_dict):
     """remove unused keys (e.g.: seg_head.aux_head)"""
     keys_to_ignore = []
-    for k in state_dict.keys():
+    for k in state_dict:
         if k.startswith("seg_head.aux_head."):
             keys_to_ignore.append(k)
     for k in keys_to_ignore:
diff --git a/src/transformers/models/moonshine/convert_usefulsensors_to_hf.py b/src/transformers/models/moonshine/convert_usefulsensors_to_hf.py
index 39d8df0f3f..f29da8c8e2 100644
--- a/src/transformers/models/moonshine/convert_usefulsensors_to_hf.py
+++ b/src/transformers/models/moonshine/convert_usefulsensors_to_hf.py
@@ -33,7 +33,7 @@ def _get_weights(model_name):
 
 
 def _read_h5_weights(group, current_key="", weights={}):
-    for key in group.keys():
+    for key in group:
         full_key = f"{current_key}.{key}" if current_key else key
         if isinstance(group[key], h5py.Dataset):
             w = np.array(group[key])
diff --git a/src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py b/src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py
index cdee89efda..6330e2fe92 100644
--- a/src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py
+++ b/src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py
@@ -62,7 +62,7 @@ def rename_key(orig_key):
 
 
 def convert_checkpoint_helper(max_position_embeddings, orig_state_dict):
-    for key in orig_state_dict.copy().keys():
+    for key in orig_state_dict.copy():
         val = orig_state_dict.pop(key)
 
         if ("pooler" in key) or ("sen_class" in key):
diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py
index 071010abf1..0a414cdf11 100644
--- a/src/transformers/models/mt5/modeling_mt5.py
+++ b/src/transformers/models/mt5/modeling_mt5.py
@@ -898,7 +898,7 @@ class MT5Stack(MT5PreTrainedModel):
         )
         assert_device_map(self.device_map, len(self.block))
         self.model_parallel = True
-        self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
+        self.first_device = "cpu" if "cpu" in self.device_map else "cuda:" + str(min(self.device_map.keys()))
         self.last_device = "cuda:" + str(max(self.device_map.keys()))
         # Load onto devices
         for k, v in self.device_map.items():
diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py
index 91c505c636..bc0dc7122b 100644
--- a/src/transformers/models/musicgen/modeling_musicgen.py
+++ b/src/transformers/models/musicgen/modeling_musicgen.py
@@ -1589,11 +1589,11 @@ class MusicgenForConditionalGeneration(MusicgenPreTrainedModel, GenerationMixin)
         }
 
         # remove text encoder, audio encoder and decoder kwargs from kwargs
-        for key in kwargs_text_encoder.keys():
+        for key in kwargs_text_encoder:
             del kwargs["text_encoder_" + key]
-        for key in kwargs_audio_encoder.keys():
+        for key in kwargs_audio_encoder:
             del kwargs["audio_encoder_" + key]
-        for key in kwargs_decoder.keys():
+        for key in kwargs_decoder:
             del kwargs["decoder_" + key]
 
         # Load and initialize the encoder and decoder
diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
index 23c5314c54..aca49d8e83 100644
--- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
@@ -1472,11 +1472,11 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel, GenerationMixin):
         }
 
         # remove text encoder, audio encoder and decoder kwargs from kwargs
-        for key in kwargs_text_encoder.keys():
+        for key in kwargs_text_encoder:
             del kwargs["text_encoder_" + key]
-        for key in kwargs_audio_encoder.keys():
+        for key in kwargs_audio_encoder:
             del kwargs["audio_encoder_" + key]
-        for key in kwargs_decoder.keys():
+        for key in kwargs_decoder:
             del kwargs["decoder_" + key]
 
         # Load and initialize the encoder and decoder
diff --git a/src/transformers/models/nllb_moe/convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py b/src/transformers/models/nllb_moe/convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py
index 317c5c713c..ef2e3d0d90 100644
--- a/src/transformers/models/nllb_moe/convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py
+++ b/src/transformers/models/nllb_moe/convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py
@@ -46,7 +46,7 @@ def make_linear_from_emb(emb):
 
 def rename_fairseq_keys(state_dict, expert_idx=None):
     new_dict = {}
-    for old_key in state_dict.keys():
+    for old_key in state_dict:
         key = old_key
         if "moe_layer.experts." in key:
             if expert_idx is not None:
diff --git a/src/transformers/models/nougat/convert_nougat_to_hf.py b/src/transformers/models/nougat/convert_nougat_to_hf.py
index e42f8553ac..d8096ad864 100644
--- a/src/transformers/models/nougat/convert_nougat_to_hf.py
+++ b/src/transformers/models/nougat/convert_nougat_to_hf.py
@@ -100,7 +100,7 @@ def rename_key(name):
 
 # Copied from transformers.models.donut.convert_donut_to_pytorch.convert_state_dict
 def convert_state_dict(orig_state_dict, model):
-    for key in orig_state_dict.copy().keys():
+    for key in orig_state_dict.copy():
         val = orig_state_dict.pop(key)
 
         if "qkv" in key:
diff --git a/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py
index 7be4869181..934a23e010 100644
--- a/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py
@@ -61,7 +61,7 @@ def rename_key(orig_key):
 
 
 def convert_checkpoint_helper(config, orig_state_dict):
-    for key in orig_state_dict.copy().keys():
+    for key in orig_state_dict.copy():
         val = orig_state_dict.pop(key)
 
         if ("pooler" in key) or ("sen_class" in key) or ("conv.bias" in key):
diff --git a/src/transformers/models/omdet_turbo/convert_omdet_turbo_to_hf.py b/src/transformers/models/omdet_turbo/convert_omdet_turbo_to_hf.py
index e4e31e4d8a..da358d7119 100644
--- a/src/transformers/models/omdet_turbo/convert_omdet_turbo_to_hf.py
+++ b/src/transformers/models/omdet_turbo/convert_omdet_turbo_to_hf.py
@@ -66,7 +66,7 @@ def create_rename_keys_vision(state_dict, config):
     rename_keys = []
     # fmt: off
     ########################################## VISION BACKBONE - START
-    for layer_name in state_dict.keys():
+    for layer_name in state_dict:
         if layer_name.startswith("backbone") and not layer_name.startswith("backbone.norm"):
             if config.use_timm_backbone:
                 layer_name_replace = layer_name.replace("backbone", "vision_backbone.vision_backbone._backbone")
@@ -100,7 +100,7 @@ def create_rename_keys_vision(state_dict, config):
     ########################################## VISION BACKBONE - END
 
     ########################################## ENCODER - START
-    for layer_name in state_dict.keys():
+    for layer_name in state_dict:
         if "neck" in layer_name:
             layer_name_replace = layer_name.replace("neck", "encoder")
             layer_name_replace = layer_name_replace.replace("input_proj", "channel_projection_layers")
@@ -117,7 +117,7 @@ def create_rename_keys_vision(state_dict, config):
     ########################################## ENCODER - END
 
     ########################################## DECODER - START
-    for layer_name in state_dict.keys():
+    for layer_name in state_dict:
         if layer_name.startswith("decoder"):
             layer_name_replace = layer_name.replace("decoder.decoder.layers", "decoder.layers")
             layer_name_replace = layer_name_replace.replace("input_proj", "channel_projection_layers")
@@ -136,7 +136,7 @@ def create_rename_keys_vision(state_dict, config):
 def create_rename_keys_language(state_dict):
     rename_keys = []
     # fmt: off
-    for layer_name in state_dict.keys():
+    for layer_name in state_dict:
         if layer_name.startswith("language_backbone") and not layer_name.startswith("language_backbone.text_projection"):
             layer_name_replace = layer_name.replace("language_backbone", "language_backbone.model.text_model")
             layer_name_replace = layer_name_replace.replace("transformer.resblocks", "encoder.layers")
diff --git a/src/transformers/models/opt/convert_opt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/opt/convert_opt_original_pytorch_checkpoint_to_pytorch.py
index 9a9b0c306c..98731ed212 100644
--- a/src/transformers/models/opt/convert_opt_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/opt/convert_opt_original_pytorch_checkpoint_to_pytorch.py
@@ -30,7 +30,7 @@ logger = logging.get_logger(__name__)
 def load_checkpoint(checkpoint_path):
     """Checkpoint path should end in model.pt"""
     sd = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
-    if "model" in sd.keys():
+    if "model" in sd:
         sd = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"]
 
     # pop unnecessary weights
diff --git a/src/transformers/models/opt/modeling_tf_opt.py b/src/transformers/models/opt/modeling_tf_opt.py
index b7c817a034..f996256063 100644
--- a/src/transformers/models/opt/modeling_tf_opt.py
+++ b/src/transformers/models/opt/modeling_tf_opt.py
@@ -934,7 +934,7 @@ class TFOPTForCausalLM(TFOPTPreTrainedModel, TFCausalLanguageModelingLoss):
         return self.model.get_input_embeddings()
 
     def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs):
-        attention_mask = kwargs.get("attention_mask", None)
+        attention_mask = kwargs.get("attention_mask")
 
         # only last token for inputs_ids if past is defined in kwargs
         if past_key_values:
diff --git a/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py b/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py
index 49ce578d00..ea766c366f 100644
--- a/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py
+++ b/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py
@@ -220,7 +220,7 @@ def copy_class_box_heads(hf_model, flax_params):
 
     # Copy flax params to PyTorch params
     for name, param in new_params.items():
-        if name in pt_params.keys():
+        if name in pt_params:
             pt_params[name].copy_(param)
 
 
@@ -313,7 +313,7 @@ def convert_clip_backbone(flax_params, torch_config):
 
     # Copy flax CLIP backbone params to PyTorch params
     for name, param in new_torch_params.items():
-        if name in torch_clip_params.keys():
+        if name in torch_clip_params:
             new_param = torch.from_numpy(param)
             torch_clip_params[name].copy_(new_param)
         else:
diff --git a/src/transformers/models/paligemma/convert_paligemma2_weights_to_hf.py b/src/transformers/models/paligemma/convert_paligemma2_weights_to_hf.py
index 62619ea1d3..630ff7b0b0 100644
--- a/src/transformers/models/paligemma/convert_paligemma2_weights_to_hf.py
+++ b/src/transformers/models/paligemma/convert_paligemma2_weights_to_hf.py
@@ -263,7 +263,7 @@ def slice_state_dict(state_dict, config):
         state_dict[f"language_model.model.layers.{i}.post_feedforward_layernorm.weight"] = llm_post_feedforward_layernorm[i]
     state_dict["language_model.model.norm.weight"] = state_dict.pop("llm/final_norm/scale")
     state_dict["language_model.lm_head.weight"] = embedding_vector # weights are tied.
-    [k for k in state_dict.keys() if not k.startswith('vision') and not k.startswith('language')]
+    [k for k in state_dict if not k.startswith('vision') and not k.startswith('language')]
     # fmt: on
     for key, value in state_dict.items():
         if not isinstance(value, torch.Tensor):
diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py
index bd3a2d9e8c..d07fa49ddc 100755
--- a/src/transformers/models/perceiver/modeling_perceiver.py
+++ b/src/transformers/models/perceiver/modeling_perceiver.py
@@ -577,7 +577,7 @@ class PerceiverPreTrainedModel(PreTrainedModel):
         elif hasattr(module, "position_embeddings") and isinstance(module, PerceiverTrainablePositionEncoding):
             module.position_embeddings.data.normal_(mean=0.0, std=self.config.initializer_range)
         elif isinstance(module, nn.ParameterDict):
-            for modality in module.keys():
+            for modality in module:
                 module[modality].data.normal_(mean=0.0, std=self.config.initializer_range)
         elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index b775f2f0fd..919a120312 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -218,7 +218,7 @@ class PerceptionLMImageProcessorFast(BaseImageProcessorFast):
         closest_aspect_ratio = None
         if target_aspect_ratio >= 1:
             closest_aspect_ratio = min(
-                [k for k in asp_dict.keys() if k <= target_aspect_ratio],
+                [k for k in asp_dict if k <= target_aspect_ratio],
                 key=lambda x: abs(x - target_aspect_ratio),
             )
             tiles_given_aspect_ratio = asp_dict[closest_aspect_ratio]
@@ -226,7 +226,7 @@ class PerceptionLMImageProcessorFast(BaseImageProcessorFast):
             return max(tiles_given_aspect_ratio, key=lambda x: x[0])
         else:
             closest_aspect_ratio = min(
-                [k for k in asp_dict.keys() if k > target_aspect_ratio],
+                [k for k in asp_dict if k > target_aspect_ratio],
                 key=lambda x: abs(1 / x - 1 / target_aspect_ratio),
             )
             tiles_given_aspect_ratio = asp_dict[closest_aspect_ratio]
diff --git a/src/transformers/models/phi/convert_phi_weights_to_hf.py b/src/transformers/models/phi/convert_phi_weights_to_hf.py
index 0974233661..bbaa9b4c0c 100644
--- a/src/transformers/models/phi/convert_phi_weights_to_hf.py
+++ b/src/transformers/models/phi/convert_phi_weights_to_hf.py
@@ -105,7 +105,7 @@ def _download(url: str, root: str):
 def convert_phi_weights(
     model_name, checkpoint_path, pytorch_dump_folder_path, use_cuda, save_weights_directly, _MODELS
 ):
-    _MODELS = _MODELS if model_name not in _MODELS.keys() else {model_name: _MODELS.get(model_name)}
+    _MODELS = _MODELS if model_name not in _MODELS else {model_name: _MODELS.get(model_name)}
     device = "cuda" if torch.cuda.is_available() and use_cuda else "cpu"
     for model_name, model_url in _MODELS.items():
         converted_checkpoint = {}
diff --git a/src/transformers/models/pix2struct/convert_pix2struct_original_pytorch_to_hf.py b/src/transformers/models/pix2struct/convert_pix2struct_original_pytorch_to_hf.py
index f19d864f46..bf3bf2b723 100644
--- a/src/transformers/models/pix2struct/convert_pix2struct_original_pytorch_to_hf.py
+++ b/src/transformers/models/pix2struct/convert_pix2struct_original_pytorch_to_hf.py
@@ -67,7 +67,7 @@ def rename_and_convert_flax_params(flax_dict):
         "decoder.logits_dense.weight": "decoder.lm_head.weight",
     }
 
-    for key in flax_dict.keys():
+    for key in flax_dict:
         if "target" in key:
             # remove the first prefix from the key
             new_key = ".".join(key[1:])
diff --git a/src/transformers/models/pix2struct/image_processing_pix2struct.py b/src/transformers/models/pix2struct/image_processing_pix2struct.py
index e434c01fe1..06dadb8a9f 100644
--- a/src/transformers/models/pix2struct/image_processing_pix2struct.py
+++ b/src/transformers/models/pix2struct/image_processing_pix2struct.py
@@ -404,7 +404,7 @@ class Pix2StructImageProcessor(BaseImageProcessor):
         max_patches = max_patches if max_patches is not None else self.max_patches
         is_vqa = self.is_vqa
 
-        if kwargs.get("data_format", None) is not None:
+        if kwargs.get("data_format") is not None:
             raise ValueError("data_format is not an accepted input as the outputs are ")
 
         images = make_list_of_images(images)
diff --git a/src/transformers/models/plbart/tokenization_plbart.py b/src/transformers/models/plbart/tokenization_plbart.py
index 7adeb11d6a..821c4c8172 100644
--- a/src/transformers/models/plbart/tokenization_plbart.py
+++ b/src/transformers/models/plbart/tokenization_plbart.py
@@ -425,7 +425,7 @@ class PLBartTokenizer(PreTrainedTokenizer):
 
     def _convert_lang_code_special_format(self, lang: str) -> str:
         """Convert Language Codes to format tokenizer uses if required"""
-        lang = FAIRSEQ_LANGUAGE_CODES_MAP[lang] if lang in FAIRSEQ_LANGUAGE_CODES_MAP.keys() else lang
+        lang = FAIRSEQ_LANGUAGE_CODES_MAP[lang] if lang in FAIRSEQ_LANGUAGE_CODES_MAP else lang
         return lang
 
 
diff --git a/src/transformers/models/pop2piano/modeling_pop2piano.py b/src/transformers/models/pop2piano/modeling_pop2piano.py
index 71a1397cc3..9676c39455 100644
--- a/src/transformers/models/pop2piano/modeling_pop2piano.py
+++ b/src/transformers/models/pop2piano/modeling_pop2piano.py
@@ -1072,7 +1072,7 @@ class Pop2PianoForConditionalGeneration(Pop2PianoPreTrainedModel, GenerationMixi
                 - 0 for tokens that are **padded**.
         """
         composer_to_feature_token = generation_config.composer_to_feature_token
-        if composer not in composer_to_feature_token.keys():
+        if composer not in composer_to_feature_token:
             raise ValueError(
                 f"Please choose a composer from {list(composer_to_feature_token.keys())}. Composer received - {composer}"
             )
diff --git a/src/transformers/models/pop2piano/tokenization_pop2piano.py b/src/transformers/models/pop2piano/tokenization_pop2piano.py
index bf0c1afdfa..4356b5d93f 100644
--- a/src/transformers/models/pop2piano/tokenization_pop2piano.py
+++ b/src/transformers/models/pop2piano/tokenization_pop2piano.py
@@ -265,7 +265,7 @@ class Pop2PianoTokenizer(PreTrainedTokenizer):
 
         current_idx = start_idx
         current_velocity = 0
-        note_onsets_ready = [None for i in range(sum([k.endswith("NOTE") for k in self.encoder.keys()]) + 1)]
+        note_onsets_ready = [None for i in range(sum([k.endswith("NOTE") for k in self.encoder]) + 1)]
         notes = []
         for token_type, number in words:
             if token_type == "TOKEN_SPECIAL":
diff --git a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py
index f690ca5108..a10a0955f8 100644
--- a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py
@@ -3872,7 +3872,7 @@ class Qwen2_5OmniForConditionalGeneration(Qwen2_5OmniPreTrainedModel, Generation
 
         # 2. Generate speech tokens from talker module
         embeds_to_talker = thinker_result.hidden_states[0][0].clone().to(input_ids.device)
-        if thinker_kwargs.get("input_features", None) is not None:
+        if thinker_kwargs.get("input_features") is not None:
             audio_ids_mask = input_ids == self.config.thinker_config.audio_token_index
             audio_mask = audio_ids_mask.unsqueeze(-1).expand_as(embeds_to_talker)
             audio_mask_tensor = torch.zeros(
@@ -3881,7 +3881,7 @@ class Qwen2_5OmniForConditionalGeneration(Qwen2_5OmniPreTrainedModel, Generation
                 device=input_ids.device,
             )
             embeds_to_talker.masked_scatter_(audio_mask, audio_mask_tensor)
-        if thinker_kwargs.get("pixel_values", None) is not None:
+        if thinker_kwargs.get("pixel_values") is not None:
             image_ids_mask = input_ids == self.config.thinker_config.image_token_index
             image_mask = image_ids_mask.unsqueeze(-1).expand_as(embeds_to_talker)
             image_mask_tensor = torch.zeros(
@@ -3890,7 +3890,7 @@ class Qwen2_5OmniForConditionalGeneration(Qwen2_5OmniPreTrainedModel, Generation
                 device=input_ids.device,
             )
             embeds_to_talker.masked_scatter_(image_mask, image_mask_tensor)
-        if thinker_kwargs.get("pixel_values_videos", None) is not None:
+        if thinker_kwargs.get("pixel_values_videos") is not None:
             video_ids_mask = input_ids == self.config.thinker_config.video_token_index
             video_mask = video_ids_mask.unsqueeze(-1).expand_as(embeds_to_talker)
             video_mask_tensor = torch.zeros(
diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py
index b8e5bc6216..56fdff57e5 100644
--- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py
@@ -4169,7 +4169,7 @@ class Qwen2_5OmniForConditionalGeneration(Qwen2_5OmniPreTrainedModel, Generation
 
         # 2. Generate speech tokens from talker module
         embeds_to_talker = thinker_result.hidden_states[0][0].clone().to(input_ids.device)
-        if thinker_kwargs.get("input_features", None) is not None:
+        if thinker_kwargs.get("input_features") is not None:
             audio_ids_mask = input_ids == self.config.thinker_config.audio_token_index
             audio_mask = audio_ids_mask.unsqueeze(-1).expand_as(embeds_to_talker)
             audio_mask_tensor = torch.zeros(
@@ -4178,7 +4178,7 @@ class Qwen2_5OmniForConditionalGeneration(Qwen2_5OmniPreTrainedModel, Generation
                 device=input_ids.device,
             )
             embeds_to_talker.masked_scatter_(audio_mask, audio_mask_tensor)
-        if thinker_kwargs.get("pixel_values", None) is not None:
+        if thinker_kwargs.get("pixel_values") is not None:
             image_ids_mask = input_ids == self.config.thinker_config.image_token_index
             image_mask = image_ids_mask.unsqueeze(-1).expand_as(embeds_to_talker)
             image_mask_tensor = torch.zeros(
@@ -4187,7 +4187,7 @@ class Qwen2_5OmniForConditionalGeneration(Qwen2_5OmniPreTrainedModel, Generation
                 device=input_ids.device,
             )
             embeds_to_talker.masked_scatter_(image_mask, image_mask_tensor)
-        if thinker_kwargs.get("pixel_values_videos", None) is not None:
+        if thinker_kwargs.get("pixel_values_videos") is not None:
             video_ids_mask = input_ids == self.config.thinker_config.video_token_index
             video_mask = video_ids_mask.unsqueeze(-1).expand_as(embeds_to_talker)
             video_mask_tensor = torch.zeros(
diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py
index ffc1f60c8f..367b4dc456 100644
--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -318,9 +318,9 @@ class RagPreTrainedModel(PreTrainedModel):
         }
 
         # remove question_encoder, generator kwargs from kwargs
-        for key in kwargs_question_encoder.keys():
+        for key in kwargs_question_encoder:
             del kwargs["question_encoder_" + key]
-        for key in kwargs_generator.keys():
+        for key in kwargs_generator:
             del kwargs["generator_" + key]
 
         # Load and initialize the question_encoder and generator
@@ -370,7 +370,7 @@ class RagPreTrainedModel(PreTrainedModel):
             )
 
         # instantiate config with corresponding kwargs
-        config = kwargs.get("config", None)
+        config = kwargs.get("config")
         if config is None:
             config = RagConfig.from_question_encoder_generator_configs(
                 question_encoder.config, generator.config, **kwargs
diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py
index 1d8c3f1fc8..f430239e0d 100644
--- a/src/transformers/models/rag/modeling_tf_rag.py
+++ b/src/transformers/models/rag/modeling_tf_rag.py
@@ -322,9 +322,9 @@ class TFRagPreTrainedModel(TFPreTrainedModel):
         }
 
         # remove question_encoder, generator kwargs from kwargs
-        for key in kwargs_question_encoder.keys():
+        for key in kwargs_question_encoder:
             del kwargs["question_encoder_" + key]
-        for key in kwargs_generator.keys():
+        for key in kwargs_generator:
             del kwargs["generator_" + key]
 
         # Load and initialize the question_encoder and generator
@@ -376,7 +376,7 @@ class TFRagPreTrainedModel(TFPreTrainedModel):
             )
 
         # instantiate config with corresponding kwargs
-        config = kwargs.get("config", None)
+        config = kwargs.get("config")
         if config is None:
             config = RagConfig.from_question_encoder_generator_configs(
                 question_encoder.config, generator.config, **kwargs
diff --git a/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py b/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py
index 9bf3a17e91..ed4bc48035 100644
--- a/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py
+++ b/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py
@@ -51,7 +51,7 @@ class Tracker:
     name2module: dict[str, nn.Module] = field(default_factory=OrderedDict)
 
     def _forward_hook(self, m, inputs: Tensor, outputs: Tensor, name: str):
-        has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d)
+        has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, (nn.Conv2d, nn.BatchNorm2d))
         if has_not_submodules:
             self.traced.append(m)
             self.name2module[name] = m
@@ -217,7 +217,7 @@ def convert_weights_and_push(save_directory: Path, model_name: Optional[str] = N
         not_used_keys = list(from_state_dict.keys())
         regex = r"\.block.-part."
         # this is "interesting", so the original checkpoints have `block[0,1]-part` in each key name, we remove it
-        for key in from_state_dict.keys():
+        for key in from_state_dict:
             # remove the weird "block[0,1]-part" from the key
             src_key = re.sub(regex, "", key)
             # now src_key from the model checkpoints is the one we got from the original model after tracing, so use it to get the correct destination key
diff --git a/src/transformers/models/regnet/convert_regnet_to_pytorch.py b/src/transformers/models/regnet/convert_regnet_to_pytorch.py
index b7b48568a1..9d6659d768 100644
--- a/src/transformers/models/regnet/convert_regnet_to_pytorch.py
+++ b/src/transformers/models/regnet/convert_regnet_to_pytorch.py
@@ -44,7 +44,7 @@ class Tracker:
     handles: list = field(default_factory=list)
 
     def _forward_hook(self, m, inputs: Tensor, outputs: Tensor):
-        has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d)
+        has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, (nn.Conv2d, nn.BatchNorm2d))
         if has_not_submodules:
             self.traced.append(m)
 
diff --git a/src/transformers/models/resnet/convert_resnet_to_pytorch.py b/src/transformers/models/resnet/convert_resnet_to_pytorch.py
index 7443b9cf50..11b09c372c 100644
--- a/src/transformers/models/resnet/convert_resnet_to_pytorch.py
+++ b/src/transformers/models/resnet/convert_resnet_to_pytorch.py
@@ -42,7 +42,7 @@ class Tracker:
     handles: list = field(default_factory=list)
 
     def _forward_hook(self, m, inputs: Tensor, outputs: Tensor):
-        has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d)
+        has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, (nn.Conv2d, nn.BatchNorm2d))
         if has_not_submodules:
             self.traced.append(m)
 
diff --git a/src/transformers/models/rt_detr/convert_rt_detr_original_pytorch_checkpoint_to_hf.py b/src/transformers/models/rt_detr/convert_rt_detr_original_pytorch_checkpoint_to_hf.py
index 9f2271930e..8a76fa4b4d 100644
--- a/src/transformers/models/rt_detr/convert_rt_detr_original_pytorch_checkpoint_to_hf.py
+++ b/src/transformers/models/rt_detr/convert_rt_detr_original_pytorch_checkpoint_to_hf.py
@@ -574,7 +574,7 @@ def convert_rt_detr_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
     # query, key and value matrices need special treatment
     read_in_q_k_v(state_dict, config)
     # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         if key.endswith("num_batches_tracked"):
             del state_dict[key]
         # for two_stage
diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py
index 76321cc22e..b6f52c28f8 100644
--- a/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py
+++ b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py
@@ -160,8 +160,8 @@ class RTDetrImageProcessorFast(BaseImageProcessorFast):
 
     def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorKwargs]) -> None:
         # Backwards compatibility
-        do_convert_annotations = kwargs.get("do_convert_annotations", None)
-        do_normalize = kwargs.get("do_normalize", None)
+        do_convert_annotations = kwargs.get("do_convert_annotations")
+        do_normalize = kwargs.get("do_normalize")
         if do_convert_annotations is None and getattr(self, "do_convert_annotations", None) is None:
             self.do_convert_annotations = do_normalize if do_normalize is not None else self.do_normalize
 
diff --git a/src/transformers/models/rt_detr/modular_rt_detr.py b/src/transformers/models/rt_detr/modular_rt_detr.py
index 641519fafe..9a77a92b5a 100644
--- a/src/transformers/models/rt_detr/modular_rt_detr.py
+++ b/src/transformers/models/rt_detr/modular_rt_detr.py
@@ -123,8 +123,8 @@ class RTDetrImageProcessorFast(DetrImageProcessorFast, BaseImageProcessorFast):
 
     def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorKwargs]) -> None:
         # Backwards compatibility
-        do_convert_annotations = kwargs.get("do_convert_annotations", None)
-        do_normalize = kwargs.get("do_normalize", None)
+        do_convert_annotations = kwargs.get("do_convert_annotations")
+        do_normalize = kwargs.get("do_normalize")
         if do_convert_annotations is None and getattr(self, "do_convert_annotations", None) is None:
             self.do_convert_annotations = do_normalize if do_normalize is not None else self.do_normalize
 
diff --git a/src/transformers/models/rt_detr_v2/convert_rt_detr_v2_weights_to_hf.py b/src/transformers/models/rt_detr_v2/convert_rt_detr_v2_weights_to_hf.py
index 1a86ac02af..d2f9b200df 100644
--- a/src/transformers/models/rt_detr_v2/convert_rt_detr_v2_weights_to_hf.py
+++ b/src/transformers/models/rt_detr_v2/convert_rt_detr_v2_weights_to_hf.py
@@ -239,13 +239,13 @@ def write_model_and_image_processor(model_name, output_dir, push_to_hub, repo_id
     ]["module"]
     # rename keys
     state_dict = convert_old_keys_to_new_keys(state_dict)
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         if key.endswith("num_batches_tracked"):
             del state_dict[key]
     # query, key and value matrices need special treatment
     read_in_q_k_v(state_dict, config)
     # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         if key.endswith("num_batches_tracked"):
             del state_dict[key]
         # for two_stage
diff --git a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
index 1c47aec148..619b6448b4 100755
--- a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
+++ b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
@@ -3910,7 +3910,7 @@ class SeamlessM4TModel(SeamlessM4TPreTrainedModel, GenerationMixin):
               shape `(batch_size, sequence_length)` and `waveform_lengths` which gives the length of each sample.
             - If `generate_speech=False`, it will returns `ModelOutput`.
         """
-        if input_ids is None and input_features is None and kwargs.get("inputs_embeds", None) is None:
+        if input_ids is None and input_features is None and kwargs.get("inputs_embeds") is None:
             raise ValueError(
                 "`input_ids`,`input_features` and `inputs_embeds` are all empty. Make sure at least one of them is not."
             )
diff --git a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
index 6160d311c8..950c0d3b8f 100644
--- a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
+++ b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
@@ -4208,7 +4208,7 @@ class SeamlessM4Tv2Model(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
               shape `(batch_size, sequence_length)` and `waveform_lengths` which gives the length of each sample.
             - If `generate_speech=False`, it will returns `ModelOutput`.
         """
-        if input_ids is None and input_features is None and kwargs.get("inputs_embeds", None) is None:
+        if input_ids is None and input_features is None and kwargs.get("inputs_embeds") is None:
             raise ValueError(
                 "`input_ids`,`input_features` and `inputs_embeds` are all empty. Make sure at least one of them is not."
             )
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
index c809b804c8..bc541262c7 100644
--- a/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
@@ -844,9 +844,9 @@ class FlaxSpeechEncoderDecoderModel(FlaxPreTrainedModel):
         }
 
         # remove encoder, decoder kwargs from kwargs
-        for key in kwargs_encoder.keys():
+        for key in kwargs_encoder:
             del kwargs["encoder_" + key]
-        for key in kwargs_decoder.keys():
+        for key in kwargs_decoder:
             del kwargs["decoder_" + key]
 
         # Load and initialize the encoder and decoder
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
index 0b17ce1548..f4e804dac2 100644
--- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
@@ -242,9 +242,9 @@ class SpeechEncoderDecoderModel(PreTrainedModel, GenerationMixin):
         }
 
         # remove encoder, decoder kwargs from kwargs
-        for key in kwargs_encoder.keys():
+        for key in kwargs_encoder:
             del kwargs["encoder_" + key]
-        for key in kwargs_decoder.keys():
+        for key in kwargs_decoder:
             del kwargs["decoder_" + key]
 
         # Load and initialize the encoder and decoder
diff --git a/src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py b/src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py
index 3567bb674e..a2d8893b5d 100644
--- a/src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py
+++ b/src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py
@@ -65,7 +65,7 @@ def rename_key(dct, old, new):
 
 def create_rename_keys(state_dict):
     rename_keys = []
-    for k in state_dict.keys():
+    for k in state_dict:
         k_new = k
         if ".pwconv" in k:
             k_new = k_new.replace(".pwconv", ".point_wise_conv")
diff --git a/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py b/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py
index 9a87ff693a..dbaeeb31ef 100644
--- a/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py
+++ b/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py
@@ -83,7 +83,7 @@ def rename_key(name):
 
 
 def convert_state_dict(orig_state_dict, model):
-    for key in orig_state_dict.copy().keys():
+    for key in orig_state_dict.copy():
         val = orig_state_dict.pop(key)
 
         if "attn_mask" in key:
diff --git a/src/transformers/models/swin/convert_swin_timm_to_pytorch.py b/src/transformers/models/swin/convert_swin_timm_to_pytorch.py
index c91249b272..9971da844a 100644
--- a/src/transformers/models/swin/convert_swin_timm_to_pytorch.py
+++ b/src/transformers/models/swin/convert_swin_timm_to_pytorch.py
@@ -90,7 +90,7 @@ def rename_key(name):
 
 
 def convert_state_dict(orig_state_dict, model):
-    for key in orig_state_dict.copy().keys():
+    for key in orig_state_dict.copy():
         val = orig_state_dict.pop(key)
 
         if "mask" in key:
diff --git a/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py b/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py
index 192e58d9db..e827070ed5 100644
--- a/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py
+++ b/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py
@@ -124,7 +124,7 @@ def rename_key(name, config):
 
 
 def convert_state_dict(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
+    for key in orig_state_dict.copy():
         val = orig_state_dict.pop(key)
 
         if "qkv" in key:
diff --git a/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py b/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py
index 0e6e837a7e..60ea55edee 100644
--- a/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py
+++ b/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py
@@ -130,7 +130,7 @@ def rename_key(name):
 
 
 def convert_state_dict(orig_state_dict, model):
-    for key in orig_state_dict.copy().keys():
+    for key in orig_state_dict.copy():
         val = orig_state_dict.pop(key)
 
         if "mask" in key:
diff --git a/src/transformers/models/switch_transformers/convert_big_switch.py b/src/transformers/models/switch_transformers/convert_big_switch.py
index 8d19daf1c1..e6ef99a310 100644
--- a/src/transformers/models/switch_transformers/convert_big_switch.py
+++ b/src/transformers/models/switch_transformers/convert_big_switch.py
@@ -80,7 +80,7 @@ def shard_on_the_fly(switch_checkpoint_path, dump_path, max_shard_size, dtype, w
         checkpoint_info = flatten_dict(checkpoint_info, sep="/")
 
     all_layers = {}
-    for layer in checkpoint_info.keys():
+    for layer in checkpoint_info:
         curr_real_layer_name, split_layer, content = get_key_and_tensorstore_dict(
             layer, checkpoint_info, switch_checkpoint_path
         )
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index e39c4b2f99..aeb8de0424 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -911,7 +911,7 @@ class T5Stack(T5PreTrainedModel):
         )
         assert_device_map(self.device_map, len(self.block))
         self.model_parallel = True
-        self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
+        self.first_device = "cpu" if "cpu" in self.device_map else "cuda:" + str(min(self.device_map.keys()))
         self.last_device = "cuda:" + str(max(self.device_map.keys()))
         # Load onto devices
         for k, v in self.device_map.items():
diff --git a/src/transformers/models/table_transformer/convert_table_transformer_to_hf.py b/src/transformers/models/table_transformer/convert_table_transformer_to_hf.py
index 487cdc4819..aeba012ad0 100644
--- a/src/transformers/models/table_transformer/convert_table_transformer_to_hf.py
+++ b/src/transformers/models/table_transformer/convert_table_transformer_to_hf.py
@@ -203,7 +203,7 @@ def convert_table_transformer_checkpoint(checkpoint_url, pytorch_dump_folder_pat
     read_in_q_k_v(state_dict)
     # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
     prefix = "model."
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
             val = state_dict.pop(key)
             state_dict[prefix + key] = val
diff --git a/src/transformers/models/table_transformer/convert_table_transformer_to_hf_no_timm.py b/src/transformers/models/table_transformer/convert_table_transformer_to_hf_no_timm.py
index 1073d48877..f9964369bf 100644
--- a/src/transformers/models/table_transformer/convert_table_transformer_to_hf_no_timm.py
+++ b/src/transformers/models/table_transformer/convert_table_transformer_to_hf_no_timm.py
@@ -335,7 +335,7 @@ def convert_table_transformer_checkpoint(checkpoint_url, pytorch_dump_folder_pat
     read_in_q_k_v(state_dict)
     # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
     prefix = "model."
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
             val = state_dict.pop(key)
             state_dict[prefix + key] = val
diff --git a/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py b/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py
index cda9b0c182..5db24e6367 100644
--- a/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py
+++ b/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py
@@ -101,7 +101,7 @@ def rename_key(name):
 
 
 def convert_state_dict(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
+    for key in orig_state_dict.copy():
         val = orig_state_dict.pop(key)
 
         if key.startswith("model."):
diff --git a/src/transformers/models/timm_wrapper/configuration_timm_wrapper.py b/src/transformers/models/timm_wrapper/configuration_timm_wrapper.py
index 39ed2098d6..5fa115a054 100644
--- a/src/transformers/models/timm_wrapper/configuration_timm_wrapper.py
+++ b/src/transformers/models/timm_wrapper/configuration_timm_wrapper.py
@@ -77,7 +77,7 @@ class TimmWrapperConfig(PretrainedConfig):
 
     @classmethod
     def from_dict(cls, config_dict: dict[str, Any], **kwargs):
-        label_names = config_dict.get("label_names", None)
+        label_names = config_dict.get("label_names")
         is_custom_model = "num_labels" in kwargs or "id2label" in kwargs
 
         # if no labels added to config, use imagenet labeller in timm
diff --git a/src/transformers/models/udop/tokenization_udop_fast.py b/src/transformers/models/udop/tokenization_udop_fast.py
index a9f18c1b2b..a8878b9b51 100644
--- a/src/transformers/models/udop/tokenization_udop_fast.py
+++ b/src/transformers/models/udop/tokenization_udop_fast.py
@@ -591,7 +591,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
         # To match each overflowing sample with the original sample in the batch
         # we add an overflow_to_sample_mapping array (see below)
         sanitized_tokens = {}
-        for key in tokens_and_encodings[0][0].keys():
+        for key in tokens_and_encodings[0][0]:
             stack = [e for item, _ in tokens_and_encodings for e in item[key]]
             sanitized_tokens[key] = stack
         sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
diff --git a/src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py b/src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py
index eeb3ab5fc9..17d110c577 100644
--- a/src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py
+++ b/src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py
@@ -134,7 +134,7 @@ def convert_upernet_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
     model.eval()
 
     # replace "bn" => "batch_norm"
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         val = state_dict.pop(key)
         if "bn" in key:
             key = key.replace("bn", "batch_norm")
diff --git a/src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py b/src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py
index 9580af7c46..edf0e142da 100644
--- a/src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py
+++ b/src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py
@@ -208,7 +208,7 @@ def convert_upernet_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub
     model.eval()
 
     # replace "bn" => "batch_norm"
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         val = state_dict.pop(key)
         if "bn" in key:
             key = key.replace("bn", "batch_norm")
diff --git a/src/transformers/models/videomae/convert_videomae_to_pytorch.py b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
index 011c1862eb..2a1ab62c6a 100644
--- a/src/transformers/models/videomae/convert_videomae_to_pytorch.py
+++ b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
@@ -136,7 +136,7 @@ def rename_key(name):
 
 
 def convert_state_dict(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
+    for key in orig_state_dict.copy():
         val = orig_state_dict.pop(key)
 
         if key.startswith("encoder."):
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
index c5467852c2..4c7869184e 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
@@ -785,9 +785,9 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
         }
 
         # remove encoder, decoder kwargs from kwargs
-        for key in kwargs_encoder.keys():
+        for key in kwargs_encoder:
             del kwargs["encoder_" + key]
-        for key in kwargs_decoder.keys():
+        for key in kwargs_decoder:
             del kwargs["decoder_" + key]
 
         # Load and initialize the encoder and decoder
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
index e1274cacfb..5818a74271 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
@@ -375,9 +375,9 @@ class TFVisionEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLos
         }
 
         # remove encoder, decoder kwargs from kwargs
-        for key in kwargs_encoder.keys():
+        for key in kwargs_encoder:
             del kwargs["encoder_" + key]
-        for key in kwargs_decoder.keys():
+        for key in kwargs_decoder:
             del kwargs["decoder_" + key]
 
         # Load and initialize the encoder and decoder
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
index 96d8228ea5..541c8bc9b9 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
@@ -257,7 +257,7 @@ class VisionEncoderDecoderModel(PreTrainedModel, GenerationMixin):
                 del tf_model
                 gc.collect()
 
-                attn_implementation = kwargs.get("attn_implementation", None)
+                attn_implementation = kwargs.get("attn_implementation")
                 kwargs_encoder_decoder = {}
                 if attn_implementation:
                     kwargs_encoder_decoder = {
@@ -360,9 +360,9 @@ class VisionEncoderDecoderModel(PreTrainedModel, GenerationMixin):
         }
 
         # remove encoder, decoder kwargs from kwargs
-        for key in kwargs_encoder.keys():
+        for key in kwargs_encoder:
             del kwargs["encoder_" + key]
-        for key in kwargs_decoder.keys():
+        for key in kwargs_decoder:
             del kwargs["decoder_" + key]
 
         # Load and initialize the encoder and decoder
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
index 44c5e6c172..663d6944c6 100644
--- a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
@@ -480,9 +480,9 @@ class FlaxVisionTextDualEncoderModel(FlaxPreTrainedModel):
         }
 
         # remove text, vision kwargs from kwargs
-        for key in kwargs_vision.keys():
+        for key in kwargs_vision:
             del kwargs["vision_" + key]
-        for key in kwargs_text.keys():
+        for key in kwargs_text:
             del kwargs["text_" + key]
 
         # Load and initialize the text and vision model
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
index 62ec6c955b..42ff0be7a9 100644
--- a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
@@ -525,9 +525,9 @@ class TFVisionTextDualEncoderModel(TFPreTrainedModel):
         }
 
         # remove vision, text kwargs from kwargs
-        for key in kwargs_vision.keys():
+        for key in kwargs_vision:
             del kwargs["vision_" + key]
-        for key in kwargs_text.keys():
+        for key in kwargs_text:
             del kwargs["text_" + key]
 
         # Load and initialize the vision and text model
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
index fb1abc020a..fd36c7afaf 100755
--- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
@@ -366,9 +366,9 @@ class VisionTextDualEncoderModel(PreTrainedModel):
         }
 
         # remove vision, text kwargs from kwargs
-        for key in kwargs_vision.keys():
+        for key in kwargs_vision:
             del kwargs["vision_" + key]
-        for key in kwargs_text.keys():
+        for key in kwargs_text:
             del kwargs["text_" + key]
 
         # Load and initialize the vision and text model
diff --git a/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py b/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py
index 47e77593f6..c7e4a7dc3b 100644
--- a/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py
+++ b/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py
@@ -67,7 +67,7 @@ def rename_key(name):
 
 
 def convert_state_dict(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
+    for key in orig_state_dict.copy():
         val = orig_state_dict.pop(key)
 
         if "qkv" in key:
diff --git a/src/transformers/models/vitmatte/convert_vitmatte_to_hf.py b/src/transformers/models/vitmatte/convert_vitmatte_to_hf.py
index 5153e1faf5..e9b171876a 100644
--- a/src/transformers/models/vitmatte/convert_vitmatte_to_hf.py
+++ b/src/transformers/models/vitmatte/convert_vitmatte_to_hf.py
@@ -85,7 +85,7 @@ def convert_vitmatte_checkpoint(model_name, pytorch_dump_folder_path, push_to_hu
     state_dict = torch.load(filepath, map_location="cpu", weights_only=True)
 
     # rename keys
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         val = state_dict.pop(key)
         if "backbone.blocks" in key:
             key = key.replace("backbone.blocks", "backbone.encoder.layer")
diff --git a/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py b/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py
index f4b5e1cfda..bf6aa8e4a3 100644
--- a/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py
+++ b/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py
@@ -129,7 +129,7 @@ def transform_state_encoder_block(state_dict, i):
 
 
 def get_n_layers(state_dict):
-    return sum([1 if "encoderblock_" in k else 0 for k in state_dict["optimizer"]["target"]["Transformer"].keys()])
+    return sum([1 if "encoderblock_" in k else 0 for k in state_dict["optimizer"]["target"]["Transformer"]])
 
 
 def transform_state(state_dict, classification_head=False):
diff --git a/src/transformers/models/voxtral/processing_voxtral.py b/src/transformers/models/voxtral/processing_voxtral.py
index 598529bf5c..7867827322 100644
--- a/src/transformers/models/voxtral/processing_voxtral.py
+++ b/src/transformers/models/voxtral/processing_voxtral.py
@@ -166,7 +166,7 @@ class VoxtralProcessor(ProcessorMixin):
         }
 
         for kwarg_type in processed_kwargs:
-            for key in AllKwargsForChatTemplate.__annotations__[kwarg_type].__annotations__.keys():
+            for key in AllKwargsForChatTemplate.__annotations__[kwarg_type].__annotations__:
                 kwarg_type_defaults = AllKwargsForChatTemplate.__annotations__[kwarg_type]
                 default_value = getattr(kwarg_type_defaults, key, None)
                 value = kwargs.pop(key, default_value)
diff --git a/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
index 1e0f00f477..361684319c 100644
--- a/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
@@ -89,7 +89,7 @@ def set_recursively(key, value, full_name, weight_type, hf_pointer):
         hf_pointer = getattr(hf_pointer, attribute)
 
     hf_param_name = None
-    for param_key in PARAM_MAPPING.keys():
+    for param_key in PARAM_MAPPING:
         if full_name.endswith(param_key):
             hf_param_name = PARAM_MAPPING[full_name.split(".")[-1]]
             weight_type = "param"
@@ -148,7 +148,7 @@ def set_recursively(key, value, full_name, weight_type, hf_pointer):
 
 def rename_dict(key, value, full_name, weight_type, hf_dict):
     hf_param_name = None
-    for param_key in PARAM_MAPPING.keys():
+    for param_key in PARAM_MAPPING:
         if full_name.endswith(param_key):
             hf_param_name = PARAM_MAPPING[full_name.split(".")[-1]]
             weight_type = "param"
diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
index dd116a7c80..427cd7261f 100644
--- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
@@ -1300,7 +1300,7 @@ class TFWav2Vec2MainLayer(keras.layers.Layer):
 
         hidden_states, extract_features = self.feature_projection(extract_features, training=training)
 
-        mask_time_indices = kwargs.get("mask_time_indices", None)
+        mask_time_indices = kwargs.get("mask_time_indices")
         if training:
             hidden_states = self._mask_hidden_states(hidden_states, mask_time_indices=mask_time_indices)
 
diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
index 14e61ec513..7fab737064 100644
--- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
@@ -191,7 +191,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
 
         # make sure that tokens made of several
         # characters are not split at tokenization
-        for token in self.encoder.keys():
+        for token in self.encoder:
             if len(token) > 1:
                 self.add_tokens(AddedToken(token, rstrip=True, lstrip=True, normalized=False))
 
@@ -212,7 +212,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
 
         # make sure that tokens made of several
         # characters are not split at tokenization
-        for token in self.encoder.keys():
+        for token in self.encoder:
             if len(token) > 1:
                 self.add_tokens(AddedToken(token, rstrip=True, lstrip=True, normalized=False))
 
diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 1cdca9ba03..a5cbcd9534 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -1181,7 +1181,7 @@ class WhisperGenerationMixin(GenerationMixin):
     def _stack_split_outputs(self, seek_outputs, model_output_type, device, kwargs):
         # Stack back seek_outputs tensors after splitting them with the split_by_batch_index method
         outputs = {}
-        for key in seek_outputs[0].keys():
+        for key in seek_outputs[0]:
             if key in ["sequences", "beam_indices", "token_timestamps"]:
                 outputs[key] = torch.stack([v[key] for v in seek_outputs], dim=0).to(device)
             elif key in ["scores", "encoder_attentions", "encoder_hidden_states", "logits"]:
@@ -1212,7 +1212,7 @@ class WhisperGenerationMixin(GenerationMixin):
                 else:
                     outputs[key] = None
 
-        token_timestamps = outputs.get("token_timestamps", None)
+        token_timestamps = outputs.get("token_timestamps")
         if token_timestamps is not None:
             model_output_type = dict
 
@@ -1442,9 +1442,9 @@ class WhisperGenerationMixin(GenerationMixin):
 
         def language_to_id(language: str) -> int:
             language = language.lower()
-            if language in generation_config.lang_to_id.keys():
+            if language in generation_config.lang_to_id:
                 language_token = language
-            elif language in TO_LANGUAGE_CODE.keys():
+            elif language in TO_LANGUAGE_CODE:
                 language_token = f"<|{TO_LANGUAGE_CODE[language]}|>"
             elif language in TO_LANGUAGE_CODE.values():
                 language_token = f"<|{language}|>"
diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py
index a3c71745c5..b11df907b3 100644
--- a/src/transformers/models/whisper/modeling_tf_whisper.py
+++ b/src/transformers/models/whisper/modeling_tf_whisper.py
@@ -1592,14 +1592,14 @@ class TFWhisperForConditionalGeneration(TFWhisperPreTrainedModel, TFCausalLangua
         ):
             forced_decoder_ids = self.generation_config.forced_decoder_ids
         else:
-            forced_decoder_ids = kwargs.get("forced_decoder_ids", None)
+            forced_decoder_ids = kwargs.get("forced_decoder_ids")
 
         if task is not None or language is not None or (forced_decoder_ids is None and prompt_ids is not None):
             forced_decoder_ids = []
             if hasattr(generation_config, "language"):
-                if generation_config.language in generation_config.lang_to_id.keys():
+                if generation_config.language in generation_config.lang_to_id:
                     language_token = generation_config.language
-                elif generation_config.language in TO_LANGUAGE_CODE.keys():
+                elif generation_config.language in TO_LANGUAGE_CODE:
                     language_token = f"<|{TO_LANGUAGE_CODE[generation_config.language]}|>"
                 elif generation_config.language in TO_LANGUAGE_CODE.values():
                     language_token = f"<|{generation_config.language}|>"
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 2d9dd6845c..4147f14d86 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -1007,7 +1007,7 @@ def _decode_asr(tokenizer, model_outputs, *, return_timestamps, return_language,
                 text = tokenizer.decode([token])
                 # Removing outer shell <|XX|>
                 text = text[2:-2]
-                language = LANGUAGES.get(text, None)
+                language = LANGUAGES.get(text)
                 if language is not None:
                     # 1/ Indeed some language
                     # TODO Handle when language is different from the previous
diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
index 6f36b19055..fbd2762cef 100644
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -118,7 +118,7 @@ def rename_key(name):
 
 
 def convert_state_dict(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
+    for key in orig_state_dict.copy():
         val = orig_state_dict.pop(key)
 
         if "attn.in_proj" in key:
diff --git a/src/transformers/models/xglm/modeling_tf_xglm.py b/src/transformers/models/xglm/modeling_tf_xglm.py
index 5915e2e9ae..d799ced792 100644
--- a/src/transformers/models/xglm/modeling_tf_xglm.py
+++ b/src/transformers/models/xglm/modeling_tf_xglm.py
@@ -890,8 +890,8 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss):
         if past_key_values:
             inputs = tf.expand_dims(inputs[:, -1], -1)
 
-        position_ids = kwargs.get("position_ids", None)
-        attention_mask = kwargs.get("attention_mask", None)
+        position_ids = kwargs.get("position_ids")
+        attention_mask = kwargs.get("attention_mask")
 
         if attention_mask is not None and position_ids is None:
             position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
diff --git a/src/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py
index d43c05cd62..c110c005af 100644
--- a/src/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py
@@ -144,7 +144,7 @@ def convert_xmod_checkpoint_to_pytorch(
 
         if sorted(bert_output.adapter_modules.keys()) != sorted(xmod_layer.adapter_modules.keys()):
             raise AssertionError("Lists of language adapters do not match.")
-        for lang_code in xmod_layer.adapter_modules.keys():
+        for lang_code in xmod_layer.adapter_modules:
             to_adapter = bert_output.adapter_modules[lang_code]
             from_adapter = xmod_layer.adapter_modules[lang_code]
             to_adapter.dense1.weight = from_adapter.fc1.weight
diff --git a/src/transformers/models/yolos/convert_yolos_to_pytorch.py b/src/transformers/models/yolos/convert_yolos_to_pytorch.py
index 907a11d067..54fbd18e06 100644
--- a/src/transformers/models/yolos/convert_yolos_to_pytorch.py
+++ b/src/transformers/models/yolos/convert_yolos_to_pytorch.py
@@ -123,7 +123,7 @@ def rename_key(name: str) -> str:
 
 
 def convert_state_dict(orig_state_dict: dict, model: YolosForObjectDetection) -> dict:
-    for key in orig_state_dict.copy().keys():
+    for key in orig_state_dict.copy():
         val = orig_state_dict.pop(key)
 
         if "qkv" in key:
diff --git a/src/transformers/models/yolos/image_processing_yolos_fast.py b/src/transformers/models/yolos/image_processing_yolos_fast.py
index 85fb1d142d..d7efb99e46 100644
--- a/src/transformers/models/yolos/image_processing_yolos_fast.py
+++ b/src/transformers/models/yolos/image_processing_yolos_fast.py
@@ -366,8 +366,8 @@ class YolosImageProcessorFast(BaseImageProcessorFast):
         self.size = get_size_dict(size, max_size=max_size, default_to_square=False)
 
         # Backwards compatibility
-        do_convert_annotations = kwargs.get("do_convert_annotations", None)
-        do_normalize = kwargs.get("do_normalize", None)
+        do_convert_annotations = kwargs.get("do_convert_annotations")
+        do_normalize = kwargs.get("do_normalize")
         if do_convert_annotations is None and getattr(self, "do_convert_annotations", None) is None:
             self.do_convert_annotations = do_normalize if do_normalize is not None else self.do_normalize
 
diff --git a/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py b/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py
index 07652ed021..b1d3e96859 100644
--- a/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py
+++ b/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py
@@ -60,7 +60,7 @@ def rename_key(orig_key):
 
 
 def convert_checkpoint_helper(max_position_embeddings, orig_state_dict):
-    for key in orig_state_dict.copy().keys():
+    for key in orig_state_dict.copy():
         val = orig_state_dict.pop(key)
 
         if ("pooler" in key) or ("sen_class" in key):
diff --git a/src/transformers/models/zoedepth/convert_zoedepth_to_hf.py b/src/transformers/models/zoedepth/convert_zoedepth_to_hf.py
index 81fcb66afa..03f2145418 100644
--- a/src/transformers/models/zoedepth/convert_zoedepth_to_hf.py
+++ b/src/transformers/models/zoedepth/convert_zoedepth_to_hf.py
@@ -255,7 +255,7 @@ def read_in_q_k_v_metric_head(state_dict):
 
 
 def convert_state_dict(orig_state_dict):
-    for key in orig_state_dict.copy().keys():
+    for key in orig_state_dict.copy():
         val = orig_state_dict.pop(key)
 
         # rename key
@@ -266,7 +266,7 @@ def convert_state_dict(orig_state_dict):
 
 
 def remove_ignore_keys(state_dict):
-    for key in state_dict.copy().keys():
+    for key in state_dict.copy():
         if (
             "fc_norm" in key
             or "relative_position_index" in key
diff --git a/src/transformers/onnx/convert.py b/src/transformers/onnx/convert.py
index 27a47c4ea9..152bc98d6d 100644
--- a/src/transformers/onnx/convert.py
+++ b/src/transformers/onnx/convert.py
@@ -457,5 +457,5 @@ def ensure_model_and_config_inputs_match(
 
     # Make sure the input order match (VERY IMPORTANT !!!!)
     matching_inputs = forward_inputs_set.intersection(model_inputs_set)
-    ordered_inputs = [parameter for parameter in forward_parameters.keys() if parameter in matching_inputs]
+    ordered_inputs = [parameter for parameter in forward_parameters if parameter in matching_inputs]
     return is_ok, ordered_inputs
diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py
index 1c71487dbb..688d0f8db5 100644
--- a/src/transformers/optimization.py
+++ b/src/transformers/optimization.py
@@ -625,7 +625,7 @@ def get_scheduler(
         optimizer_dict = optimizer.optimizer_dict
         scheduler_dict = {}
 
-        for param in optimizer_dict.keys():
+        for param in optimizer_dict:
             scheduler_dict[param] = get_scheduler(
                 name,
                 optimizer=optimizer_dict[param],
@@ -639,7 +639,7 @@ def get_scheduler(
             # attach the scheduler hook, the gradients have been zeroed here
             scheduler_dict[param].step()
 
-        for param in optimizer_dict.keys():
+        for param in optimizer_dict:
             if param.requires_grad:
                 param.register_post_accumulate_grad_hook(scheduler_hook)
 
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 238467ee29..3e73980095 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -1141,7 +1141,7 @@ class Pipeline(_ScikitCompat, PushToHubMixin):
                 "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
                 FutureWarning,
             )
-            if kwargs.get("token", None) is not None:
+            if kwargs.get("token") is not None:
                 raise ValueError(
                     "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
                 )
diff --git a/src/transformers/pipelines/image_to_image.py b/src/transformers/pipelines/image_to_image.py
index 52fc99bd05..d87f6928c9 100644
--- a/src/transformers/pipelines/image_to_image.py
+++ b/src/transformers/pipelines/image_to_image.py
@@ -136,7 +136,7 @@ class ImageToImagePipeline(Pipeline):
 
     def postprocess(self, model_outputs):
         images = []
-        if "reconstruction" in model_outputs.keys():
+        if "reconstruction" in model_outputs:
             outputs = model_outputs.reconstruction
         for output in outputs:
             output = output.data.squeeze().float().cpu().clamp_(0, 1).numpy()
diff --git a/src/transformers/pipelines/question_answering.py b/src/transformers/pipelines/question_answering.py
index 2eee80a907..ed21e39084 100644
--- a/src/transformers/pipelines/question_answering.py
+++ b/src/transformers/pipelines/question_answering.py
@@ -524,7 +524,7 @@ class QuestionAnsweringPipeline(ChunkPipeline):
         model_inputs = {k: inputs[k] for k in self.tokenizer.model_input_names}
         # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported
         model_forward = self.model.forward if self.framework == "pt" else self.model.call
-        if "use_cache" in inspect.signature(model_forward).parameters.keys():
+        if "use_cache" in inspect.signature(model_forward).parameters:
             model_inputs["use_cache"] = False
         output = self.model(**model_inputs)
         if isinstance(output, dict):
diff --git a/src/transformers/pipelines/text_classification.py b/src/transformers/pipelines/text_classification.py
index 367d867d0e..6f11f3bc97 100644
--- a/src/transformers/pipelines/text_classification.py
+++ b/src/transformers/pipelines/text_classification.py
@@ -194,7 +194,7 @@ class TextClassificationPipeline(Pipeline):
     def _forward(self, model_inputs):
         # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported
         model_forward = self.model.forward if self.framework == "pt" else self.model.call
-        if "use_cache" in inspect.signature(model_forward).parameters.keys():
+        if "use_cache" in inspect.signature(model_forward).parameters:
             model_inputs["use_cache"] = False
         return self.model(**model_inputs)
 
diff --git a/src/transformers/pipelines/token_classification.py b/src/transformers/pipelines/token_classification.py
index d7663eb170..a9d1467482 100644
--- a/src/transformers/pipelines/token_classification.py
+++ b/src/transformers/pipelines/token_classification.py
@@ -31,7 +31,7 @@ class TokenClassificationArgumentHandler(ArgumentHandler):
 
     def __call__(self, inputs: Union[str, list[str]], **kwargs):
         is_split_into_words = kwargs.get("is_split_into_words", False)
-        delimiter = kwargs.get("delimiter", None)
+        delimiter = kwargs.get("delimiter")
 
         if inputs is not None and isinstance(inputs, (list, tuple)) and len(inputs) > 0:
             inputs = list(inputs)
diff --git a/src/transformers/pipelines/zero_shot_classification.py b/src/transformers/pipelines/zero_shot_classification.py
index b571a7896b..20675d4a29 100644
--- a/src/transformers/pipelines/zero_shot_classification.py
+++ b/src/transformers/pipelines/zero_shot_classification.py
@@ -146,7 +146,7 @@ class ZeroShotClassificationPipeline(ChunkPipeline):
         return inputs
 
     def _sanitize_parameters(self, **kwargs):
-        if kwargs.get("multi_class", None) is not None:
+        if kwargs.get("multi_class") is not None:
             kwargs["multi_label"] = kwargs["multi_class"]
             logger.warning(
                 "The `multi_class` argument has been deprecated and renamed to `multi_label`. "
@@ -227,7 +227,7 @@ class ZeroShotClassificationPipeline(ChunkPipeline):
         model_inputs = {k: inputs[k] for k in self.tokenizer.model_input_names}
         # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported
         model_forward = self.model.forward if self.framework == "pt" else self.model.call
-        if "use_cache" in inspect.signature(model_forward).parameters.keys():
+        if "use_cache" in inspect.signature(model_forward).parameters:
             model_inputs["use_cache"] = False
         outputs = self.model(**model_inputs)
 
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index a40f2ad7fd..7f77daa297 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -678,7 +678,7 @@ class ProcessorMixin(PushToHubMixin):
                 "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
                 FutureWarning,
             )
-            if kwargs.get("token", None) is not None:
+            if kwargs.get("token") is not None:
                 raise ValueError(
                     "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
                 )
@@ -1187,7 +1187,7 @@ class ProcessorMixin(PushToHubMixin):
         for modality in default_kwargs:  # noqa: PLC0206
             default_kwargs[modality] = ModelProcessorKwargs._defaults.get(modality, {}).copy()
             # update defaults with arguments from tokenizer init
-            for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys():
+            for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__:
                 # init with tokenizer init kwargs if necessary
                 if tokenizer_init_kwargs is not None and modality_key in tokenizer_init_kwargs:
                     value = (
@@ -1203,7 +1203,7 @@ class ProcessorMixin(PushToHubMixin):
         # update modality kwargs with passed kwargs
         non_modality_kwargs = set(kwargs) - set(output_kwargs)
         for modality, output_kwarg in output_kwargs.items():
-            for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys():
+            for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__:
                 # check if we received a structured kwarg dict or not to handle it correctly
                 if modality in kwargs:
                     kwarg_value = kwargs[modality].pop(modality_key, "__empty__")
@@ -1236,7 +1236,7 @@ class ProcessorMixin(PushToHubMixin):
             # kwargs is a flat dictionary
             for key, kwarg in kwargs.items():
                 if key not in used_keys:
-                    if key in ModelProcessorKwargs.__annotations__["common_kwargs"].__annotations__.keys():
+                    if key in ModelProcessorKwargs.__annotations__["common_kwargs"].__annotations__:
                         output_kwargs["common_kwargs"][key] = kwarg
                     elif key not in possible_modality_keywords:
                         logger.warning_once(
@@ -1350,7 +1350,7 @@ class ProcessorMixin(PushToHubMixin):
                 classes = tuple(cls.get_possibly_dynamic_module(n) if n is not None else None for n in class_name)
                 if attribute_name == "image_processor":
                     # TODO: @yoni, change logic in v4.52 (when use_fast set to True by default)
-                    use_fast = kwargs.get("use_fast", None)
+                    use_fast = kwargs.get("use_fast")
                     if use_fast is None:
                         logger.warning_once(
                             "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. "
@@ -1495,7 +1495,7 @@ class ProcessorMixin(PushToHubMixin):
         }
 
         for kwarg_type in processed_kwargs:
-            for key in AllKwargsForChatTemplate.__annotations__[kwarg_type].__annotations__.keys():
+            for key in AllKwargsForChatTemplate.__annotations__[kwarg_type].__annotations__:
                 kwarg_type_defaults = AllKwargsForChatTemplate.__annotations__[kwarg_type]
                 default_value = getattr(kwarg_type_defaults, key, None)
                 value = kwargs.pop(key, default_value)
@@ -1641,7 +1641,7 @@ class ProcessorMixin(PushToHubMixin):
                                 current_mask[token_id] = 1
                         assistant_masks.append(current_mask)
                     out["assistant_masks"] = assistant_masks
-                    out.convert_to_tensors(tensor_type=kwargs.get("return_tensors", None))
+                    out.convert_to_tensors(tensor_type=kwargs.get("return_tensors"))
                 return out
             else:
                 return out["input_ids"]
diff --git a/src/transformers/quantizers/auto.py b/src/transformers/quantizers/auto.py
index e4fbaadb5d..161951d340 100644
--- a/src/transformers/quantizers/auto.py
+++ b/src/transformers/quantizers/auto.py
@@ -116,7 +116,7 @@ class AutoQuantizationConfig:
 
     @classmethod
     def from_dict(cls, quantization_config_dict: dict):
-        quant_method = quantization_config_dict.get("quant_method", None)
+        quant_method = quantization_config_dict.get("quant_method")
         # We need a special care for bnb models to make sure everything is BC ..
         if quantization_config_dict.get("load_in_8bit", False) or quantization_config_dict.get("load_in_4bit", False):
             suffix = "_4bit" if quantization_config_dict.get("load_in_4bit", False) else "_8bit"
@@ -126,7 +126,7 @@ class AutoQuantizationConfig:
                 "The model's quantization config from the arguments has no `quant_method` attribute. Make sure that the model has been correctly quantized"
             )
 
-        if quant_method not in AUTO_QUANTIZATION_CONFIG_MAPPING.keys():
+        if quant_method not in AUTO_QUANTIZATION_CONFIG_MAPPING:
             raise ValueError(
                 f"Unknown quantization type, got {quant_method} - supported types are:"
                 f" {list(AUTO_QUANTIZER_MAPPING.keys())}"
@@ -171,7 +171,7 @@ class AutoHfQuantizer:
             else:
                 quant_method += "_4bit"
 
-        if quant_method not in AUTO_QUANTIZER_MAPPING.keys():
+        if quant_method not in AUTO_QUANTIZER_MAPPING:
             raise ValueError(
                 f"Unknown quantization type, got {quant_method} - supported types are:"
                 f" {list(AUTO_QUANTIZER_MAPPING.keys())}"
@@ -238,7 +238,7 @@ class AutoHfQuantizer:
                 "The model's quantization config from the arguments has no `quant_method` attribute. Make sure that the model has been correctly quantized"
             )
 
-        if quant_method not in AUTO_QUANTIZATION_CONFIG_MAPPING.keys():
+        if quant_method not in AUTO_QUANTIZATION_CONFIG_MAPPING:
             logger.warning(
                 f"Unknown quantization type, got {quant_method} - supported types are:"
                 f" {list(AUTO_QUANTIZER_MAPPING.keys())}. Hence, we will skip the quantization. "
diff --git a/src/transformers/quantizers/base.py b/src/transformers/quantizers/base.py
index 0a4ddf6804..fb53c4c0f6 100644
--- a/src/transformers/quantizers/base.py
+++ b/src/transformers/quantizers/base.py
@@ -319,7 +319,7 @@ class HfQuantizer(ABC):
 
         for name, module in model.named_modules():
             module_class_name = module.__class__.__name__
-            if module_class_name in MODULES_TO_PATCH_FOR_QUANTIZATION.keys() and (
+            if module_class_name in MODULES_TO_PATCH_FOR_QUANTIZATION and (
                 self.quantization_config.quant_method
                 in MODULES_TO_PATCH_FOR_QUANTIZATION[module_class_name]["quantization_methods"]
             ):
diff --git a/src/transformers/quantizers/quantizer_auto_round.py b/src/transformers/quantizers/quantizer_auto_round.py
index 9f6783daee..2f73567860 100644
--- a/src/transformers/quantizers/quantizer_auto_round.py
+++ b/src/transformers/quantizers/quantizer_auto_round.py
@@ -42,7 +42,7 @@ class AutoRoundQuantizer(HfQuantizer):
         super().__init__(quantization_config, **kwargs)
 
     def validate_environment(self, *args, **kwargs):
-        self.device_map = kwargs.get("device_map", None)
+        self.device_map = kwargs.get("device_map")
         if not is_auto_round_available():
             raise ImportError(
                 "Loading an AutoRound quantized model requires auto-round library (`pip install 'auto-round>=0.5'`)"
diff --git a/src/transformers/quantizers/quantizer_bitnet.py b/src/transformers/quantizers/quantizer_bitnet.py
index 90cbab4303..a57e732b98 100644
--- a/src/transformers/quantizers/quantizer_bitnet.py
+++ b/src/transformers/quantizers/quantizer_bitnet.py
@@ -62,7 +62,7 @@ class BitNetHfQuantizer(HfQuantizer):
             )
             return
 
-        device_map = kwargs.get("device_map", None)
+        device_map = kwargs.get("device_map")
         if device_map is None:
             logger.warning_once(
                 "You have loaded a BitNet model on CPU and have a CUDA device available, make sure to set "
diff --git a/src/transformers/quantizers/quantizer_bnb_4bit.py b/src/transformers/quantizers/quantizer_bnb_4bit.py
index 409c5d47b0..fb268777cd 100644
--- a/src/transformers/quantizers/quantizer_bnb_4bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_4bit.py
@@ -102,14 +102,14 @@ class Bnb4BitHfQuantizer(HfQuantizer):
                 " sure the weights are in PyTorch format."
             )
 
-        device_map = kwargs.get("device_map", None)
+        device_map = kwargs.get("device_map")
         if (
             device_map is not None
             and isinstance(device_map, dict)
             and not self.quantization_config.llm_int8_enable_fp32_cpu_offload
         ):
             device_map_without_lm_head = {
-                key: device_map[key] for key in device_map.keys() if key not in self.modules_to_not_convert
+                key: device_map[key] for key in device_map if key not in self.modules_to_not_convert
             }
             if set(device_map.values()) == {"cpu"} and bnb_multibackend_is_enabled:
                 pass
diff --git a/src/transformers/quantizers/quantizer_bnb_8bit.py b/src/transformers/quantizers/quantizer_bnb_8bit.py
index e7d914170e..3dc759dfd2 100644
--- a/src/transformers/quantizers/quantizer_bnb_8bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_8bit.py
@@ -99,14 +99,14 @@ class Bnb8BitHfQuantizer(HfQuantizer):
                 " sure the weights are in PyTorch format."
             )
 
-        device_map = kwargs.get("device_map", None)
+        device_map = kwargs.get("device_map")
         if (
             device_map is not None
             and isinstance(device_map, dict)
             and not self.quantization_config.llm_int8_enable_fp32_cpu_offload
         ):
             device_map_without_lm_head = {
-                key: device_map[key] for key in device_map.keys() if key not in self.modules_to_not_convert
+                key: device_map[key] for key in device_map if key not in self.modules_to_not_convert
             }
             if set(device_map.values()) == {"cpu"} and bnb_multibackend_is_enabled:
                 pass
@@ -177,7 +177,7 @@ class Bnb8BitHfQuantizer(HfQuantizer):
         module, tensor_name = get_module_from_name(model, param_name)
         if isinstance(module._parameters.get(tensor_name, None), bnb.nn.Int8Params):
             if self.pre_quantized:
-                if param_name.replace("weight", "SCB") not in state_dict.keys():
+                if param_name.replace("weight", "SCB") not in state_dict:
                     raise ValueError("Missing quantization component `SCB`")
                 if param_value.dtype != torch.int8:
                     raise ValueError(
@@ -204,8 +204,8 @@ class Bnb8BitHfQuantizer(HfQuantizer):
         fp16_statistics_key = param_name.replace("weight", "SCB")
         fp16_weights_format_key = param_name.replace("weight", "weight_format")
 
-        fp16_statistics = state_dict.get(fp16_statistics_key, None)
-        fp16_weights_format = state_dict.get(fp16_weights_format_key, None)
+        fp16_statistics = state_dict.get(fp16_statistics_key)
+        fp16_weights_format = state_dict.get(fp16_weights_format_key)
 
         module, tensor_name = get_module_from_name(model, param_name)
         if tensor_name not in module._parameters:
diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py
index 7df4fb7ca9..0e94d87f32 100644
--- a/src/transformers/quantizers/quantizer_compressed_tensors.py
+++ b/src/transformers/quantizers/quantizer_compressed_tensors.py
@@ -130,7 +130,7 @@ class CompressedTensorsHfQuantizer(HfQuantizer):
         if (
             self.quantization_config.is_quantization_compressed and not self.run_compressed
         ) or self.quantization_config.is_sparsification_compressed:
-            config = kwargs.get("config", None)
+            config = kwargs.get("config")
             cache_path = config._name_or_path
 
             if not os.path.exists(cache_path):
diff --git a/src/transformers/quantizers/quantizer_eetq.py b/src/transformers/quantizers/quantizer_eetq.py
index 61c9390900..bb7a78c536 100644
--- a/src/transformers/quantizers/quantizer_eetq.py
+++ b/src/transformers/quantizers/quantizer_eetq.py
@@ -79,7 +79,7 @@ class EetqHfQuantizer(HfQuantizer):
         if not torch.cuda.is_available():
             raise RuntimeError("No GPU found. A GPU is needed for quantization.")
 
-        device_map = kwargs.get("device_map", None)
+        device_map = kwargs.get("device_map")
         if device_map is None:
             logger.warning_once(
                 "You have loaded an EETQ model on CPU and have a CUDA device available, make sure to set "
diff --git a/src/transformers/quantizers/quantizer_fbgemm_fp8.py b/src/transformers/quantizers/quantizer_fbgemm_fp8.py
index 003ea515b3..c00a141d67 100644
--- a/src/transformers/quantizers/quantizer_fbgemm_fp8.py
+++ b/src/transformers/quantizers/quantizer_fbgemm_fp8.py
@@ -71,7 +71,7 @@ class FbgemmFp8HfQuantizer(HfQuantizer):
                 "FP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)"
             )
 
-        device_map = kwargs.get("device_map", None)
+        device_map = kwargs.get("device_map")
         if device_map is None:
             logger.warning_once(
                 "You have loaded an FP8 model on CPU and have a CUDA device available, make sure to set "
@@ -231,7 +231,7 @@ class FbgemmFp8HfQuantizer(HfQuantizer):
 
         not_missing_keys = []
         for name, module in model.named_modules():
-            if isinstance(module, FbgemmFp8Linear) or isinstance(module, FbgemmFp8Llama4TextExperts):
+            if isinstance(module, (FbgemmFp8Linear, FbgemmFp8Llama4TextExperts)):
                 for missing in missing_keys:
                     if (
                         (name in missing or name in f"{prefix}.{missing}")
diff --git a/src/transformers/quantizers/quantizer_finegrained_fp8.py b/src/transformers/quantizers/quantizer_finegrained_fp8.py
index 1a2ab58d98..58c747effa 100644
--- a/src/transformers/quantizers/quantizer_finegrained_fp8.py
+++ b/src/transformers/quantizers/quantizer_finegrained_fp8.py
@@ -56,7 +56,7 @@ class FineGrainedFP8HfQuantizer(HfQuantizer):
                     f", actual = `{major}.{minor}`"
                 )
 
-        device_map = kwargs.get("device_map", None)
+        device_map = kwargs.get("device_map")
         if device_map is None:
             logger.warning_once(
                 "You have loaded an FP8 model on CPU and have a CUDA device available, make sure to set "
diff --git a/src/transformers/quantizers/quantizer_hqq.py b/src/transformers/quantizers/quantizer_hqq.py
index 4c6a64705b..f52ac36a8d 100755
--- a/src/transformers/quantizers/quantizer_hqq.py
+++ b/src/transformers/quantizers/quantizer_hqq.py
@@ -78,7 +78,7 @@ class HqqHfQuantizer(HfQuantizer):
                 self.torch_dtype = torch.float32
                 logger.info("Setting torch_dtype to torch.float32 as the default value since it was not specified.")
 
-        device_map = kwargs.get("device_map", None)
+        device_map = kwargs.get("device_map")
         if isinstance(device_map, dict):
             if "cpu" in device_map.values() or "disk" in device_map.values():
                 raise ValueError(
@@ -171,7 +171,7 @@ class HqqHfQuantizer(HfQuantizer):
         module, tensor_name = get_module_from_name(model, param_name)
 
         if self.pre_quantized:
-            return (isinstance(module, torch.nn.Linear) or isinstance(module, HQQLinear)) and tensor_name != "weight"
+            return (isinstance(module, (torch.nn.Linear, HQQLinear))) and tensor_name != "weight"
         else:
             return (
                 isinstance(module, torch.nn.Linear)
diff --git a/src/transformers/quantizers/quantizer_quanto.py b/src/transformers/quantizers/quantizer_quanto.py
index 675d54ed2f..d273ddda59 100644
--- a/src/transformers/quantizers/quantizer_quanto.py
+++ b/src/transformers/quantizers/quantizer_quanto.py
@@ -117,8 +117,8 @@ class QuantoHfQuantizer(HfQuantizer):
         if is_optimum_quanto_available():
             from optimum.quanto import QModuleMixin
 
-        device_map = kwargs.get("device_map", None)
-        param_device = kwargs.get("param_device", None)
+        device_map = kwargs.get("device_map")
+        param_device = kwargs.get("param_device")
         # we don't quantize the model if the module is going to be offloaded to the cpu
         if device_map is not None and param_device is not None:
             device_map_values = set(device_map.values())
diff --git a/src/transformers/quantizers/quantizer_torchao.py b/src/transformers/quantizers/quantizer_torchao.py
index c0b337b566..23c9d7e327 100644
--- a/src/transformers/quantizers/quantizer_torchao.py
+++ b/src/transformers/quantizers/quantizer_torchao.py
@@ -98,7 +98,7 @@ class TorchAoHfQuantizer(HfQuantizer):
             raise ImportError("Loading an torchao quantized model requires torchao library (`pip install torchao`)")
 
         self.offload = False
-        device_map = kwargs.get("device_map", None)
+        device_map = kwargs.get("device_map")
         if isinstance(device_map, dict):
             if "cpu" in device_map.values() or "disk" in device_map.values():
                 if self.pre_quantized:
@@ -109,7 +109,7 @@ class TorchAoHfQuantizer(HfQuantizer):
                 else:
                     self.offload = True
         if self.pre_quantized:
-            weights_only = kwargs.get("weights_only", None)
+            weights_only = kwargs.get("weights_only")
             if weights_only:
                 torch_version = version.parse(importlib.metadata.version("torch"))
                 if torch_version < version.parse("2.5.0"):
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index fd5f62ec28..a41ea8166e 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -3304,7 +3304,7 @@ def compare_pipeline_output_to_hub_spec(output, hub_spec):
     missing_keys = []
     unexpected_keys = []
     all_field_names = {field.name for field in fields(hub_spec)}
-    matching_keys = sorted([key for key in output.keys() if key in all_field_names])
+    matching_keys = sorted([key for key in output if key in all_field_names])
 
     # Fields with a MISSING default are required and must be in the output
     for field in fields(hub_spec):
diff --git a/src/transformers/tokenization_mistral_common.py b/src/transformers/tokenization_mistral_common.py
index 95bd64049b..cd85a641cb 100644
--- a/src/transformers/tokenization_mistral_common.py
+++ b/src/transformers/tokenization_mistral_common.py
@@ -1203,7 +1203,7 @@ class MistralCommonTokenizer(PushToHubMixin):
         # If we have a list of dicts, let's convert it in a dict of lists
         # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
         if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping):
-            encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
+            encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0]}
 
         # The model's main input name, usually `input_ids`, has been passed for padding
         if self.model_input_names[0] not in encoded_inputs:
@@ -1449,7 +1449,7 @@ class MistralCommonTokenizer(PushToHubMixin):
             if not isinstance(message, dict):
                 return
             maybe_list_content: Optional[Union[str, list[dict[str, Union[str, dict[str, Any]]]]]] = message.get(
-                "content", None
+                "content"
             )
             if not maybe_list_content or isinstance(maybe_list_content, str):
                 return
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index a8d0336b46..beb91a9472 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -271,7 +271,7 @@ class BatchEncoding(UserDict):
         elif self._encodings is not None:
             return self._encodings[item]
         elif isinstance(item, slice):
-            return {key: self.data[key][item] for key in self.data.keys()}
+            return {key: self.data[key][item] for key in self.data}
         else:
             raise KeyError(
                 "Invalid key. Only three types of key are available: "
@@ -1846,7 +1846,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         from_pipeline = kwargs.pop("_from_pipeline", None)
         from_auto_class = kwargs.pop("_from_auto", False)
         commit_hash = kwargs.pop("_commit_hash", None)
-        gguf_file = kwargs.get("gguf_file", None)
+        gguf_file = kwargs.get("gguf_file")
 
         if use_auth_token is not None:
             warnings.warn(
@@ -2042,7 +2042,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
         # file or if `from_slow` is set to True.
         from_slow = kwargs.get("from_slow", False)
-        gguf_file = kwargs.get("gguf_file", None)
+        gguf_file = kwargs.get("gguf_file")
         has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None
 
         # If one passes a GGUF file path to `gguf_file` there is no need for this check as the tokenizer will be
@@ -2136,7 +2136,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
                 else:
                     # Fallback: use pattern matching on the string.
                     model_type = None
-                    for pattern in TOKENIZER_MAPPING_NAMES.keys():
+                    for pattern in TOKENIZER_MAPPING_NAMES:
                         if pattern in str(pretrained_model_name_or_path):
                             model_type = pattern
                             break
@@ -2415,7 +2415,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
                 "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
                 FutureWarning,
             )
-            if kwargs.get("token", None) is not None:
+            if kwargs.get("token") is not None:
                 raise ValueError(
                     "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
                 )
@@ -2464,7 +2464,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
 
         if len(self.init_inputs) > 0:
             tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
-        for file_id in self.vocab_files_names.keys():
+        for file_id in self.vocab_files_names:
             tokenizer_config.pop(file_id, None)
 
         # no typefields, this way old fast and slow can load it
@@ -3283,7 +3283,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         # If we have a list of dicts, let's convert it in a dict of lists
         # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
         if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping):
-            encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
+            encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0]}
 
         # The model's main input name, usually `input_ids`, has been passed for padding
         if self.model_input_names[0] not in encoded_inputs:
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index 3fecfa0e1d..4ca5bed605 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -134,7 +134,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
             fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
         elif not slow_tokenizer:
             # We tried loading a slow_tokenizer with spm and failed, try to load with tiktoken
-            self.vocab_file = kwargs.get("vocab_file", None)
+            self.vocab_file = kwargs.get("vocab_file")
             self.additional_special_tokens = kwargs.get("additional_special_tokens", [])
             fast_tokenizer = convert_slow_tokenizer(self, from_tiktoken=True)
             slow_tokenizer = None
@@ -583,7 +583,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
         # To match each overflowing sample with the original sample in the batch
         # we add an overflow_to_sample_mapping array (see below)
         sanitized_tokens = {}
-        for key in tokens_and_encodings[0][0].keys():
+        for key in tokens_and_encodings[0][0]:
             stack = [e for item, _ in tokens_and_encodings for e in item[key]]
             sanitized_tokens[key] = stack
         sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 52dc9c3557..c0d4c12db5 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -438,7 +438,7 @@ class Trainer:
             logger.info(f"No `TrainingArguments` passed, using `output_dir={output_dir}`.")
             args = TrainingArguments(output_dir=output_dir)
         if args.batch_eval_metrics and compute_metrics is not None:
-            if "compute_result" not in inspect.signature(compute_metrics).parameters.keys():
+            if "compute_result" not in inspect.signature(compute_metrics).parameters:
                 raise ValueError(
                     "When using `batch_eval_metrics`, your `compute_metrics` function must take a `compute_result`"
                     " boolean argument which will be triggered after the last batch of the eval set to signal that the"
@@ -4614,7 +4614,7 @@ class Trainer:
         # For CLIP-like models capable of returning loss values.
         # If `return_loss` is not specified or being `None` in `inputs`, we check if the default value of `return_loss`
         # is `True` in `model.forward`.
-        return_loss = inputs.get("return_loss", None)
+        return_loss = inputs.get("return_loss")
         if return_loss is None:
             return_loss = self.can_return_loss
         loss_without_labels = True if len(self.label_names) == 0 and return_loss else False
@@ -5259,7 +5259,7 @@ class Trainer:
         # some Trainer classes need to use `gather` instead of `gather_for_metrics`, thus we store a flag
         self.gather_function = self.accelerator.gather_for_metrics
 
-        if "use_gather_object" in inspect.signature(self.gather_function).parameters.keys():
+        if "use_gather_object" in inspect.signature(self.gather_function).parameters:
             self.gather_function = functools.partial(
                 self.gather_function, use_gather_object=self.args.eval_use_gather_object
             )
diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py
index 84f65c9544..e93e279052 100644
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -1047,7 +1047,7 @@ def log_metrics(self, split, metrics):
 
     print(f"***** {split} metrics *****")
     metrics_formatted = self.metrics_format(metrics)
-    k_width = max(len(str(x)) for x in metrics_formatted.keys())
+    k_width = max(len(str(x)) for x in metrics_formatted)
     v_width = max(len(str(x)) for x in metrics_formatted.values())
     for key in sorted(metrics_formatted.keys()):
         print(f"  {key: <{k_width}} = {metrics_formatted[key]:>{v_width}}")
@@ -1139,9 +1139,7 @@ def get_parameter_names(model, forbidden_layer_types, forbidden_layer_names=None
         ]
     # Add model specific parameters that are not in any child
     result += [
-        k
-        for k in model._parameters.keys()
-        if not any(pattern.search(k.lower()) for pattern in forbidden_layer_patterns)
+        k for k in model._parameters if not any(pattern.search(k.lower()) for pattern in forbidden_layer_patterns)
     ]
 
     return result
@@ -1333,7 +1331,7 @@ class AcceleratorConfig:
         with open_file(json_file, "r", encoding="utf-8") as f:
             config_dict = json.load(f)
         # Check for keys and load sensible defaults
-        extra_keys = sorted(key for key in config_dict.keys() if key not in cls.__dataclass_fields__.keys())
+        extra_keys = sorted(key for key in config_dict if key not in cls.__dataclass_fields__)
         if len(extra_keys) > 0:
             raise ValueError(
                 f"The config file at {json_file} had unknown keys ({extra_keys}), please try upgrading your `transformers`"
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index 317e50fb68..c435c1f6df 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -288,9 +288,7 @@ def default_compute_objective(metrics: dict[str, float]) -> float:
     _ = metrics.pop("epoch", None)
     # Remove speed metrics
     speed_metrics = [
-        m
-        for m in metrics.keys()
-        if m.endswith("_runtime") or m.endswith("_per_second") or m.endswith("_compilation_time")
+        m for m in metrics if m.endswith("_runtime") or m.endswith("_per_second") or m.endswith("_compilation_time")
     ]
     for sm in speed_metrics:
         _ = metrics.pop(sm, None)
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 8314e27134..a9bbebab94 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -2532,7 +2532,7 @@ class TrainingArguments:
         converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
         string, which can then be stored in the json format.
         """
-        if d.get("torch_dtype", None) is not None and not isinstance(d["torch_dtype"], str):
+        if d.get("torch_dtype") is not None and not isinstance(d["torch_dtype"], str):
             d["torch_dtype"] = str(d["torch_dtype"]).split(".")[1]
         for value in d.values():
             if isinstance(value, dict):
diff --git a/src/transformers/utils/attention_visualizer.py b/src/transformers/utils/attention_visualizer.py
index d9a4041f7f..8f5b9bc827 100644
--- a/src/transformers/utils/attention_visualizer.py
+++ b/src/transformers/utils/attention_visualizer.py
@@ -233,7 +233,7 @@ class AttentionMaskVisualizer:
             attention_mask,
             img_token=self.image_token,
             sliding_window=getattr(self.config, "sliding_window", None),
-            token_type_ids=kwargs.get("token_type_ids", None),
+            token_type_ids=kwargs.get("token_type_ids"),
             image_seq_length=image_seq_length,
         )
         print(f_string)
diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py
index e13d3bbe28..335a5d5717 100644
--- a/src/transformers/utils/generic.py
+++ b/src/transformers/utils/generic.py
@@ -830,7 +830,7 @@ def filter_out_non_signature_kwargs(extra: Optional[list] = None):
                     invalid_kwargs[k] = v
 
             if invalid_kwargs:
-                invalid_kwargs_names = [f"'{k}'" for k in invalid_kwargs.keys()]
+                invalid_kwargs_names = [f"'{k}'" for k in invalid_kwargs]
                 invalid_kwargs_names = ", ".join(invalid_kwargs_names)
 
                 # Get the class name for better warning message
@@ -993,7 +993,7 @@ def check_model_inputs(func):
 
     @wraps(func)
     def wrapper(self, *args, **kwargs):
-        use_cache = kwargs.get("use_cache", None)
+        use_cache = kwargs.get("use_cache")
         if use_cache is None:
             use_cache = getattr(self.config, "use_cache", False)
 
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index fb9d56e160..106e8abe11 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -939,9 +939,9 @@ def is_torch_hpu_available():
     original_compile = torch.compile
 
     def hpu_backend_compile(*args, **kwargs):
-        if kwargs.get("backend", None) not in ["hpu_backend", "eager"]:
+        if kwargs.get("backend") not in ["hpu_backend", "eager"]:
             logger.warning(
-                f"Calling torch.compile with backend={kwargs.get('backend', None)} on a Gaudi device is not supported. "
+                f"Calling torch.compile with backend={kwargs.get('backend')} on a Gaudi device is not supported. "
                 "We will override the backend with 'hpu_backend' to avoid errors."
             )
             kwargs["backend"] = "hpu_backend"
@@ -2149,7 +2149,7 @@ class _LazyModule(ModuleType):
         self._object_missing_backend = {}
         self._explicit_import_shortcut = explicit_import_shortcut if explicit_import_shortcut else {}
 
-        if any(isinstance(key, frozenset) for key in import_structure.keys()):
+        if any(isinstance(key, frozenset) for key in import_structure):
             self._modules = set()
             self._class_to_module = {}
             self.__all__ = []
@@ -2247,7 +2247,7 @@ class _LazyModule(ModuleType):
     def __getattr__(self, name: str) -> Any:
         if name in self._objects:
             return self._objects[name]
-        if name in self._object_missing_backend.keys():
+        if name in self._object_missing_backend:
             missing_backends = self._object_missing_backend[name]
 
             class Placeholder(metaclass=DummyObject):
@@ -2271,7 +2271,7 @@ class _LazyModule(ModuleType):
             Placeholder.__module__ = module_name
 
             value = Placeholder
-        elif name in self._class_to_module.keys():
+        elif name in self._class_to_module:
             try:
                 module = self._get_module(self._class_to_module[name])
                 value = getattr(module, name)
@@ -2726,7 +2726,7 @@ def spread_import_structure(nested_import_structure):
             if not isinstance(_value, dict):
                 frozenset_first_import_structure[_key] = _value
 
-            elif any(isinstance(v, frozenset) for v in _value.keys()):
+            elif any(isinstance(v, frozenset) for v in _value):
                 for k, v in _value.items():
                     if isinstance(k, frozenset):
                         # Here we want to switch around _key and k to propagate k upstream if it is a frozenset
diff --git a/src/transformers/utils/notebook.py b/src/transformers/utils/notebook.py
index 42e24c7781..b270a14105 100644
--- a/src/transformers/utils/notebook.py
+++ b/src/transformers/utils/notebook.py
@@ -244,7 +244,7 @@ class NotebookTrainingTracker(NotebookProgressBar):
             self.inner_table = [list(values.keys()), list(values.values())]
         else:
             columns = self.inner_table[0]
-            for key in values.keys():
+            for key in values:
                 if key not in columns:
                     columns.append(key)
             self.inner_table[0] = columns
@@ -258,7 +258,7 @@ class NotebookTrainingTracker(NotebookProgressBar):
                     # update last line
                     new_values = values
                     for c in columns:
-                        if c not in new_values.keys():
+                        if c not in new_values:
                             new_values[c] = last_values[columns.index(c)]
                     self.inner_table[-1] = [new_values[c] for c in columns]
             else:
diff --git a/src/transformers/video_processing_utils.py b/src/transformers/video_processing_utils.py
index 0db55024e6..b59b387485 100644
--- a/src/transformers/video_processing_utils.py
+++ b/src/transformers/video_processing_utils.py
@@ -553,7 +553,7 @@ class BaseVideoProcessor(BaseImageProcessorFast):
                 "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
                 FutureWarning,
             )
-            if kwargs.get("token", None) is not None:
+            if kwargs.get("token") is not None:
                 raise ValueError(
                     "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
                 )
diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py
index dc087559d4..a31c23adfe 100644
--- a/tests/extended/test_trainer_ext.py
+++ b/tests/extended/test_trainer_ext.py
@@ -83,7 +83,7 @@ class TestTrainerExt(TestCasePlus):
         if not do_eval:
             self.skipTest(reason="do_eval is False")
 
-        eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
+        eval_metrics = [log for log in logs if "eval_loss" in log]
 
         first_step_stats = eval_metrics[0]
         if predict_with_generate:
@@ -168,7 +168,7 @@ class TestTrainerExt(TestCasePlus):
 
         # Check metrics
         logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
-        eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
+        eval_metrics = [log for log in logs if "eval_loss" in log]
         first_step_stats = eval_metrics[0]
         last_step_stats = eval_metrics[-1]
 
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 0f94837b78..d73bfa2274 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1440,7 +1440,7 @@ class GenerationTesterMixin:
         #   added support for it yet. We skip these models for now.
         has_encoder_attributes = any(
             attr_name
-            for attr_name in config.to_dict().keys()
+            for attr_name in config.to_dict()
             if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size"
         )
         if has_encoder_attributes:
@@ -1644,7 +1644,7 @@ class GenerationTesterMixin:
             config.is_decoder = True
 
             model = model_class(config).to(torch_device).eval()
-            if "inputs_embeds" not in inspect.signature(model.prepare_inputs_for_generation).parameters.keys():
+            if "inputs_embeds" not in inspect.signature(model.prepare_inputs_for_generation).parameters:
                 continue
 
             #  No easy fix, let's skip the test for now
@@ -1701,7 +1701,7 @@ class GenerationTesterMixin:
             # Skip models without explicit support
             model = model_class(config).to(torch_device).eval()
             set_model_for_less_flaky_test(model)
-            if "inputs_embeds" not in inspect.signature(model.prepare_inputs_for_generation).parameters.keys():
+            if "inputs_embeds" not in inspect.signature(model.prepare_inputs_for_generation).parameters:
                 continue
 
             # There are a few exception patterns in this test:
@@ -1773,7 +1773,7 @@ class GenerationTesterMixin:
                 self.skipTest(reason="This model is encoder-decoder and has Encoder-Decoder Cache")
 
             model = model_class(config).to(torch_device).eval()
-            if "inputs_embeds" not in inspect.signature(model.prepare_inputs_for_generation).parameters.keys():
+            if "inputs_embeds" not in inspect.signature(model.prepare_inputs_for_generation).parameters:
                 self.skipTest(reason="This model does not support `inputs_embeds` in generation")
 
             input_ids = inputs_dict.pop("input_ids")
@@ -1929,7 +1929,7 @@ class GenerationTesterMixin:
 
             model = model_class(config).to(torch_device).eval()
 
-            if "inputs_embeds" not in inspect.signature(model.prepare_inputs_for_generation).parameters.keys():
+            if "inputs_embeds" not in inspect.signature(model.prepare_inputs_for_generation).parameters:
                 self.skipTest(reason="This model does not support `inputs_embeds` in generation")
 
             # If "past_key_values" is not returned, skip the test (e.g. RWKV uses a different cache name and format)
diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py
index 4e897a554c..167cb1ff7c 100644
--- a/tests/models/align/test_modeling_align.py
+++ b/tests/models/align/test_modeling_align.py
@@ -562,8 +562,8 @@ class AlignModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
             loaded_model_state_dict = loaded_model.state_dict()
 
             non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
+            for key in loaded_model_state_dict:
+                if key not in model_state_dict:
                     non_persistent_buffers[key] = loaded_model_state_dict[key]
 
             loaded_model_state_dict = {
diff --git a/tests/models/align/test_processor_align.py b/tests/models/align/test_processor_align.py
index 73e0d1df91..9205e5f4f2 100644
--- a/tests/models/align/test_processor_align.py
+++ b/tests/models/align/test_processor_align.py
@@ -135,7 +135,7 @@ class AlignProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_image_proc = image_processor(image_input, return_tensors="np")
         input_processor = processor(images=image_input, return_tensors="np")
 
-        for key in input_image_proc.keys():
+        for key in input_image_proc:
             self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
@@ -149,7 +149,7 @@ class AlignProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         encoded_processor = processor(text=input_str)
 
         encoded_tok = tokenizer(input_str, padding="max_length", max_length=64)
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     def test_processor(self):
diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py
index d1656d06fe..2a36470051 100755
--- a/tests/models/altclip/test_modeling_altclip.py
+++ b/tests/models/altclip/test_modeling_altclip.py
@@ -532,8 +532,8 @@ class AltCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
             loaded_model_state_dict = loaded_model.state_dict()
 
             non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
+            for key in loaded_model_state_dict:
+                if key not in model_state_dict:
                     non_persistent_buffers[key] = loaded_model_state_dict[key]
 
             loaded_model_state_dict = {
diff --git a/tests/models/bamba/test_modeling_bamba.py b/tests/models/bamba/test_modeling_bamba.py
index cf660b7fa0..245163d672 100644
--- a/tests/models/bamba/test_modeling_bamba.py
+++ b/tests/models/bamba/test_modeling_bamba.py
@@ -448,7 +448,7 @@ class BambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
         #   added support for it yet. We skip these models for now.
         has_encoder_attributes = any(
             attr_name
-            for attr_name in config.to_dict().keys()
+            for attr_name in config.to_dict()
             if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size"
         )
         if has_encoder_attributes:
diff --git a/tests/models/bark/test_processor_bark.py b/tests/models/bark/test_processor_bark.py
index 15b0871d81..447d38b956 100644
--- a/tests/models/bark/test_processor_bark.py
+++ b/tests/models/bark/test_processor_bark.py
@@ -123,5 +123,5 @@ class BarkProcessorTest(unittest.TestCase):
             return_token_type_ids=False,
         )
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key].squeeze().tolist())
diff --git a/tests/models/biogpt/test_modeling_biogpt.py b/tests/models/biogpt/test_modeling_biogpt.py
index 232f7176b2..26f2053a93 100644
--- a/tests/models/biogpt/test_modeling_biogpt.py
+++ b/tests/models/biogpt/test_modeling_biogpt.py
@@ -224,7 +224,7 @@ class BioGptModelTester:
     def create_and_check_biogpt_weight_initialization(self, config, *args):
         model = BioGptModel(config)
         model_std = model.config.initializer_range / math.sqrt(2 * model.config.num_hidden_layers)
-        for key in model.state_dict().keys():
+        for key in model.state_dict():
             if "c_proj" in key and "weight" in key:
                 self.parent.assertLessEqual(abs(torch.std(model.state_dict()[key]) - model_std), 0.001)
                 self.parent.assertLessEqual(abs(torch.mean(model.state_dict()[key]) - 0.0), 0.01)
diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py
index d340d3e569..e8cdb26cc4 100644
--- a/tests/models/blip/test_modeling_blip.py
+++ b/tests/models/blip/test_modeling_blip.py
@@ -536,8 +536,8 @@ class BlipModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
             loaded_model_state_dict = loaded_model.state_dict()
 
             non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
+            for key in loaded_model_state_dict:
+                if key not in model_state_dict:
                     non_persistent_buffers[key] = loaded_model_state_dict[key]
 
             loaded_model_state_dict = {
@@ -1056,8 +1056,8 @@ class BlipTextRetrievalModelTest(ModelTesterMixin, unittest.TestCase):
             loaded_model_state_dict = loaded_model.state_dict()
 
             non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
+            for key in loaded_model_state_dict:
+                if key not in model_state_dict:
                     non_persistent_buffers[key] = loaded_model_state_dict[key]
 
             loaded_model_state_dict = {
@@ -1274,8 +1274,8 @@ class BlipTextImageModelTest(ModelTesterMixin, unittest.TestCase):
             loaded_model_state_dict = loaded_model.state_dict()
 
             non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
+            for key in loaded_model_state_dict:
+                if key not in model_state_dict:
                     non_persistent_buffers[key] = loaded_model_state_dict[key]
 
             loaded_model_state_dict = {
diff --git a/tests/models/blip/test_processor_blip.py b/tests/models/blip/test_processor_blip.py
index 6ff81e991a..254eede275 100644
--- a/tests/models/blip/test_processor_blip.py
+++ b/tests/models/blip/test_processor_blip.py
@@ -81,7 +81,7 @@ class BlipProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_feat_extract = image_processor(image_input, return_tensors="np")
         input_processor = processor(images=image_input, return_tensors="np")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
@@ -96,7 +96,7 @@ class BlipProcessorTest(ProcessorTesterMixin, unittest.TestCase):
 
         encoded_tok = tokenizer(input_str, return_token_type_ids=False)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     def test_processor(self):
diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index d5606ac705..952595bbe6 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -627,7 +627,7 @@ class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationT
         #   added support for it yet. We skip these models for now.
         has_encoder_attributes = any(
             attr_name
-            for attr_name in config.to_dict().keys()
+            for attr_name in config.to_dict()
             if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size"
         )
         if has_encoder_attributes:
diff --git a/tests/models/blip_2/test_processor_blip_2.py b/tests/models/blip_2/test_processor_blip_2.py
index d2b63f8e40..a24ccc4aeb 100644
--- a/tests/models/blip_2/test_processor_blip_2.py
+++ b/tests/models/blip_2/test_processor_blip_2.py
@@ -84,7 +84,7 @@ class Blip2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_feat_extract = image_processor(image_input, return_tensors="np")
         input_processor = processor(images=image_input, return_tensors="np")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_processor(self):
diff --git a/tests/models/bloom/test_modeling_bloom.py b/tests/models/bloom/test_modeling_bloom.py
index 787a99c932..0c685f0536 100644
--- a/tests/models/bloom/test_modeling_bloom.py
+++ b/tests/models/bloom/test_modeling_bloom.py
@@ -290,7 +290,7 @@ class BloomModelTester:
     def create_and_check_bloom_weight_initialization(self, config, *args):
         model = BloomModel(config)
         model_std = model.config.initializer_range / math.sqrt(2 * model.config.n_layer)
-        for key in model.state_dict().keys():
+        for key in model.state_dict():
             if "c_proj" in key and "weight" in key:
                 self.parent.assertLessEqual(abs(torch.std(model.state_dict()[key]) - model_std), 0.001)
                 self.parent.assertLessEqual(abs(torch.mean(model.state_dict()[key]) - 0.0), 0.01)
@@ -747,7 +747,7 @@ class BloomEmbeddingTest(unittest.TestCase):
             output_dict["max"][idx] = embeddings.max(dim=-1).values[0][i].item()
             output_dict["mean"][idx] = embeddings.mean(dim=-1)[0][i].item()
 
-        for key in TEST_EMBEDDINGS[str(model.dtype)].keys():
+        for key in TEST_EMBEDDINGS[str(model.dtype)]:
             self.assertDictEqual(TEST_EMBEDDINGS[str(model.dtype)][key], output_dict[key])
 
         output_dict_norm = {"min": {}, "max": {}, "mean": {}}
diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py
index 7ec9dd42dc..dc8e9a145b 100644
--- a/tests/models/chinese_clip/test_modeling_chinese_clip.py
+++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py
@@ -649,8 +649,8 @@ class ChineseCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
             loaded_model_state_dict = loaded_model.state_dict()
 
             non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
+            for key in loaded_model_state_dict:
+                if key not in model_state_dict:
                     non_persistent_buffers[key] = loaded_model_state_dict[key]
 
             loaded_model_state_dict = {
diff --git a/tests/models/chinese_clip/test_processor_chinese_clip.py b/tests/models/chinese_clip/test_processor_chinese_clip.py
index 3c2a2247c5..83f68c0361 100644
--- a/tests/models/chinese_clip/test_processor_chinese_clip.py
+++ b/tests/models/chinese_clip/test_processor_chinese_clip.py
@@ -155,7 +155,7 @@ class ChineseCLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_feat_extract = image_processor(image_input, return_tensors="np")
         input_processor = processor(images=image_input, return_tensors="np")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
@@ -170,7 +170,7 @@ class ChineseCLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase):
 
         encoded_tok = tokenizer(input_str)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     def test_processor(self):
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index 22579fa99f..0dab34123d 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -594,8 +594,8 @@ class ClapModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
             loaded_model_state_dict = loaded_model.state_dict()
 
             non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
+            for key in loaded_model_state_dict:
+                if key not in model_state_dict:
                     non_persistent_buffers[key] = loaded_model_state_dict[key]
 
             loaded_model_state_dict = {
diff --git a/tests/models/clap/test_processor_clap.py b/tests/models/clap/test_processor_clap.py
index 49e9972ea0..43192cee2e 100644
--- a/tests/models/clap/test_processor_clap.py
+++ b/tests/models/clap/test_processor_clap.py
@@ -81,7 +81,7 @@ class ClapProcessorTest(unittest.TestCase):
         input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
         input_processor = processor(audios=raw_speech, return_tensors="np")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
@@ -96,7 +96,7 @@ class ClapProcessorTest(unittest.TestCase):
 
         encoded_tok = tokenizer(input_str)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     def test_tokenizer_decode(self):
diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py
index 92de6d4583..ad7d817f96 100644
--- a/tests/models/clip/test_modeling_clip.py
+++ b/tests/models/clip/test_modeling_clip.py
@@ -641,8 +641,8 @@ class CLIPModelTest(CLIPModelTesterMixin, PipelineTesterMixin, unittest.TestCase
             loaded_model_state_dict = loaded_model.state_dict()
 
             non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
+            for key in loaded_model_state_dict:
+                if key not in model_state_dict:
                     non_persistent_buffers[key] = loaded_model_state_dict[key]
 
             loaded_model_state_dict = {
diff --git a/tests/models/clip/test_processor_clip.py b/tests/models/clip/test_processor_clip.py
index 1b6eed7534..bb7fae4a86 100644
--- a/tests/models/clip/test_processor_clip.py
+++ b/tests/models/clip/test_processor_clip.py
@@ -137,7 +137,7 @@ class CLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_image_proc = image_processor(image_input, return_tensors="np")
         input_processor = processor(images=image_input, return_tensors="np")
 
-        for key in input_image_proc.keys():
+        for key in input_image_proc:
             self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
@@ -152,7 +152,7 @@ class CLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase):
 
         encoded_tok = tokenizer(input_str)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     def test_processor(self):
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index dcbaeeb661..08a21f9dcf 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -562,8 +562,8 @@ class CLIPSegModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
             loaded_model_state_dict = loaded_model.state_dict()
 
             non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
+            for key in loaded_model_state_dict:
+                if key not in model_state_dict:
                     non_persistent_buffers[key] = loaded_model_state_dict[key]
 
             loaded_model_state_dict = {
diff --git a/tests/models/clipseg/test_processor_clipseg.py b/tests/models/clipseg/test_processor_clipseg.py
index 5147ed3975..f7255838ca 100644
--- a/tests/models/clipseg/test_processor_clipseg.py
+++ b/tests/models/clipseg/test_processor_clipseg.py
@@ -128,7 +128,7 @@ class CLIPSegProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_feat_extract = image_processor(image_input, return_tensors="np")
         input_processor = processor(images=image_input, return_tensors="np")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
@@ -143,7 +143,7 @@ class CLIPSegProcessorTest(ProcessorTesterMixin, unittest.TestCase):
 
         encoded_tok = tokenizer(input_str)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     def test_processor_text(self):
diff --git a/tests/models/clvp/test_processor_clvp.py b/tests/models/clvp/test_processor_clvp.py
index f751ab92d0..817cbbff88 100644
--- a/tests/models/clvp/test_processor_clvp.py
+++ b/tests/models/clvp/test_processor_clvp.py
@@ -71,7 +71,7 @@ class ClvpProcessorTest(unittest.TestCase):
         input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
         input_processor = processor(raw_speech=raw_speech, return_tensors="np")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     # Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.test_tokenizer with Whisper->Clvp
@@ -87,7 +87,7 @@ class ClvpProcessorTest(unittest.TestCase):
 
         encoded_tok = tokenizer(input_str)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     # Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.test_tokenizer_decode with Whisper->Clvp
diff --git a/tests/models/csm/test_processor_csm.py b/tests/models/csm/test_processor_csm.py
index 2abb7eb2d6..910a0e7868 100644
--- a/tests/models/csm/test_processor_csm.py
+++ b/tests/models/csm/test_processor_csm.py
@@ -57,7 +57,7 @@ class CsmProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
         processor_dict_loaded = json.loads(processor_loaded.to_json_string())
         # chat templates aren't serialized to json in processors
-        self.assertFalse("chat_template" in processor_dict_loaded.keys())
+        self.assertFalse("chat_template" in processor_dict_loaded)
 
         # they have to be saved as separate file and loaded back from that file
         # so we check if the same template is loaded
diff --git a/tests/models/d_fine/test_modeling_d_fine.py b/tests/models/d_fine/test_modeling_d_fine.py
index b26db579d0..94429260ec 100644
--- a/tests/models/d_fine/test_modeling_d_fine.py
+++ b/tests/models/d_fine/test_modeling_d_fine.py
@@ -647,7 +647,7 @@ class DFineModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
             # Skip the check for the backbone
             for name, module in model.named_modules():
                 if module.__class__.__name__ == "DFineConvEncoder":
-                    backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()]
+                    backbone_params = [f"{name}.{key}" for key in module.state_dict()]
                     break
 
             for name, param in model.named_parameters():
diff --git a/tests/models/dac/test_modeling_dac.py b/tests/models/dac/test_modeling_dac.py
index 93f61f4186..bfd6e7416b 100644
--- a/tests/models/dac/test_modeling_dac.py
+++ b/tests/models/dac/test_modeling_dac.py
@@ -228,8 +228,8 @@ class DacModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
             loaded_model_state_dict = loaded_model.state_dict()
 
             non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
+            for key in loaded_model_state_dict:
+                if key not in model_state_dict:
                     non_persistent_buffers[key] = loaded_model_state_dict[key]
 
             loaded_model_state_dict = {
diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py
index fc30b10e14..df4ebd3614 100644
--- a/tests/models/deformable_detr/test_modeling_deformable_detr.py
+++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py
@@ -770,7 +770,7 @@ class DeformableDetrModelIntegrationTests(unittest.TestCase):
 
         # 3. assert equivalence
         # (on A10, the differences get larger than on T4)
-        for key in cpu_outputs.keys():
+        for key in cpu_outputs:
             torch.testing.assert_close(cpu_outputs[key], gpu_outputs[key].cpu(), atol=2e-2, rtol=2e-2)
 
         expected_logits = torch.tensor(
diff --git a/tests/models/dia/test_processor_dia.py b/tests/models/dia/test_processor_dia.py
index 8ce15f4330..b015e3c197 100644
--- a/tests/models/dia/test_processor_dia.py
+++ b/tests/models/dia/test_processor_dia.py
@@ -135,7 +135,7 @@ class DiaProcessorTest(unittest.TestCase):
         input_tokenizer = tokenizer(random_text, padding=True, return_tensors="pt")
         input_processor = self.processor(random_text)
 
-        for key in input_tokenizer.keys():
+        for key in input_tokenizer:
             self.assertTrue((input_tokenizer[key] == input_processor[key]).all())
 
     def test_no_audio(self):
diff --git a/tests/models/dpt/test_modeling_dpt.py b/tests/models/dpt/test_modeling_dpt.py
index eb968ad9f6..8b47a18a71 100644
--- a/tests/models/dpt/test_modeling_dpt.py
+++ b/tests/models/dpt/test_modeling_dpt.py
@@ -268,7 +268,7 @@ class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
             backbone_params = []
             for name, module in model.named_modules():
                 if module.__class__.__name__ == "DPTViTHybridEmbeddings":
-                    backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()]
+                    backbone_params = [f"{name}.{key}" for key in module.state_dict()]
                     break
 
             for name, param in model.named_parameters():
diff --git a/tests/models/dpt/test_modeling_dpt_auto_backbone.py b/tests/models/dpt/test_modeling_dpt_auto_backbone.py
index 1505be27cf..b9068631bd 100644
--- a/tests/models/dpt/test_modeling_dpt_auto_backbone.py
+++ b/tests/models/dpt/test_modeling_dpt_auto_backbone.py
@@ -205,7 +205,7 @@ class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
             backbone_params = []
             for name, module in model.named_modules():
                 if module.__class__.__name__ == "DPTViTHybridEmbeddings":
-                    backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()]
+                    backbone_params = [f"{name}.{key}" for key in module.state_dict()]
                     break
 
             for name, param in model.named_parameters():
diff --git a/tests/models/dpt/test_modeling_dpt_hybrid.py b/tests/models/dpt/test_modeling_dpt_hybrid.py
index 79cad886db..e7a184c400 100644
--- a/tests/models/dpt/test_modeling_dpt_hybrid.py
+++ b/tests/models/dpt/test_modeling_dpt_hybrid.py
@@ -281,7 +281,7 @@ class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
             backbone_params = []
             for name, module in model.named_modules():
                 if module.__class__.__name__ == "DPTViTHybridEmbeddings":
-                    backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()]
+                    backbone_params = [f"{name}.{key}" for key in module.state_dict()]
                     break
 
             for name, param in model.named_parameters():
diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py
index 157e174631..6dae1643d2 100644
--- a/tests/models/encodec/test_modeling_encodec.py
+++ b/tests/models/encodec/test_modeling_encodec.py
@@ -258,8 +258,8 @@ class EncodecModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
             loaded_model_state_dict = loaded_model.state_dict()
 
             non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
+            for key in loaded_model_state_dict:
+                if key not in model_state_dict:
                     non_persistent_buffers[key] = loaded_model_state_dict[key]
 
             loaded_model_state_dict = {
diff --git a/tests/models/falcon_h1/test_modeling_falcon_h1.py b/tests/models/falcon_h1/test_modeling_falcon_h1.py
index 5c538ac4b4..37afc2cceb 100644
--- a/tests/models/falcon_h1/test_modeling_falcon_h1.py
+++ b/tests/models/falcon_h1/test_modeling_falcon_h1.py
@@ -460,7 +460,7 @@ class FalconH1ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
         #   added support for it yet. We skip these models for now.
         has_encoder_attributes = any(
             attr_name
-            for attr_name in config.to_dict().keys()
+            for attr_name in config.to_dict()
             if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size"
         )
         if has_encoder_attributes:
diff --git a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py
index 855d0f2103..9686b1660f 100644
--- a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py
+++ b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py
@@ -292,7 +292,7 @@ class FalconMambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest
         """
         if isinstance(member, torch.Tensor):
             max_value, min_value = member.max().item(), member.min().item()
-        elif isinstance(member, list) or isinstance(member, tuple):
+        elif isinstance(member, (list, tuple)):
             max_value, min_value = max(member), min(member)
 
         if not isinstance(container, list):
diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py
index f5cafdc957..896ce25695 100644
--- a/tests/models/flava/test_modeling_flava.py
+++ b/tests/models/flava/test_modeling_flava.py
@@ -1001,8 +1001,8 @@ class FlavaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
             loaded_model_state_dict.pop("text_model.embeddings.token_type_ids", None)
 
             non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
+            for key in loaded_model_state_dict:
+                if key not in model_state_dict:
                     non_persistent_buffers[key] = loaded_model_state_dict[key]
 
             loaded_model_state_dict = {
diff --git a/tests/models/flava/test_processor_flava.py b/tests/models/flava/test_processor_flava.py
index 8489322efd..afba82b383 100644
--- a/tests/models/flava/test_processor_flava.py
+++ b/tests/models/flava/test_processor_flava.py
@@ -144,7 +144,7 @@ class FlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_feat_extract = image_processor(image_input, return_tensors="np")
         input_processor = processor(images=image_input, return_tensors="np")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
         # With rest of the args
@@ -157,7 +157,7 @@ class FlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
             images=image_input, return_image_mask=True, return_codebook_pixels=True, return_tensors="np"
         )
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
@@ -172,7 +172,7 @@ class FlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
 
         encoded_tok = tokenizer(input_str)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     def test_processor(self):
diff --git a/tests/models/gemma3n/test_processing_gemma3n.py b/tests/models/gemma3n/test_processing_gemma3n.py
index ffedb2b98a..5590be01ba 100644
--- a/tests/models/gemma3n/test_processing_gemma3n.py
+++ b/tests/models/gemma3n/test_processing_gemma3n.py
@@ -116,7 +116,7 @@ class Gemma3nProcessorTest(unittest.TestCase):
         input_image_processor = image_processor(raw_image, return_tensors="pt")
         input_processor = processor(text="Describe:", images=raw_image, return_tensors="pt")
 
-        for key in input_image_processor.keys():
+        for key in input_image_processor:
             self.assertAlmostEqual(input_image_processor[key].sum(), input_processor[key].sum(), delta=1e-2)
             if "pixel_values" in key:
                 # NOTE: all images should be re-scaled to 768x768
@@ -135,7 +135,7 @@ class Gemma3nProcessorTest(unittest.TestCase):
         input_feat_extract = feature_extractor(raw_speech, return_tensors="pt")
         input_processor = processor(text="Transcribe:", audio=raw_speech, return_tensors="pt")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
@@ -152,7 +152,7 @@ class Gemma3nProcessorTest(unittest.TestCase):
 
         encoded_tok = tokenizer(input_str)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key][0])
 
     def test_tokenizer_decode(self):
diff --git a/tests/models/git/test_processor_git.py b/tests/models/git/test_processor_git.py
index c15301a587..65b7819319 100644
--- a/tests/models/git/test_processor_git.py
+++ b/tests/models/git/test_processor_git.py
@@ -83,7 +83,7 @@ class GitProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_feat_extract = image_processor(image_input, return_tensors="np")
         input_processor = processor(images=image_input, return_tensors="np")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
@@ -98,7 +98,7 @@ class GitProcessorTest(ProcessorTesterMixin, unittest.TestCase):
 
         encoded_tok = tokenizer(input_str, return_token_type_ids=False)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     def test_processor(self):
diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py
index d0796468c3..5155f6d9a0 100644
--- a/tests/models/gpt2/test_modeling_gpt2.py
+++ b/tests/models/gpt2/test_modeling_gpt2.py
@@ -419,7 +419,7 @@ class GPT2ModelTester:
     def create_and_check_gpt2_weight_initialization(self, config, *args):
         model = GPT2Model(config)
         model_std = model.config.initializer_range / math.sqrt(2 * model.config.n_layer)
-        for key in model.state_dict().keys():
+        for key in model.state_dict():
             if "c_proj" in key and "weight" in key:
                 self.parent.assertLessEqual(abs(torch.std(model.state_dict()[key]) - model_std), 0.001)
                 self.parent.assertLessEqual(abs(torch.mean(model.state_dict()[key]) - 0.0), 0.01)
diff --git a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
index 26737a6473..21d1764c76 100644
--- a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
+++ b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
@@ -344,7 +344,7 @@ class GPTBigCodeModelTester:
     def create_and_check_gpt_bigcode_weight_initialization(self, config, *args):
         model = GPTBigCodeModel(config)
         model_std = model.config.initializer_range / math.sqrt(2 * model.config.n_layer)
-        for key in model.state_dict().keys():
+        for key in model.state_dict():
             if "c_proj" in key and "weight" in key:
                 self.parent.assertLessEqual(abs(torch.std(model.state_dict()[key]) - model_std), 0.001)
                 self.parent.assertLessEqual(abs(torch.mean(model.state_dict()[key]) - 0.0), 0.01)
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 8d94a42100..041d9af20b 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -763,7 +763,7 @@ class GroundingDinoModelIntegrationTests(unittest.TestCase):
             gpu_outputs = model(**encoding)
 
         # 3. assert equivalence
-        for key in cpu_outputs.keys():
+        for key in cpu_outputs:
             torch.testing.assert_close(cpu_outputs[key], gpu_outputs[key].cpu(), rtol=1e-3, atol=1e-3)
 
         expected_logits = torch.tensor(
diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py
index 35b77c39f2..89f14153b1 100644
--- a/tests/models/grounding_dino/test_processor_grounding_dino.py
+++ b/tests/models/grounding_dino/test_processor_grounding_dino.py
@@ -206,7 +206,7 @@ class GroundingDinoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_image_proc = image_processor(image_input, return_tensors="np")
         input_processor = processor(images=image_input, return_tensors="np")
 
-        for key in input_image_proc.keys():
+        for key in input_image_proc:
             self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_tokenizer with CLIP->GroundingDino
@@ -222,7 +222,7 @@ class GroundingDinoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
 
         encoded_tok = tokenizer(input_str)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     def test_processor(self):
diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py
index 73af5e4b4b..a4d521ff2a 100644
--- a/tests/models/groupvit/test_modeling_groupvit.py
+++ b/tests/models/groupvit/test_modeling_groupvit.py
@@ -635,8 +635,8 @@ class GroupViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
             loaded_model_state_dict = loaded_model.state_dict()
 
             non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
+            for key in loaded_model_state_dict:
+                if key not in model_state_dict:
                     non_persistent_buffers[key] = loaded_model_state_dict[key]
 
             loaded_model_state_dict = {
diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
index fe309e700c..fae6092eca 100644
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -597,7 +597,7 @@ class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, Gene
         #   added support for it yet. We skip these models for now.
         has_encoder_attributes = any(
             attr_name
-            for attr_name in config.to_dict().keys()
+            for attr_name in config.to_dict()
             if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size"
         )
         if has_encoder_attributes:
diff --git a/tests/models/instructblip/test_processor_instructblip.py b/tests/models/instructblip/test_processor_instructblip.py
index 984826aaa2..82da35ce79 100644
--- a/tests/models/instructblip/test_processor_instructblip.py
+++ b/tests/models/instructblip/test_processor_instructblip.py
@@ -107,7 +107,7 @@ class InstructBlipProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_feat_extract = image_processor(image_input, return_tensors="np")
         input_processor = processor(images=image_input, return_tensors="np")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_processor(self):
diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
index 5f056d4464..129a2d045c 100644
--- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
@@ -609,7 +609,7 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
         #   added support for it yet. We skip these models for now.
         has_encoder_attributes = any(
             attr_name
-            for attr_name in config.to_dict().keys()
+            for attr_name in config.to_dict()
             if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size"
         )
         if has_encoder_attributes:
diff --git a/tests/models/instructblipvideo/test_processor_instructblipvideo.py b/tests/models/instructblipvideo/test_processor_instructblipvideo.py
index 3d3633365a..2c1b897dd1 100644
--- a/tests/models/instructblipvideo/test_processor_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_processor_instructblipvideo.py
@@ -110,7 +110,7 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_feat_extract = video_processor(image_input, return_tensors="pt")
         input_processor = processor(images=image_input, return_tensors="pt")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
@@ -131,10 +131,10 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         encoded_tokens = tokenizer(input_str, return_token_type_ids=False)
         encoded_tokens_qformer = qformer_tokenizer(input_str, return_token_type_ids=False)
 
-        for key in encoded_tokens.keys():
+        for key in encoded_tokens:
             self.assertListEqual(encoded_tokens[key], encoded_processor[key])
 
-        for key in encoded_tokens_qformer.keys():
+        for key in encoded_tokens_qformer:
             self.assertListEqual(encoded_tokens_qformer[key], encoded_processor["qformer_" + key])
 
     def test_processor(self):
diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py
index 6bb86406ce..6e58c13f03 100644
--- a/tests/models/kosmos2/test_modeling_kosmos2.py
+++ b/tests/models/kosmos2/test_modeling_kosmos2.py
@@ -579,8 +579,8 @@ class Kosmos2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
             loaded_model_state_dict = loaded_model.state_dict()
 
             non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
+            for key in loaded_model_state_dict:
+                if key not in model_state_dict:
                     non_persistent_buffers[key] = loaded_model_state_dict[key]
 
             loaded_model_state_dict = {
diff --git a/tests/models/kosmos2/test_processor_kosmos2.py b/tests/models/kosmos2/test_processor_kosmos2.py
index 587db26ef1..6f539e866b 100644
--- a/tests/models/kosmos2/test_processor_kosmos2.py
+++ b/tests/models/kosmos2/test_processor_kosmos2.py
@@ -135,7 +135,7 @@ class Kosmos2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_image_processor = image_processor(image_input, return_tensors="np")
         input_processor = processor(images=image_input, return_tensors="np")
 
-        for key in input_image_processor.keys():
+        for key in input_image_processor:
             self.assertAlmostEqual(input_image_processor[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
@@ -150,7 +150,7 @@ class Kosmos2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
 
         encoded_tok = tokenizer(input_str, return_token_type_ids=False)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     def test_processor(self):
diff --git a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py
index 6be8879afb..35898887fa 100644
--- a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py
+++ b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py
@@ -390,7 +390,7 @@ class KyutaiSpeechToTextModelTest(ModelTesterMixin, GenerationTesterMixin, Pipel
         #   added support for it yet. We skip these models for now.
         has_encoder_attributes = any(
             attr_name
-            for attr_name in config.to_dict().keys()
+            for attr_name in config.to_dict()
             if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size"
         )
         if has_encoder_attributes:
diff --git a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
index 6eb8abf0b5..ab34b1007d 100644
--- a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
@@ -860,7 +860,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
                     words, is_pair=False, boxes=boxes, max_length=maximum_length + 10, padding="longest"
                 )
-                for key in encoded_sequences_batch_padded_1.keys():
+                for key in encoded_sequences_batch_padded_1:
                     self.assertListEqual(
                         encoded_sequences_batch_padded_1[key],
                         encoded_sequences_batch_padded_2[key],
@@ -873,7 +873,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
                     words, is_pair=False, boxes=boxes, max_length=maximum_length + 10, padding=False
                 )
-                for key in encoded_sequences_batch_padded_1.keys():
+                for key in encoded_sequences_batch_padded_1:
                     self.assertListEqual(
                         encoded_sequences_batch_padded_1[key],
                         encoded_sequences_batch_padded_2[key],
@@ -1358,7 +1358,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                     add_special_tokens=True,
                 )
 
-                for key in tokens_p.keys():
+                for key in tokens_p:
                     self.assertEqual(tokens_r[key], tokens_p[key])
 
                 if "token_type_ids" in tokens_r:
@@ -1389,7 +1389,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 # encode_plus()
                 no_special_tokens = tokenizer_r.encode_plus(words, boxes=boxes, add_special_tokens=False)
                 with_special_tokens = tokenizer_r.encode_plus(words, boxes=boxes, add_special_tokens=True)
-                for key in no_special_tokens.keys():
+                for key in no_special_tokens:
                     self.assertEqual(
                         len(no_special_tokens[key]),
                         len(with_special_tokens[key]) - simple_num_special_tokens_to_add,
@@ -1400,7 +1400,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
                 no_special_tokens = tokenizer_r.batch_encode_plus(words, boxes=boxes, add_special_tokens=False)
                 with_special_tokens = tokenizer_r.batch_encode_plus(words, boxes=boxes, add_special_tokens=True)
-                for key in no_special_tokens.keys():
+                for key in no_special_tokens:
                     for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]):
                         self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add)
 
diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
index deee8d31d2..fcc997c769 100644
--- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
@@ -753,7 +753,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
                     words, is_pair=False, boxes=boxes, max_length=maximum_length + 10, padding="longest"
                 )
-                for key in encoded_sequences_batch_padded_1.keys():
+                for key in encoded_sequences_batch_padded_1:
                     self.assertListEqual(
                         encoded_sequences_batch_padded_1[key],
                         encoded_sequences_batch_padded_2[key],
@@ -766,7 +766,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
                     words, is_pair=False, boxes=boxes, max_length=maximum_length + 10, padding=False
                 )
-                for key in encoded_sequences_batch_padded_1.keys():
+                for key in encoded_sequences_batch_padded_1:
                     self.assertListEqual(
                         encoded_sequences_batch_padded_1[key],
                         encoded_sequences_batch_padded_2[key],
@@ -1249,7 +1249,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                     add_special_tokens=True,
                 )
 
-                for key in tokens_p.keys():
+                for key in tokens_p:
                     self.assertEqual(tokens_r[key], tokens_p[key])
 
                 if "token_type_ids" in tokens_r:
@@ -1280,7 +1280,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 # encode_plus()
                 no_special_tokens = tokenizer_r.encode_plus(words, boxes=boxes, add_special_tokens=False)
                 with_special_tokens = tokenizer_r.encode_plus(words, boxes=boxes, add_special_tokens=True)
-                for key in no_special_tokens.keys():
+                for key in no_special_tokens:
                     self.assertEqual(
                         len(no_special_tokens[key]),
                         len(with_special_tokens[key]) - simple_num_special_tokens_to_add,
@@ -1291,7 +1291,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
                 no_special_tokens = tokenizer_r.batch_encode_plus(words, boxes=boxes, add_special_tokens=False)
                 with_special_tokens = tokenizer_r.batch_encode_plus(words, boxes=boxes, add_special_tokens=True)
-                for key in no_special_tokens.keys():
+                for key in no_special_tokens:
                     for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]):
                         self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add)
 
diff --git a/tests/models/layoutxlm/test_tokenization_layoutxlm.py b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
index 056726f004..506ddd1f24 100644
--- a/tests/models/layoutxlm/test_tokenization_layoutxlm.py
+++ b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
@@ -791,7 +791,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
                     words, is_pair=False, boxes=boxes, max_length=maximum_length + 10, padding="longest"
                 )
-                for key in encoded_sequences_batch_padded_1.keys():
+                for key in encoded_sequences_batch_padded_1:
                     self.assertListEqual(
                         encoded_sequences_batch_padded_1[key],
                         encoded_sequences_batch_padded_2[key],
@@ -804,7 +804,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
                     words, is_pair=False, boxes=boxes, max_length=maximum_length + 10, padding=False
                 )
-                for key in encoded_sequences_batch_padded_1.keys():
+                for key in encoded_sequences_batch_padded_1:
                     self.assertListEqual(
                         encoded_sequences_batch_padded_1[key],
                         encoded_sequences_batch_padded_2[key],
@@ -1287,7 +1287,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                     add_special_tokens=True,
                 )
 
-                for key in tokens_p.keys():
+                for key in tokens_p:
                     self.assertEqual(tokens_r[key], tokens_p[key])
 
                 if "token_type_ids" in tokens_r:
@@ -1318,7 +1318,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 # encode_plus()
                 no_special_tokens = tokenizer_r.encode_plus(words, boxes=boxes, add_special_tokens=False)
                 with_special_tokens = tokenizer_r.encode_plus(words, boxes=boxes, add_special_tokens=True)
-                for key in no_special_tokens.keys():
+                for key in no_special_tokens:
                     self.assertEqual(
                         len(no_special_tokens[key]),
                         len(with_special_tokens[key]) - simple_num_special_tokens_to_add,
@@ -1329,7 +1329,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
                 no_special_tokens = tokenizer_r.batch_encode_plus(words, boxes=boxes, add_special_tokens=False)
                 with_special_tokens = tokenizer_r.batch_encode_plus(words, boxes=boxes, add_special_tokens=True)
-                for key in no_special_tokens.keys():
+                for key in no_special_tokens:
                     for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]):
                         self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add)
 
diff --git a/tests/models/llava/test_processor_llava.py b/tests/models/llava/test_processor_llava.py
index 41b9d8a09e..7e3be530d4 100644
--- a/tests/models/llava/test_processor_llava.py
+++ b/tests/models/llava/test_processor_llava.py
@@ -77,7 +77,7 @@ class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
         processor_dict_loaded = json.loads(processor_loaded.to_json_string())
         # chat templates aren't serialized to json in processors
-        self.assertFalse("chat_template" in processor_dict_loaded.keys())
+        self.assertFalse("chat_template" in processor_dict_loaded)
 
         # they have to be saved as separate file and loaded back from that file
         # so we check if the same template is loaded
diff --git a/tests/models/llava_next/test_processor_llava_next.py b/tests/models/llava_next/test_processor_llava_next.py
index d6156adb75..46d385d2d8 100644
--- a/tests/models/llava_next/test_processor_llava_next.py
+++ b/tests/models/llava_next/test_processor_llava_next.py
@@ -84,7 +84,7 @@ class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
         processor_dict_loaded = json.loads(processor_loaded.to_json_string())
         # chat templates aren't serialized to json in processors
-        self.assertFalse("chat_template" in processor_dict_loaded.keys())
+        self.assertFalse("chat_template" in processor_dict_loaded)
 
         # they have to be saved as separate file and loaded back from that file
         # so we check if the same template is loaded
diff --git a/tests/models/llava_next_video/test_processor_llava_next_video.py b/tests/models/llava_next_video/test_processor_llava_next_video.py
index 17bcb3657d..3426d1e453 100644
--- a/tests/models/llava_next_video/test_processor_llava_next_video.py
+++ b/tests/models/llava_next_video/test_processor_llava_next_video.py
@@ -93,7 +93,7 @@ class LlavaNextVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
         processor_dict_loaded = json.loads(processor_loaded.to_json_string())
         # chat templates aren't serialized to json in processors
-        self.assertFalse("chat_template" in processor_dict_loaded.keys())
+        self.assertFalse("chat_template" in processor_dict_loaded)
 
         # they have to be saved as separate file and loaded back from that file
         # so we check if the same template is loaded
diff --git a/tests/models/llava_onevision/test_processor_llava_onevision.py b/tests/models/llava_onevision/test_processor_llava_onevision.py
index 1eb3b0d0d4..577ba19cd8 100644
--- a/tests/models/llava_onevision/test_processor_llava_onevision.py
+++ b/tests/models/llava_onevision/test_processor_llava_onevision.py
@@ -97,7 +97,7 @@ class LlavaOnevisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
         processor_dict_loaded = json.loads(processor_loaded.to_json_string())
         # chat templates aren't serialized to json in processors
-        self.assertFalse("chat_template" in processor_dict_loaded.keys())
+        self.assertFalse("chat_template" in processor_dict_loaded)
 
         # they have to be saved as separate file and loaded back from that file
         # so we check if the same template is loaded
diff --git a/tests/models/luke/test_modeling_luke.py b/tests/models/luke/test_modeling_luke.py
index 7cc282e49a..5959b1877c 100644
--- a/tests/models/luke/test_modeling_luke.py
+++ b/tests/models/luke/test_modeling_luke.py
@@ -897,7 +897,7 @@ class LukeModelIntegrationTests(unittest.TestCase):
         encoding = tokenizer(text, entity_spans=[span], add_prefix_space=True, return_tensors="pt")
 
         # move all values to device
-        for key in encoding.keys():
+        for key in encoding:
             encoding[key] = encoding[key].to(torch_device)
 
         outputs = model(**encoding)
@@ -932,7 +932,7 @@ class LukeModelIntegrationTests(unittest.TestCase):
         encoding = tokenizer(text, entity_spans=[span], add_prefix_space=True, return_tensors="pt")
 
         # move all values to device
-        for key in encoding.keys():
+        for key in encoding:
             encoding[key] = encoding[key].to(torch_device)
 
         outputs = model(**encoding)
diff --git a/tests/models/mamba/test_modeling_mamba.py b/tests/models/mamba/test_modeling_mamba.py
index e99c8b1e57..aff98558d5 100644
--- a/tests/models/mamba/test_modeling_mamba.py
+++ b/tests/models/mamba/test_modeling_mamba.py
@@ -260,7 +260,7 @@ class MambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
         """
         if isinstance(member, torch.Tensor):
             max_value, min_value = member.max().item(), member.min().item()
-        elif isinstance(member, list) or isinstance(member, tuple):
+        elif isinstance(member, (list, tuple)):
             max_value, min_value = max(member), min(member)
 
         if not isinstance(container, list):
diff --git a/tests/models/markuplm/test_tokenization_markuplm.py b/tests/models/markuplm/test_tokenization_markuplm.py
index 3cdbd4acf8..3bf526407f 100644
--- a/tests/models/markuplm/test_tokenization_markuplm.py
+++ b/tests/models/markuplm/test_tokenization_markuplm.py
@@ -627,7 +627,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
                     nodes, is_pair=False, xpaths=xpaths, max_length=maximum_length + 10, padding="longest"
                 )
-                for key in encoded_sequences_batch_padded_1.keys():
+                for key in encoded_sequences_batch_padded_1:
                     self.assertListEqual(
                         encoded_sequences_batch_padded_1[key],
                         encoded_sequences_batch_padded_2[key],
@@ -640,7 +640,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
                     nodes, is_pair=False, xpaths=xpaths, max_length=maximum_length + 10, padding=False
                 )
-                for key in encoded_sequences_batch_padded_1.keys():
+                for key in encoded_sequences_batch_padded_1:
                     self.assertListEqual(
                         encoded_sequences_batch_padded_1[key],
                         encoded_sequences_batch_padded_2[key],
@@ -1120,7 +1120,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 tokens_r = tokenizer_r.encode_plus(nodes, xpaths=xpaths, add_special_tokens=True)
                 tokens_p = tokenizer_p.encode_plus(nodes, xpaths=xpaths, add_special_tokens=True)
 
-                for key in tokens_p.keys():
+                for key in tokens_p:
                     self.assertEqual(tokens_r[key], tokens_p[key])
 
                 if "token_type_ids" in tokens_r:
@@ -1151,7 +1151,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 # encode_plus()
                 no_special_tokens = tokenizer_r.encode_plus(nodes, xpaths=xpaths, add_special_tokens=False)
                 with_special_tokens = tokenizer_r.encode_plus(nodes, xpaths=xpaths, add_special_tokens=True)
-                for key in no_special_tokens.keys():
+                for key in no_special_tokens:
                     self.assertEqual(
                         len(no_special_tokens[key]),
                         len(with_special_tokens[key]) - simple_num_special_tokens_to_add,
@@ -1162,7 +1162,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
                 no_special_tokens = tokenizer_r.batch_encode_plus(nodes, xpaths=xpaths, add_special_tokens=False)
                 with_special_tokens = tokenizer_r.batch_encode_plus(nodes, xpaths=xpaths, add_special_tokens=True)
-                for key in no_special_tokens.keys():
+                for key in no_special_tokens:
                     for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]):
                         self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add)
 
diff --git a/tests/models/mgp_str/test_processor_mgp_str.py b/tests/models/mgp_str/test_processor_mgp_str.py
index ccfb6dba68..7157655abe 100644
--- a/tests/models/mgp_str/test_processor_mgp_str.py
+++ b/tests/models/mgp_str/test_processor_mgp_str.py
@@ -134,7 +134,7 @@ class MgpstrProcessorTest(unittest.TestCase):
         input_image_proc = image_processor(image_input, return_tensors="np")
         input_processor = processor(images=image_input, return_tensors="np")
 
-        for key in input_image_proc.keys():
+        for key in input_image_proc:
             self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
@@ -148,7 +148,7 @@ class MgpstrProcessorTest(unittest.TestCase):
         encoded_processor = processor(text=input_str)
 
         encoded_tok = tokenizer(input_str)
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     def test_processor(self):
diff --git a/tests/models/mimi/test_modeling_mimi.py b/tests/models/mimi/test_modeling_mimi.py
index d9b0216b15..2123e93c69 100644
--- a/tests/models/mimi/test_modeling_mimi.py
+++ b/tests/models/mimi/test_modeling_mimi.py
@@ -270,8 +270,8 @@ class MimiModelTest(ModelTesterMixin, unittest.TestCase):
             loaded_model_state_dict = loaded_model.state_dict()
 
             non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
+            for key in loaded_model_state_dict:
+                if key not in model_state_dict:
                     non_persistent_buffers[key] = loaded_model_state_dict[key]
 
             loaded_model_state_dict = {
diff --git a/tests/models/mllama/test_processor_mllama.py b/tests/models/mllama/test_processor_mllama.py
index e87a7ae751..f20dd5d277 100644
--- a/tests/models/mllama/test_processor_mllama.py
+++ b/tests/models/mllama/test_processor_mllama.py
@@ -60,7 +60,7 @@ class MllamaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
         processor_dict_loaded = json.loads(processor_loaded.to_json_string())
         # chat templates aren't serialized to json in processors
-        self.assertFalse("chat_template" in processor_dict_loaded.keys())
+        self.assertFalse("chat_template" in processor_dict_loaded)
 
         # they have to be saved as separate file and loaded back from that file
         # so we check if the same template is loaded
diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py
index 4f5b168959..f750f35cf3 100644
--- a/tests/models/moshi/test_modeling_moshi.py
+++ b/tests/models/moshi/test_modeling_moshi.py
@@ -69,7 +69,7 @@ if is_torch_available():
 
 def _config_zero_init(config):
     configs_no_init = copy.deepcopy(config)
-    for key in configs_no_init.__dict__.keys():
+    for key in configs_no_init.__dict__:
         if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
             setattr(configs_no_init, key, 1e-10)
         if isinstance(getattr(configs_no_init, key, None), PretrainedConfig):
diff --git a/tests/models/mpt/test_modeling_mpt.py b/tests/models/mpt/test_modeling_mpt.py
index 15d8fddb9f..75c9f65e92 100644
--- a/tests/models/mpt/test_modeling_mpt.py
+++ b/tests/models/mpt/test_modeling_mpt.py
@@ -310,7 +310,7 @@ class MptModelTester:
     def create_and_check_mpt_weight_initialization(self, config, *args):
         model = MptModel(config)
         model_std = model.config.initializer_range / math.sqrt(2 * model.config.n_layers)
-        for key in model.state_dict().keys():
+        for key in model.state_dict():
             if "c_proj" in key and "weight" in key:
                 self.parent.assertLessEqual(abs(torch.std(model.state_dict()[key]) - model_std), 0.001)
                 self.parent.assertLessEqual(abs(torch.mean(model.state_dict()[key]) - 0.0), 0.01)
diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py
index 8a41c47d6f..bd4290f05c 100644
--- a/tests/models/musicgen/test_modeling_musicgen.py
+++ b/tests/models/musicgen/test_modeling_musicgen.py
@@ -64,7 +64,7 @@ if is_torch_available():
 
 def _config_zero_init(config):
     configs_no_init = copy.deepcopy(config)
-    for key in configs_no_init.__dict__.keys():
+    for key in configs_no_init.__dict__:
         if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
             setattr(configs_no_init, key, 1e-10)
         if isinstance(getattr(configs_no_init, key, None), PretrainedConfig):
diff --git a/tests/models/musicgen/test_processor_musicgen.py b/tests/models/musicgen/test_processor_musicgen.py
index be1e855725..9070b753e5 100644
--- a/tests/models/musicgen/test_processor_musicgen.py
+++ b/tests/models/musicgen/test_processor_musicgen.py
@@ -106,7 +106,7 @@ class MusicgenProcessorTest(unittest.TestCase):
         input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
         input_processor = processor(raw_speech, return_tensors="np")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
@@ -121,7 +121,7 @@ class MusicgenProcessorTest(unittest.TestCase):
 
         encoded_tok = tokenizer(input_str)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     def test_tokenizer_decode(self):
diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
index 72b20f345b..a09f37d115 100644
--- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
+++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
@@ -68,7 +68,7 @@ if is_torchaudio_available():
 
 def _config_zero_init(config):
     configs_no_init = copy.deepcopy(config)
-    for key in configs_no_init.__dict__.keys():
+    for key in configs_no_init.__dict__:
         if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
             setattr(configs_no_init, key, 1e-10)
         if isinstance(getattr(configs_no_init, key, None), PretrainedConfig):
diff --git a/tests/models/musicgen_melody/test_processor_musicgen_melody.py b/tests/models/musicgen_melody/test_processor_musicgen_melody.py
index 04fb94c64c..fbeee0543b 100644
--- a/tests/models/musicgen_melody/test_processor_musicgen_melody.py
+++ b/tests/models/musicgen_melody/test_processor_musicgen_melody.py
@@ -111,7 +111,7 @@ class MusicgenMelodyProcessorTest(unittest.TestCase):
         input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
         input_processor = processor(raw_speech, return_tensors="np")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
@@ -126,7 +126,7 @@ class MusicgenMelodyProcessorTest(unittest.TestCase):
 
         encoded_tok = tokenizer(input_str)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     def test_tokenizer_decode(self):
diff --git a/tests/models/omdet_turbo/test_processor_omdet_turbo.py b/tests/models/omdet_turbo/test_processor_omdet_turbo.py
index 500e83c56b..262cff8947 100644
--- a/tests/models/omdet_turbo/test_processor_omdet_turbo.py
+++ b/tests/models/omdet_turbo/test_processor_omdet_turbo.py
@@ -142,7 +142,7 @@ class OmDetTurboProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_image_proc = image_processor(image_input, return_tensors="np")
         input_processor = processor(images=image_input, return_tensors="np")
 
-        for key in input_image_proc.keys():
+        for key in input_image_proc:
             self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
@@ -157,7 +157,7 @@ class OmDetTurboProcessorTest(ProcessorTesterMixin, unittest.TestCase):
 
         encoded_tok = tokenizer(input_str, padding="max_length", truncation=True, max_length=77)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     def test_processor(self):
diff --git a/tests/models/owlv2/test_modeling_owlv2.py b/tests/models/owlv2/test_modeling_owlv2.py
index bbbe6824be..d35cd77159 100644
--- a/tests/models/owlv2/test_modeling_owlv2.py
+++ b/tests/models/owlv2/test_modeling_owlv2.py
@@ -524,8 +524,8 @@ class Owlv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
             loaded_model_state_dict = loaded_model.state_dict()
 
             non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
+            for key in loaded_model_state_dict:
+                if key not in model_state_dict:
                     non_persistent_buffers[key] = loaded_model_state_dict[key]
 
             loaded_model_state_dict = {
@@ -738,8 +738,8 @@ class Owlv2ForObjectDetectionTest(ModelTesterMixin, unittest.TestCase):
             loaded_model_state_dict = loaded_model.state_dict()
 
             non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
+            for key in loaded_model_state_dict:
+                if key not in model_state_dict:
                     non_persistent_buffers[key] = loaded_model_state_dict[key]
 
             loaded_model_state_dict = {
diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py
index d150ae7899..01d45d7e43 100644
--- a/tests/models/owlvit/test_modeling_owlvit.py
+++ b/tests/models/owlvit/test_modeling_owlvit.py
@@ -519,8 +519,8 @@ class OwlViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
             loaded_model_state_dict = loaded_model.state_dict()
 
             non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
+            for key in loaded_model_state_dict:
+                if key not in model_state_dict:
                     non_persistent_buffers[key] = loaded_model_state_dict[key]
 
             loaded_model_state_dict = {
@@ -731,8 +731,8 @@ class OwlViTForObjectDetectionTest(ModelTesterMixin, unittest.TestCase):
             loaded_model_state_dict = loaded_model.state_dict()
 
             non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
+            for key in loaded_model_state_dict:
+                if key not in model_state_dict:
                     non_persistent_buffers[key] = loaded_model_state_dict[key]
 
             loaded_model_state_dict = {
diff --git a/tests/models/owlvit/test_processor_owlvit.py b/tests/models/owlvit/test_processor_owlvit.py
index f31dbaf9fb..069fa7b776 100644
--- a/tests/models/owlvit/test_processor_owlvit.py
+++ b/tests/models/owlvit/test_processor_owlvit.py
@@ -128,7 +128,7 @@ class OwlViTProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_image_proc = image_processor(image_input, return_tensors="np")
         input_processor = processor(images=image_input, return_tensors="np")
 
-        for key in input_image_proc.keys():
+        for key in input_image_proc:
             self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
@@ -143,7 +143,7 @@ class OwlViTProcessorTest(ProcessorTesterMixin, unittest.TestCase):
 
         encoded_tok = tokenizer(input_str, return_tensors="np")
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key][0].tolist(), encoded_processor[key][0].tolist())
 
     def test_processor(self):
diff --git a/tests/models/perceiver/test_modeling_perceiver.py b/tests/models/perceiver/test_modeling_perceiver.py
index 6c2aceea53..3966f4a3a0 100644
--- a/tests/models/perceiver/test_modeling_perceiver.py
+++ b/tests/models/perceiver/test_modeling_perceiver.py
@@ -431,7 +431,7 @@ class PerceiverModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
 
             if model_class.__name__ == "PerceiverForMultimodalAutoencoding":
                 # model outputs a dictionary with logits per modality, let's verify each modality
-                for modality in first.keys():
+                for modality in first:
                     out_1 = first[modality].cpu().numpy()
                     out_2 = second[modality].cpu().numpy()
                     out_1 = out_1[~np.isnan(out_1)]
@@ -677,7 +677,7 @@ class PerceiverModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
             hidden_states_with_chunk = model(**self._prepare_for_class(inputs_dict, model_class))[0]
             if model_class.__name__ == "PerceiverForMultimodalAutoencoding":
                 # model outputs a dictionary with logits for each modality
-                for modality in hidden_states_no_chunk.keys():
+                for modality in hidden_states_no_chunk:
                     self.assertTrue(
                         torch.allclose(hidden_states_no_chunk[modality], hidden_states_with_chunk[modality], atol=1e-3)
                     )
@@ -695,7 +695,7 @@ class PerceiverModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
 
             if model_class.__name__ == "PerceiverForMultimodalAutoencoding":
-                for modality in outputs[0].keys():
+                for modality in outputs[0]:
                     out_2 = outputs[0][modality].cpu().numpy()
                     out_2[np.isnan(out_2)] = 0
 
diff --git a/tests/models/perception_lm/test_processor_perception_lm.py b/tests/models/perception_lm/test_processor_perception_lm.py
index 28f5a56e4c..c6384e4b45 100644
--- a/tests/models/perception_lm/test_processor_perception_lm.py
+++ b/tests/models/perception_lm/test_processor_perception_lm.py
@@ -83,7 +83,7 @@ class PerceptionLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
         processor_dict_loaded = json.loads(processor_loaded.to_json_string())
         # chat templates aren't serialized to json in processors
-        self.assertFalse("chat_template" in processor_dict_loaded.keys())
+        self.assertFalse("chat_template" in processor_dict_loaded)
 
         # they have to be saved as separate file and loaded back from that file
         # so we check if the same template is loaded
diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py
index d6a0fa5f08..64ebdd4fb5 100644
--- a/tests/models/pix2struct/test_modeling_pix2struct.py
+++ b/tests/models/pix2struct/test_modeling_pix2struct.py
@@ -702,8 +702,8 @@ class Pix2StructModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
             loaded_model_state_dict = loaded_model.state_dict()
 
             non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
+            for key in loaded_model_state_dict:
+                if key not in model_state_dict:
                     non_persistent_buffers[key] = loaded_model_state_dict[key]
 
             loaded_model_state_dict = {
diff --git a/tests/models/pix2struct/test_processor_pix2struct.py b/tests/models/pix2struct/test_processor_pix2struct.py
index d58605d444..5bc3a10336 100644
--- a/tests/models/pix2struct/test_processor_pix2struct.py
+++ b/tests/models/pix2struct/test_processor_pix2struct.py
@@ -90,7 +90,7 @@ class Pix2StructProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_feat_extract = image_processor(image_input, return_tensors="np")
         input_processor = processor(images=image_input, return_tensors="np")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
@@ -105,7 +105,7 @@ class Pix2StructProcessorTest(ProcessorTesterMixin, unittest.TestCase):
 
         encoded_tok = tokenizer(input_str, return_token_type_ids=False, add_special_tokens=True)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     def test_processor(self):
diff --git a/tests/models/pop2piano/test_processor_pop2piano.py b/tests/models/pop2piano/test_processor_pop2piano.py
index b8cdf63ff9..41b37621ec 100644
--- a/tests/models/pop2piano/test_processor_pop2piano.py
+++ b/tests/models/pop2piano/test_processor_pop2piano.py
@@ -155,7 +155,7 @@ class Pop2PianoProcessorTest(unittest.TestCase):
         )
         processor_outputs = processor(audio=input_speech, sampling_rate=sampling_rate, return_tensors="np")
 
-        for key in feature_extractor_outputs.keys():
+        for key in feature_extractor_outputs:
             self.assertTrue(np.allclose(feature_extractor_outputs[key], processor_outputs[key], atol=1e-4))
 
     def test_processor_batch_decode(self):
diff --git a/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py b/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py
index 0baea494cd..eaea49fb44 100644
--- a/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py
+++ b/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py
@@ -278,7 +278,7 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_image_proc = image_processor(image_input, return_tensors="pt")
         input_processor = processor(images=image_input, text="dummy", return_tensors="pt")
 
-        for key in input_image_proc.keys():
+        for key in input_image_proc:
             self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_processor(self):
diff --git a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
index c3f478950f..dc15d5ef86 100644
--- a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
+++ b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
@@ -108,7 +108,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_image_proc = image_processor(image_input, return_tensors="pt")
         input_processor = processor(images=image_input, text="dummy", return_tensors="pt")
 
-        for key in input_image_proc.keys():
+        for key in input_image_proc:
             self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_processor(self):
diff --git a/tests/models/qwen2_vl/test_processor_qwen2_vl.py b/tests/models/qwen2_vl/test_processor_qwen2_vl.py
index 69fae59595..09ed36beb6 100644
--- a/tests/models/qwen2_vl/test_processor_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_processor_qwen2_vl.py
@@ -112,7 +112,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_image_proc = image_processor(image_input, return_tensors="pt")
         input_processor = processor(images=image_input, text="dummy", return_tensors="pt")
 
-        for key in input_image_proc.keys():
+        for key in input_image_proc:
             self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_processor(self):
diff --git a/tests/models/rt_detr/test_modeling_rt_detr.py b/tests/models/rt_detr/test_modeling_rt_detr.py
index fad9093426..809f4acfde 100644
--- a/tests/models/rt_detr/test_modeling_rt_detr.py
+++ b/tests/models/rt_detr/test_modeling_rt_detr.py
@@ -603,7 +603,7 @@ class RTDetrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
             # Skip the check for the backbone
             for name, module in model.named_modules():
                 if module.__class__.__name__ == "RTDetrConvEncoder":
-                    backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()]
+                    backbone_params = [f"{name}.{key}" for key in module.state_dict()]
                     break
 
             for name, param in model.named_parameters():
diff --git a/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py b/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py
index 79202d3cf7..52cd5ac0c4 100644
--- a/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py
+++ b/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py
@@ -607,7 +607,7 @@ class RTDetrV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
             # Skip the check for the backbone
             for name, module in model.named_modules():
                 if module.__class__.__name__ == "RTDetrV2ConvEncoder":
-                    backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()]
+                    backbone_params = [f"{name}.{key}" for key in module.state_dict()]
                     break
 
             for name, param in model.named_parameters():
diff --git a/tests/models/rwkv/test_modeling_rwkv.py b/tests/models/rwkv/test_modeling_rwkv.py
index 4e92baf2d3..8fe4070e9c 100644
--- a/tests/models/rwkv/test_modeling_rwkv.py
+++ b/tests/models/rwkv/test_modeling_rwkv.py
@@ -238,7 +238,7 @@ class RwkvModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
         """
         if isinstance(member, torch.Tensor):
             max_value, min_value = member.max().item(), member.min().item()
-        elif isinstance(member, list) or isinstance(member, tuple):
+        elif isinstance(member, (list, tuple)):
             max_value, min_value = max(member), min(member)
 
         if not isinstance(container, list):
diff --git a/tests/models/sam/test_processor_sam.py b/tests/models/sam/test_processor_sam.py
index 852df56f99..b609515205 100644
--- a/tests/models/sam/test_processor_sam.py
+++ b/tests/models/sam/test_processor_sam.py
@@ -98,7 +98,7 @@ class SamProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_feat_extract = image_processor(image_input, return_tensors="np")
         input_processor = processor(images=image_input, return_tensors="np")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
         for image in input_feat_extract.pixel_values:
@@ -123,7 +123,7 @@ class SamProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_feat_extract = image_processor(images=image_input, segmentation_maps=mask_input, return_tensors="np")
         input_processor = processor(images=image_input, segmentation_maps=mask_input, return_tensors="np")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
         for label in input_feat_extract.labels:
diff --git a/tests/models/sam_hq/test_processor_samhq.py b/tests/models/sam_hq/test_processor_samhq.py
index 89a642ba60..36e00a9ce0 100644
--- a/tests/models/sam_hq/test_processor_samhq.py
+++ b/tests/models/sam_hq/test_processor_samhq.py
@@ -107,7 +107,7 @@ class SamHQProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_feat_extract = image_processor(image_input, return_tensors="pt")
         input_processor = processor(images=image_input, return_tensors="pt")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum().item(), input_processor[key].sum().item(), delta=1e-2)
 
         for image in input_feat_extract.pixel_values:
@@ -132,7 +132,7 @@ class SamHQProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_feat_extract = image_processor(images=image_input, segmentation_maps=mask_input, return_tensors="pt")
         input_processor = processor(images=image_input, segmentation_maps=mask_input, return_tensors="pt")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum().item(), input_processor[key].sum().item(), delta=1e-2)
 
         for label in input_feat_extract.labels:
diff --git a/tests/models/seamless_m4t/test_processor_seamless_m4t.py b/tests/models/seamless_m4t/test_processor_seamless_m4t.py
index 7beefb16bd..f963fa2098 100644
--- a/tests/models/seamless_m4t/test_processor_seamless_m4t.py
+++ b/tests/models/seamless_m4t/test_processor_seamless_m4t.py
@@ -51,9 +51,7 @@ class SeamlessM4TProcessorTest(unittest.TestCase):
         processor = SeamlessM4TProcessor.from_pretrained(self.tmpdirname)
 
         self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
-        tokenizer_instance = isinstance(processor.tokenizer, SeamlessM4TTokenizerFast) or isinstance(
-            processor.tokenizer, SeamlessM4TTokenizer
-        )
+        tokenizer_instance = isinstance(processor.tokenizer, (SeamlessM4TTokenizerFast, SeamlessM4TTokenizer))
         self.assertTrue(tokenizer_instance)
 
         self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
@@ -75,9 +73,7 @@ class SeamlessM4TProcessorTest(unittest.TestCase):
         self.assertIsInstance(processor.feature_extractor, SeamlessM4TFeatureExtractor)
         self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
 
-        tokenizer_instance = isinstance(processor.tokenizer, SeamlessM4TTokenizerFast) or isinstance(
-            processor.tokenizer, SeamlessM4TTokenizer
-        )
+        tokenizer_instance = isinstance(processor.tokenizer, (SeamlessM4TTokenizerFast, SeamlessM4TTokenizer))
         self.assertTrue(tokenizer_instance)
 
     # Copied from test.models.whisper.test_processor_whisper.WhisperProcessorTest.test_feature_extractor with Whisper->SeamlessM4T
@@ -92,7 +88,7 @@ class SeamlessM4TProcessorTest(unittest.TestCase):
         input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
         input_processor = processor(audios=raw_speech, return_tensors="np")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     # Copied from test.models.whisper.test_processor_whisper.WhisperProcessorTest.test_tokenizer with Whisper->SeamlessM4T
@@ -108,7 +104,7 @@ class SeamlessM4TProcessorTest(unittest.TestCase):
 
         encoded_tok = tokenizer(input_str)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     # Copied from test.models.whisper.test_processor_whisper.WhisperProcessorTest.test_tokenizer_decode with Whisper->SeamlessM4T
diff --git a/tests/models/siglip/test_modeling_siglip.py b/tests/models/siglip/test_modeling_siglip.py
index b5c14280bb..295c127b36 100644
--- a/tests/models/siglip/test_modeling_siglip.py
+++ b/tests/models/siglip/test_modeling_siglip.py
@@ -553,8 +553,8 @@ class SiglipModelTest(SiglipModelTesterMixin, PipelineTesterMixin, unittest.Test
             loaded_model_state_dict = loaded_model.state_dict()
 
             non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
+            for key in loaded_model_state_dict:
+                if key not in model_state_dict:
                     non_persistent_buffers[key] = loaded_model_state_dict[key]
 
             loaded_model_state_dict = {
diff --git a/tests/models/speech_to_text/test_modeling_speech_to_text.py b/tests/models/speech_to_text/test_modeling_speech_to_text.py
index 593aca1406..0ebbc6631e 100644
--- a/tests/models/speech_to_text/test_modeling_speech_to_text.py
+++ b/tests/models/speech_to_text/test_modeling_speech_to_text.py
@@ -664,8 +664,8 @@ class Speech2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest
             loaded_model_state_dict = loaded_model.state_dict()
 
             non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
+            for key in loaded_model_state_dict:
+                if key not in model_state_dict:
                     non_persistent_buffers[key] = loaded_model_state_dict[key]
 
             loaded_model_state_dict = {
diff --git a/tests/models/speech_to_text/test_processor_speech_to_text.py b/tests/models/speech_to_text/test_processor_speech_to_text.py
index 4f2c714295..f4794cca53 100644
--- a/tests/models/speech_to_text/test_processor_speech_to_text.py
+++ b/tests/models/speech_to_text/test_processor_speech_to_text.py
@@ -115,7 +115,7 @@ class Speech2TextProcessorTest(unittest.TestCase):
         input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
         input_processor = processor(raw_speech, return_tensors="np")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
@@ -130,7 +130,7 @@ class Speech2TextProcessorTest(unittest.TestCase):
 
         encoded_tok = tokenizer(input_str)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     def test_tokenizer_decode(self):
diff --git a/tests/models/speecht5/test_processor_speecht5.py b/tests/models/speecht5/test_processor_speecht5.py
index 75340fa4e0..ea84104ac7 100644
--- a/tests/models/speecht5/test_processor_speecht5.py
+++ b/tests/models/speecht5/test_processor_speecht5.py
@@ -122,7 +122,7 @@ class SpeechT5ProcessorTest(unittest.TestCase):
         input_feat_extract = feature_extractor(audio=raw_speech, return_tensors="np")
         input_processor = processor(audio=raw_speech, return_tensors="np")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_feature_extractor_target(self):
@@ -136,7 +136,7 @@ class SpeechT5ProcessorTest(unittest.TestCase):
         input_feat_extract = feature_extractor(audio_target=raw_speech, return_tensors="np")
         input_processor = processor(audio_target=raw_speech, return_tensors="np")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
@@ -150,7 +150,7 @@ class SpeechT5ProcessorTest(unittest.TestCase):
         encoded_processor = processor(text=input_str)
         encoded_tok = tokenizer(input_str)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     def test_tokenizer_target(self):
@@ -164,7 +164,7 @@ class SpeechT5ProcessorTest(unittest.TestCase):
         encoded_processor = processor(text_target=input_str)
         encoded_tok = tokenizer(input_str)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     def test_tokenizer_decode(self):
diff --git a/tests/models/tapas/test_tokenization_tapas.py b/tests/models/tapas/test_tokenization_tapas.py
index 6f3b96166d..998ec823ef 100644
--- a/tests/models/tapas/test_tokenization_tapas.py
+++ b/tests/models/tapas/test_tokenization_tapas.py
@@ -691,7 +691,7 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
                     table, sequences, max_length=maximum_length + 10, padding="longest"
                 )
-                for key in encoded_sequences_batch_padded_1.keys():
+                for key in encoded_sequences_batch_padded_1:
                     self.assertListEqual(
                         encoded_sequences_batch_padded_1[key],
                         encoded_sequences_batch_padded_2[key],
@@ -702,7 +702,7 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
                     table, sequences, max_length=maximum_length + 10, padding=False
                 )
-                for key in encoded_sequences_batch_padded_1.keys():
+                for key in encoded_sequences_batch_padded_1:
                     self.assertListEqual(
                         encoded_sequences_batch_padded_1[key],
                         encoded_sequences_batch_padded_2[key],
diff --git a/tests/models/trocr/test_processor_trocr.py b/tests/models/trocr/test_processor_trocr.py
index e2d64aeb0e..205f268fe6 100644
--- a/tests/models/trocr/test_processor_trocr.py
+++ b/tests/models/trocr/test_processor_trocr.py
@@ -91,7 +91,7 @@ class TrOCRProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_feat_extract = image_processor(image_input, return_tensors="np")
         input_processor = processor(images=image_input, return_tensors="np")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
@@ -103,7 +103,7 @@ class TrOCRProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         encoded_processor = processor(text=input_str)
         encoded_tok = tokenizer(input_str)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     def test_processor_text(self):
diff --git a/tests/models/udop/test_tokenization_udop.py b/tests/models/udop/test_tokenization_udop.py
index 581cee32e2..7badff1b4b 100644
--- a/tests/models/udop/test_tokenization_udop.py
+++ b/tests/models/udop/test_tokenization_udop.py
@@ -721,7 +721,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus_boxes(
                     words, is_pair=False, boxes=boxes, max_length=maximum_length + 10, padding="longest"
                 )
-                for key in encoded_sequences_batch_padded_1.keys():
+                for key in encoded_sequences_batch_padded_1:
                     self.assertListEqual(
                         encoded_sequences_batch_padded_1[key],
                         encoded_sequences_batch_padded_2[key],
@@ -734,7 +734,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus_boxes(
                     words, is_pair=False, boxes=boxes, max_length=maximum_length + 10, padding=False
                 )
-                for key in encoded_sequences_batch_padded_1.keys():
+                for key in encoded_sequences_batch_padded_1:
                     self.assertListEqual(
                         encoded_sequences_batch_padded_1[key],
                         encoded_sequences_batch_padded_2[key],
@@ -1231,7 +1231,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                     add_special_tokens=True,
                 )
 
-                for key in tokens_p.keys():
+                for key in tokens_p:
                     self.assertEqual(tokens_r[key], tokens_p[key])
 
                 if "token_type_ids" in tokens_r:
@@ -1262,7 +1262,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 # encode_plus()
                 no_special_tokens = tokenizer_r.encode_plus_boxes(words, boxes=boxes, add_special_tokens=False)
                 with_special_tokens = tokenizer_r.encode_plus_boxes(words, boxes=boxes, add_special_tokens=True)
-                for key in no_special_tokens.keys():
+                for key in no_special_tokens:
                     self.assertEqual(
                         len(no_special_tokens[key]),
                         len(with_special_tokens[key]) - simple_num_special_tokens_to_add,
@@ -1273,7 +1273,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
                 no_special_tokens = tokenizer_r.batch_encode_plus_boxes(words, boxes=boxes, add_special_tokens=False)
                 with_special_tokens = tokenizer_r.batch_encode_plus_boxes(words, boxes=boxes, add_special_tokens=True)
-                for key in no_special_tokens.keys():
+                for key in no_special_tokens:
                     for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]):
                         self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add)
 
diff --git a/tests/models/vision_text_dual_encoder/test_processor_vision_text_dual_encoder.py b/tests/models/vision_text_dual_encoder/test_processor_vision_text_dual_encoder.py
index fdae4bf8e1..814c202fdb 100644
--- a/tests/models/vision_text_dual_encoder/test_processor_vision_text_dual_encoder.py
+++ b/tests/models/vision_text_dual_encoder/test_processor_vision_text_dual_encoder.py
@@ -120,7 +120,7 @@ class VisionTextDualEncoderProcessorTest(ProcessorTesterMixin, unittest.TestCase
         input_feat_extract = image_processor(image_input, return_tensors="pt")
         input_processor = processor(images=image_input, return_tensors="pt")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
@@ -135,7 +135,7 @@ class VisionTextDualEncoderProcessorTest(ProcessorTesterMixin, unittest.TestCase
 
         encoded_tok = tokenizer(input_str)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     def test_processor(self):
diff --git a/tests/models/visual_bert/test_modeling_visual_bert.py b/tests/models/visual_bert/test_modeling_visual_bert.py
index 8b715137a3..09c96a2467 100644
--- a/tests/models/visual_bert/test_modeling_visual_bert.py
+++ b/tests/models/visual_bert/test_modeling_visual_bert.py
@@ -318,7 +318,7 @@ class VisualBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = copy.deepcopy(inputs_dict)
         if model_class == VisualBertForMultipleChoice:
-            for key in inputs_dict.keys():
+            for key in inputs_dict:
                 value = inputs_dict[key]
                 if isinstance(value, torch.Tensor) and value.ndim > 1:
                     if key != "visual_embeds":
diff --git a/tests/models/vits/test_modeling_vits.py b/tests/models/vits/test_modeling_vits.py
index ecd1a73f33..7f8e9d6e7f 100644
--- a/tests/models/vits/test_modeling_vits.py
+++ b/tests/models/vits/test_modeling_vits.py
@@ -55,7 +55,7 @@ GENERATION_CONFIG_NAME = "generation_config.json"
 
 def _config_zero_init(config):
     configs_no_init = copy.deepcopy(config)
-    for key in configs_no_init.__dict__.keys():
+    for key in configs_no_init.__dict__:
         if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
             setattr(configs_no_init, key, 1e-10)
         if isinstance(getattr(configs_no_init, key, None), PretrainedConfig):
diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py
index 560a8af6d9..3b38f2d8b6 100644
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@@ -1982,7 +1982,7 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
             "en": "joe keton disapproved of films and buster also had reservations about the media",
         }
 
-        for lang in LANG_MAP.keys():
+        for lang in LANG_MAP:
             assert run_model(lang) == TRANSCRIPTIONS[lang]
 
     @require_flash_attn
diff --git a/tests/models/wav2vec2/test_processor_wav2vec2.py b/tests/models/wav2vec2/test_processor_wav2vec2.py
index 4001421681..1ed5fa2982 100644
--- a/tests/models/wav2vec2/test_processor_wav2vec2.py
+++ b/tests/models/wav2vec2/test_processor_wav2vec2.py
@@ -126,7 +126,7 @@ class Wav2Vec2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
         input_processor = processor(raw_speech, return_tensors="np")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
@@ -140,7 +140,7 @@ class Wav2Vec2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
 
         encoded_tok = tokenizer(input_str)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     def test_tokenizer_decode(self):
diff --git a/tests/models/wav2vec2_bert/test_processor_wav2vec2_bert.py b/tests/models/wav2vec2_bert/test_processor_wav2vec2_bert.py
index c269e15db3..5c4f062de7 100644
--- a/tests/models/wav2vec2_bert/test_processor_wav2vec2_bert.py
+++ b/tests/models/wav2vec2_bert/test_processor_wav2vec2_bert.py
@@ -127,7 +127,7 @@ class Wav2Vec2BertProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
         input_processor = processor(raw_speech, return_tensors="np")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
@@ -141,7 +141,7 @@ class Wav2Vec2BertProcessorTest(ProcessorTesterMixin, unittest.TestCase):
 
         encoded_tok = tokenizer(input_str)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     def test_tokenizer_decode(self):
diff --git a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
index 66fc8665cb..97e83ff367 100644
--- a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
+++ b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
@@ -154,7 +154,7 @@ class Wav2Vec2ProcessorWithLMTest(unittest.TestCase):
         input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
         input_processor = processor(raw_speech, return_tensors="np")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_another_feature_extractor(self):
@@ -169,7 +169,7 @@ class Wav2Vec2ProcessorWithLMTest(unittest.TestCase):
         input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
         input_processor = processor(raw_speech, return_tensors="np")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
         self.assertListEqual(
@@ -199,7 +199,7 @@ class Wav2Vec2ProcessorWithLMTest(unittest.TestCase):
 
         encoded_tok = tokenizer(input_str)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     def _get_dummy_logits(self, shape=(2, 10, 16), seed=77):
@@ -394,7 +394,7 @@ class Wav2Vec2ProcessorWithLMTest(unittest.TestCase):
         input_wav2vec2 = processor_wav2vec2(raw_speech, return_tensors="np")
         input_auto = processor_auto(raw_speech, return_tensors="np")
 
-        for key in input_wav2vec2.keys():
+        for key in input_wav2vec2:
             self.assertAlmostEqual(input_wav2vec2[key].sum(), input_auto[key].sum(), delta=1e-2)
 
         logits = self._get_dummy_logits()
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index d0aeecf806..aa82a8108c 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -1016,8 +1016,8 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
             loaded_model_state_dict = loaded_model.state_dict()
 
             non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
+            for key in loaded_model_state_dict:
+                if key not in model_state_dict:
                     non_persistent_buffers[key] = loaded_model_state_dict[key]
 
             loaded_model_state_dict = {
diff --git a/tests/models/whisper/test_processor_whisper.py b/tests/models/whisper/test_processor_whisper.py
index 86451b5412..a80e7950b2 100644
--- a/tests/models/whisper/test_processor_whisper.py
+++ b/tests/models/whisper/test_processor_whisper.py
@@ -93,7 +93,7 @@ class WhisperProcessorTest(unittest.TestCase):
         input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
         input_processor = processor(raw_speech, return_tensors="np")
 
-        for key in input_feat_extract.keys():
+        for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
     def test_tokenizer(self):
@@ -108,7 +108,7 @@ class WhisperProcessorTest(unittest.TestCase):
 
         encoded_tok = tokenizer(input_str)
 
-        for key in encoded_tok.keys():
+        for key in encoded_tok:
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
     def test_tokenizer_decode(self):
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index 5c125f0797..fe6e8728be 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -638,8 +638,8 @@ class XCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
             loaded_model_state_dict = loaded_model.state_dict()
 
             non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
+            for key in loaded_model_state_dict:
+                if key not in model_state_dict:
                     non_persistent_buffers[key] = loaded_model_state_dict[key]
 
             loaded_model_state_dict = {
diff --git a/tests/models/xglm/test_modeling_xglm.py b/tests/models/xglm/test_modeling_xglm.py
index ace2f0f511..caa1074282 100644
--- a/tests/models/xglm/test_modeling_xglm.py
+++ b/tests/models/xglm/test_modeling_xglm.py
@@ -258,7 +258,7 @@ class XGLMModelTester:
     def create_and_check_xglm_weight_initialization(self, config, *args):
         model = XGLMModel(config)
         model_std = model.config.initializer_range / math.sqrt(2 * model.config.num_hidden_layers)
-        for key in model.state_dict().keys():
+        for key in model.state_dict():
             if "c_proj" in key and "weight" in key:
                 self.parent.assertLessEqual(abs(torch.std(model.state_dict()[key]) - model_std), 0.001)
                 self.parent.assertLessEqual(abs(torch.mean(model.state_dict()[key]) - 0.0), 0.01)
diff --git a/tests/peft_integration/test_peft_integration.py b/tests/peft_integration/test_peft_integration.py
index 0ce14f0412..0cf97ea80d 100644
--- a/tests/peft_integration/test_peft_integration.py
+++ b/tests/peft_integration/test_peft_integration.py
@@ -115,7 +115,7 @@ class PeftIntegrationTester(unittest.TestCase, PeftTesterMixin):
 
                 state_dict = peft_model.get_adapter_state_dict()
 
-                for key in state_dict.keys():
+                for key in state_dict:
                     self.assertTrue("lora" in key)
 
     def test_peft_save_pretrained(self):
@@ -242,7 +242,7 @@ class PeftIntegrationTester(unittest.TestCase, PeftTesterMixin):
                 self.assertTrue(_has_modules_to_save_wrapper)
                 state_dict = model.get_adapter_state_dict()
 
-                self.assertTrue("lm_head.weight" in state_dict.keys())
+                self.assertTrue("lm_head.weight" in state_dict)
 
                 logits = model(dummy_input).logits
                 loss = logits.mean()
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index a701f86f66..a6bceef116 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -568,7 +568,7 @@ class PipelineUtilsTest(unittest.TestCase):
         from transformers.pipelines import SUPPORTED_TASKS
 
         set_seed_fn = lambda: torch.manual_seed(0)  # noqa: E731
-        for task in SUPPORTED_TASKS.keys():
+        for task in SUPPORTED_TASKS:
             if task == "table-question-answering":
                 # test table in separate test due to more dependencies
                 continue
@@ -627,7 +627,7 @@ class PipelineUtilsTest(unittest.TestCase):
             model_ids = []
             revisions = []
             tasks = []
-            for translation_pair in task_dict["default"].keys():
+            for translation_pair in task_dict["default"]:
                 model_id, revision = task_dict["default"][translation_pair]["model"][framework]
 
                 model_ids.append(model_id)
diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index 3c0977f21c..a5ba1a861f 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -726,7 +726,7 @@ class BaseSerializationTest(unittest.TestCase):
         d1 = dict(model_1.named_parameters())
         self.assertTrue(d0.keys() == d1.keys())
 
-        for k in d0.keys():
+        for k in d0:
             self.assertTrue(d0[k].shape == d1[k].shape)
             self.assertTrue(d0[k].device.type == d1[k].device.type)
             self.assertTrue(d0[k].device == d1[k].device)
diff --git a/tests/quantization/quanto_integration/test_quanto.py b/tests/quantization/quanto_integration/test_quanto.py
index a4e0b47869..5cc0814332 100644
--- a/tests/quantization/quanto_integration/test_quanto.py
+++ b/tests/quantization/quanto_integration/test_quanto.py
@@ -251,7 +251,7 @@ class QuantoQuantizationTest(unittest.TestCase):
         d0 = dict(model1.named_parameters())
         d1 = dict(model2.named_parameters())
         self.assertTrue(d0.keys() == d1.keys())
-        for k in d0.keys():
+        for k in d0:
             self.assertTrue(d0[k].shape == d1[k].shape)
             self.assertTrue(d0[k].device.type == d1[k].device.type)
             self.assertTrue(d0[k].device == d1[k].device)
diff --git a/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py b/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
index 2b1a19b9ab..d18484d1cd 100644
--- a/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
+++ b/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
@@ -136,7 +136,7 @@ class DataTrainingArguments:
     def __post_init__(self):
         if self.task_name is not None:
             self.task_name = self.task_name.lower()
-            if self.task_name not in task_to_keys.keys():
+            if self.task_name not in task_to_keys:
                 raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys()))
         elif self.train_file is None or self.validation_file is None:
             raise ValueError("Need either a GLUE task or a training/validation file.")
@@ -271,7 +271,7 @@ def main():
             else:
                 raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
 
-        for key in data_files.keys():
+        for key in data_files:
             logger.info(f"load a local file for {key}: {data_files[key]}")
 
         if data_args.train_file.endswith(".csv"):
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 9454f4b4e5..d0b2e902e6 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -489,7 +489,7 @@ def _test_eager_matches_sdpa_inference(
 
 def _config_zero_init(config):
     configs_no_init = copy.deepcopy(config)
-    for key in configs_no_init.__dict__.keys():
+    for key in configs_no_init.__dict__:
         if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
             setattr(configs_no_init, key, 1e-10)
         if isinstance(getattr(configs_no_init, key, None), PretrainedConfig):
@@ -949,7 +949,7 @@ class ModelTesterMixin:
             state_dict = model.state_dict()
 
             def check_equal(loaded):
-                for key in state_dict.keys():
+                for key in state_dict:
                     max_diff = torch.max(
                         state_dict()[key] ^ loaded[key]
                         if isinstance(state_dict[key], torch.BoolTensor)
@@ -1482,8 +1482,8 @@ class ModelTesterMixin:
                 loaded_model_state_dict = loaded_model.state_dict()
 
                 non_persistent_buffers = {}
-                for key in loaded_model_state_dict.keys():
-                    if key not in model_state_dict.keys():
+                for key in loaded_model_state_dict:
+                    if key not in model_state_dict:
                         non_persistent_buffers[key] = loaded_model_state_dict[key]
 
                 loaded_model_state_dict = {
@@ -3431,7 +3431,7 @@ class ModelTesterMixin:
             set_seed(0)
             new_model = MyClass.from_pretrained(tmpdirname, num_labels=4, ignore_mismatched_sizes=True)
 
-            for key in new_model.state_dict().keys():
+            for key in new_model.state_dict():
                 # check weight values for weights with matched shapes are identical
                 # (i.e. correctly loaded from the checkpoint)
                 if key not in ["linear.weight", "linear.bias"]:
@@ -3632,12 +3632,12 @@ class ModelTesterMixin:
             # set eager as it will be the one supported in all models
             # we just need to test if passing 'attn_implementation' as a dict fails or not
             attn_implementation_per_subconfig = {"": "eager"}
-            for key in config.sub_configs.keys():
+            for key in config.sub_configs:
                 attn_implementation_per_subconfig[key] = "eager"
 
             config._attn_implementation = attn_implementation_per_subconfig
             model = model_class(config)
-            for key in config.sub_configs.keys():
+            for key in config.sub_configs:
                 sub_config = getattr(model.config, key)
                 self.assertTrue(sub_config._attn_implementation == "eager")
 
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 3461721808..b70a778640 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -427,7 +427,7 @@ class TokenizerTesterMixin:
         # Switch from batch_encode_plus format:   {'input_ids': [[...], [...]], ...}
         # to the list of examples/ encode_plus format: [{'input_ids': [...], ...}, {'input_ids': [...], ...}]
         return [
-            {value: batch_encode_plus_sequences[value][i] for value in batch_encode_plus_sequences.keys()}
+            {value: batch_encode_plus_sequences[value][i] for value in batch_encode_plus_sequences}
             for i in range(len(batch_encode_plus_sequences["input_ids"]))
         ]
 
@@ -2792,7 +2792,7 @@ class TokenizerTesterMixin:
                 encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
                     sequences, max_length=maximum_length + 10, padding="longest"
                 )
-                for key in encoded_sequences_batch_padded_1.keys():
+                for key in encoded_sequences_batch_padded_1:
                     self.assertListEqual(
                         encoded_sequences_batch_padded_1[key],
                         encoded_sequences_batch_padded_2[key],
@@ -2803,7 +2803,7 @@ class TokenizerTesterMixin:
                 encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
                     sequences, max_length=maximum_length + 10, padding=False
                 )
-                for key in encoded_sequences_batch_padded_1.keys():
+                for key in encoded_sequences_batch_padded_1:
                     self.assertListEqual(
                         encoded_sequences_batch_padded_1[key],
                         encoded_sequences_batch_padded_2[key],
@@ -2933,11 +2933,11 @@ class TokenizerTesterMixin:
                 # Test encode_plus for pretokenized inputs
                 output = tokenizer.encode_plus(token_sequence, is_split_into_words=True, add_special_tokens=False)
                 output_sequence = tokenizer.encode_plus(sequence, add_special_tokens=False)
-                for key in output.keys():
+                for key in output:
                     self.assertEqual(output[key], output_sequence[key])
                 output = tokenizer.encode_plus(token_sequence, is_split_into_words=True, add_special_tokens=True)
                 output_sequence = tokenizer.encode_plus(sequence, add_special_tokens=True)
-                for key in output.keys():
+                for key in output:
                     self.assertEqual(output[key], output_sequence[key])
 
                 # Test batch_encode_plus for pretokenized inputs
@@ -2951,7 +2951,7 @@ class TokenizerTesterMixin:
                 output_sequence = tokenizer.batch_encode_plus(
                     sequence_batch_cleaned_up_spaces, add_special_tokens=False
                 )
-                for key in output.keys():
+                for key in output:
                     self.assertEqual(output[key], output_sequence[key])
                 output = tokenizer.batch_encode_plus(
                     token_sequence_batch, is_split_into_words=True, add_special_tokens=True
@@ -2959,7 +2959,7 @@ class TokenizerTesterMixin:
                 output_sequence = tokenizer.batch_encode_plus(
                     sequence_batch_cleaned_up_spaces, add_special_tokens=True
                 )
-                for key in output.keys():
+                for key in output:
                     self.assertEqual(output[key], output_sequence[key])
 
                 # Test encode for pretokenized inputs pairs
@@ -2979,13 +2979,13 @@ class TokenizerTesterMixin:
                     token_sequence, token_sequence, is_split_into_words=True, add_special_tokens=False
                 )
                 output_sequence = tokenizer.encode_plus(sequence, sequence, add_special_tokens=False)
-                for key in output.keys():
+                for key in output:
                     self.assertEqual(output[key], output_sequence[key])
                 output = tokenizer.encode_plus(
                     token_sequence, token_sequence, is_split_into_words=True, add_special_tokens=True
                 )
                 output_sequence = tokenizer.encode_plus(sequence, sequence, add_special_tokens=True)
-                for key in output.keys():
+                for key in output:
                     self.assertEqual(output[key], output_sequence[key])
 
                 # Test batch_encode_plus for pretokenized inputs pairs
@@ -3003,7 +3003,7 @@ class TokenizerTesterMixin:
                 output_sequence = tokenizer.batch_encode_plus(
                     sequence_pair_batch_cleaned_up_spaces, add_special_tokens=False
                 )
-                for key in output.keys():
+                for key in output:
                     self.assertEqual(output[key], output_sequence[key])
                 output = tokenizer.batch_encode_plus(
                     token_sequence_pair_batch, is_split_into_words=True, add_special_tokens=True
@@ -3011,7 +3011,7 @@ class TokenizerTesterMixin:
                 output_sequence = tokenizer.batch_encode_plus(
                     sequence_pair_batch_cleaned_up_spaces, add_special_tokens=True
                 )
-                for key in output.keys():
+                for key in output:
                     self.assertEqual(output[key], output_sequence[key])
 
     def test_prepare_for_model(self):
@@ -3703,14 +3703,14 @@ class TokenizerTesterMixin:
                 # Test encode_plus for pretokenized inputs
                 output_r = tokenizer_r.encode_plus(pretokenized_input_simple, **kwargs)
                 output_p = tokenizer_p.encode_plus(pretokenized_input_simple, **kwargs)
-                for key in output_p.keys():
+                for key in output_p:
                     self.assertEqual(output_p[key], output_r[key])
 
                 # Test batch_encode_plus for pretokenized inputs
                 input_batch = ([pretokenized_input_simple] * 2) + [pretokenized_input_simple + pretokenized_input_pair]
                 output_r = tokenizer_r.batch_encode_plus(input_batch, **batch_kwargs)
                 output_p = tokenizer_p.batch_encode_plus(input_batch, **batch_kwargs)
-                for key in output_p.keys():
+                for key in output_p:
                     self.assertEqual(output_p[key], output_r[key])
 
                 # Test encode for pretokenized inputs pairs
@@ -3725,7 +3725,7 @@ class TokenizerTesterMixin:
                 # Test encode_plus for pretokenized inputs
                 output_r = tokenizer_r.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs)
                 output_p = tokenizer_p.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs)
-                for key in output_p.keys():
+                for key in output_p:
                     self.assertEqual(output_p[key], output_r[key])
 
                 # Test batch_encode_plus for pretokenized inputs
@@ -3735,7 +3735,7 @@ class TokenizerTesterMixin:
                 ]
                 output_r = tokenizer_r.batch_encode_plus(input_batch_pair, **batch_kwargs)
                 output_p = tokenizer_p.batch_encode_plus(input_batch_pair, **batch_kwargs)
-                for key in output_p.keys():
+                for key in output_p:
                     self.assertEqual(output_p[key], output_r[key])
 
     def test_create_token_type_ids(self):
@@ -4125,7 +4125,7 @@ class TokenizerTesterMixin:
                     add_special_tokens=True,
                 )
 
-                for key in tokens_p.keys():
+                for key in tokens_p:
                     self.assertEqual(tokens_r[key], tokens_p[key])
 
                 if "token_type_ids" in tokens_r:
@@ -4161,7 +4161,7 @@ class TokenizerTesterMixin:
                     # encode_plus()
                     no_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=False)
                     with_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=True)
-                    for key in no_special_tokens.keys():
+                    for key in no_special_tokens:
                         self.assertEqual(
                             len(no_special_tokens[key]),
                             len(with_special_tokens[key]) - simple_num_special_tokens_to_add,
@@ -4170,7 +4170,7 @@ class TokenizerTesterMixin:
                     # # batch_encode_plus
                     no_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=False)
                     with_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=True)
-                    for key in no_special_tokens.keys():
+                    for key in no_special_tokens:
                         for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]):
                             self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add)
 
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 3c2cc0ce89..a0e2911789 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -555,7 +555,7 @@ if is_torch_available():
         output_dir=None,
         **kwargs,
     ):
-        label_names = kwargs.get("label_names", None)
+        label_names = kwargs.get("label_names")
         gradient_checkpointing = kwargs.get("gradient_checkpointing", False)
         train_dataset = RegressionDataset(length=train_len, label_names=label_names)
         eval_dataset = RegressionDataset(length=eval_len, label_names=label_names)
@@ -1340,7 +1340,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         trainer.train()
         args = TrainingArguments(tmp_dir, report_to=[])
         dict1, dict2 = args.to_dict(), trainer.args.to_dict()
-        for key in dict1.keys():
+        for key in dict1:
             # Logging dir can be slightly different as they default to something with the time.
             if key != "logging_dir":
                 self.assertEqual(dict1[key], dict2[key])
diff --git a/tests/utils/test_hf_argparser.py b/tests/utils/test_hf_argparser.py
index 27ecb84306..2923deeb61 100644
--- a/tests/utils/test_hf_argparser.py
+++ b/tests/utils/test_hf_argparser.py
@@ -154,7 +154,7 @@ class HfArgumentParserTest(unittest.TestCase):
 
             # Choices with mixed type have custom function as "type"
             # So we need to compare results directly for equality
-            if xx.get("choices", None) and yy.get("choices", None):
+            if xx.get("choices") and yy.get("choices"):
                 for expected_choice in yy["choices"] + xx["choices"]:
                     self.assertEqual(xx["type"](expected_choice), yy["type"](expected_choice))
                 del xx["type"], yy["type"]
diff --git a/tests/utils/test_import_structure.py b/tests/utils/test_import_structure.py
index d6382a5fdf..1a4588a0d3 100644
--- a/tests/utils/test_import_structure.py
+++ b/tests/utils/test_import_structure.py
@@ -71,7 +71,7 @@ class TestImportStructures(unittest.TestCase):
         }
 
         self.assertEqual(len(import_structure.keys()), len(valid_frozensets.keys()))
-        for _frozenset in valid_frozensets.keys():
+        for _frozenset in valid_frozensets:
             self.assertTrue(_frozenset in import_structure)
             self.assertListEqual(list(import_structure[_frozenset].keys()), list(valid_frozensets[_frozenset].keys()))
             for module, objects in valid_frozensets[_frozenset].items():
diff --git a/utils/add_pipeline_model_mapping_to_test.py b/utils/add_pipeline_model_mapping_to_test.py
index f036e44495..636f018eb5 100644
--- a/utils/add_pipeline_model_mapping_to_test.py
+++ b/utils/add_pipeline_model_mapping_to_test.py
@@ -37,7 +37,7 @@ from tests.test_pipeline_mixin import pipeline_test_mapping
 
 
 PIPELINE_TEST_MAPPING = {}
-for task in pipeline_test_mapping.keys():
+for task in pipeline_test_mapping:
     PIPELINE_TEST_MAPPING[task] = {"pt": None, "tf": None}
 
 
diff --git a/utils/check_copies.py b/utils/check_copies.py
index d53d1fc9ca..e48163421a 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -731,8 +731,8 @@ def is_copy_consistent(
 
         # Below, we change some names in `theoretical_code_blocks` and `observed_code_blocks`. These mappings map the
         # original names to the modified names: this is used to restore the original order of the code blocks.
-        name_mappings_1 = {k: k for k in theoretical_code_blocks.keys()}
-        name_mappings_2 = {k: k for k in observed_code_blocks.keys()}
+        name_mappings_1 = {k: k for k in theoretical_code_blocks}
+        name_mappings_2 = {k: k for k in observed_code_blocks}
 
         # Update code blocks' name and content:
         #   If `"# Ignore copy"` is found in a block of the observed code:
diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py
index dd10a7d9b1..a9ca4f1d56 100644
--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@@ -913,7 +913,7 @@ def match_docstring_with_signature(obj: Any) -> Optional[tuple[str, str]]:
         missing = set(signature.keys()) - set(old_arguments)
         new_param_docs.extend([arguments[name] for name in missing if len(arguments[name]) > 0])
     else:
-        new_param_docs = [arguments[name] for name in signature.keys() if len(arguments[name]) > 0]
+        new_param_docs = [arguments[name] for name in signature if len(arguments[name]) > 0]
     new_doc_arg = "\n".join(new_param_docs)
 
     return old_doc_arg, new_doc_arg
diff --git a/utils/check_dummies.py b/utils/check_dummies.py
index f429e8b14a..b9757bdb02 100644
--- a/utils/check_dummies.py
+++ b/utils/check_dummies.py
@@ -199,8 +199,7 @@ def check_dummies(overwrite: bool = False):
     # Locate actual dummy modules and read their content.
     path = os.path.join(PATH_TO_TRANSFORMERS, "utils")
     dummy_file_paths = {
-        backend: os.path.join(path, f"dummy_{short_names.get(backend, backend)}_objects.py")
-        for backend in dummy_files.keys()
+        backend: os.path.join(path, f"dummy_{short_names.get(backend, backend)}_objects.py") for backend in dummy_files
     }
 
     actual_dummies = {}
@@ -212,7 +211,7 @@ def check_dummies(overwrite: bool = False):
             actual_dummies[backend] = ""
 
     # Compare actual with what they should be.
-    for backend in dummy_files.keys():
+    for backend in dummy_files:
         if dummy_files[backend] != actual_dummies[backend]:
             if overwrite:
                 print(
diff --git a/utils/check_inits.py b/utils/check_inits.py
index b392d09a89..bc211baa9e 100644
--- a/utils/check_inits.py
+++ b/utils/check_inits.py
@@ -257,7 +257,7 @@ def analyze_results(import_dict_objects: dict[str, list[str]], type_hint_objects
 
     errors = []
     # Find all errors.
-    for key in import_dict_objects.keys():
+    for key in import_dict_objects:
         # Duplicate imports in any half.
         duplicate_imports = find_duplicates(import_dict_objects[key])
         if duplicate_imports:
diff --git a/utils/check_modular_conversion.py b/utils/check_modular_conversion.py
index 15dcf418d6..dad6b1f0a2 100644
--- a/utils/check_modular_conversion.py
+++ b/utils/check_modular_conversion.py
@@ -71,7 +71,7 @@ def compare_files(modular_file_path, show_diff=True):
     # Generate the expected modeling content
     generated_modeling_content = convert_modular_file(modular_file_path)
     diff = 0
-    for file_type in generated_modeling_content.keys():
+    for file_type in generated_modeling_content:
         diff += process_file(modular_file_path, generated_modeling_content, file_type, show_diff)
     return diff
 
diff --git a/utils/create_dependency_mapping.py b/utils/create_dependency_mapping.py
index 7590fda98c..84fec10e93 100644
--- a/utils/create_dependency_mapping.py
+++ b/utils/create_dependency_mapping.py
@@ -17,7 +17,7 @@ def topological_sort(dependencies: dict) -> list[list[str]]:
     """
 
     # Nodes are the name of the models to convert (we only add those to the graph)
-    nodes = {node.rsplit("modular_", 1)[1].replace(".py", "") for node in dependencies.keys()}
+    nodes = {node.rsplit("modular_", 1)[1].replace(".py", "") for node in dependencies}
     # This will be a graph from models to convert, to models to convert that should be converted before (as they are a dependency)
     graph = {}
     name_mapping = {}
diff --git a/utils/create_dummy_models.py b/utils/create_dummy_models.py
index e151b37d52..c8e2bd4557 100644
--- a/utils/create_dummy_models.py
+++ b/utils/create_dummy_models.py
@@ -424,7 +424,7 @@ def get_tiny_config(config_class, model_class=None, **model_tester_kwargs):
     # `text_model_tester` via `text_kwargs`. The same trick is also necessary for `Flava`.
 
     if "vocab_size" in model_tester_kwargs:
-        if "text_kwargs" in inspect.signature(model_tester_class.__init__).parameters.keys():
+        if "text_kwargs" in inspect.signature(model_tester_class.__init__).parameters:
             vocab_size = model_tester_kwargs.pop("vocab_size")
             model_tester_kwargs["text_kwargs"] = {"vocab_size": vocab_size}
 
diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py
index 3b447dc7bf..ade3589868 100644
--- a/utils/modular_model_converter.py
+++ b/utils/modular_model_converter.py
@@ -57,7 +57,7 @@ def get_module_source_from_name(module_name: str) -> str:
 
 def preserve_case_replace(text, patterns: dict, default_name: str):
     # Create a regex pattern to match all variations
-    regex_pattern = "|".join(re.escape(key) for key in patterns.keys())
+    regex_pattern = "|".join(re.escape(key) for key in patterns)
     compiled_regex = re.compile(f"(?<![a-z0-9])({regex_pattern})(.|$)", re.IGNORECASE | re.DOTALL)
 
     def replace(match):
@@ -445,7 +445,7 @@ def find_all_dependencies(
             # Add the dependencies
             all_dependencies.add(current)
             all_dependencies_with_parent += [(current, parents[current])]
-            if current in dependency_mapping.keys():
+            if current in dependency_mapping:
                 # Update dependency queue
                 dependency_queue.extend(dependency_mapping[current])
                 parents.update(dict.fromkeys(dependency_mapping[current], current))
@@ -670,7 +670,7 @@ class ModuleMapper(CSTVisitor, ABC):
         the recursive mapping, i.e. `recursive_dependencies = {"test": {"bar", "foo"}, "bar": {"foo}}`.
         """
         recursive_dependencies = {}
-        for object_name in self.object_dependency_mapping.keys():
+        for object_name in self.object_dependency_mapping:
             all_dependencies = find_all_dependencies(self.object_dependency_mapping, start_entity=object_name)
             recursive_dependencies[object_name] = all_dependencies
         return recursive_dependencies
@@ -682,7 +682,7 @@ class ModuleMapper(CSTVisitor, ABC):
         new_dependencies = dependencies.copy()
         # Go through the set of dependencies
         for dep in tuple(dependencies):
-            if dep in self.object_recursive_dependency_mapping.keys():
+            if dep in self.object_recursive_dependency_mapping:
                 new_dependencies.update(self.object_recursive_dependency_mapping[dep])
         return new_dependencies
 
@@ -790,9 +790,7 @@ class ModelFileMapper(ModuleMapper):
         """
         # Add/overwrite all needed function nodes and dependencies
         self.functions.update(functions)
-        self.object_dependency_mapping.update(
-            {obj: dep for obj, dep in object_mapping.items() if obj in functions.keys()}
-        )
+        self.object_dependency_mapping.update({obj: dep for obj, dep in object_mapping.items() if obj in functions})
         # Add them to global nodes
         self.global_nodes.update(self.functions)
 
@@ -1150,7 +1148,7 @@ def get_needed_imports(body: dict[str, dict], all_imports: list[cst.CSTNode]) ->
                 import_ref_count[name] = max(ref_count, import_ref_count[name])
     # Similar imports may be redefined, and only used between their 1st and 2nd definition so if we already have
     # a ref count > 0 at any point, the imports is actually used
-    unused_imports = {name for name, count in import_ref_count.items() if count <= 0 or name in body.keys()}
+    unused_imports = {name for name, count in import_ref_count.items() if count <= 0 or name in body}
 
     imports_to_keep = []
     # We need to keep track of which names were already imported, because some import may be duplicated from multiple sources
@@ -1494,8 +1492,8 @@ class ModularFileMapper(ModuleMapper):
             final_name_mapping[file] = get_lowercase_name(final_name)
 
         # Check we are not missing imported files
-        for file in self.model_specific_modules.keys():
-            if file not in final_name_mapping.keys():
+        for file in self.model_specific_modules:
+            if file not in final_name_mapping:
                 final_name_mapping[file] = self.model_name
 
         return final_name_mapping
@@ -1614,7 +1612,7 @@ def get_class_node_and_dependencies(
         nodes_to_add = {
             dep: (relative_dependency_order[dep], modular_mapper.global_nodes[dep])
             for dep in all_dependencies_to_add
-            if dep not in file_to_update.keys()
+            if dep not in file_to_update
         }
 
     # Add the class node itself to the nodes to add
@@ -1710,7 +1708,7 @@ def convert_modular_file(modular_file):
 
 
 def save_modeling_file(modular_file, converted_file):
-    for file_type in converted_file.keys():
+    for file_type in converted_file:
         file_name_prefix = file_type.split("*")[0]
         file_name_suffix = file_type.split("*")[-1] if "*" in file_type else ""
         new_file_name = modular_file.replace("modular_", f"{file_name_prefix}_").replace(
diff --git a/utils/notification_service.py b/utils/notification_service.py
index 2848c5670c..e751794539 100644
--- a/utils/notification_service.py
+++ b/utils/notification_service.py
@@ -1199,7 +1199,7 @@ if __name__ == "__main__":
 
     unclassified_model_failures = []
 
-    for matrix_name in matrix_job_results.keys():
+    for matrix_name in matrix_job_results:
         for artifact_path_dict in available_artifacts[f"{report_name_prefix}_{matrix_name}_test_reports"].paths:
             path = artifact_path_dict["path"]
             artifact_gpu = artifact_path_dict["gpu"]
@@ -1323,10 +1323,10 @@ if __name__ == "__main__":
             "failures": {},
             "job_link": {},
         }
-        for key in additional_files.keys()
+        for key in additional_files
     }
 
-    for key in additional_results.keys():
+    for key in additional_results:
         # If a whole suite of test fails, the artifact isn't available.
         if additional_files[key] not in available_artifacts:
             additional_results[key]["error"] = True
diff --git a/utils/split_doctest_jobs.py b/utils/split_doctest_jobs.py
index 7e670840b7..fd887394de 100644
--- a/utils/split_doctest_jobs.py
+++ b/utils/split_doctest_jobs.py
@@ -69,7 +69,7 @@ if __name__ == "__main__":
         raw_test_collection_map[file_dir].append(file)
 
     refined_test_collection_map = {}
-    for file_dir in raw_test_collection_map.keys():
+    for file_dir in raw_test_collection_map:
         if file_dir in ["docs/source/en/model_doc", "docs/source/en/tasks"]:
             for file in raw_test_collection_map[file_dir]:
                 refined_test_collection_map[file] = file