From 1d30ec95c7a3656972a394d28a86efdb6ca4cdab Mon Sep 17 00:00:00 2001
From: Bhadresh Savani <bhadreshpsavani@gmail.com>
Date: Mon, 26 Apr 2021 17:24:31 +0100
Subject: [PATCH] [Examples] Fixes inconsistency around eval vs val and predict
 vs test (#11380)

* added changes for uniformity

* modified files

* corrected typo

* fixed qa scripts

* fix typos

* fixed predict typo in qa no trainer

* fixed test file

* reverted trainer changes

* reverted trainer changes in custom exmaples

* updated readme

* added changes in deepspeed test

* added changes for predict and eval
---
 examples/pytorch/README.md                    |  4 +-
 examples/pytorch/language-modeling/run_clm.py | 12 ++--
 examples/pytorch/language-modeling/run_mlm.py | 12 ++--
 examples/pytorch/language-modeling/run_plm.py | 12 ++--
 examples/pytorch/multiple-choice/run_swag.py  | 12 ++--
 examples/pytorch/question-answering/run_qa.py | 46 +++++++--------
 .../question-answering/run_qa_beam_search.py  | 44 ++++++++-------
 .../run_qa_beam_search_no_trainer.py          | 54 +++++++++---------
 .../question-answering/run_qa_no_trainer.py   | 50 ++++++++---------
 .../pytorch/question-answering/trainer_qa.py  | 14 ++---
 .../summarization/run_summarization.py        | 56 ++++++++++---------
 .../pytorch/text-classification/run_glue.py   | 42 +++++++-------
 .../pytorch/text-classification/run_xnli.py   | 42 +++++++-------
 .../pytorch/token-classification/run_ner.py   | 34 +++++------
 .../pytorch/translation/run_translation.py    | 56 ++++++++++---------
 .../run_text_classification.py                | 32 +++++------
 .../run_{{cookiecutter.example_shortcut}}.py  | 36 ++++++------
 tests/deepspeed/test_deepspeed.py             |  4 +-
 tests/extended/test_trainer_ext.py            |  2 +-
 19 files changed, 288 insertions(+), 276 deletions(-)

diff --git a/examples/pytorch/README.md b/examples/pytorch/README.md
index 541b5c7c84..7fb888b27a 100644
--- a/examples/pytorch/README.md
+++ b/examples/pytorch/README.md
@@ -50,8 +50,8 @@ For example here is how to truncate all three splits to just 50 samples each:
 ```
 examples/pytorch/token-classification/run_ner.py \
 --max_train_samples 50 \
---max_val_samples 50 \
---max_test_samples 50 \
+--max_eval_samples 50 \
+--max_predict_samples 50 \
 [...]
 ```
 
diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
index ea4b4f0934..ad9acaf196 100755
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -126,10 +126,10 @@ class DataTrainingArguments:
             "value if set."
         },
     )
-    max_val_samples: Optional[int] = field(
+    max_eval_samples: Optional[int] = field(
         default=None,
         metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
             "value if set."
         },
     )
@@ -397,8 +397,8 @@ def main():
         if "validation" not in tokenized_datasets:
             raise ValueError("--do_eval requires a validation dataset")
         eval_dataset = lm_datasets["validation"]
-        if data_args.max_val_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
+        if data_args.max_eval_samples is not None:
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
 
     # Initialize our Trainer
     trainer = Trainer(
@@ -439,8 +439,8 @@ def main():
 
         metrics = trainer.evaluate()
 
-        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
         perplexity = math.exp(metrics["eval_loss"])
         metrics["perplexity"] = perplexity
 
diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
index d0932e5f93..f16082ceeb 100755
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -157,10 +157,10 @@ class DataTrainingArguments:
             "value if set."
         },
     )
-    max_val_samples: Optional[int] = field(
+    max_eval_samples: Optional[int] = field(
         default=None,
         metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
             "value if set."
         },
     )
@@ -419,8 +419,8 @@ def main():
         if "validation" not in tokenized_datasets:
             raise ValueError("--do_eval requires a validation dataset")
         eval_dataset = tokenized_datasets["validation"]
-        if data_args.max_val_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
+        if data_args.max_eval_samples is not None:
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
 
     # Data collator
     # This one will take care of randomly masking the tokens.
@@ -468,8 +468,8 @@ def main():
 
         metrics = trainer.evaluate()
 
-        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
         perplexity = math.exp(metrics["eval_loss"])
         metrics["perplexity"] = perplexity
 
diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py
index d713614472..5a8be42bf5 100755
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@@ -154,10 +154,10 @@ class DataTrainingArguments:
             "value if set."
         },
     )
-    max_val_samples: Optional[int] = field(
+    max_eval_samples: Optional[int] = field(
         default=None,
         metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
             "value if set."
         },
     )
@@ -397,8 +397,8 @@ def main():
         if "validation" not in tokenized_datasets:
             raise ValueError("--do_eval requires a validation dataset")
         eval_dataset = tokenized_datasets["validation"]
-        if data_args.max_val_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
+        if data_args.max_eval_samples is not None:
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
 
     # Data collator
     data_collator = DataCollatorForPermutationLanguageModeling(
@@ -444,8 +444,8 @@ def main():
 
         metrics = trainer.evaluate()
 
-        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
         perplexity = math.exp(metrics["eval_loss"])
         metrics["perplexity"] = perplexity
 
diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py
index e73e13492f..0bc0ded2d8 100755
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@@ -127,10 +127,10 @@ class DataTrainingArguments:
             "value if set."
         },
     )
-    max_val_samples: Optional[int] = field(
+    max_eval_samples: Optional[int] = field(
         default=None,
         metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
             "value if set."
         },
     )
@@ -363,8 +363,8 @@ def main():
         if "validation" not in datasets:
             raise ValueError("--do_eval requires a validation dataset")
         eval_dataset = datasets["validation"]
-        if data_args.max_val_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
+        if data_args.max_eval_samples is not None:
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
         eval_dataset = eval_dataset.map(
             preprocess_function,
             batched=True,
@@ -422,8 +422,8 @@ def main():
         logger.info("*** Evaluate ***")
 
         metrics = trainer.evaluate()
-        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
 
         trainer.log_metrics("eval", metrics)
         trainer.save_metrics("eval", metrics)
diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index 9494569505..cd1d250c4a 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -133,17 +133,17 @@ class DataTrainingArguments:
             "value if set."
         },
     )
-    max_val_samples: Optional[int] = field(
+    max_eval_samples: Optional[int] = field(
         default=None,
         metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
             "value if set."
         },
     )
-    max_test_samples: Optional[int] = field(
+    max_predict_samples: Optional[int] = field(
         default=None,
         metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of test examples to this "
+            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
             "value if set."
         },
     )
@@ -468,9 +468,9 @@ def main():
         if "validation" not in datasets:
             raise ValueError("--do_eval requires a validation dataset")
         eval_examples = datasets["validation"]
-        if data_args.max_val_samples is not None:
+        if data_args.max_eval_samples is not None:
             # We will select sample from whole data
-            eval_examples = eval_examples.select(range(data_args.max_val_samples))
+            eval_examples = eval_examples.select(range(data_args.max_eval_samples))
         # Validation Feature Creation
         eval_dataset = eval_examples.map(
             prepare_validation_features,
@@ -479,28 +479,28 @@ def main():
             remove_columns=column_names,
             load_from_cache_file=not data_args.overwrite_cache,
         )
-        if data_args.max_val_samples is not None:
+        if data_args.max_eval_samples is not None:
             # During Feature creation dataset samples might increase, we will select required samples again
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
 
     if training_args.do_predict:
         if "test" not in datasets:
             raise ValueError("--do_predict requires a test dataset")
-        test_examples = datasets["test"]
-        if data_args.max_test_samples is not None:
+        predict_examples = datasets["test"]
+        if data_args.max_predict_samples is not None:
             # We will select sample from whole data
-            test_examples = test_examples.select(range(data_args.max_test_samples))
-        # Test Feature Creation
-        test_dataset = test_examples.map(
+            predict_examples = predict_examples.select(range(data_args.max_predict_samples))
+        # Predict Feature Creation
+        predict_dataset = predict_examples.map(
             prepare_validation_features,
             batched=True,
             num_proc=data_args.preprocessing_num_workers,
             remove_columns=column_names,
             load_from_cache_file=not data_args.overwrite_cache,
         )
-        if data_args.max_test_samples is not None:
+        if data_args.max_predict_samples is not None:
             # During Feature creation dataset samples might increase, we will select required samples again
-            test_dataset = test_dataset.select(range(data_args.max_test_samples))
+            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
 
     # Data collator
     # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
@@ -581,8 +581,8 @@ def main():
         logger.info("*** Evaluate ***")
         metrics = trainer.evaluate()
 
-        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
 
         trainer.log_metrics("eval", metrics)
         trainer.save_metrics("eval", metrics)
@@ -590,14 +590,16 @@ def main():
     # Prediction
     if training_args.do_predict:
         logger.info("*** Predict ***")
-        results = trainer.predict(test_dataset, test_examples)
+        results = trainer.predict(predict_dataset, predict_examples)
         metrics = results.metrics
 
-        max_test_samples = data_args.max_test_samples if data_args.max_test_samples is not None else len(test_dataset)
-        metrics["test_samples"] = min(max_test_samples, len(test_dataset))
+        max_predict_samples = (
+            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
+        )
+        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
 
-        trainer.log_metrics("test", metrics)
-        trainer.save_metrics("test", metrics)
+        trainer.log_metrics("predict", metrics)
+        trainer.save_metrics("predict", metrics)
 
     if training_args.push_to_hub:
         trainer.push_to_hub()
diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py
index ca8d620fe0..ffefee12f7 100755
--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@@ -132,17 +132,17 @@ class DataTrainingArguments:
             "value if set."
         },
     )
-    max_val_samples: Optional[int] = field(
+    max_eval_samples: Optional[int] = field(
         default=None,
         metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
             "value if set."
         },
     )
-    max_test_samples: Optional[int] = field(
+    max_predict_samples: Optional[int] = field(
         default=None,
         metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of test examples to this "
+            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
             "value if set."
         },
     )
@@ -504,9 +504,9 @@ def main():
         if "validation" not in datasets:
             raise ValueError("--do_eval requires a validation dataset")
         eval_examples = datasets["validation"]
-        if data_args.max_val_samples is not None:
+        if data_args.max_eval_samples is not None:
             # Selecting Eval Samples from Dataset
-            eval_examples = eval_examples.select(range(data_args.max_val_samples))
+            eval_examples = eval_examples.select(range(data_args.max_eval_samples))
         # Create Features from Eval Dataset
         eval_dataset = eval_examples.map(
             prepare_validation_features,
@@ -515,28 +515,28 @@ def main():
             remove_columns=column_names,
             load_from_cache_file=not data_args.overwrite_cache,
         )
-        if data_args.max_val_samples is not None:
+        if data_args.max_eval_samples is not None:
             # Selecting Samples from Dataset again since Feature Creation might increase samples size
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
 
     if training_args.do_predict:
         if "test" not in datasets:
             raise ValueError("--do_predict requires a test dataset")
-        test_examples = datasets["test"]
-        if data_args.max_test_samples is not None:
+        predict_examples = datasets["test"]
+        if data_args.max_predict_samples is not None:
             # We will select sample from whole data
-            test_examples = test_examples.select(range(data_args.max_test_samples))
+            predict_examples = predict_examples.select(range(data_args.max_predict_samples))
         # Test Feature Creation
-        test_dataset = test_examples.map(
+        predict_dataset = predict_examples.map(
             prepare_validation_features,
             batched=True,
             num_proc=data_args.preprocessing_num_workers,
             remove_columns=column_names,
             load_from_cache_file=not data_args.overwrite_cache,
         )
-        if data_args.max_test_samples is not None:
+        if data_args.max_predict_samples is not None:
             # During Feature creation dataset samples might increase, we will select required samples again
-            test_dataset = test_dataset.select(range(data_args.max_test_samples))
+            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
 
     # Data collator
     # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
@@ -620,8 +620,8 @@ def main():
         logger.info("*** Evaluate ***")
         metrics = trainer.evaluate()
 
-        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
 
         trainer.log_metrics("eval", metrics)
         trainer.save_metrics("eval", metrics)
@@ -629,14 +629,16 @@ def main():
     # Prediction
     if training_args.do_predict:
         logger.info("*** Predict ***")
-        results = trainer.predict(test_dataset, test_examples)
+        results = trainer.predict(predict_dataset, predict_examples)
         metrics = results.metrics
 
-        max_test_samples = data_args.max_test_samples if data_args.max_test_samples is not None else len(test_dataset)
-        metrics["test_samples"] = min(max_test_samples, len(test_dataset))
+        max_predict_samples = (
+            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
+        )
+        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
 
-        trainer.log_metrics("test", metrics)
-        trainer.save_metrics("test", metrics)
+        trainer.log_metrics("predict", metrics)
+        trainer.save_metrics("predict", metrics)
 
     if training_args.push_to_hub:
         trainer.push_to_hub()
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index ca0d60c0f8..f1d5a2d030 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -183,20 +183,20 @@ def parse_args():
         "value if set.",
     )
     parser.add_argument(
-        "--max_val_samples",
+        "--max_eval_samples",
         type=int,
         default=None,
-        help="For debugging purposes or quicker training, truncate the number of validation examples to this "
+        help="For debugging purposes or quicker training, truncate the number of evaluation examples to this "
         "value if set.",
     )
     parser.add_argument(
         "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets"
     )
     parser.add_argument(
-        "--max_test_samples",
+        "--max_predict_samples",
         type=int,
         default=None,
-        help="For debugging purposes or quicker training, truncate the number of test examples to this",
+        help="For debugging purposes or quicker training, truncate the number of prediction examples to this",
     )
 
     args = parser.parse_args()
@@ -481,9 +481,9 @@ def main():
     if "validation" not in raw_datasets:
         raise ValueError("--do_eval requires a validation dataset")
     eval_examples = raw_datasets["validation"]
-    if args.max_val_samples is not None:
+    if args.max_eval_samples is not None:
         # We will select sample from whole data
-        eval_examples = eval_examples.select(range(args.max_val_samples))
+        eval_examples = eval_examples.select(range(args.max_eval_samples))
     # Validation Feature Creation
     eval_dataset = eval_examples.map(
         prepare_validation_features,
@@ -493,28 +493,28 @@ def main():
         load_from_cache_file=not args.overwrite_cache,
     )
 
-    if args.max_val_samples is not None:
+    if args.max_eval_samples is not None:
         # During Feature creation dataset samples might increase, we will select required samples again
-        eval_dataset = eval_dataset.select(range(args.max_val_samples))
+        eval_dataset = eval_dataset.select(range(args.max_eval_samples))
 
     if args.do_predict:
         if "test" not in raw_datasets:
             raise ValueError("--do_predict requires a test dataset")
-        test_examples = raw_datasets["test"]
-        if args.max_test_samples is not None:
+        predict_examples = raw_datasets["test"]
+        if args.max_predict_samples is not None:
             # We will select sample from whole data
-            test_examples = test_examples.select(range(args.max_test_samples))
-        # Test Feature Creation
-        test_dataset = test_examples.map(
+            predict_examples = predict_examples.select(range(args.max_predict_samples))
+        # Predict Feature Creation
+        predict_dataset = predict_examples.map(
             prepare_validation_features,
             batched=True,
             num_proc=args.preprocessing_num_workers,
             remove_columns=column_names,
             load_from_cache_file=not args.overwrite_cache,
         )
-        if args.max_test_samples is not None:
+        if args.max_predict_samples is not None:
             # During Feature creation dataset samples might increase, we will select required samples again
-            test_dataset = test_dataset.select(range(args.max_test_samples))
+            predict_dataset = predict_dataset.select(range(args.max_predict_samples))
 
     # Log a few random samples from the training set:
     for index in random.sample(range(len(train_dataset)), 3):
@@ -539,9 +539,9 @@ def main():
     eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)
 
     if args.do_predict:
-        test_dataset.set_format(type="torch", columns=["attention_mask", "input_ids", "token_type_ids"])
-        test_dataloader = DataLoader(
-            test_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size
+        predict_dataset.set_format(type="torch", columns=["attention_mask", "input_ids", "token_type_ids"])
+        predict_dataloader = DataLoader(
+            predict_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size
         )
 
     # Post-processing:
@@ -737,7 +737,7 @@ def main():
         all_end_top_log_probs = []
         all_end_top_index = []
         all_cls_logits = []
-        for step, batch in enumerate(test_dataloader):
+        for step, batch in enumerate(predict_dataloader):
             with torch.no_grad():
                 outputs = model(**batch)
                 start_top_log_probs = outputs.start_top_log_probs
@@ -762,10 +762,10 @@ def main():
         max_len = max([x.shape[1] for x in all_end_top_log_probs])  # Get the max_length of the tensor
 
         # concatenate all numpy arrays collected above
-        start_top_log_probs_concat = create_and_fill_np_array(all_start_top_log_probs, test_dataset, max_len)
-        start_top_index_concat = create_and_fill_np_array(all_start_top_index, test_dataset, max_len)
-        end_top_log_probs_concat = create_and_fill_np_array(all_end_top_log_probs, test_dataset, max_len)
-        end_top_index_concat = create_and_fill_np_array(all_end_top_index, test_dataset, max_len)
+        start_top_log_probs_concat = create_and_fill_np_array(all_start_top_log_probs, predict_dataset, max_len)
+        start_top_index_concat = create_and_fill_np_array(all_start_top_index, predict_dataset, max_len)
+        end_top_log_probs_concat = create_and_fill_np_array(all_end_top_log_probs, predict_dataset, max_len)
+        end_top_index_concat = create_and_fill_np_array(all_end_top_index, predict_dataset, max_len)
         all_cls_logits = np.concatenate(all_cls_logits, axis=0)
 
         # delete the list of numpy arrays
@@ -774,7 +774,7 @@ def main():
         del end_top_log_probs
         del end_top_index
 
-        test_dataset.set_format(type=None, columns=list(test_dataset.features.keys()))
+        predict_dataset.set_format(type=None, columns=list(predict_dataset.features.keys()))
         outputs_numpy = (
             start_top_log_probs_concat,
             start_top_index_concat,
@@ -783,9 +783,9 @@ def main():
             cls_logits,
         )
 
-        prediction = post_processing_function(test_examples, test_dataset, outputs_numpy)
-        test_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
-        logger.info(f"Test metrics: {test_metric}")
+        prediction = post_processing_function(predict_examples, predict_dataset, outputs_numpy)
+        predict_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
+        logger.info(f"Predict metrics: {predict_metric}")
 
     if args.output_dir is not None:
         accelerator.wait_for_everyone()
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index 7a8b2215be..97e2c8b431 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -205,20 +205,20 @@ def parse_args():
         "value if set.",
     )
     parser.add_argument(
-        "--max_val_samples",
+        "--max_eval_samples",
         type=int,
         default=None,
-        help="For debugging purposes or quicker training, truncate the number of validation examples to this "
+        help="For debugging purposes or quicker training, truncate the number of evaluation examples to this "
         "value if set.",
     )
     parser.add_argument(
         "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets"
     )
     parser.add_argument(
-        "--max_test_samples",
+        "--max_predict_samples",
         type=int,
         default=None,
-        help="For debugging purposes or quicker training, truncate the number of test examples to this",
+        help="For debugging purposes or quicker training, truncate the number of prediction examples to this",
     )
     parser.add_argument(
         "--model_type",
@@ -486,9 +486,9 @@ def main():
     if "validation" not in raw_datasets:
         raise ValueError("--do_eval requires a validation dataset")
     eval_examples = raw_datasets["validation"]
-    if args.max_val_samples is not None:
+    if args.max_eval_samples is not None:
         # We will select sample from whole data
-        eval_examples = eval_examples.select(range(args.max_val_samples))
+        eval_examples = eval_examples.select(range(args.max_eval_samples))
     # Validation Feature Creation
     eval_dataset = eval_examples.map(
         prepare_validation_features,
@@ -498,28 +498,28 @@ def main():
         load_from_cache_file=not args.overwrite_cache,
     )
 
-    if args.max_val_samples is not None:
+    if args.max_eval_samples is not None:
         # During Feature creation dataset samples might increase, we will select required samples again
-        eval_dataset = eval_dataset.select(range(args.max_val_samples))
+        eval_dataset = eval_dataset.select(range(args.max_eval_samples))
 
     if args.do_predict:
         if "test" not in raw_datasets:
             raise ValueError("--do_predict requires a test dataset")
-        test_examples = raw_datasets["test"]
-        if args.max_test_samples is not None:
+        predict_examples = raw_datasets["test"]
+        if args.max_predict_samples is not None:
             # We will select sample from whole data
-            test_examples = test_examples.select(range(args.max_test_samples))
-        # Test Feature Creation
-        test_dataset = test_examples.map(
+            predict_examples = predict_examples.select(range(args.max_predict_samples))
+        # Predict Feature Creation
+        predict_dataset = predict_examples.map(
             prepare_validation_features,
             batched=True,
             num_proc=args.preprocessing_num_workers,
             remove_columns=column_names,
             load_from_cache_file=not args.overwrite_cache,
         )
-        if args.max_test_samples is not None:
+        if args.max_predict_samples is not None:
             # During Feature creation dataset samples might increase, we will select required samples again
-            test_dataset = test_dataset.select(range(args.max_test_samples))
+            predict_dataset = predict_dataset.select(range(args.max_predict_samples))
 
     # Log a few random samples from the training set:
     for index in random.sample(range(len(train_dataset)), 3):
@@ -544,9 +544,9 @@ def main():
     eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)
 
     if args.do_predict:
-        test_dataset.set_format(type="torch", columns=["attention_mask", "input_ids", "token_type_ids"])
-        test_dataloader = DataLoader(
-            test_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size
+        predict_dataset.set_format(type="torch", columns=["attention_mask", "input_ids", "token_type_ids"])
+        predict_dataloader = DataLoader(
+            predict_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size
         )
 
     # Post-processing:
@@ -714,7 +714,7 @@ def main():
     if args.do_predict:
         all_start_logits = []
         all_end_logits = []
-        for step, batch in enumerate(test_dataloader):
+        for step, batch in enumerate(predict_dataloader):
             with torch.no_grad():
                 outputs = model(**batch)
                 start_logits = outputs.start_logits
@@ -729,19 +729,19 @@ def main():
 
         max_len = max([x.shape[1] for x in all_start_logits])  # Get the max_length of the tensor
         # concatenate the numpy array
-        start_logits_concat = create_and_fill_np_array(all_start_logits, test_dataset, max_len)
-        end_logits_concat = create_and_fill_np_array(all_end_logits, test_dataset, max_len)
+        start_logits_concat = create_and_fill_np_array(all_start_logits, predict_dataset, max_len)
+        end_logits_concat = create_and_fill_np_array(all_end_logits, predict_dataset, max_len)
 
         # delete the list of numpy arrays
         del all_start_logits
         del all_end_logits
 
         # Now we need to add extra columns which we removed for post processing
-        test_dataset.set_format(type=None, columns=list(test_dataset.features.keys()))
+        predict_dataset.set_format(type=None, columns=list(predict_dataset.features.keys()))
         outputs_numpy = (start_logits_concat, end_logits_concat)
-        prediction = post_processing_function(test_examples, test_dataset, outputs_numpy)
-        eval_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
-        logger.info(f"Test metrics: {eval_metric}")
+        prediction = post_processing_function(predict_examples, predict_dataset, outputs_numpy)
+        predict_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
+        logger.info(f"Predict metrics: {predict_metric}")
 
     if args.output_dir is not None:
         accelerator.wait_for_everyone()
diff --git a/examples/pytorch/question-answering/trainer_qa.py b/examples/pytorch/question-answering/trainer_qa.py
index 41699cd1df..36e2e544a7 100644
--- a/examples/pytorch/question-answering/trainer_qa.py
+++ b/examples/pytorch/question-answering/trainer_qa.py
@@ -66,16 +66,16 @@ class QuestionAnsweringTrainer(Trainer):
         self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics)
         return metrics
 
-    def predict(self, test_dataset, test_examples, ignore_keys=None):
-        test_dataloader = self.get_test_dataloader(test_dataset)
+    def predict(self, predict_dataset, predict_examples, ignore_keys=None):
+        predict_dataloader = self.get_test_dataloader(predict_dataset)
 
         # Temporarily disable metric computation, we will do it in the loop here.
         compute_metrics = self.compute_metrics
         self.compute_metrics = None
         try:
             output = self.prediction_loop(
-                test_dataloader,
-                description="Evaluation",
+                predict_dataloader,
+                description="Prediction",
                 # No point gathering the predictions if there are no metrics, otherwise we defer to
                 # self.args.prediction_loss_only
                 prediction_loss_only=True if compute_metrics is None else None,
@@ -87,7 +87,7 @@ class QuestionAnsweringTrainer(Trainer):
         if self.post_process_function is None or self.compute_metrics is None:
             return output
 
-        eval_preds = self.post_process_function(test_examples, test_dataset, output.predictions, "test")
-        metrics = self.compute_metrics(eval_preds)
+        predictions = self.post_process_function(predict_examples, predict_dataset, output.predictions, "predict")
+        metrics = self.compute_metrics(predictions)
 
-        return PredictionOutput(predictions=eval_preds.predictions, label_ids=eval_preds.label_ids, metrics=metrics)
+        return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics)
diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index 25a117010e..841d7e9b58 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -178,17 +178,17 @@ class DataTrainingArguments:
             "value if set."
         },
     )
-    max_val_samples: Optional[int] = field(
+    max_eval_samples: Optional[int] = field(
         default=None,
         metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
             "value if set."
         },
     )
-    max_test_samples: Optional[int] = field(
+    max_predict_samples: Optional[int] = field(
         default=None,
         metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of test examples to this "
+            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
             "value if set."
         },
     )
@@ -438,8 +438,8 @@ def main():
         if "validation" not in datasets:
             raise ValueError("--do_eval requires a validation dataset")
         eval_dataset = datasets["validation"]
-        if data_args.max_val_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
+        if data_args.max_eval_samples is not None:
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
         eval_dataset = eval_dataset.map(
             preprocess_function,
             batched=True,
@@ -452,10 +452,10 @@ def main():
         max_target_length = data_args.val_max_target_length
         if "test" not in datasets:
             raise ValueError("--do_predict requires a test dataset")
-        test_dataset = datasets["test"]
-        if data_args.max_test_samples is not None:
-            test_dataset = test_dataset.select(range(data_args.max_test_samples))
-        test_dataset = test_dataset.map(
+        predict_dataset = datasets["test"]
+        if data_args.max_predict_samples is not None:
+            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
+        predict_dataset = predict_dataset.map(
             preprocess_function,
             batched=True,
             num_proc=data_args.preprocessing_num_workers,
@@ -547,37 +547,39 @@ def main():
         metrics = trainer.evaluate(
             max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, metric_key_prefix="eval"
         )
-        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
 
         trainer.log_metrics("eval", metrics)
         trainer.save_metrics("eval", metrics)
 
     if training_args.do_predict:
-        logger.info("*** Test ***")
+        logger.info("*** Predict ***")
 
-        test_results = trainer.predict(
-            test_dataset,
-            metric_key_prefix="test",
+        predict_results = trainer.predict(
+            predict_dataset,
+            metric_key_prefix="predict",
             max_length=data_args.val_max_target_length,
             num_beams=data_args.num_beams,
         )
-        metrics = test_results.metrics
-        max_test_samples = data_args.max_test_samples if data_args.max_test_samples is not None else len(test_dataset)
-        metrics["test_samples"] = min(max_test_samples, len(test_dataset))
+        metrics = predict_results.metrics
+        max_predict_samples = (
+            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
+        )
+        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
 
-        trainer.log_metrics("test", metrics)
-        trainer.save_metrics("test", metrics)
+        trainer.log_metrics("predict", metrics)
+        trainer.save_metrics("predict", metrics)
 
         if trainer.is_world_process_zero():
             if training_args.predict_with_generate:
-                test_preds = tokenizer.batch_decode(
-                    test_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
+                predictions = tokenizer.batch_decode(
+                    predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
                 )
-                test_preds = [pred.strip() for pred in test_preds]
-                output_test_preds_file = os.path.join(training_args.output_dir, "test_generations.txt")
-                with open(output_test_preds_file, "w") as writer:
-                    writer.write("\n".join(test_preds))
+                predictions = [pred.strip() for pred in predictions]
+                output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.txt")
+                with open(output_prediction_file, "w") as writer:
+                    writer.write("\n".join(predictions))
 
     if training_args.push_to_hub:
         trainer.push_to_hub()
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index eff048a65a..cd8c6a94ae 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -100,17 +100,17 @@ class DataTrainingArguments:
             "value if set."
         },
     )
-    max_val_samples: Optional[int] = field(
+    max_eval_samples: Optional[int] = field(
         default=None,
         metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
             "value if set."
         },
     )
-    max_test_samples: Optional[int] = field(
+    max_predict_samples: Optional[int] = field(
         default=None,
         metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of test examples to this "
+            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
             "value if set."
         },
     )
@@ -390,15 +390,15 @@ def main():
         if "validation" not in datasets and "validation_matched" not in datasets:
             raise ValueError("--do_eval requires a validation dataset")
         eval_dataset = datasets["validation_matched" if data_args.task_name == "mnli" else "validation"]
-        if data_args.max_val_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
+        if data_args.max_eval_samples is not None:
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
 
     if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None:
         if "test" not in datasets and "test_matched" not in datasets:
             raise ValueError("--do_predict requires a test dataset")
-        test_dataset = datasets["test_matched" if data_args.task_name == "mnli" else "test"]
-        if data_args.max_test_samples is not None:
-            test_dataset = test_dataset.select(range(data_args.max_test_samples))
+        predict_dataset = datasets["test_matched" if data_args.task_name == "mnli" else "test"]
+        if data_args.max_predict_samples is not None:
+            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
 
     # Log a few random samples from the training set:
     if training_args.do_train:
@@ -483,32 +483,34 @@ def main():
         for eval_dataset, task in zip(eval_datasets, tasks):
             metrics = trainer.evaluate(eval_dataset=eval_dataset)
 
-            max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
-            metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
+            max_eval_samples = (
+                data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+            )
+            metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
 
             trainer.log_metrics("eval", metrics)
             trainer.save_metrics("eval", metrics)
 
     if training_args.do_predict:
-        logger.info("*** Test ***")
+        logger.info("*** Predict ***")
 
         # Loop to handle MNLI double evaluation (matched, mis-matched)
         tasks = [data_args.task_name]
-        test_datasets = [test_dataset]
+        predict_datasets = [predict_dataset]
         if data_args.task_name == "mnli":
             tasks.append("mnli-mm")
-            test_datasets.append(datasets["test_mismatched"])
+            predict_datasets.append(datasets["test_mismatched"])
 
-        for test_dataset, task in zip(test_datasets, tasks):
+        for predict_dataset, task in zip(predict_datasets, tasks):
             # Removing the `label` columns because it contains -1 and Trainer won't like that.
-            test_dataset.remove_columns_("label")
-            predictions = trainer.predict(test_dataset=test_dataset).predictions
+            predict_dataset.remove_columns_("label")
+            predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
             predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
 
-            output_test_file = os.path.join(training_args.output_dir, f"test_results_{task}.txt")
+            output_predict_file = os.path.join(training_args.output_dir, f"predict_results_{task}.txt")
             if trainer.is_world_process_zero():
-                with open(output_test_file, "w") as writer:
-                    logger.info(f"***** Test results {task} *****")
+                with open(output_predict_file, "w") as writer:
+                    logger.info(f"***** Predict results {task} *****")
                     writer.write("index\tprediction\n")
                     for index, item in enumerate(predictions):
                         if is_regression:
diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py
index 1c74e6e7bc..c1d8522c8d 100755
--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@@ -84,17 +84,17 @@ class DataTrainingArguments:
             "value if set."
         },
     )
-    max_val_samples: Optional[int] = field(
+    max_eval_samples: Optional[int] = field(
         default=None,
         metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
             "value if set."
         },
     )
-    max_test_samples: Optional[int] = field(
+    max_predict_samples: Optional[int] = field(
         default=None,
         metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of test examples to this "
+            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
             "value if set."
         },
     )
@@ -221,8 +221,8 @@ def main():
         label_list = eval_dataset.features["label"].names
 
     if training_args.do_predict:
-        test_dataset = load_dataset("xnli", model_args.language, split="test", cache_dir=model_args.cache_dir)
-        label_list = test_dataset.features["label"].names
+        predict_dataset = load_dataset("xnli", model_args.language, split="test", cache_dir=model_args.cache_dir)
+        label_list = predict_dataset.features["label"].names
 
     # Labels
     num_labels = len(label_list)
@@ -286,8 +286,8 @@ def main():
             logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
 
     if training_args.do_eval:
-        if data_args.max_val_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
+        if data_args.max_eval_samples is not None:
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
         eval_dataset = eval_dataset.map(
             preprocess_function,
             batched=True,
@@ -295,9 +295,9 @@ def main():
         )
 
     if training_args.do_predict:
-        if data_args.max_test_samples is not None:
-            test_dataset = test_dataset.select(range(data_args.max_test_samples))
-        test_dataset = test_dataset.map(
+        if data_args.max_predict_samples is not None:
+            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
+        predict_dataset = predict_dataset.map(
             preprocess_function,
             batched=True,
             load_from_cache_file=not data_args.overwrite_cache,
@@ -360,8 +360,8 @@ def main():
         logger.info("*** Evaluate ***")
         metrics = trainer.evaluate(eval_dataset=eval_dataset)
 
-        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
 
         trainer.log_metrics("eval", metrics)
         trainer.save_metrics("eval", metrics)
@@ -369,18 +369,20 @@ def main():
     # Prediction
     if training_args.do_predict:
         logger.info("*** Predict ***")
-        predictions, labels, metrics = trainer.predict(test_dataset)
+        predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict")
 
-        max_test_samples = data_args.max_test_samples if data_args.max_test_samples is not None else len(test_dataset)
-        metrics["test_samples"] = min(max_test_samples, len(test_dataset))
+        max_predict_samples = (
+            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
+        )
+        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
 
-        trainer.log_metrics("test", metrics)
-        trainer.save_metrics("test", metrics)
+        trainer.log_metrics("predict", metrics)
+        trainer.save_metrics("predict", metrics)
 
         predictions = np.argmax(predictions, axis=1)
-        output_test_file = os.path.join(training_args.output_dir, "test_predictions.txt")
+        output_predict_file = os.path.join(training_args.output_dir, "predictions.txt")
         if trainer.is_world_process_zero():
-            with open(output_test_file, "w") as writer:
+            with open(output_predict_file, "w") as writer:
                 writer.write("index\tprediction\n")
                 for index, item in enumerate(predictions):
                     item = label_list[item]
diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py
index 2907a78033..6ca2c591ae 100755
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -128,17 +128,17 @@ class DataTrainingArguments:
             "value if set."
         },
     )
-    max_val_samples: Optional[int] = field(
+    max_eval_samples: Optional[int] = field(
         default=None,
         metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
             "value if set."
         },
     )
-    max_test_samples: Optional[int] = field(
+    max_predict_samples: Optional[int] = field(
         default=None,
         metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of test examples to this "
+            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
             "value if set."
         },
     )
@@ -363,8 +363,8 @@ def main():
         if "validation" not in datasets:
             raise ValueError("--do_eval requires a validation dataset")
         eval_dataset = datasets["validation"]
-        if data_args.max_val_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
+        if data_args.max_eval_samples is not None:
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
         eval_dataset = eval_dataset.map(
             tokenize_and_align_labels,
             batched=True,
@@ -375,10 +375,10 @@ def main():
     if training_args.do_predict:
         if "test" not in datasets:
             raise ValueError("--do_predict requires a test dataset")
-        test_dataset = datasets["test"]
-        if data_args.max_test_samples is not None:
-            test_dataset = test_dataset.select(range(data_args.max_test_samples))
-        test_dataset = test_dataset.map(
+        predict_dataset = datasets["test"]
+        if data_args.max_predict_samples is not None:
+            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
+        predict_dataset = predict_dataset.map(
             tokenize_and_align_labels,
             batched=True,
             num_proc=data_args.preprocessing_num_workers,
@@ -462,8 +462,8 @@ def main():
 
         metrics = trainer.evaluate()
 
-        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
 
         trainer.log_metrics("eval", metrics)
         trainer.save_metrics("eval", metrics)
@@ -472,7 +472,7 @@ def main():
     if training_args.do_predict:
         logger.info("*** Predict ***")
 
-        predictions, labels, metrics = trainer.predict(test_dataset)
+        predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict")
         predictions = np.argmax(predictions, axis=2)
 
         # Remove ignored index (special tokens)
@@ -481,13 +481,13 @@ def main():
             for prediction, label in zip(predictions, labels)
         ]
 
-        trainer.log_metrics("test", metrics)
-        trainer.save_metrics("test", metrics)
+        trainer.log_metrics("predict", metrics)
+        trainer.save_metrics("predict", metrics)
 
         # Save predictions
-        output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt")
+        output_predictions_file = os.path.join(training_args.output_dir, "predictions.txt")
         if trainer.is_world_process_zero():
-            with open(output_test_predictions_file, "w") as writer:
+            with open(output_predictions_file, "w") as writer:
                 for prediction in true_predictions:
                     writer.write(" ".join(prediction) + "\n")
 
diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py
index 24ab358cc3..e4fd946e71 100755
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -167,17 +167,17 @@ class DataTrainingArguments:
             "value if set."
         },
     )
-    max_val_samples: Optional[int] = field(
+    max_eval_samples: Optional[int] = field(
         default=None,
         metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
             "value if set."
         },
     )
-    max_test_samples: Optional[int] = field(
+    max_predict_samples: Optional[int] = field(
         default=None,
         metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of test examples to this "
+            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
             "value if set."
         },
     )
@@ -432,8 +432,8 @@ def main():
         if "validation" not in datasets:
             raise ValueError("--do_eval requires a validation dataset")
         eval_dataset = datasets["validation"]
-        if data_args.max_val_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
+        if data_args.max_eval_samples is not None:
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
         eval_dataset = eval_dataset.map(
             preprocess_function,
             batched=True,
@@ -446,10 +446,10 @@ def main():
         max_target_length = data_args.val_max_target_length
         if "test" not in datasets:
             raise ValueError("--do_predict requires a test dataset")
-        test_dataset = datasets["test"]
-        if data_args.max_test_samples is not None:
-            test_dataset = test_dataset.select(range(data_args.max_test_samples))
-        test_dataset = test_dataset.map(
+        predict_dataset = datasets["test"]
+        if data_args.max_predict_samples is not None:
+            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
+        predict_dataset = predict_dataset.map(
             preprocess_function,
             batched=True,
             num_proc=data_args.preprocessing_num_workers,
@@ -539,37 +539,39 @@ def main():
         metrics = trainer.evaluate(
             max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, metric_key_prefix="eval"
         )
-        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
 
         trainer.log_metrics("eval", metrics)
         trainer.save_metrics("eval", metrics)
 
     if training_args.do_predict:
-        logger.info("*** Test ***")
+        logger.info("*** Predict ***")
 
-        test_results = trainer.predict(
-            test_dataset,
-            metric_key_prefix="test",
+        predict_results = trainer.predict(
+            predict_dataset,
+            metric_key_prefix="predict",
             max_length=data_args.val_max_target_length,
             num_beams=data_args.num_beams,
         )
-        metrics = test_results.metrics
-        max_test_samples = data_args.max_test_samples if data_args.max_test_samples is not None else len(test_dataset)
-        metrics["test_samples"] = min(max_test_samples, len(test_dataset))
+        metrics = predict_results.metrics
+        max_predict_samples = (
+            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
+        )
+        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
 
-        trainer.log_metrics("test", metrics)
-        trainer.save_metrics("test", metrics)
+        trainer.log_metrics("predict", metrics)
+        trainer.save_metrics("predict", metrics)
 
         if trainer.is_world_process_zero():
             if training_args.predict_with_generate:
-                test_preds = tokenizer.batch_decode(
-                    test_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
+                predictions = tokenizer.batch_decode(
+                    predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
                 )
-                test_preds = [pred.strip() for pred in test_preds]
-                output_test_preds_file = os.path.join(training_args.output_dir, "test_generations.txt")
-                with open(output_test_preds_file, "w") as writer:
-                    writer.write("\n".join(test_preds))
+                predictions = [pred.strip() for pred in predictions]
+                output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.txt")
+                with open(output_prediction_file, "w") as writer:
+                    writer.write("\n".join(predictions))
 
     if training_args.push_to_hub:
         trainer.push_to_hub()
diff --git a/examples/tensorflow/text-classification/run_text_classification.py b/examples/tensorflow/text-classification/run_text_classification.py
index 3c9e260097..f725f1c930 100644
--- a/examples/tensorflow/text-classification/run_text_classification.py
+++ b/examples/tensorflow/text-classification/run_text_classification.py
@@ -164,17 +164,17 @@ class DataTrainingArguments:
             "value if set."
         },
     )
-    max_val_samples: Optional[int] = field(
+    max_eval_samples: Optional[int] = field(
         default=None,
         metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
             "value if set."
         },
     )
-    max_test_samples: Optional[int] = field(
+    max_predict_samples: Optional[int] = field(
         default=None,
         metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of test examples to this "
+            "help": "For debugging purposes or quicker training, truncate the number of predict examples to this "
             "value if set."
         },
     )
@@ -468,13 +468,13 @@ def main():
 
     if "validation" in datasets:
         eval_dataset = datasets["validation"]
-        if data_args.max_val_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
+        if data_args.max_eval_samples is not None:
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
 
     if "test" in datasets:
-        test_dataset = datasets["test"]
-        if data_args.max_test_samples is not None:
-            test_dataset = test_dataset.select(range(data_args.max_test_samples))
+        predict_dataset = datasets["test"]
+        if data_args.max_predict_samples is not None:
+            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
 
     # endregion
 
@@ -513,15 +513,15 @@ def main():
 
     # region Prediction
     if "test" in datasets:
-        logger.info("Doing predictions on test dataset...")
+        logger.info("Doing predictions on Predict dataset...")
 
-        test_dataset = DataSequence(
-            test_dataset, non_label_column_names, batch_size=training_args.per_device_eval_batch_size, labels=False
+        predict_dataset = DataSequence(
+            predict_dataset, non_label_column_names, batch_size=training_args.per_device_eval_batch_size, labels=False
         )
-        predictions = model.predict(test_dataset)["logits"]
+        predictions = model.predict(predict_dataset)["logits"]
         predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
-        output_test_file = os.path.join(training_args.output_dir, "test_results.txt")
-        with open(output_test_file, "w") as writer:
+        output_predict_file = os.path.join(training_args.output_dir, "predict_results.txt")
+        with open(output_predict_file, "w") as writer:
             writer.write("index\tprediction\n")
             for index, item in enumerate(predictions):
                 if is_regression:
@@ -529,7 +529,7 @@ def main():
                 else:
                     item = model.config.id2label[item]
                     writer.write(f"{index}\t{item}\n")
-        logger.info(f"Wrote predictions to {output_test_file}!")
+        logger.info(f"Wrote predictions to {output_predict_file}!")
     # endregion
 
 
diff --git a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
index d40328b925..48590fe167 100755
--- a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
+++ b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
@@ -157,17 +157,17 @@ class DataTrainingArguments:
             "value if set."
         },
     )
-    max_val_samples: Optional[int] = field(
+    max_eval_samples: Optional[int] = field(
         default=None,
         metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
             "value if set."
         },
     )
-    max_test_samples: Optional[int] = field(
+    max_predict_samples: Optional[int] = field(
         default=None,
         metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of test examples to this "
+            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
             "value if set."
         },
     )
@@ -379,8 +379,8 @@ def main():
             raise ValueError("--do_eval requires a validation dataset")
         eval_dataset = datasets["validation"]
         # Selecting samples from dataset
-        if data_args.max_val_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
+        if data_args.max_eval_samples is not None:
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
         # tokenize validation dataset
         eval_dataset = eval_dataset.map(
             tokenize_function,
@@ -393,12 +393,12 @@ def main():
     if training_args.do_predict:
         if "test" not in datasets:
             raise ValueError("--do_predict requires a test dataset")
-        test_dataset = datasets["test"]
+        predict_dataset = datasets["test"]
         # Selecting samples from dataset
-        if data_args.max_test_samples is not None:
-            test_dataset = test_dataset.select(range(data_args.max_test_samples))
-        # tokenize test dataset
-        test_dataset = test_dataset.map(
+        if data_args.max_predict_samples is not None:
+            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
+        # tokenize predict dataset
+        predict_dataset = predict_dataset.map(
             tokenize_function,
             batched=True,
             num_proc=data_args.preprocessing_num_workers,
@@ -455,8 +455,8 @@ def main():
 
         metrics = trainer.evaluate()
 
-        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
 
         trainer.log_metrics("eval", metrics)
         trainer.save_metrics("eval", metrics)
@@ -464,13 +464,13 @@ def main():
     # Prediction
     if training_args.do_predict:
         logger.info("*** Predict ***")
-        predictions, labels, metrics = trainer.predict(test_dataset)
+        predictions, labels, metrics = trainer.predict(predict_dataset)
 
-        max_test_samples = data_args.max_test_samples if data_args.max_test_samples is not None else len(test_dataset)
-        metrics["test_samples"] = min(max_test_samples, len(test_dataset))
+        max_predict_samples = data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
+        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
 
-        trainer.log_metrics("test", metrics)
-        trainer.save_metrics("test", metrics)
+        trainer.log_metrics("predict", metrics)
+        trainer.save_metrics("predict", metrics)
         
         # write custom code for saving predictions according to task
 
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 07afadc369..2b00e75652 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -578,7 +578,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
             args.extend(
                 """
             --do_eval
-            --max_val_samples 100
+            --max_eval_samples 100
             --per_device_eval_batch_size 2
             """.split()
             )
@@ -620,7 +620,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
             --do_train
             --do_eval
             --max_train_samples 10
-            --max_val_samples 10
+            --max_eval_samples 10
             --per_device_train_batch_size 5
             --per_device_eval_batch_size 5
             --num_train_epochs 1
diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py
index 563ea8f059..bae3587400 100644
--- a/tests/extended/test_trainer_ext.py
+++ b/tests/extended/test_trainer_ext.py
@@ -191,7 +191,7 @@ class TestTrainerExt(TestCasePlus):
             --output_dir {output_dir}
             --overwrite_output_dir
             --max_train_samples 8
-            --max_val_samples 8
+            --max_eval_samples 8
             --max_source_length {max_len}
             --max_target_length {max_len}
             --val_max_target_length {max_len}