From 87716a6d072b2b66415ce43086c73b04e63fe0fe Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 30 Jun 2020 11:43:43 -0400
Subject: [PATCH] Documentation for the Trainer API (#5383)

* Documentation for the Trainer API

* Address review comments

* Address comments
---
 docs/source/index.rst                |   1 +
 docs/source/main_classes/trainer.rst |  45 +++++++++++
 src/transformers/__init__.py         |   2 +-
 src/transformers/trainer.py          | 107 +++++++++++++++++++--------
 src/transformers/trainer_tf.py       |  84 ++++++++++++++++++---
 src/transformers/trainer_utils.py    |   7 +-
 src/transformers/training_args.py    |  86 ++++++++++++++++++++-
 src/transformers/training_args_tf.py |  85 +++++++++++++++++++++
 8 files changed, 369 insertions(+), 48 deletions(-)
 create mode 100644 docs/source/main_classes/trainer.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index bbd841fb85..aad68b0dd8 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -173,6 +173,7 @@ conversion utilities for the following models:
     main_classes/pipelines
     main_classes/optimizer_schedules
     main_classes/processors
+    main_classes/trainer
     model_doc/auto
     model_doc/encoderdecoder
     model_doc/bert
diff --git a/docs/source/main_classes/trainer.rst b/docs/source/main_classes/trainer.rst
new file mode 100644
index 0000000000..e5687de469
--- /dev/null
+++ b/docs/source/main_classes/trainer.rst
@@ -0,0 +1,45 @@
+Trainer
+----------
+
+The :class:`~transformers.Trainer` and :class:`~transformers.TFTrainer` classes provide an API for feature-complete
+training in most standard use cases. It's used in most of the :doc:`example scripts <../examples>`.
+
+Before instantiating your :class:`~transformers.Trainer`/:class:`~transformers.TFTrainer`, create a 
+:class:`~transformers.TrainingArguments`/:class:`~transformers.TFTrainingArguments` to access all the points of
+customization during training.
+
+The API supports distributed training on multiple GPUs/TPUs, mixed precision through `NVIDIA Apex
+<https://github.com/NVIDIA/apex>`__ for PyTorch and :obj:`tf.keras.mixed_precision` for TensorFlow.
+
+``Trainer`` 
+~~~~~~~~~~~
+
+.. autoclass:: transformers.Trainer
+    :members:
+
+``TFTrainer`` 
+~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFTrainer
+    :members:
+
+``TrainingArguments``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TrainingArguments
+    :members:
+
+``TFTrainingArguments``
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFTrainingArguments
+    :members:
+
+Utilities
+~~~~~~~~~
+
+.. autoclass:: transformers.EvalPrediction
+
+.. autofunction:: transformers.set_seed
+
+.. autofunction:: transformers.torch_distributed_zero_first
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 335ac94175..4728f6ff02 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -397,7 +397,7 @@ if is_torch_available():
     )
 
     # Trainer
-    from .trainer import Trainer, set_seed, torch_distributed_zero_first, EvalPrediction
+    from .trainer import Trainer, set_seed, torch_distributed_zero_first
     from .data.data_collator import default_data_collator, DataCollator, DataCollatorForLanguageModeling
     from .data.datasets import GlueDataset, TextDataset, LineByLineTextDataset, GlueDataTrainingArguments
 
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index c586907bd5..fa40947b1a 100644
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -61,6 +61,12 @@ logger = logging.getLogger(__name__)
 
 
 def set_seed(seed: int):
+    """
+    Helper function for reproducible behavior to set the seed in ``random``, ``numpy`` and ``torch``.
+
+    Args:
+        seed (:obj:`int`): The seed to set.
+    """
     random.seed(seed)
     np.random.seed(seed)
     torch.manual_seed(seed)
@@ -72,6 +78,9 @@ def set_seed(seed: int):
 def torch_distributed_zero_first(local_rank: int):
     """
     Decorator to make all processes in distributed training wait for each local_master to do something.
+
+    Args:
+        local_rank (:obj:`int`): The rank of the local process.
     """
     if local_rank not in [-1, 0]:
         torch.distributed.barrier()
@@ -133,7 +142,31 @@ def get_tpu_sampler(dataset: Dataset):
 class Trainer:
     """
     Trainer is a simple but feature-complete training and eval loop for PyTorch,
-    optimized for Transformers.
+    optimized for 🤗 Transformers.
+
+    Args:
+        model (:class:`~transformers.PreTrainedModel`):
+            The model to train, evaluate or use for predictions.
+        args (:class:`~transformers.TrainingArguments`):
+            The arguments to tweak training.
+        data_collator (:obj:`DataCollator`, `optional`, defaults to :func:`~transformers.default_data_collator`):
+            The function to use to from a batch from a list of elements of :obj:`train_dataset` or
+            :obj:`eval_dataset`.
+        train_dataset (:obj:`Dataset`, `optional`):
+            The dataset to use for training.
+        eval_dataset (:obj:`Dataset`, `optional`):
+            The dataset to use for evaluation.
+        compute_metrics (:obj:`Callable[[EvalPrediction], Dict]`, `optional`):
+            The function that will be used to compute metrics at evaluation. Must take a
+            :class:`~transformers.EvalPrediction` and return a dictionary string to metric values.
+        prediction_loss_only (:obj:`bool`, `optional`, defaults to `False`):
+            When performing evaluation and predictions, only returns the loss.
+        tb_writer (:obj:`SummaryWriter`, `optional`):
+            Object to write to TensorBoard.
+        optimizers (:obj:`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR`, `optional`):
+            A tuple containing the optimizer and the scheduler to use. Will default to an instance of
+            :class:`~transformers.AdamW` on your model and a scheduler given by
+            :func:`~transformers.get_linear_schedule_with_warmup` controlled by :obj:`args`.
     """
 
     model: PreTrainedModel
@@ -160,14 +193,6 @@ class Trainer:
         tb_writer: Optional["SummaryWriter"] = None,
         optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None,
     ):
-        """
-        Trainer is a simple but feature-complete training and eval loop for PyTorch,
-        optimized for Transformers.
-
-        Args:
-            prediction_loss_only:
-                (Optional) in evaluation and prediction, only return the loss
-        """
         self.model = model.to(args.device)
         self.args = args
         self.data_collator = data_collator if data_collator is not None else default_data_collator
@@ -210,6 +235,9 @@ class Trainer:
             )
 
     def get_train_dataloader(self) -> DataLoader:
+        """
+        Returns the training :class:`~torch.utils.data.DataLoader`.
+        """
         if self.train_dataset is None:
             raise ValueError("Trainer: training requires a train_dataset.")
         if is_torch_tpu_available():
@@ -232,6 +260,13 @@ class Trainer:
         return data_loader
 
     def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader:
+        """
+        Returns the evaluation :class:`~torch.utils.data.DataLoader`.
+
+        Args:
+            eval_dataset (:obj:`Dataset`, `optional`):
+                If provided, will override `self.eval_dataset`.
+        """
         if eval_dataset is None and self.eval_dataset is None:
             raise ValueError("Trainer: evaluation requires an eval_dataset.")
 
@@ -257,6 +292,12 @@ class Trainer:
         return data_loader
 
     def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
+        """
+        Returns the test :class:`~torch.utils.data.DataLoader`.
+
+        Args:
+            test_dataset (obj:`Dataset`): The test dataset to use.
+        """
         # We use the same batch_size as for eval.
         if is_torch_tpu_available():
             sampler = SequentialDistributedSampler(
@@ -283,9 +324,8 @@ class Trainer:
         """
         Setup the optimizer and the learning rate scheduler.
 
-        We provide a reasonable default that works well.
-        If you want to use something else, you can pass a tuple in the Trainer's init,
-        or override this method in a subclass.
+        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+        Trainer's init through :obj:`optimizers`, or override this method in a subclass.
         """
         if self.optimizers is not None:
             return self.optimizers
@@ -336,7 +376,7 @@ class Trainer:
 
     def num_examples(self, dataloader: DataLoader) -> int:
         """
-        Helper to get num of examples from a DataLoader, by accessing its Dataset.
+        Helper to get number of samples in a :class:`~torch.utils.data.DataLoader` by accessing its Dataset.
         """
         return len(dataloader.dataset)
 
@@ -345,9 +385,9 @@ class Trainer:
         Main training entry point.
 
         Args:
-            model_path:
-                (Optional) Local path to model if model to train has been instantiated from a local path
-                If present, we will try reloading the optimizer/scheduler states from there.
+            model_path (:obj:`str`, `optional`):
+                Local path to the model if the model to train has been instantiated from a local path. If present,
+                training will resume from the optimizer/scheduler states loaded here.
         """
         train_dataloader = self.get_train_dataloader()
         if self.args.max_steps > 0:
@@ -611,8 +651,7 @@ class Trainer:
 
     def save_model(self, output_dir: Optional[str] = None):
         """
-        Saving best-practices: if you use default names for the model,
-        you can reload it using from_pretrained().
+        Will save the model, so you can reload it using :obj:`from_pretrained()`.
 
         Will only save from the world_master process (unless in TPUs).
         """
@@ -683,22 +722,18 @@ class Trainer:
             logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
             shutil.rmtree(checkpoint)
 
-    def evaluate(
-        self, eval_dataset: Optional[Dataset] = None, prediction_loss_only: Optional[bool] = None,
-    ) -> Dict[str, float]:
+    def evaluate(self, eval_dataset: Optional[Dataset] = None) -> Dict[str, float]:
         """
-        Run evaluation and return metrics.
+        Run evaluation and returns metrics.
 
         The calling script will be responsible for providing a method to compute metrics, as they are
-        task-dependent.
+        task-dependent (pass it to the init :obj:`compute_metrics` argument).
 
         Args:
-            eval_dataset: (Optional) Pass a dataset if you wish to override
-            the one on the instance.
+            eval_dataset (:obj:`Dataset`, `optional`):
+                Pass a dataset if you wish to override :obj:`self.eval_dataset`.
         Returns:
-            A dict containing:
-                - the eval loss
-                - the potential metrics computed from the predictions
+            A dictionary containing the evaluation loss and the potential metrics computed from the predictions.
         """
         eval_dataloader = self.get_eval_dataloader(eval_dataset)
 
@@ -714,10 +749,22 @@ class Trainer:
 
     def predict(self, test_dataset: Dataset) -> PredictionOutput:
         """
-        Run prediction and return predictions and potential metrics.
+        Run prediction and returns predictions and potential metrics.
 
         Depending on the dataset and your use case, your test dataset may contain labels.
-        In that case, this method will also return metrics, like in evaluate().
+        In that case, this method will also return metrics, like in :obj:`evaluate()`.
+
+        Args:
+            test_dataset (:obj:`Dataset`):
+                Dataset to run the predictions on.
+        Returns:
+            `NamedTuple`:
+            predictions (:obj:`np.ndarray`):
+                The predictions on :obj:`test_dataset`.
+            label_ids (:obj:`np.ndarray`, `optional`):
+                The labels (if the dataset contained some).
+            metrics (:obj:`Dict[str, float]`, `optional`):
+                The potential dictionary of metrics (if the dataset contained labels).
         """
         test_dataloader = self.get_test_dataloader(test_dataset)
 
diff --git a/src/transformers/trainer_tf.py b/src/transformers/trainer_tf.py
index b871ef1e57..e1afbc1743 100644
--- a/src/transformers/trainer_tf.py
+++ b/src/transformers/trainer_tf.py
@@ -29,6 +29,34 @@ def set_seed(seed: int):
 
 
 class TFTrainer:
+    """
+    TFTrainer is a simple but feature-complete training and eval loop for TensorFlow,
+    optimized for 🤗 Transformers.
+
+    Args:
+        model (:class:`~transformers.TFPreTrainedModel`):
+            The model to train, evaluate or use for predictions.
+        args (:class:`~transformers.TFTrainingArguments`):
+            The arguments to tweak training.
+        train_dataset (:class:`~tf.data.Dataset`, `optional`):
+            The dataset to use for training.
+        eval_dataset (:class:`~tf.data.Dataset`, `optional`):
+            The dataset to use for evaluation.
+        compute_metrics (:obj:`Callable[[EvalPrediction], Dict]`, `optional`):
+            The function that will be used to compute metrics at evaluation. Must take a
+            :class:`~transformers.EvalPrediction` and return a dictionary string to metric values.
+        prediction_loss_only (:obj:`bool`, `optional`, defaults to `False`):
+            When performing evaluation and predictions, only returns the loss.
+        tb_writer (:obj:`tf.summary.SummaryWriter`, `optional`):
+            Object to write to TensorBoard.
+        optimizers (:obj:`Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule]`, `optional`):
+            A tuple containing the optimizer and the scheduler to use. The optimizer default to an instance of
+            :class:`tf.keras.optimizers.Adam` if :obj:`args.weight_decay_rate` is 0 else an instance of
+            :class:`~transformers.AdamWeightDecay`. The scheduler will default to an instance of
+            :class:`tf.keras.optimizers.schedules.PolynomialDecay` if :obj:`args.num_warmup_steps` is 0 else
+            an instance of :class:`~transformers.WarmUp`.
+    """
+
     model: TFPreTrainedModel
     args: TFTrainingArguments
     train_dataset: Optional[tf.data.Dataset]
@@ -78,6 +106,9 @@ class TFTrainer:
         set_seed(self.args.seed)
 
     def get_train_tfdataset(self) -> tf.data.Dataset:
+        """
+        Returns the training :class:`~tf.data.Dataset`.
+        """
         if self.train_dataset is None:
             raise ValueError("Trainer: training requires a train_dataset.")
 
@@ -101,6 +132,13 @@ class TFTrainer:
         return self.args.strategy.experimental_distribute_dataset(ds)
 
     def get_eval_tfdataset(self, eval_dataset: Optional[tf.data.Dataset] = None) -> tf.data.Dataset:
+        """
+        Returns the evaluation :class:`~tf.data.Dataset`.
+
+        Args:
+            eval_dataset (:class:`~tf.data.Dataset`, `optional`):
+                If provided, will override `self.eval_dataset`.
+        """
         if eval_dataset is None and self.eval_dataset is None:
             raise ValueError("Trainer: evaluation requires an eval_dataset.")
 
@@ -114,6 +152,12 @@ class TFTrainer:
         return self.args.strategy.experimental_distribute_dataset(ds)
 
     def get_test_tfdataset(self, test_dataset: tf.data.Dataset) -> tf.data.Dataset:
+        """
+        Returns a test :class:`~tf.data.Dataset`.
+
+        Args:
+            test_dataset (:class:`~tf.data.Dataset`): The dataset to use.
+        """
         ds = test_dataset.batch(self.args.eval_batch_size, drop_remainder=self.args.dataloader_drop_last)
 
         return self.args.strategy.experimental_distribute_dataset(ds)
@@ -124,9 +168,8 @@ class TFTrainer:
         """
         Setup the optimizer and the learning rate scheduler.
 
-        We provide a reasonable default that works well.
-        If you want to use something else, you can pass a tuple in the Trainer's init,
-        or override this method in a subclass.
+        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+        TFTrainer's init through :obj:`optimizers`, or override this method in a subclass.
         """
         if self.optimizers is not None:
             return self.optimizers
@@ -263,11 +306,18 @@ class TFTrainer:
 
         logger.info(output)
 
-    def evaluate(
-        self, eval_dataset: Optional[tf.data.Dataset] = None, prediction_loss_only: Optional[bool] = None
-    ) -> Dict[str, float]:
+    def evaluate(self, eval_dataset: Optional[tf.data.Dataset] = None) -> Dict[str, float]:
         """
-        Prediction/evaluation loop, shared by `evaluate()` and `predict()`.
+        Run evaluation and returns metrics.
+
+        The calling script will be responsible for providing a method to compute metrics, as they are
+        task-dependent (pass it to the init :obj:`compute_metrics` argument).
+
+        Args:
+            eval_dataset (:class:`~tf.data.Dataset`, `optional`):
+                Pass a dataset if you wish to override :obj:`self.eval_dataset`.
+        Returns:
+            A dictionary containing the evaluation loss and the potential metrics computed from the predictions.
         """
         eval_ds = self.get_eval_tfdataset(eval_dataset)
 
@@ -478,12 +528,22 @@ class TFTrainer:
 
     def predict(self, test_dataset: tf.data.Dataset) -> PredictionOutput:
         """
-        Run prediction and return predictions and potential metrics.
+        Run prediction and returns predictions and potential metrics.
+
         Depending on the dataset and your use case, your test dataset may contain labels.
-        In that case, this method will also return metrics, like in evaluate().
+        In that case, this method will also return metrics, like in :obj:`evaluate()`.
+
         Args:
-          test_dataset: something similar to a PT Dataset. This is just
-            temporary before to have a framework-agnostic approach for datasets.
+            test_dataset (:class:`~tf.data.Dataset`):
+                Dataset to run the predictions on.
+        Returns:
+            `NamedTuple`:
+            predictions (:obj:`np.ndarray`):
+                The predictions on :obj:`test_dataset`.
+            label_ids (:obj:`np.ndarray`, `optional`):
+                The labels (if the dataset contained some).
+            metrics (:obj:`Dict[str, float]`, `optional`):
+                The potential dictionary of metrics (if the dataset contained labels).
         """
         test_ds = self.get_test_tfdataset(test_dataset)
 
@@ -491,7 +551,7 @@ class TFTrainer:
 
     def save_model(self, output_dir: Optional[str] = None):
         """
-        Save the pretrained model.
+        Will save the model, so you can reload it using :obj:`from_pretrained()`.
         """
         output_dir = output_dir if output_dir is not None else self.args.output_dir
 
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index 35458a5ff7..619adb1a1f 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -23,8 +23,11 @@ def is_wandb_available():
 
 class EvalPrediction(NamedTuple):
     """
-    Evaluation output (always contains labels), to be used
-    to compute metrics.
+    Evaluation output (always contains labels), to be used to compute metrics.
+
+    Parameters:
+        predictions (:obj:`np.ndarray`): Predictions of the model.
+        label_ids (:obj:`np.ndarray`): Targets to be matched.
     """
 
     predictions: np.ndarray
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index fc97e63d4a..c0d7a4913f 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -35,9 +35,73 @@ class TrainingArguments:
     TrainingArguments is the subset of the arguments we use in our example scripts
     **which relate to the training loop itself**.
 
-    Using `HfArgumentParser` we can turn this class
-    into argparse arguments to be able to specify them on
-    the command line.
+    Using :class:`~transformers.HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on the command line.
+
+    Parameters:
+        output_dir (:obj:`str`):
+            The output directory where the model predictions and checkpoints will be written.
+        overwrite_output_dir (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If :obj:`True`, overwrite the content of the output directory. Use this to continue training if
+            :obj:`output_dir` points to a checkpoint directory.
+        do_train (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to run training or not.
+        do_eval (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to run evaluation on the dev set or not.
+        do_predict (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to run predictions on the test set or not.
+        evaluate_during_training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to run evaluation during training at each logging step or not.
+        per_device_train_batch_size (:obj:`int`, `optional`, defaults to 8):
+            The batch size per GPU/TPU core/CPU for training.
+        per_device_eval_batch_size (:obj:`int`, `optional`, defaults to 8):
+            The batch size per GPU/TPU core/CPU for evaluation.
+        gradient_accumulation_steps: (:obj:`int`, `optional`, defaults to 1):
+            Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
+        learning_rate (:obj:`float`, `optional`, defaults to 5e-5):
+            The initial learning rate for Adam.
+        weight_decay (:obj:`float`, `optional`, defaults to 0):
+            The weight decay to apply (if not zero).
+        adam_epsilon (:obj:`float`, `optional`, defaults to 1e-8):
+            Epsilon for the Adam optimizer.
+        max_grad_norm (:obj:`float`, `optional`, defaults to 1.0):
+            Maximum gradient norm (for gradient clipping).
+        num_train_epochs(:obj:`float`, `optional`, defaults to 3.0):
+            Total number of training epochs to perform.
+        max_steps (:obj:`int`, `optional`, defaults to -1):
+            If set to a positive number, the total number of training steps to perform. Overrides
+            :obj:`num_train_epochs`.
+        warmup_steps (:obj:`int`, `optional`, defaults to 0):
+            Number of steps used for a linear warmup from 0 to :obj:`learning_rate`.
+        logging_dir (:obj:`str`, `optional`):
+            Tensorboard log directory. Will default to `runs/**CURRENT_DATETIME_HOSTNAME**`.
+        logging_first_step (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Wheter to log and evalulate the first :obj:`global_step` or not.
+        logging_steps (:obj:`int`, `optional`, defaults to 500):
+            Number of update steps between two logs.
+        save_steps (:obj:`int`, `optional`, defaults to 500):
+            Number of updates steps before two checkpoint saves.
+        save_total_limit (:obj:`int`, `optional`):
+            If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
+            :obj:`output_dir`.
+        no_cuda (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Wherher to not use CUDA even when it is available or not.
+        seed (:obj:`int`, `optional`, defaults to 42):
+            Random seed for initialization.
+        fp16 (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to use 16-bit (mixed) precision training (through NVIDIA apex) instead of 32-bit training.
+        fp16_opt_level (:obj:`str`, `optional`, defaults to 'O1'):
+            For :obj:`fp16` training, apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details
+            on the `apex documentation <https://nvidia.github.io/apex/amp.html>`__.
+        local_rank (:obj:`int`, `optional`, defaults to -1):
+            During distributed training, the rank of the process.
+        tpu_num_cores (:obj:`int`, `optional`):
+            When training on TPU, the mumber of TPU cores (automatically passed by launcher script).
+        tpu_metrics_debug (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            When training on TPU, whether to print debug metrics or not.
+        dataloader_drop_last (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
+            or not.
     """
 
     output_dir: str = field(
@@ -141,6 +205,9 @@ class TrainingArguments:
 
     @property
     def train_batch_size(self) -> int:
+        """
+        The actual batch size for training (may differ from :obj:`per_gpu_train_batch_size` in distributed training).
+        """
         if self.per_gpu_train_batch_size:
             logger.warning(
                 "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future "
@@ -151,6 +218,9 @@ class TrainingArguments:
 
     @property
     def eval_batch_size(self) -> int:
+        """
+        The actual batch size for evaluation (may differ from :obj:`per_gpu_eval_batch_size` in distributed training).
+        """
         if self.per_gpu_eval_batch_size:
             logger.warning(
                 "Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future "
@@ -193,11 +263,21 @@ class TrainingArguments:
     @property
     @torch_required
     def device(self) -> "torch.device":
+        """
+        The device used by this process.
+        """
         return self._setup_devices[0]
 
     @property
     @torch_required
     def n_gpu(self):
+        """
+        The number of GPUs used by this process.
+
+        Note:
+            This will only be greater than one when you have multiple GPUs available but are not using distributed
+            training. For distributed training, it will always be 1.
+        """
         return self._setup_devices[1]
 
     def to_json_string(self):
diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py
index c05acafdc7..f87c7bc994 100644
--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -14,6 +14,85 @@ if is_tf_available():
 
 @dataclass
 class TFTrainingArguments(TrainingArguments):
+    """
+    TrainingArguments is the subset of the arguments we use in our example scripts
+    **which relate to the training loop itself**.
+
+    Using :class:`~transformers.HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on the command line.
+
+    Parameters:
+        output_dir (:obj:`str`):
+            The output directory where the model predictions and checkpoints will be written.
+        overwrite_output_dir (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If :obj:`True`, overwrite the content of the output directory. Use this to continue training if
+            :obj:`output_dir` points to a checkpoint directory.
+        do_train (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to run training or not.
+        do_eval (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to run evaluation on the dev set or not.
+        do_predict (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to run predictions on the test set or not.
+        evaluate_during_training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to run evaluation during training at each logging step or not.
+        per_device_train_batch_size (:obj:`int`, `optional`, defaults to 8):
+            The batch size per GPU/TPU core/CPU for training.
+        per_device_eval_batch_size (:obj:`int`, `optional`, defaults to 8):
+            The batch size per GPU/TPU core/CPU for evaluation.
+        gradient_accumulation_steps: (:obj:`int`, `optional`, defaults to 1):
+            Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
+        learning_rate (:obj:`float`, `optional`, defaults to 5e-5):
+            The initial learning rate for Adam.
+        weight_decay (:obj:`float`, `optional`, defaults to 0):
+            The weight decay to apply (if not zero).
+        adam_epsilon (:obj:`float`, `optional`, defaults to 1e-8):
+            Epsilon for the Adam optimizer.
+        max_grad_norm (:obj:`float`, `optional`, defaults to 1.0):
+            Maximum gradient norm (for gradient clipping).
+        num_train_epochs(:obj:`float`, `optional`, defaults to 3.0):
+            Total number of training epochs to perform.
+        max_steps (:obj:`int`, `optional`, defaults to -1):
+            If set to a positive number, the total number of training steps to perform. Overrides
+            :obj:`num_train_epochs`.
+        warmup_steps (:obj:`int`, `optional`, defaults to 0):
+            Number of steps used for a linear warmup from 0 to :obj:`learning_rate`.
+        logging_dir (:obj:`str`, `optional`):
+            Tensorboard log directory. Will default to `runs/**CURRENT_DATETIME_HOSTNAME**`.
+        logging_first_step (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Wheter to log and evalulate the first :obj:`global_step` or not.
+        logging_steps (:obj:`int`, `optional`, defaults to 500):
+            Number of update steps between two logs.
+        save_steps (:obj:`int`, `optional`, defaults to 500):
+            Number of updates steps before two checkpoint saves.
+        save_total_limit (:obj:`int`, `optional`):
+            If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
+            :obj:`output_dir`.
+        no_cuda (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Wherher to not use CUDA even when it is available or not.
+        seed (:obj:`int`, `optional`, defaults to 42):
+            Random seed for initialization.
+        fp16 (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to use 16-bit (mixed) precision training (through NVIDIA apex) instead of 32-bit training.
+        fp16_opt_level (:obj:`str`, `optional`, defaults to 'O1'):
+            For :obj:`fp16` training, apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details
+            on the `apex documentation <https://nvidia.github.io/apex/amp.html>`__.
+        local_rank (:obj:`int`, `optional`, defaults to -1):
+            During distributed training, the rank of the process.
+        tpu_num_cores (:obj:`int`, `optional`):
+            When training on TPU, the mumber of TPU cores (automatically passed by launcher script).
+        tpu_metrics_debug (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            When training on TPU, whether to print debug metrics or not.
+        dataloader_drop_last (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
+            or not.
+        tpu_name (:obj:`str`, `optional`):
+            The name of the TPU the process is running on.
+        eval_steps (:obj:`int`, `optional`, defaults to 1000):
+            Number of update steps before two evaluations.
+        debug (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Wheter to activate the trace to record computation graphs and profiling information or not.
+    """
+
     tpu_name: str = field(
         default=None, metadata={"help": "Name of TPU"},
     )
@@ -59,9 +138,15 @@ class TFTrainingArguments(TrainingArguments):
     @property
     @tf_required
     def strategy(self) -> "tf.distribute.Strategy":
+        """
+        The strategy used for distributed training.
+        """
         return self._setup_strategy
 
     @property
     @tf_required
     def n_gpu(self) -> int:
+        """
+        The number of replicas (GPUs or TPU cores) used in this training.
+        """
         return self._setup_strategy.num_replicas_in_sync