From 87716a6d072b2b66415ce43086c73b04e63fe0fe Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Tue, 30 Jun 2020 11:43:43 -0400 Subject: [PATCH] Documentation for the Trainer API (#5383) * Documentation for the Trainer API * Address review comments * Address comments --- docs/source/index.rst | 1 + docs/source/main_classes/trainer.rst | 45 +++++++++++ src/transformers/__init__.py | 2 +- src/transformers/trainer.py | 107 +++++++++++++++++++-------- src/transformers/trainer_tf.py | 84 ++++++++++++++++++--- src/transformers/trainer_utils.py | 7 +- src/transformers/training_args.py | 86 ++++++++++++++++++++- src/transformers/training_args_tf.py | 85 +++++++++++++++++++++ 8 files changed, 369 insertions(+), 48 deletions(-) create mode 100644 docs/source/main_classes/trainer.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index bbd841fb85..aad68b0dd8 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -173,6 +173,7 @@ conversion utilities for the following models: main_classes/pipelines main_classes/optimizer_schedules main_classes/processors + main_classes/trainer model_doc/auto model_doc/encoderdecoder model_doc/bert diff --git a/docs/source/main_classes/trainer.rst b/docs/source/main_classes/trainer.rst new file mode 100644 index 0000000000..e5687de469 --- /dev/null +++ b/docs/source/main_classes/trainer.rst @@ -0,0 +1,45 @@ +Trainer +---------- + +The :class:`~transformers.Trainer` and :class:`~transformers.TFTrainer` classes provide an API for feature-complete +training in most standard use cases. It's used in most of the :doc:`example scripts <../examples>`. + +Before instantiating your :class:`~transformers.Trainer`/:class:`~transformers.TFTrainer`, create a +:class:`~transformers.TrainingArguments`/:class:`~transformers.TFTrainingArguments` to access all the points of +customization during training. + +The API supports distributed training on multiple GPUs/TPUs, mixed precision through `NVIDIA Apex +`__ for PyTorch and :obj:`tf.keras.mixed_precision` for TensorFlow. + +``Trainer`` +~~~~~~~~~~~ + +.. autoclass:: transformers.Trainer + :members: + +``TFTrainer`` +~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFTrainer + :members: + +``TrainingArguments`` +~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TrainingArguments + :members: + +``TFTrainingArguments`` +~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFTrainingArguments + :members: + +Utilities +~~~~~~~~~ + +.. autoclass:: transformers.EvalPrediction + +.. autofunction:: transformers.set_seed + +.. autofunction:: transformers.torch_distributed_zero_first diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 335ac94175..4728f6ff02 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -397,7 +397,7 @@ if is_torch_available(): ) # Trainer - from .trainer import Trainer, set_seed, torch_distributed_zero_first, EvalPrediction + from .trainer import Trainer, set_seed, torch_distributed_zero_first from .data.data_collator import default_data_collator, DataCollator, DataCollatorForLanguageModeling from .data.datasets import GlueDataset, TextDataset, LineByLineTextDataset, GlueDataTrainingArguments diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index c586907bd5..fa40947b1a 100644 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -61,6 +61,12 @@ logger = logging.getLogger(__name__) def set_seed(seed: int): + """ + Helper function for reproducible behavior to set the seed in ``random``, ``numpy`` and ``torch``. + + Args: + seed (:obj:`int`): The seed to set. + """ random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) @@ -72,6 +78,9 @@ def set_seed(seed: int): def torch_distributed_zero_first(local_rank: int): """ Decorator to make all processes in distributed training wait for each local_master to do something. + + Args: + local_rank (:obj:`int`): The rank of the local process. """ if local_rank not in [-1, 0]: torch.distributed.barrier() @@ -133,7 +142,31 @@ def get_tpu_sampler(dataset: Dataset): class Trainer: """ Trainer is a simple but feature-complete training and eval loop for PyTorch, - optimized for Transformers. + optimized for 🤗 Transformers. + + Args: + model (:class:`~transformers.PreTrainedModel`): + The model to train, evaluate or use for predictions. + args (:class:`~transformers.TrainingArguments`): + The arguments to tweak training. + data_collator (:obj:`DataCollator`, `optional`, defaults to :func:`~transformers.default_data_collator`): + The function to use to from a batch from a list of elements of :obj:`train_dataset` or + :obj:`eval_dataset`. + train_dataset (:obj:`Dataset`, `optional`): + The dataset to use for training. + eval_dataset (:obj:`Dataset`, `optional`): + The dataset to use for evaluation. + compute_metrics (:obj:`Callable[[EvalPrediction], Dict]`, `optional`): + The function that will be used to compute metrics at evaluation. Must take a + :class:`~transformers.EvalPrediction` and return a dictionary string to metric values. + prediction_loss_only (:obj:`bool`, `optional`, defaults to `False`): + When performing evaluation and predictions, only returns the loss. + tb_writer (:obj:`SummaryWriter`, `optional`): + Object to write to TensorBoard. + optimizers (:obj:`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR`, `optional`): + A tuple containing the optimizer and the scheduler to use. Will default to an instance of + :class:`~transformers.AdamW` on your model and a scheduler given by + :func:`~transformers.get_linear_schedule_with_warmup` controlled by :obj:`args`. """ model: PreTrainedModel @@ -160,14 +193,6 @@ class Trainer: tb_writer: Optional["SummaryWriter"] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None, ): - """ - Trainer is a simple but feature-complete training and eval loop for PyTorch, - optimized for Transformers. - - Args: - prediction_loss_only: - (Optional) in evaluation and prediction, only return the loss - """ self.model = model.to(args.device) self.args = args self.data_collator = data_collator if data_collator is not None else default_data_collator @@ -210,6 +235,9 @@ class Trainer: ) def get_train_dataloader(self) -> DataLoader: + """ + Returns the training :class:`~torch.utils.data.DataLoader`. + """ if self.train_dataset is None: raise ValueError("Trainer: training requires a train_dataset.") if is_torch_tpu_available(): @@ -232,6 +260,13 @@ class Trainer: return data_loader def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader: + """ + Returns the evaluation :class:`~torch.utils.data.DataLoader`. + + Args: + eval_dataset (:obj:`Dataset`, `optional`): + If provided, will override `self.eval_dataset`. + """ if eval_dataset is None and self.eval_dataset is None: raise ValueError("Trainer: evaluation requires an eval_dataset.") @@ -257,6 +292,12 @@ class Trainer: return data_loader def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: + """ + Returns the test :class:`~torch.utils.data.DataLoader`. + + Args: + test_dataset (obj:`Dataset`): The test dataset to use. + """ # We use the same batch_size as for eval. if is_torch_tpu_available(): sampler = SequentialDistributedSampler( @@ -283,9 +324,8 @@ class Trainer: """ Setup the optimizer and the learning rate scheduler. - We provide a reasonable default that works well. - If you want to use something else, you can pass a tuple in the Trainer's init, - or override this method in a subclass. + We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the + Trainer's init through :obj:`optimizers`, or override this method in a subclass. """ if self.optimizers is not None: return self.optimizers @@ -336,7 +376,7 @@ class Trainer: def num_examples(self, dataloader: DataLoader) -> int: """ - Helper to get num of examples from a DataLoader, by accessing its Dataset. + Helper to get number of samples in a :class:`~torch.utils.data.DataLoader` by accessing its Dataset. """ return len(dataloader.dataset) @@ -345,9 +385,9 @@ class Trainer: Main training entry point. Args: - model_path: - (Optional) Local path to model if model to train has been instantiated from a local path - If present, we will try reloading the optimizer/scheduler states from there. + model_path (:obj:`str`, `optional`): + Local path to the model if the model to train has been instantiated from a local path. If present, + training will resume from the optimizer/scheduler states loaded here. """ train_dataloader = self.get_train_dataloader() if self.args.max_steps > 0: @@ -611,8 +651,7 @@ class Trainer: def save_model(self, output_dir: Optional[str] = None): """ - Saving best-practices: if you use default names for the model, - you can reload it using from_pretrained(). + Will save the model, so you can reload it using :obj:`from_pretrained()`. Will only save from the world_master process (unless in TPUs). """ @@ -683,22 +722,18 @@ class Trainer: logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint)) shutil.rmtree(checkpoint) - def evaluate( - self, eval_dataset: Optional[Dataset] = None, prediction_loss_only: Optional[bool] = None, - ) -> Dict[str, float]: + def evaluate(self, eval_dataset: Optional[Dataset] = None) -> Dict[str, float]: """ - Run evaluation and return metrics. + Run evaluation and returns metrics. The calling script will be responsible for providing a method to compute metrics, as they are - task-dependent. + task-dependent (pass it to the init :obj:`compute_metrics` argument). Args: - eval_dataset: (Optional) Pass a dataset if you wish to override - the one on the instance. + eval_dataset (:obj:`Dataset`, `optional`): + Pass a dataset if you wish to override :obj:`self.eval_dataset`. Returns: - A dict containing: - - the eval loss - - the potential metrics computed from the predictions + A dictionary containing the evaluation loss and the potential metrics computed from the predictions. """ eval_dataloader = self.get_eval_dataloader(eval_dataset) @@ -714,10 +749,22 @@ class Trainer: def predict(self, test_dataset: Dataset) -> PredictionOutput: """ - Run prediction and return predictions and potential metrics. + Run prediction and returns predictions and potential metrics. Depending on the dataset and your use case, your test dataset may contain labels. - In that case, this method will also return metrics, like in evaluate(). + In that case, this method will also return metrics, like in :obj:`evaluate()`. + + Args: + test_dataset (:obj:`Dataset`): + Dataset to run the predictions on. + Returns: + `NamedTuple`: + predictions (:obj:`np.ndarray`): + The predictions on :obj:`test_dataset`. + label_ids (:obj:`np.ndarray`, `optional`): + The labels (if the dataset contained some). + metrics (:obj:`Dict[str, float]`, `optional`): + The potential dictionary of metrics (if the dataset contained labels). """ test_dataloader = self.get_test_dataloader(test_dataset) diff --git a/src/transformers/trainer_tf.py b/src/transformers/trainer_tf.py index b871ef1e57..e1afbc1743 100644 --- a/src/transformers/trainer_tf.py +++ b/src/transformers/trainer_tf.py @@ -29,6 +29,34 @@ def set_seed(seed: int): class TFTrainer: + """ + TFTrainer is a simple but feature-complete training and eval loop for TensorFlow, + optimized for 🤗 Transformers. + + Args: + model (:class:`~transformers.TFPreTrainedModel`): + The model to train, evaluate or use for predictions. + args (:class:`~transformers.TFTrainingArguments`): + The arguments to tweak training. + train_dataset (:class:`~tf.data.Dataset`, `optional`): + The dataset to use for training. + eval_dataset (:class:`~tf.data.Dataset`, `optional`): + The dataset to use for evaluation. + compute_metrics (:obj:`Callable[[EvalPrediction], Dict]`, `optional`): + The function that will be used to compute metrics at evaluation. Must take a + :class:`~transformers.EvalPrediction` and return a dictionary string to metric values. + prediction_loss_only (:obj:`bool`, `optional`, defaults to `False`): + When performing evaluation and predictions, only returns the loss. + tb_writer (:obj:`tf.summary.SummaryWriter`, `optional`): + Object to write to TensorBoard. + optimizers (:obj:`Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule]`, `optional`): + A tuple containing the optimizer and the scheduler to use. The optimizer default to an instance of + :class:`tf.keras.optimizers.Adam` if :obj:`args.weight_decay_rate` is 0 else an instance of + :class:`~transformers.AdamWeightDecay`. The scheduler will default to an instance of + :class:`tf.keras.optimizers.schedules.PolynomialDecay` if :obj:`args.num_warmup_steps` is 0 else + an instance of :class:`~transformers.WarmUp`. + """ + model: TFPreTrainedModel args: TFTrainingArguments train_dataset: Optional[tf.data.Dataset] @@ -78,6 +106,9 @@ class TFTrainer: set_seed(self.args.seed) def get_train_tfdataset(self) -> tf.data.Dataset: + """ + Returns the training :class:`~tf.data.Dataset`. + """ if self.train_dataset is None: raise ValueError("Trainer: training requires a train_dataset.") @@ -101,6 +132,13 @@ class TFTrainer: return self.args.strategy.experimental_distribute_dataset(ds) def get_eval_tfdataset(self, eval_dataset: Optional[tf.data.Dataset] = None) -> tf.data.Dataset: + """ + Returns the evaluation :class:`~tf.data.Dataset`. + + Args: + eval_dataset (:class:`~tf.data.Dataset`, `optional`): + If provided, will override `self.eval_dataset`. + """ if eval_dataset is None and self.eval_dataset is None: raise ValueError("Trainer: evaluation requires an eval_dataset.") @@ -114,6 +152,12 @@ class TFTrainer: return self.args.strategy.experimental_distribute_dataset(ds) def get_test_tfdataset(self, test_dataset: tf.data.Dataset) -> tf.data.Dataset: + """ + Returns a test :class:`~tf.data.Dataset`. + + Args: + test_dataset (:class:`~tf.data.Dataset`): The dataset to use. + """ ds = test_dataset.batch(self.args.eval_batch_size, drop_remainder=self.args.dataloader_drop_last) return self.args.strategy.experimental_distribute_dataset(ds) @@ -124,9 +168,8 @@ class TFTrainer: """ Setup the optimizer and the learning rate scheduler. - We provide a reasonable default that works well. - If you want to use something else, you can pass a tuple in the Trainer's init, - or override this method in a subclass. + We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the + TFTrainer's init through :obj:`optimizers`, or override this method in a subclass. """ if self.optimizers is not None: return self.optimizers @@ -263,11 +306,18 @@ class TFTrainer: logger.info(output) - def evaluate( - self, eval_dataset: Optional[tf.data.Dataset] = None, prediction_loss_only: Optional[bool] = None - ) -> Dict[str, float]: + def evaluate(self, eval_dataset: Optional[tf.data.Dataset] = None) -> Dict[str, float]: """ - Prediction/evaluation loop, shared by `evaluate()` and `predict()`. + Run evaluation and returns metrics. + + The calling script will be responsible for providing a method to compute metrics, as they are + task-dependent (pass it to the init :obj:`compute_metrics` argument). + + Args: + eval_dataset (:class:`~tf.data.Dataset`, `optional`): + Pass a dataset if you wish to override :obj:`self.eval_dataset`. + Returns: + A dictionary containing the evaluation loss and the potential metrics computed from the predictions. """ eval_ds = self.get_eval_tfdataset(eval_dataset) @@ -478,12 +528,22 @@ class TFTrainer: def predict(self, test_dataset: tf.data.Dataset) -> PredictionOutput: """ - Run prediction and return predictions and potential metrics. + Run prediction and returns predictions and potential metrics. + Depending on the dataset and your use case, your test dataset may contain labels. - In that case, this method will also return metrics, like in evaluate(). + In that case, this method will also return metrics, like in :obj:`evaluate()`. + Args: - test_dataset: something similar to a PT Dataset. This is just - temporary before to have a framework-agnostic approach for datasets. + test_dataset (:class:`~tf.data.Dataset`): + Dataset to run the predictions on. + Returns: + `NamedTuple`: + predictions (:obj:`np.ndarray`): + The predictions on :obj:`test_dataset`. + label_ids (:obj:`np.ndarray`, `optional`): + The labels (if the dataset contained some). + metrics (:obj:`Dict[str, float]`, `optional`): + The potential dictionary of metrics (if the dataset contained labels). """ test_ds = self.get_test_tfdataset(test_dataset) @@ -491,7 +551,7 @@ class TFTrainer: def save_model(self, output_dir: Optional[str] = None): """ - Save the pretrained model. + Will save the model, so you can reload it using :obj:`from_pretrained()`. """ output_dir = output_dir if output_dir is not None else self.args.output_dir diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py index 35458a5ff7..619adb1a1f 100644 --- a/src/transformers/trainer_utils.py +++ b/src/transformers/trainer_utils.py @@ -23,8 +23,11 @@ def is_wandb_available(): class EvalPrediction(NamedTuple): """ - Evaluation output (always contains labels), to be used - to compute metrics. + Evaluation output (always contains labels), to be used to compute metrics. + + Parameters: + predictions (:obj:`np.ndarray`): Predictions of the model. + label_ids (:obj:`np.ndarray`): Targets to be matched. """ predictions: np.ndarray diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index fc97e63d4a..c0d7a4913f 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -35,9 +35,73 @@ class TrainingArguments: TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop itself**. - Using `HfArgumentParser` we can turn this class - into argparse arguments to be able to specify them on - the command line. + Using :class:`~transformers.HfArgumentParser` we can turn this class + into argparse arguments to be able to specify them on the command line. + + Parameters: + output_dir (:obj:`str`): + The output directory where the model predictions and checkpoints will be written. + overwrite_output_dir (:obj:`bool`, `optional`, defaults to :obj:`False`): + If :obj:`True`, overwrite the content of the output directory. Use this to continue training if + :obj:`output_dir` points to a checkpoint directory. + do_train (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to run training or not. + do_eval (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to run evaluation on the dev set or not. + do_predict (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to run predictions on the test set or not. + evaluate_during_training (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to run evaluation during training at each logging step or not. + per_device_train_batch_size (:obj:`int`, `optional`, defaults to 8): + The batch size per GPU/TPU core/CPU for training. + per_device_eval_batch_size (:obj:`int`, `optional`, defaults to 8): + The batch size per GPU/TPU core/CPU for evaluation. + gradient_accumulation_steps: (:obj:`int`, `optional`, defaults to 1): + Number of updates steps to accumulate the gradients for, before performing a backward/update pass. + learning_rate (:obj:`float`, `optional`, defaults to 5e-5): + The initial learning rate for Adam. + weight_decay (:obj:`float`, `optional`, defaults to 0): + The weight decay to apply (if not zero). + adam_epsilon (:obj:`float`, `optional`, defaults to 1e-8): + Epsilon for the Adam optimizer. + max_grad_norm (:obj:`float`, `optional`, defaults to 1.0): + Maximum gradient norm (for gradient clipping). + num_train_epochs(:obj:`float`, `optional`, defaults to 3.0): + Total number of training epochs to perform. + max_steps (:obj:`int`, `optional`, defaults to -1): + If set to a positive number, the total number of training steps to perform. Overrides + :obj:`num_train_epochs`. + warmup_steps (:obj:`int`, `optional`, defaults to 0): + Number of steps used for a linear warmup from 0 to :obj:`learning_rate`. + logging_dir (:obj:`str`, `optional`): + Tensorboard log directory. Will default to `runs/**CURRENT_DATETIME_HOSTNAME**`. + logging_first_step (:obj:`bool`, `optional`, defaults to :obj:`False`): + Wheter to log and evalulate the first :obj:`global_step` or not. + logging_steps (:obj:`int`, `optional`, defaults to 500): + Number of update steps between two logs. + save_steps (:obj:`int`, `optional`, defaults to 500): + Number of updates steps before two checkpoint saves. + save_total_limit (:obj:`int`, `optional`): + If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in + :obj:`output_dir`. + no_cuda (:obj:`bool`, `optional`, defaults to :obj:`False`): + Wherher to not use CUDA even when it is available or not. + seed (:obj:`int`, `optional`, defaults to 42): + Random seed for initialization. + fp16 (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to use 16-bit (mixed) precision training (through NVIDIA apex) instead of 32-bit training. + fp16_opt_level (:obj:`str`, `optional`, defaults to 'O1'): + For :obj:`fp16` training, apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details + on the `apex documentation `__. + local_rank (:obj:`int`, `optional`, defaults to -1): + During distributed training, the rank of the process. + tpu_num_cores (:obj:`int`, `optional`): + When training on TPU, the mumber of TPU cores (automatically passed by launcher script). + tpu_metrics_debug (:obj:`bool`, `optional`, defaults to :obj:`False`): + When training on TPU, whether to print debug metrics or not. + dataloader_drop_last (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size) + or not. """ output_dir: str = field( @@ -141,6 +205,9 @@ class TrainingArguments: @property def train_batch_size(self) -> int: + """ + The actual batch size for training (may differ from :obj:`per_gpu_train_batch_size` in distributed training). + """ if self.per_gpu_train_batch_size: logger.warning( "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future " @@ -151,6 +218,9 @@ class TrainingArguments: @property def eval_batch_size(self) -> int: + """ + The actual batch size for evaluation (may differ from :obj:`per_gpu_eval_batch_size` in distributed training). + """ if self.per_gpu_eval_batch_size: logger.warning( "Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future " @@ -193,11 +263,21 @@ class TrainingArguments: @property @torch_required def device(self) -> "torch.device": + """ + The device used by this process. + """ return self._setup_devices[0] @property @torch_required def n_gpu(self): + """ + The number of GPUs used by this process. + + Note: + This will only be greater than one when you have multiple GPUs available but are not using distributed + training. For distributed training, it will always be 1. + """ return self._setup_devices[1] def to_json_string(self): diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py index c05acafdc7..f87c7bc994 100644 --- a/src/transformers/training_args_tf.py +++ b/src/transformers/training_args_tf.py @@ -14,6 +14,85 @@ if is_tf_available(): @dataclass class TFTrainingArguments(TrainingArguments): + """ + TrainingArguments is the subset of the arguments we use in our example scripts + **which relate to the training loop itself**. + + Using :class:`~transformers.HfArgumentParser` we can turn this class + into argparse arguments to be able to specify them on the command line. + + Parameters: + output_dir (:obj:`str`): + The output directory where the model predictions and checkpoints will be written. + overwrite_output_dir (:obj:`bool`, `optional`, defaults to :obj:`False`): + If :obj:`True`, overwrite the content of the output directory. Use this to continue training if + :obj:`output_dir` points to a checkpoint directory. + do_train (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to run training or not. + do_eval (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to run evaluation on the dev set or not. + do_predict (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to run predictions on the test set or not. + evaluate_during_training (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to run evaluation during training at each logging step or not. + per_device_train_batch_size (:obj:`int`, `optional`, defaults to 8): + The batch size per GPU/TPU core/CPU for training. + per_device_eval_batch_size (:obj:`int`, `optional`, defaults to 8): + The batch size per GPU/TPU core/CPU for evaluation. + gradient_accumulation_steps: (:obj:`int`, `optional`, defaults to 1): + Number of updates steps to accumulate the gradients for, before performing a backward/update pass. + learning_rate (:obj:`float`, `optional`, defaults to 5e-5): + The initial learning rate for Adam. + weight_decay (:obj:`float`, `optional`, defaults to 0): + The weight decay to apply (if not zero). + adam_epsilon (:obj:`float`, `optional`, defaults to 1e-8): + Epsilon for the Adam optimizer. + max_grad_norm (:obj:`float`, `optional`, defaults to 1.0): + Maximum gradient norm (for gradient clipping). + num_train_epochs(:obj:`float`, `optional`, defaults to 3.0): + Total number of training epochs to perform. + max_steps (:obj:`int`, `optional`, defaults to -1): + If set to a positive number, the total number of training steps to perform. Overrides + :obj:`num_train_epochs`. + warmup_steps (:obj:`int`, `optional`, defaults to 0): + Number of steps used for a linear warmup from 0 to :obj:`learning_rate`. + logging_dir (:obj:`str`, `optional`): + Tensorboard log directory. Will default to `runs/**CURRENT_DATETIME_HOSTNAME**`. + logging_first_step (:obj:`bool`, `optional`, defaults to :obj:`False`): + Wheter to log and evalulate the first :obj:`global_step` or not. + logging_steps (:obj:`int`, `optional`, defaults to 500): + Number of update steps between two logs. + save_steps (:obj:`int`, `optional`, defaults to 500): + Number of updates steps before two checkpoint saves. + save_total_limit (:obj:`int`, `optional`): + If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in + :obj:`output_dir`. + no_cuda (:obj:`bool`, `optional`, defaults to :obj:`False`): + Wherher to not use CUDA even when it is available or not. + seed (:obj:`int`, `optional`, defaults to 42): + Random seed for initialization. + fp16 (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to use 16-bit (mixed) precision training (through NVIDIA apex) instead of 32-bit training. + fp16_opt_level (:obj:`str`, `optional`, defaults to 'O1'): + For :obj:`fp16` training, apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details + on the `apex documentation `__. + local_rank (:obj:`int`, `optional`, defaults to -1): + During distributed training, the rank of the process. + tpu_num_cores (:obj:`int`, `optional`): + When training on TPU, the mumber of TPU cores (automatically passed by launcher script). + tpu_metrics_debug (:obj:`bool`, `optional`, defaults to :obj:`False`): + When training on TPU, whether to print debug metrics or not. + dataloader_drop_last (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size) + or not. + tpu_name (:obj:`str`, `optional`): + The name of the TPU the process is running on. + eval_steps (:obj:`int`, `optional`, defaults to 1000): + Number of update steps before two evaluations. + debug (:obj:`bool`, `optional`, defaults to :obj:`False`): + Wheter to activate the trace to record computation graphs and profiling information or not. + """ + tpu_name: str = field( default=None, metadata={"help": "Name of TPU"}, ) @@ -59,9 +138,15 @@ class TFTrainingArguments(TrainingArguments): @property @tf_required def strategy(self) -> "tf.distribute.Strategy": + """ + The strategy used for distributed training. + """ return self._setup_strategy @property @tf_required def n_gpu(self) -> int: + """ + The number of replicas (GPUs or TPU cores) used in this training. + """ return self._setup_strategy.num_replicas_in_sync