Documentation for the Trainer API (#5383)
* Documentation for the Trainer API * Address review comments * Address comments
This commit is contained in:
@@ -173,6 +173,7 @@ conversion utilities for the following models:
|
|||||||
main_classes/pipelines
|
main_classes/pipelines
|
||||||
main_classes/optimizer_schedules
|
main_classes/optimizer_schedules
|
||||||
main_classes/processors
|
main_classes/processors
|
||||||
|
main_classes/trainer
|
||||||
model_doc/auto
|
model_doc/auto
|
||||||
model_doc/encoderdecoder
|
model_doc/encoderdecoder
|
||||||
model_doc/bert
|
model_doc/bert
|
||||||
|
|||||||
45
docs/source/main_classes/trainer.rst
Normal file
45
docs/source/main_classes/trainer.rst
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
Trainer
|
||||||
|
----------
|
||||||
|
|
||||||
|
The :class:`~transformers.Trainer` and :class:`~transformers.TFTrainer` classes provide an API for feature-complete
|
||||||
|
training in most standard use cases. It's used in most of the :doc:`example scripts <../examples>`.
|
||||||
|
|
||||||
|
Before instantiating your :class:`~transformers.Trainer`/:class:`~transformers.TFTrainer`, create a
|
||||||
|
:class:`~transformers.TrainingArguments`/:class:`~transformers.TFTrainingArguments` to access all the points of
|
||||||
|
customization during training.
|
||||||
|
|
||||||
|
The API supports distributed training on multiple GPUs/TPUs, mixed precision through `NVIDIA Apex
|
||||||
|
<https://github.com/NVIDIA/apex>`__ for PyTorch and :obj:`tf.keras.mixed_precision` for TensorFlow.
|
||||||
|
|
||||||
|
``Trainer``
|
||||||
|
~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.Trainer
|
||||||
|
:members:
|
||||||
|
|
||||||
|
``TFTrainer``
|
||||||
|
~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFTrainer
|
||||||
|
:members:
|
||||||
|
|
||||||
|
``TrainingArguments``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TrainingArguments
|
||||||
|
:members:
|
||||||
|
|
||||||
|
``TFTrainingArguments``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFTrainingArguments
|
||||||
|
:members:
|
||||||
|
|
||||||
|
Utilities
|
||||||
|
~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.EvalPrediction
|
||||||
|
|
||||||
|
.. autofunction:: transformers.set_seed
|
||||||
|
|
||||||
|
.. autofunction:: transformers.torch_distributed_zero_first
|
||||||
@@ -397,7 +397,7 @@ if is_torch_available():
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Trainer
|
# Trainer
|
||||||
from .trainer import Trainer, set_seed, torch_distributed_zero_first, EvalPrediction
|
from .trainer import Trainer, set_seed, torch_distributed_zero_first
|
||||||
from .data.data_collator import default_data_collator, DataCollator, DataCollatorForLanguageModeling
|
from .data.data_collator import default_data_collator, DataCollator, DataCollatorForLanguageModeling
|
||||||
from .data.datasets import GlueDataset, TextDataset, LineByLineTextDataset, GlueDataTrainingArguments
|
from .data.datasets import GlueDataset, TextDataset, LineByLineTextDataset, GlueDataTrainingArguments
|
||||||
|
|
||||||
|
|||||||
@@ -61,6 +61,12 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
def set_seed(seed: int):
|
def set_seed(seed: int):
|
||||||
|
"""
|
||||||
|
Helper function for reproducible behavior to set the seed in ``random``, ``numpy`` and ``torch``.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
seed (:obj:`int`): The seed to set.
|
||||||
|
"""
|
||||||
random.seed(seed)
|
random.seed(seed)
|
||||||
np.random.seed(seed)
|
np.random.seed(seed)
|
||||||
torch.manual_seed(seed)
|
torch.manual_seed(seed)
|
||||||
@@ -72,6 +78,9 @@ def set_seed(seed: int):
|
|||||||
def torch_distributed_zero_first(local_rank: int):
|
def torch_distributed_zero_first(local_rank: int):
|
||||||
"""
|
"""
|
||||||
Decorator to make all processes in distributed training wait for each local_master to do something.
|
Decorator to make all processes in distributed training wait for each local_master to do something.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
local_rank (:obj:`int`): The rank of the local process.
|
||||||
"""
|
"""
|
||||||
if local_rank not in [-1, 0]:
|
if local_rank not in [-1, 0]:
|
||||||
torch.distributed.barrier()
|
torch.distributed.barrier()
|
||||||
@@ -133,7 +142,31 @@ def get_tpu_sampler(dataset: Dataset):
|
|||||||
class Trainer:
|
class Trainer:
|
||||||
"""
|
"""
|
||||||
Trainer is a simple but feature-complete training and eval loop for PyTorch,
|
Trainer is a simple but feature-complete training and eval loop for PyTorch,
|
||||||
optimized for Transformers.
|
optimized for 🤗 Transformers.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model (:class:`~transformers.PreTrainedModel`):
|
||||||
|
The model to train, evaluate or use for predictions.
|
||||||
|
args (:class:`~transformers.TrainingArguments`):
|
||||||
|
The arguments to tweak training.
|
||||||
|
data_collator (:obj:`DataCollator`, `optional`, defaults to :func:`~transformers.default_data_collator`):
|
||||||
|
The function to use to from a batch from a list of elements of :obj:`train_dataset` or
|
||||||
|
:obj:`eval_dataset`.
|
||||||
|
train_dataset (:obj:`Dataset`, `optional`):
|
||||||
|
The dataset to use for training.
|
||||||
|
eval_dataset (:obj:`Dataset`, `optional`):
|
||||||
|
The dataset to use for evaluation.
|
||||||
|
compute_metrics (:obj:`Callable[[EvalPrediction], Dict]`, `optional`):
|
||||||
|
The function that will be used to compute metrics at evaluation. Must take a
|
||||||
|
:class:`~transformers.EvalPrediction` and return a dictionary string to metric values.
|
||||||
|
prediction_loss_only (:obj:`bool`, `optional`, defaults to `False`):
|
||||||
|
When performing evaluation and predictions, only returns the loss.
|
||||||
|
tb_writer (:obj:`SummaryWriter`, `optional`):
|
||||||
|
Object to write to TensorBoard.
|
||||||
|
optimizers (:obj:`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR`, `optional`):
|
||||||
|
A tuple containing the optimizer and the scheduler to use. Will default to an instance of
|
||||||
|
:class:`~transformers.AdamW` on your model and a scheduler given by
|
||||||
|
:func:`~transformers.get_linear_schedule_with_warmup` controlled by :obj:`args`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
model: PreTrainedModel
|
model: PreTrainedModel
|
||||||
@@ -160,14 +193,6 @@ class Trainer:
|
|||||||
tb_writer: Optional["SummaryWriter"] = None,
|
tb_writer: Optional["SummaryWriter"] = None,
|
||||||
optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None,
|
optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None,
|
||||||
):
|
):
|
||||||
"""
|
|
||||||
Trainer is a simple but feature-complete training and eval loop for PyTorch,
|
|
||||||
optimized for Transformers.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
prediction_loss_only:
|
|
||||||
(Optional) in evaluation and prediction, only return the loss
|
|
||||||
"""
|
|
||||||
self.model = model.to(args.device)
|
self.model = model.to(args.device)
|
||||||
self.args = args
|
self.args = args
|
||||||
self.data_collator = data_collator if data_collator is not None else default_data_collator
|
self.data_collator = data_collator if data_collator is not None else default_data_collator
|
||||||
@@ -210,6 +235,9 @@ class Trainer:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def get_train_dataloader(self) -> DataLoader:
|
def get_train_dataloader(self) -> DataLoader:
|
||||||
|
"""
|
||||||
|
Returns the training :class:`~torch.utils.data.DataLoader`.
|
||||||
|
"""
|
||||||
if self.train_dataset is None:
|
if self.train_dataset is None:
|
||||||
raise ValueError("Trainer: training requires a train_dataset.")
|
raise ValueError("Trainer: training requires a train_dataset.")
|
||||||
if is_torch_tpu_available():
|
if is_torch_tpu_available():
|
||||||
@@ -232,6 +260,13 @@ class Trainer:
|
|||||||
return data_loader
|
return data_loader
|
||||||
|
|
||||||
def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader:
|
def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader:
|
||||||
|
"""
|
||||||
|
Returns the evaluation :class:`~torch.utils.data.DataLoader`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
eval_dataset (:obj:`Dataset`, `optional`):
|
||||||
|
If provided, will override `self.eval_dataset`.
|
||||||
|
"""
|
||||||
if eval_dataset is None and self.eval_dataset is None:
|
if eval_dataset is None and self.eval_dataset is None:
|
||||||
raise ValueError("Trainer: evaluation requires an eval_dataset.")
|
raise ValueError("Trainer: evaluation requires an eval_dataset.")
|
||||||
|
|
||||||
@@ -257,6 +292,12 @@ class Trainer:
|
|||||||
return data_loader
|
return data_loader
|
||||||
|
|
||||||
def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
|
def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
|
||||||
|
"""
|
||||||
|
Returns the test :class:`~torch.utils.data.DataLoader`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
test_dataset (obj:`Dataset`): The test dataset to use.
|
||||||
|
"""
|
||||||
# We use the same batch_size as for eval.
|
# We use the same batch_size as for eval.
|
||||||
if is_torch_tpu_available():
|
if is_torch_tpu_available():
|
||||||
sampler = SequentialDistributedSampler(
|
sampler = SequentialDistributedSampler(
|
||||||
@@ -283,9 +324,8 @@ class Trainer:
|
|||||||
"""
|
"""
|
||||||
Setup the optimizer and the learning rate scheduler.
|
Setup the optimizer and the learning rate scheduler.
|
||||||
|
|
||||||
We provide a reasonable default that works well.
|
We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
|
||||||
If you want to use something else, you can pass a tuple in the Trainer's init,
|
Trainer's init through :obj:`optimizers`, or override this method in a subclass.
|
||||||
or override this method in a subclass.
|
|
||||||
"""
|
"""
|
||||||
if self.optimizers is not None:
|
if self.optimizers is not None:
|
||||||
return self.optimizers
|
return self.optimizers
|
||||||
@@ -336,7 +376,7 @@ class Trainer:
|
|||||||
|
|
||||||
def num_examples(self, dataloader: DataLoader) -> int:
|
def num_examples(self, dataloader: DataLoader) -> int:
|
||||||
"""
|
"""
|
||||||
Helper to get num of examples from a DataLoader, by accessing its Dataset.
|
Helper to get number of samples in a :class:`~torch.utils.data.DataLoader` by accessing its Dataset.
|
||||||
"""
|
"""
|
||||||
return len(dataloader.dataset)
|
return len(dataloader.dataset)
|
||||||
|
|
||||||
@@ -345,9 +385,9 @@ class Trainer:
|
|||||||
Main training entry point.
|
Main training entry point.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
model_path:
|
model_path (:obj:`str`, `optional`):
|
||||||
(Optional) Local path to model if model to train has been instantiated from a local path
|
Local path to the model if the model to train has been instantiated from a local path. If present,
|
||||||
If present, we will try reloading the optimizer/scheduler states from there.
|
training will resume from the optimizer/scheduler states loaded here.
|
||||||
"""
|
"""
|
||||||
train_dataloader = self.get_train_dataloader()
|
train_dataloader = self.get_train_dataloader()
|
||||||
if self.args.max_steps > 0:
|
if self.args.max_steps > 0:
|
||||||
@@ -611,8 +651,7 @@ class Trainer:
|
|||||||
|
|
||||||
def save_model(self, output_dir: Optional[str] = None):
|
def save_model(self, output_dir: Optional[str] = None):
|
||||||
"""
|
"""
|
||||||
Saving best-practices: if you use default names for the model,
|
Will save the model, so you can reload it using :obj:`from_pretrained()`.
|
||||||
you can reload it using from_pretrained().
|
|
||||||
|
|
||||||
Will only save from the world_master process (unless in TPUs).
|
Will only save from the world_master process (unless in TPUs).
|
||||||
"""
|
"""
|
||||||
@@ -683,22 +722,18 @@ class Trainer:
|
|||||||
logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
|
logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
|
||||||
shutil.rmtree(checkpoint)
|
shutil.rmtree(checkpoint)
|
||||||
|
|
||||||
def evaluate(
|
def evaluate(self, eval_dataset: Optional[Dataset] = None) -> Dict[str, float]:
|
||||||
self, eval_dataset: Optional[Dataset] = None, prediction_loss_only: Optional[bool] = None,
|
|
||||||
) -> Dict[str, float]:
|
|
||||||
"""
|
"""
|
||||||
Run evaluation and return metrics.
|
Run evaluation and returns metrics.
|
||||||
|
|
||||||
The calling script will be responsible for providing a method to compute metrics, as they are
|
The calling script will be responsible for providing a method to compute metrics, as they are
|
||||||
task-dependent.
|
task-dependent (pass it to the init :obj:`compute_metrics` argument).
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
eval_dataset: (Optional) Pass a dataset if you wish to override
|
eval_dataset (:obj:`Dataset`, `optional`):
|
||||||
the one on the instance.
|
Pass a dataset if you wish to override :obj:`self.eval_dataset`.
|
||||||
Returns:
|
Returns:
|
||||||
A dict containing:
|
A dictionary containing the evaluation loss and the potential metrics computed from the predictions.
|
||||||
- the eval loss
|
|
||||||
- the potential metrics computed from the predictions
|
|
||||||
"""
|
"""
|
||||||
eval_dataloader = self.get_eval_dataloader(eval_dataset)
|
eval_dataloader = self.get_eval_dataloader(eval_dataset)
|
||||||
|
|
||||||
@@ -714,10 +749,22 @@ class Trainer:
|
|||||||
|
|
||||||
def predict(self, test_dataset: Dataset) -> PredictionOutput:
|
def predict(self, test_dataset: Dataset) -> PredictionOutput:
|
||||||
"""
|
"""
|
||||||
Run prediction and return predictions and potential metrics.
|
Run prediction and returns predictions and potential metrics.
|
||||||
|
|
||||||
Depending on the dataset and your use case, your test dataset may contain labels.
|
Depending on the dataset and your use case, your test dataset may contain labels.
|
||||||
In that case, this method will also return metrics, like in evaluate().
|
In that case, this method will also return metrics, like in :obj:`evaluate()`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
test_dataset (:obj:`Dataset`):
|
||||||
|
Dataset to run the predictions on.
|
||||||
|
Returns:
|
||||||
|
`NamedTuple`:
|
||||||
|
predictions (:obj:`np.ndarray`):
|
||||||
|
The predictions on :obj:`test_dataset`.
|
||||||
|
label_ids (:obj:`np.ndarray`, `optional`):
|
||||||
|
The labels (if the dataset contained some).
|
||||||
|
metrics (:obj:`Dict[str, float]`, `optional`):
|
||||||
|
The potential dictionary of metrics (if the dataset contained labels).
|
||||||
"""
|
"""
|
||||||
test_dataloader = self.get_test_dataloader(test_dataset)
|
test_dataloader = self.get_test_dataloader(test_dataset)
|
||||||
|
|
||||||
|
|||||||
@@ -29,6 +29,34 @@ def set_seed(seed: int):
|
|||||||
|
|
||||||
|
|
||||||
class TFTrainer:
|
class TFTrainer:
|
||||||
|
"""
|
||||||
|
TFTrainer is a simple but feature-complete training and eval loop for TensorFlow,
|
||||||
|
optimized for 🤗 Transformers.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model (:class:`~transformers.TFPreTrainedModel`):
|
||||||
|
The model to train, evaluate or use for predictions.
|
||||||
|
args (:class:`~transformers.TFTrainingArguments`):
|
||||||
|
The arguments to tweak training.
|
||||||
|
train_dataset (:class:`~tf.data.Dataset`, `optional`):
|
||||||
|
The dataset to use for training.
|
||||||
|
eval_dataset (:class:`~tf.data.Dataset`, `optional`):
|
||||||
|
The dataset to use for evaluation.
|
||||||
|
compute_metrics (:obj:`Callable[[EvalPrediction], Dict]`, `optional`):
|
||||||
|
The function that will be used to compute metrics at evaluation. Must take a
|
||||||
|
:class:`~transformers.EvalPrediction` and return a dictionary string to metric values.
|
||||||
|
prediction_loss_only (:obj:`bool`, `optional`, defaults to `False`):
|
||||||
|
When performing evaluation and predictions, only returns the loss.
|
||||||
|
tb_writer (:obj:`tf.summary.SummaryWriter`, `optional`):
|
||||||
|
Object to write to TensorBoard.
|
||||||
|
optimizers (:obj:`Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule]`, `optional`):
|
||||||
|
A tuple containing the optimizer and the scheduler to use. The optimizer default to an instance of
|
||||||
|
:class:`tf.keras.optimizers.Adam` if :obj:`args.weight_decay_rate` is 0 else an instance of
|
||||||
|
:class:`~transformers.AdamWeightDecay`. The scheduler will default to an instance of
|
||||||
|
:class:`tf.keras.optimizers.schedules.PolynomialDecay` if :obj:`args.num_warmup_steps` is 0 else
|
||||||
|
an instance of :class:`~transformers.WarmUp`.
|
||||||
|
"""
|
||||||
|
|
||||||
model: TFPreTrainedModel
|
model: TFPreTrainedModel
|
||||||
args: TFTrainingArguments
|
args: TFTrainingArguments
|
||||||
train_dataset: Optional[tf.data.Dataset]
|
train_dataset: Optional[tf.data.Dataset]
|
||||||
@@ -78,6 +106,9 @@ class TFTrainer:
|
|||||||
set_seed(self.args.seed)
|
set_seed(self.args.seed)
|
||||||
|
|
||||||
def get_train_tfdataset(self) -> tf.data.Dataset:
|
def get_train_tfdataset(self) -> tf.data.Dataset:
|
||||||
|
"""
|
||||||
|
Returns the training :class:`~tf.data.Dataset`.
|
||||||
|
"""
|
||||||
if self.train_dataset is None:
|
if self.train_dataset is None:
|
||||||
raise ValueError("Trainer: training requires a train_dataset.")
|
raise ValueError("Trainer: training requires a train_dataset.")
|
||||||
|
|
||||||
@@ -101,6 +132,13 @@ class TFTrainer:
|
|||||||
return self.args.strategy.experimental_distribute_dataset(ds)
|
return self.args.strategy.experimental_distribute_dataset(ds)
|
||||||
|
|
||||||
def get_eval_tfdataset(self, eval_dataset: Optional[tf.data.Dataset] = None) -> tf.data.Dataset:
|
def get_eval_tfdataset(self, eval_dataset: Optional[tf.data.Dataset] = None) -> tf.data.Dataset:
|
||||||
|
"""
|
||||||
|
Returns the evaluation :class:`~tf.data.Dataset`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
eval_dataset (:class:`~tf.data.Dataset`, `optional`):
|
||||||
|
If provided, will override `self.eval_dataset`.
|
||||||
|
"""
|
||||||
if eval_dataset is None and self.eval_dataset is None:
|
if eval_dataset is None and self.eval_dataset is None:
|
||||||
raise ValueError("Trainer: evaluation requires an eval_dataset.")
|
raise ValueError("Trainer: evaluation requires an eval_dataset.")
|
||||||
|
|
||||||
@@ -114,6 +152,12 @@ class TFTrainer:
|
|||||||
return self.args.strategy.experimental_distribute_dataset(ds)
|
return self.args.strategy.experimental_distribute_dataset(ds)
|
||||||
|
|
||||||
def get_test_tfdataset(self, test_dataset: tf.data.Dataset) -> tf.data.Dataset:
|
def get_test_tfdataset(self, test_dataset: tf.data.Dataset) -> tf.data.Dataset:
|
||||||
|
"""
|
||||||
|
Returns a test :class:`~tf.data.Dataset`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
test_dataset (:class:`~tf.data.Dataset`): The dataset to use.
|
||||||
|
"""
|
||||||
ds = test_dataset.batch(self.args.eval_batch_size, drop_remainder=self.args.dataloader_drop_last)
|
ds = test_dataset.batch(self.args.eval_batch_size, drop_remainder=self.args.dataloader_drop_last)
|
||||||
|
|
||||||
return self.args.strategy.experimental_distribute_dataset(ds)
|
return self.args.strategy.experimental_distribute_dataset(ds)
|
||||||
@@ -124,9 +168,8 @@ class TFTrainer:
|
|||||||
"""
|
"""
|
||||||
Setup the optimizer and the learning rate scheduler.
|
Setup the optimizer and the learning rate scheduler.
|
||||||
|
|
||||||
We provide a reasonable default that works well.
|
We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
|
||||||
If you want to use something else, you can pass a tuple in the Trainer's init,
|
TFTrainer's init through :obj:`optimizers`, or override this method in a subclass.
|
||||||
or override this method in a subclass.
|
|
||||||
"""
|
"""
|
||||||
if self.optimizers is not None:
|
if self.optimizers is not None:
|
||||||
return self.optimizers
|
return self.optimizers
|
||||||
@@ -263,11 +306,18 @@ class TFTrainer:
|
|||||||
|
|
||||||
logger.info(output)
|
logger.info(output)
|
||||||
|
|
||||||
def evaluate(
|
def evaluate(self, eval_dataset: Optional[tf.data.Dataset] = None) -> Dict[str, float]:
|
||||||
self, eval_dataset: Optional[tf.data.Dataset] = None, prediction_loss_only: Optional[bool] = None
|
|
||||||
) -> Dict[str, float]:
|
|
||||||
"""
|
"""
|
||||||
Prediction/evaluation loop, shared by `evaluate()` and `predict()`.
|
Run evaluation and returns metrics.
|
||||||
|
|
||||||
|
The calling script will be responsible for providing a method to compute metrics, as they are
|
||||||
|
task-dependent (pass it to the init :obj:`compute_metrics` argument).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
eval_dataset (:class:`~tf.data.Dataset`, `optional`):
|
||||||
|
Pass a dataset if you wish to override :obj:`self.eval_dataset`.
|
||||||
|
Returns:
|
||||||
|
A dictionary containing the evaluation loss and the potential metrics computed from the predictions.
|
||||||
"""
|
"""
|
||||||
eval_ds = self.get_eval_tfdataset(eval_dataset)
|
eval_ds = self.get_eval_tfdataset(eval_dataset)
|
||||||
|
|
||||||
@@ -478,12 +528,22 @@ class TFTrainer:
|
|||||||
|
|
||||||
def predict(self, test_dataset: tf.data.Dataset) -> PredictionOutput:
|
def predict(self, test_dataset: tf.data.Dataset) -> PredictionOutput:
|
||||||
"""
|
"""
|
||||||
Run prediction and return predictions and potential metrics.
|
Run prediction and returns predictions and potential metrics.
|
||||||
|
|
||||||
Depending on the dataset and your use case, your test dataset may contain labels.
|
Depending on the dataset and your use case, your test dataset may contain labels.
|
||||||
In that case, this method will also return metrics, like in evaluate().
|
In that case, this method will also return metrics, like in :obj:`evaluate()`.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
test_dataset: something similar to a PT Dataset. This is just
|
test_dataset (:class:`~tf.data.Dataset`):
|
||||||
temporary before to have a framework-agnostic approach for datasets.
|
Dataset to run the predictions on.
|
||||||
|
Returns:
|
||||||
|
`NamedTuple`:
|
||||||
|
predictions (:obj:`np.ndarray`):
|
||||||
|
The predictions on :obj:`test_dataset`.
|
||||||
|
label_ids (:obj:`np.ndarray`, `optional`):
|
||||||
|
The labels (if the dataset contained some).
|
||||||
|
metrics (:obj:`Dict[str, float]`, `optional`):
|
||||||
|
The potential dictionary of metrics (if the dataset contained labels).
|
||||||
"""
|
"""
|
||||||
test_ds = self.get_test_tfdataset(test_dataset)
|
test_ds = self.get_test_tfdataset(test_dataset)
|
||||||
|
|
||||||
@@ -491,7 +551,7 @@ class TFTrainer:
|
|||||||
|
|
||||||
def save_model(self, output_dir: Optional[str] = None):
|
def save_model(self, output_dir: Optional[str] = None):
|
||||||
"""
|
"""
|
||||||
Save the pretrained model.
|
Will save the model, so you can reload it using :obj:`from_pretrained()`.
|
||||||
"""
|
"""
|
||||||
output_dir = output_dir if output_dir is not None else self.args.output_dir
|
output_dir = output_dir if output_dir is not None else self.args.output_dir
|
||||||
|
|
||||||
|
|||||||
@@ -23,8 +23,11 @@ def is_wandb_available():
|
|||||||
|
|
||||||
class EvalPrediction(NamedTuple):
|
class EvalPrediction(NamedTuple):
|
||||||
"""
|
"""
|
||||||
Evaluation output (always contains labels), to be used
|
Evaluation output (always contains labels), to be used to compute metrics.
|
||||||
to compute metrics.
|
|
||||||
|
Parameters:
|
||||||
|
predictions (:obj:`np.ndarray`): Predictions of the model.
|
||||||
|
label_ids (:obj:`np.ndarray`): Targets to be matched.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
predictions: np.ndarray
|
predictions: np.ndarray
|
||||||
|
|||||||
@@ -35,9 +35,73 @@ class TrainingArguments:
|
|||||||
TrainingArguments is the subset of the arguments we use in our example scripts
|
TrainingArguments is the subset of the arguments we use in our example scripts
|
||||||
**which relate to the training loop itself**.
|
**which relate to the training loop itself**.
|
||||||
|
|
||||||
Using `HfArgumentParser` we can turn this class
|
Using :class:`~transformers.HfArgumentParser` we can turn this class
|
||||||
into argparse arguments to be able to specify them on
|
into argparse arguments to be able to specify them on the command line.
|
||||||
the command line.
|
|
||||||
|
Parameters:
|
||||||
|
output_dir (:obj:`str`):
|
||||||
|
The output directory where the model predictions and checkpoints will be written.
|
||||||
|
overwrite_output_dir (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
If :obj:`True`, overwrite the content of the output directory. Use this to continue training if
|
||||||
|
:obj:`output_dir` points to a checkpoint directory.
|
||||||
|
do_train (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Whether to run training or not.
|
||||||
|
do_eval (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Whether to run evaluation on the dev set or not.
|
||||||
|
do_predict (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Whether to run predictions on the test set or not.
|
||||||
|
evaluate_during_training (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Whether to run evaluation during training at each logging step or not.
|
||||||
|
per_device_train_batch_size (:obj:`int`, `optional`, defaults to 8):
|
||||||
|
The batch size per GPU/TPU core/CPU for training.
|
||||||
|
per_device_eval_batch_size (:obj:`int`, `optional`, defaults to 8):
|
||||||
|
The batch size per GPU/TPU core/CPU for evaluation.
|
||||||
|
gradient_accumulation_steps: (:obj:`int`, `optional`, defaults to 1):
|
||||||
|
Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
|
||||||
|
learning_rate (:obj:`float`, `optional`, defaults to 5e-5):
|
||||||
|
The initial learning rate for Adam.
|
||||||
|
weight_decay (:obj:`float`, `optional`, defaults to 0):
|
||||||
|
The weight decay to apply (if not zero).
|
||||||
|
adam_epsilon (:obj:`float`, `optional`, defaults to 1e-8):
|
||||||
|
Epsilon for the Adam optimizer.
|
||||||
|
max_grad_norm (:obj:`float`, `optional`, defaults to 1.0):
|
||||||
|
Maximum gradient norm (for gradient clipping).
|
||||||
|
num_train_epochs(:obj:`float`, `optional`, defaults to 3.0):
|
||||||
|
Total number of training epochs to perform.
|
||||||
|
max_steps (:obj:`int`, `optional`, defaults to -1):
|
||||||
|
If set to a positive number, the total number of training steps to perform. Overrides
|
||||||
|
:obj:`num_train_epochs`.
|
||||||
|
warmup_steps (:obj:`int`, `optional`, defaults to 0):
|
||||||
|
Number of steps used for a linear warmup from 0 to :obj:`learning_rate`.
|
||||||
|
logging_dir (:obj:`str`, `optional`):
|
||||||
|
Tensorboard log directory. Will default to `runs/**CURRENT_DATETIME_HOSTNAME**`.
|
||||||
|
logging_first_step (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Wheter to log and evalulate the first :obj:`global_step` or not.
|
||||||
|
logging_steps (:obj:`int`, `optional`, defaults to 500):
|
||||||
|
Number of update steps between two logs.
|
||||||
|
save_steps (:obj:`int`, `optional`, defaults to 500):
|
||||||
|
Number of updates steps before two checkpoint saves.
|
||||||
|
save_total_limit (:obj:`int`, `optional`):
|
||||||
|
If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
|
||||||
|
:obj:`output_dir`.
|
||||||
|
no_cuda (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Wherher to not use CUDA even when it is available or not.
|
||||||
|
seed (:obj:`int`, `optional`, defaults to 42):
|
||||||
|
Random seed for initialization.
|
||||||
|
fp16 (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Whether to use 16-bit (mixed) precision training (through NVIDIA apex) instead of 32-bit training.
|
||||||
|
fp16_opt_level (:obj:`str`, `optional`, defaults to 'O1'):
|
||||||
|
For :obj:`fp16` training, apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details
|
||||||
|
on the `apex documentation <https://nvidia.github.io/apex/amp.html>`__.
|
||||||
|
local_rank (:obj:`int`, `optional`, defaults to -1):
|
||||||
|
During distributed training, the rank of the process.
|
||||||
|
tpu_num_cores (:obj:`int`, `optional`):
|
||||||
|
When training on TPU, the mumber of TPU cores (automatically passed by launcher script).
|
||||||
|
tpu_metrics_debug (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
When training on TPU, whether to print debug metrics or not.
|
||||||
|
dataloader_drop_last (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
|
||||||
|
or not.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
output_dir: str = field(
|
output_dir: str = field(
|
||||||
@@ -141,6 +205,9 @@ class TrainingArguments:
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def train_batch_size(self) -> int:
|
def train_batch_size(self) -> int:
|
||||||
|
"""
|
||||||
|
The actual batch size for training (may differ from :obj:`per_gpu_train_batch_size` in distributed training).
|
||||||
|
"""
|
||||||
if self.per_gpu_train_batch_size:
|
if self.per_gpu_train_batch_size:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future "
|
"Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future "
|
||||||
@@ -151,6 +218,9 @@ class TrainingArguments:
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def eval_batch_size(self) -> int:
|
def eval_batch_size(self) -> int:
|
||||||
|
"""
|
||||||
|
The actual batch size for evaluation (may differ from :obj:`per_gpu_eval_batch_size` in distributed training).
|
||||||
|
"""
|
||||||
if self.per_gpu_eval_batch_size:
|
if self.per_gpu_eval_batch_size:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future "
|
"Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future "
|
||||||
@@ -193,11 +263,21 @@ class TrainingArguments:
|
|||||||
@property
|
@property
|
||||||
@torch_required
|
@torch_required
|
||||||
def device(self) -> "torch.device":
|
def device(self) -> "torch.device":
|
||||||
|
"""
|
||||||
|
The device used by this process.
|
||||||
|
"""
|
||||||
return self._setup_devices[0]
|
return self._setup_devices[0]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@torch_required
|
@torch_required
|
||||||
def n_gpu(self):
|
def n_gpu(self):
|
||||||
|
"""
|
||||||
|
The number of GPUs used by this process.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
This will only be greater than one when you have multiple GPUs available but are not using distributed
|
||||||
|
training. For distributed training, it will always be 1.
|
||||||
|
"""
|
||||||
return self._setup_devices[1]
|
return self._setup_devices[1]
|
||||||
|
|
||||||
def to_json_string(self):
|
def to_json_string(self):
|
||||||
|
|||||||
@@ -14,6 +14,85 @@ if is_tf_available():
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TFTrainingArguments(TrainingArguments):
|
class TFTrainingArguments(TrainingArguments):
|
||||||
|
"""
|
||||||
|
TrainingArguments is the subset of the arguments we use in our example scripts
|
||||||
|
**which relate to the training loop itself**.
|
||||||
|
|
||||||
|
Using :class:`~transformers.HfArgumentParser` we can turn this class
|
||||||
|
into argparse arguments to be able to specify them on the command line.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
output_dir (:obj:`str`):
|
||||||
|
The output directory where the model predictions and checkpoints will be written.
|
||||||
|
overwrite_output_dir (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
If :obj:`True`, overwrite the content of the output directory. Use this to continue training if
|
||||||
|
:obj:`output_dir` points to a checkpoint directory.
|
||||||
|
do_train (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Whether to run training or not.
|
||||||
|
do_eval (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Whether to run evaluation on the dev set or not.
|
||||||
|
do_predict (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Whether to run predictions on the test set or not.
|
||||||
|
evaluate_during_training (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Whether to run evaluation during training at each logging step or not.
|
||||||
|
per_device_train_batch_size (:obj:`int`, `optional`, defaults to 8):
|
||||||
|
The batch size per GPU/TPU core/CPU for training.
|
||||||
|
per_device_eval_batch_size (:obj:`int`, `optional`, defaults to 8):
|
||||||
|
The batch size per GPU/TPU core/CPU for evaluation.
|
||||||
|
gradient_accumulation_steps: (:obj:`int`, `optional`, defaults to 1):
|
||||||
|
Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
|
||||||
|
learning_rate (:obj:`float`, `optional`, defaults to 5e-5):
|
||||||
|
The initial learning rate for Adam.
|
||||||
|
weight_decay (:obj:`float`, `optional`, defaults to 0):
|
||||||
|
The weight decay to apply (if not zero).
|
||||||
|
adam_epsilon (:obj:`float`, `optional`, defaults to 1e-8):
|
||||||
|
Epsilon for the Adam optimizer.
|
||||||
|
max_grad_norm (:obj:`float`, `optional`, defaults to 1.0):
|
||||||
|
Maximum gradient norm (for gradient clipping).
|
||||||
|
num_train_epochs(:obj:`float`, `optional`, defaults to 3.0):
|
||||||
|
Total number of training epochs to perform.
|
||||||
|
max_steps (:obj:`int`, `optional`, defaults to -1):
|
||||||
|
If set to a positive number, the total number of training steps to perform. Overrides
|
||||||
|
:obj:`num_train_epochs`.
|
||||||
|
warmup_steps (:obj:`int`, `optional`, defaults to 0):
|
||||||
|
Number of steps used for a linear warmup from 0 to :obj:`learning_rate`.
|
||||||
|
logging_dir (:obj:`str`, `optional`):
|
||||||
|
Tensorboard log directory. Will default to `runs/**CURRENT_DATETIME_HOSTNAME**`.
|
||||||
|
logging_first_step (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Wheter to log and evalulate the first :obj:`global_step` or not.
|
||||||
|
logging_steps (:obj:`int`, `optional`, defaults to 500):
|
||||||
|
Number of update steps between two logs.
|
||||||
|
save_steps (:obj:`int`, `optional`, defaults to 500):
|
||||||
|
Number of updates steps before two checkpoint saves.
|
||||||
|
save_total_limit (:obj:`int`, `optional`):
|
||||||
|
If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
|
||||||
|
:obj:`output_dir`.
|
||||||
|
no_cuda (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Wherher to not use CUDA even when it is available or not.
|
||||||
|
seed (:obj:`int`, `optional`, defaults to 42):
|
||||||
|
Random seed for initialization.
|
||||||
|
fp16 (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Whether to use 16-bit (mixed) precision training (through NVIDIA apex) instead of 32-bit training.
|
||||||
|
fp16_opt_level (:obj:`str`, `optional`, defaults to 'O1'):
|
||||||
|
For :obj:`fp16` training, apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details
|
||||||
|
on the `apex documentation <https://nvidia.github.io/apex/amp.html>`__.
|
||||||
|
local_rank (:obj:`int`, `optional`, defaults to -1):
|
||||||
|
During distributed training, the rank of the process.
|
||||||
|
tpu_num_cores (:obj:`int`, `optional`):
|
||||||
|
When training on TPU, the mumber of TPU cores (automatically passed by launcher script).
|
||||||
|
tpu_metrics_debug (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
When training on TPU, whether to print debug metrics or not.
|
||||||
|
dataloader_drop_last (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
|
||||||
|
or not.
|
||||||
|
tpu_name (:obj:`str`, `optional`):
|
||||||
|
The name of the TPU the process is running on.
|
||||||
|
eval_steps (:obj:`int`, `optional`, defaults to 1000):
|
||||||
|
Number of update steps before two evaluations.
|
||||||
|
debug (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Wheter to activate the trace to record computation graphs and profiling information or not.
|
||||||
|
"""
|
||||||
|
|
||||||
tpu_name: str = field(
|
tpu_name: str = field(
|
||||||
default=None, metadata={"help": "Name of TPU"},
|
default=None, metadata={"help": "Name of TPU"},
|
||||||
)
|
)
|
||||||
@@ -59,9 +138,15 @@ class TFTrainingArguments(TrainingArguments):
|
|||||||
@property
|
@property
|
||||||
@tf_required
|
@tf_required
|
||||||
def strategy(self) -> "tf.distribute.Strategy":
|
def strategy(self) -> "tf.distribute.Strategy":
|
||||||
|
"""
|
||||||
|
The strategy used for distributed training.
|
||||||
|
"""
|
||||||
return self._setup_strategy
|
return self._setup_strategy
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@tf_required
|
@tf_required
|
||||||
def n_gpu(self) -> int:
|
def n_gpu(self) -> int:
|
||||||
|
"""
|
||||||
|
The number of replicas (GPUs or TPU cores) used in this training.
|
||||||
|
"""
|
||||||
return self._setup_strategy.num_replicas_in_sync
|
return self._setup_strategy.num_replicas_in_sync
|
||||||
|
|||||||
Reference in New Issue
Block a user