Seq2seq trainer (#9241)
* Add label smoothing in Trainer * Add options for scheduler and Adafactor in Trainer * Put Seq2SeqTrainer in the main lib * Apply suggestions from code review Co-authored-by: Stas Bekman <stas00@users.noreply.github.com> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> * Address review comments and adapt scripts * Documentation * Move test not using script to tests folder Co-authored-by: Stas Bekman <stas00@users.noreply.github.com> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
This commit is contained in:
@@ -287,8 +287,9 @@ from .trainer_callback import (
|
||||
TrainerControl,
|
||||
TrainerState,
|
||||
)
|
||||
from .trainer_utils import EvalPrediction, EvaluationStrategy, set_seed
|
||||
from .trainer_utils import EvalPrediction, EvaluationStrategy, SchedulerType, set_seed
|
||||
from .training_args import TrainingArguments
|
||||
from .training_args_seq2seq import Seq2SeqTrainingArguments
|
||||
from .training_args_tf import TFTrainingArguments
|
||||
from .utils import logging
|
||||
|
||||
@@ -682,11 +683,13 @@ if is_torch_available():
|
||||
get_cosine_with_hard_restarts_schedule_with_warmup,
|
||||
get_linear_schedule_with_warmup,
|
||||
get_polynomial_decay_schedule_with_warmup,
|
||||
get_scheduler,
|
||||
)
|
||||
|
||||
# Trainer
|
||||
from .trainer import Trainer
|
||||
from .trainer_pt_utils import torch_distributed_zero_first
|
||||
from .trainer_seq2seq import Seq2SeqTrainer
|
||||
else:
|
||||
from .utils.dummy_pt_objects import *
|
||||
|
||||
|
||||
@@ -15,12 +15,13 @@
|
||||
"""PyTorch optimization for BERT model."""
|
||||
|
||||
import math
|
||||
from typing import Callable, Iterable, Tuple
|
||||
from typing import Callable, Iterable, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
from torch.optim import Optimizer
|
||||
from torch.optim.lr_scheduler import LambdaLR
|
||||
|
||||
from .trainer_utils import SchedulerType
|
||||
from .utils import logging
|
||||
|
||||
|
||||
@@ -215,6 +216,56 @@ def get_polynomial_decay_schedule_with_warmup(
|
||||
return LambdaLR(optimizer, lr_lambda, last_epoch)
|
||||
|
||||
|
||||
TYPE_TO_SCHEDULER_FUNCTION = {
|
||||
SchedulerType.LINEAR: get_linear_schedule_with_warmup,
|
||||
SchedulerType.COSINE: get_cosine_schedule_with_warmup,
|
||||
SchedulerType.COSINE_WITH_RESTARTS: get_cosine_with_hard_restarts_schedule_with_warmup,
|
||||
SchedulerType.POLYNOMIAL: get_polynomial_decay_schedule_with_warmup,
|
||||
SchedulerType.CONSTANT: get_constant_schedule,
|
||||
SchedulerType.CONSTANT_WITH_WARMUP: get_constant_schedule_with_warmup,
|
||||
}
|
||||
|
||||
|
||||
def get_scheduler(
|
||||
name: Union[str, SchedulerType],
|
||||
optimizer: Optimizer,
|
||||
num_warmup_steps: Optional[int] = None,
|
||||
num_training_steps: Optional[int] = None,
|
||||
):
|
||||
"""
|
||||
Unified API to get any scheduler from its name.
|
||||
|
||||
Args:
|
||||
name (:obj:`str` or `:obj:`SchedulerType`):
|
||||
The name of the scheduler to use.
|
||||
optimizer (:obj:`torch.optim.Optimizer`):
|
||||
The optimizer that will be used during training.
|
||||
num_warmup_steps (:obj:`int`, `optional`):
|
||||
The number of warmup steps to do. This is not required by all schedulers (hence the argument being
|
||||
optional), the function will raise an error if it's unset and the scheduler type requires it.
|
||||
num_training_steps (:obj:`int`, `optional`):
|
||||
The number of training steps to do. This is not required by all schedulers (hence the argument being
|
||||
optional), the function will raise an error if it's unset and the scheduler type requires it.
|
||||
"""
|
||||
name = SchedulerType(name)
|
||||
schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
|
||||
if name == SchedulerType.CONSTANT:
|
||||
return schedule_func(optimizer)
|
||||
|
||||
# All other schedulers require `num_warmup_steps`
|
||||
if num_warmup_steps is None:
|
||||
raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
|
||||
|
||||
if name == SchedulerType.CONSTANT_WITH_WARMUP:
|
||||
return schedule_func(optimizer, num_warmup_steps=num_warmup_steps)
|
||||
|
||||
# All other schedulers require `num_training_steps`
|
||||
if num_training_steps is None:
|
||||
raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
|
||||
|
||||
return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
|
||||
|
||||
|
||||
class AdamW(Optimizer):
|
||||
"""
|
||||
Implements Adam algorithm with weight decay fix as introduced in `Decoupled Weight Decay Regularization
|
||||
|
||||
@@ -57,7 +57,7 @@ from .data.data_collator import DataCollator, DataCollatorWithPadding, default_d
|
||||
from .file_utils import WEIGHTS_NAME, is_apex_available, is_datasets_available, is_in_notebook, is_torch_tpu_available
|
||||
from .modeling_utils import PreTrainedModel
|
||||
from .models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING
|
||||
from .optimization import AdamW, get_linear_schedule_with_warmup
|
||||
from .optimization import Adafactor, AdamW, get_scheduler
|
||||
from .tokenization_utils_base import PreTrainedTokenizerBase
|
||||
from .trainer_callback import (
|
||||
CallbackHandler,
|
||||
@@ -70,6 +70,7 @@ from .trainer_callback import (
|
||||
)
|
||||
from .trainer_pt_utils import (
|
||||
DistributedTensorGatherer,
|
||||
LabelSmoother,
|
||||
SequentialDistributedSampler,
|
||||
distributed_broadcast_scalars,
|
||||
distributed_concat,
|
||||
@@ -320,6 +321,12 @@ class Trainer:
|
||||
)
|
||||
self.use_apex = True
|
||||
|
||||
# Label smoothing
|
||||
if self.args.label_smoothing_factor != 0:
|
||||
self.label_smoother = LabelSmoother(epsilon=self.args.label_smoothing_factor)
|
||||
else:
|
||||
self.label_smoother = None
|
||||
|
||||
self.state = TrainerState()
|
||||
self.control = TrainerControl()
|
||||
# Internal variable for total_flos used to count as tensors (for distributed + TPU), will be sent in the
|
||||
@@ -507,24 +514,32 @@ class Trainer:
|
||||
"weight_decay": 0.0,
|
||||
},
|
||||
]
|
||||
optimizer_cls = Adafactor if self.args.adafactor else AdamW
|
||||
if self.args.adafactor:
|
||||
optimizer_cls = Adafactor
|
||||
optimizer_kwargs = {"scale_parameter": False, "relative_step": False}
|
||||
else:
|
||||
optimizer_cls = AdamW
|
||||
optimizer_kwargs = {
|
||||
"betas": (self.args.adam_beta1, self.args.adam_beta2),
|
||||
"eps": self.args.adam_epsilon,
|
||||
}
|
||||
optimizer_kwargs["lr"] = self.args.learning_rate
|
||||
if self.sharded_dpp:
|
||||
self.optimizer = OSS(
|
||||
params=optimizer_grouped_parameters,
|
||||
optim=AdamW,
|
||||
lr=self.args.learning_rate,
|
||||
betas=(self.args.adam_beta1, self.args.adam_beta2),
|
||||
eps=self.args.adam_epsilon,
|
||||
optim=optimizer_cls,
|
||||
**optimizer_kwargs,
|
||||
)
|
||||
else:
|
||||
self.optimizer = AdamW(
|
||||
optimizer_grouped_parameters,
|
||||
lr=self.args.learning_rate,
|
||||
betas=(self.args.adam_beta1, self.args.adam_beta2),
|
||||
eps=self.args.adam_epsilon,
|
||||
)
|
||||
self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
|
||||
|
||||
if self.lr_scheduler is None:
|
||||
self.lr_scheduler = get_linear_schedule_with_warmup(
|
||||
self.optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps
|
||||
self.lr_scheduler = get_scheduler(
|
||||
self.args.lr_scheduler_type,
|
||||
self.optimizer,
|
||||
num_warmup_steps=self.args.warmup_steps,
|
||||
num_training_steps=num_training_steps,
|
||||
)
|
||||
|
||||
def num_examples(self, dataloader: DataLoader) -> int:
|
||||
@@ -1168,8 +1183,12 @@ class Trainer:
|
||||
# TODO: this needs to be fixed and made cleaner later.
|
||||
if self.args.past_index >= 0:
|
||||
self._past = outputs[self.args.past_index]
|
||||
# We don't use .loss here since the model may return tuples instead of ModelOutput.
|
||||
return outputs["loss"] if isinstance(outputs, dict) else outputs[0]
|
||||
|
||||
if self.label_smoother is not None and "labels" in inputs:
|
||||
return self.label_smoother(outputs, inputs["labels"])
|
||||
else:
|
||||
# We don't use .loss here since the model may return tuples instead of ModelOutput.
|
||||
return outputs["loss"] if isinstance(outputs, dict) else outputs[0]
|
||||
|
||||
def is_local_process_zero(self) -> bool:
|
||||
"""
|
||||
@@ -1556,11 +1575,13 @@ class Trainer:
|
||||
else:
|
||||
outputs = model(**inputs)
|
||||
if has_labels:
|
||||
if self.label_smoother is not None and "labels" in inputs:
|
||||
loss = self.label_smoother(outputs, inputs["labels"]).mean().detach()
|
||||
else:
|
||||
loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach()
|
||||
if isinstance(outputs, dict):
|
||||
loss = outputs["loss"].mean().detach()
|
||||
logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"])
|
||||
else:
|
||||
loss = outputs[0].mean().detach()
|
||||
logits = outputs[1:]
|
||||
else:
|
||||
loss = None
|
||||
|
||||
@@ -19,6 +19,7 @@ Torch utilities for the Trainer class.
|
||||
import math
|
||||
import warnings
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
@@ -360,3 +361,32 @@ class DistributedTensorGatherer:
|
||||
if self._offsets[0] != self.process_length:
|
||||
logger.warn("Not all data has been set. Are you sure you passed all values?")
|
||||
return nested_truncate(self._storage, self.num_samples)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LabelSmoother:
|
||||
"""
|
||||
Adds label-smoothing on a pre-computed output from a Transformers model.
|
||||
|
||||
Args:
|
||||
epsilon (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The label smoothing factor.
|
||||
ignore_index (:obj:`int`, `optional`, defaults to -100):
|
||||
The index in the labels to ignore when computing the loss.
|
||||
"""
|
||||
|
||||
epsilon: float = 0.1
|
||||
ignore_index: int = -100
|
||||
|
||||
def __call__(self, model_output, labels):
|
||||
model_loss = model_output["loss"] if isinstance(model_output, dict) else model_output[0]
|
||||
logits = model_output["logits"] if isinstance(model_output, dict) else model_output[1]
|
||||
log_probs = -torch.nn.functional.log_softmax(logits, dim=-1)
|
||||
|
||||
# Look at the ignored index and mask the corresponding log_probs.
|
||||
padding_mask = labels.unsqueeze(-1).eq(self.ignore_index)
|
||||
log_probs.masked_fill_(padding_mask, 0.0)
|
||||
|
||||
# Take the mean over the label dimensions, then divide by the number of active elements (i.e. not-padded):
|
||||
smoothed_loss = log_probs.mean(dim=-1).sum() / (padding_mask.numel() - padding_mask.long().sum())
|
||||
return (1 - self.epsilon) * model_loss + self.epsilon * smoothed_loss
|
||||
|
||||
231
src/transformers/trainer_seq2seq.py
Normal file
231
src/transformers/trainer_seq2seq.py
Normal file
@@ -0,0 +1,231 @@
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
from packaging import version
|
||||
from torch import nn
|
||||
from torch.utils.data import DistributedSampler, RandomSampler
|
||||
from torch.utils.data.dataset import Dataset
|
||||
|
||||
from .file_utils import is_torch_tpu_available
|
||||
from .trainer import Trainer
|
||||
from .trainer_pt_utils import get_tpu_sampler
|
||||
from .trainer_utils import PredictionOutput
|
||||
from .training_args import ParallelMode
|
||||
from .utils import logging
|
||||
|
||||
|
||||
if version.parse(torch.__version__) >= version.parse("1.6"):
|
||||
from torch.cuda.amp import autocast
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class Seq2SeqTrainer(Trainer):
|
||||
def _get_train_sampler(self) -> Optional[torch.utils.data.sampler.Sampler]:
|
||||
if isinstance(self.train_dataset, torch.utils.data.IterableDataset):
|
||||
return None
|
||||
elif is_torch_tpu_available():
|
||||
return get_tpu_sampler(self.train_dataset)
|
||||
else:
|
||||
if self.args.sortish_sampler:
|
||||
self.train_dataset.make_sortish_sampler(
|
||||
self.args.per_device_train_batch_size,
|
||||
distributed=(self.args.parallel_mode == ParallelMode.DISTRIBUTED),
|
||||
)
|
||||
|
||||
return (
|
||||
RandomSampler(self.train_dataset)
|
||||
if self.args.local_rank == -1
|
||||
else DistributedSampler(self.train_dataset)
|
||||
)
|
||||
|
||||
def evaluate(
|
||||
self,
|
||||
eval_dataset: Optional[Dataset] = None,
|
||||
ignore_keys: Optional[List[str]] = None,
|
||||
metric_key_prefix: str = "eval",
|
||||
max_length: Optional[int] = None,
|
||||
num_beams: Optional[int] = None,
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
Run evaluation and returns metrics.
|
||||
|
||||
The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
|
||||
(pass it to the init :obj:`compute_metrics` argument).
|
||||
|
||||
You can also subclass and override this method to inject custom behavior.
|
||||
|
||||
Args:
|
||||
eval_dataset (:obj:`Dataset`, `optional`):
|
||||
Pass a dataset if you wish to override :obj:`self.eval_dataset`. If it is an :obj:`datasets.Dataset`,
|
||||
columns not accepted by the ``model.forward()`` method are automatically removed. It must implement the
|
||||
:obj:`__len__` method.
|
||||
ignore_keys (:obj:`List[str]`, `optional`):
|
||||
A list of keys in the output of your model (if it is a dictionary) that should be ignored when
|
||||
gathering predictions.
|
||||
metric_key_prefix (:obj:`str`, `optional`, defaults to :obj:`"eval"`):
|
||||
An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
|
||||
"eval_bleu" if the prefix is ``"eval"`` (default)
|
||||
max_length (:obj:`int`, `optional`):
|
||||
The maximum target length to use when predicting with the generate method.
|
||||
num_beams (:obj:`int`, `optional`):
|
||||
Number of beams for beam search that will be used when predicting with the generate method. 1 means no
|
||||
beam search.
|
||||
|
||||
Returns:
|
||||
A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
|
||||
dictionary also contains the epoch number which comes from the training state.
|
||||
"""
|
||||
self._max_length = max_length
|
||||
self._num_beams = num_beams
|
||||
return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
|
||||
|
||||
def predict(
|
||||
self,
|
||||
test_dataset: Dataset,
|
||||
ignore_keys: Optional[List[str]] = None,
|
||||
metric_key_prefix: str = "eval",
|
||||
max_length: Optional[int] = None,
|
||||
num_beams: Optional[int] = None,
|
||||
) -> PredictionOutput:
|
||||
"""
|
||||
Run prediction and returns predictions and potential metrics.
|
||||
|
||||
Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method
|
||||
will also return metrics, like in :obj:`evaluate()`.
|
||||
|
||||
Args:
|
||||
test_dataset (:obj:`Dataset`):
|
||||
Dataset to run the predictions on. If it is an :obj:`datasets.Dataset`, columns not accepted by the
|
||||
``model.forward()`` method are automatically removed. Has to implement the method :obj:`__len__`
|
||||
ignore_keys (:obj:`List[str]`, `optional`):
|
||||
A list of keys in the output of your model (if it is a dictionary) that should be ignored when
|
||||
gathering predictions.
|
||||
metric_key_prefix (:obj:`str`, `optional`, defaults to :obj:`"eval"`):
|
||||
An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
|
||||
"eval_bleu" if the prefix is ``"eval"`` (default)
|
||||
max_length (:obj:`int`, `optional`):
|
||||
The maximum target length to use when predicting with the generate method.
|
||||
num_beams (:obj:`int`, `optional`):
|
||||
Number of beams for beam search that will be used when predicting with the generate method. 1 means no
|
||||
beam search.
|
||||
|
||||
.. note::
|
||||
|
||||
If your predictions or labels have different sequence lengths (for instance because you're doing dynamic
|
||||
padding in a token classification task) the predictions will be padded (on the right) to allow for
|
||||
concatenation into one array. The padding index is -100.
|
||||
|
||||
Returns: `NamedTuple` A namedtuple with the following keys:
|
||||
|
||||
- predictions (:obj:`np.ndarray`): The predictions on :obj:`test_dataset`.
|
||||
- label_ids (:obj:`np.ndarray`, `optional`): The labels (if the dataset contained some).
|
||||
- metrics (:obj:`Dict[str, float]`, `optional`): The potential dictionary of metrics (if the dataset
|
||||
contained labels).
|
||||
"""
|
||||
self._max_length = max_length
|
||||
self._num_beams = num_beams
|
||||
return super().predict(test_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
|
||||
|
||||
def prediction_step(
|
||||
self,
|
||||
model: nn.Module,
|
||||
inputs: Dict[str, Union[torch.Tensor, Any]],
|
||||
prediction_loss_only: bool,
|
||||
ignore_keys: Optional[List[str]] = None,
|
||||
) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
|
||||
"""
|
||||
Perform an evaluation step on :obj:`model` using obj:`inputs`.
|
||||
|
||||
Subclass and override to inject custom behavior.
|
||||
|
||||
Args:
|
||||
model (:obj:`nn.Module`):
|
||||
The model to evaluate.
|
||||
inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
|
||||
The inputs and targets of the model.
|
||||
|
||||
The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
|
||||
argument :obj:`labels`. Check your model's documentation for all accepted arguments.
|
||||
prediction_loss_only (:obj:`bool`):
|
||||
Whether or not to return the loss only.
|
||||
|
||||
Return:
|
||||
Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
|
||||
labels (each being optional).
|
||||
"""
|
||||
|
||||
if not self.args.predict_with_generate or prediction_loss_only:
|
||||
return super()(self, model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys)
|
||||
|
||||
has_labels = "labels" in inputs
|
||||
inputs = self._prepare_inputs(inputs)
|
||||
|
||||
gen_kwargs = {
|
||||
"max_length": self._max_length if self._max_length is not None else self.model.config.max_length,
|
||||
"num_beams": self._num_beams if self._num_beams is not None else self.model.config.num_beams,
|
||||
}
|
||||
|
||||
generated_tokens = self.model.generate(
|
||||
inputs["input_ids"],
|
||||
attention_mask=inputs["attention_mask"],
|
||||
**gen_kwargs,
|
||||
)
|
||||
# in case the batch is shorter than max length, the output should be padded
|
||||
if generated_tokens.shape[-1] < gen_kwargs["max_length"]:
|
||||
generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
|
||||
|
||||
with torch.no_grad():
|
||||
if self.use_amp:
|
||||
with autocast():
|
||||
outputs = model(**inputs)
|
||||
else:
|
||||
outputs = model(**inputs)
|
||||
if has_labels:
|
||||
if self.label_smoother is not None:
|
||||
loss = self.label_smoother(outputs, inputs["labels"]).mean().detach()
|
||||
else:
|
||||
loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach()
|
||||
else:
|
||||
loss = None
|
||||
|
||||
if self.args.prediction_loss_only:
|
||||
return (loss, None, None)
|
||||
|
||||
labels = inputs["labels"]
|
||||
if labels.shape[-1] < gen_kwargs["max_length"]:
|
||||
labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
|
||||
|
||||
return (loss, generated_tokens, labels)
|
||||
|
||||
def _pad_tensors_to_max_len(self, tensor, max_length):
|
||||
if self.tokenizer is None:
|
||||
raise ValueError(
|
||||
f"Tensor need to be padded to `max_length={max_length}` but no tokenzier was passed when creating "
|
||||
"this `Trainer`. Make sure to create your `Trainer` with the appropriate tokenizer."
|
||||
)
|
||||
# If PAD token is not defined at least EOS token has to be defined
|
||||
pad_token_id = (
|
||||
self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
|
||||
)
|
||||
|
||||
padded_tensor = pad_token_id * torch.ones(
|
||||
(tensor.shape[0], max_length), dtype=tensor.dtype, device=tensor.device
|
||||
)
|
||||
padded_tensor[:, : tensor.shape[-1]] = tensor
|
||||
return padded_tensor
|
||||
@@ -201,3 +201,12 @@ def speed_metrics(split, start_time, num_samples=None):
|
||||
samples_per_second = 1 / (runtime / num_samples)
|
||||
result[f"{split}_samples_per_second"] = round(samples_per_second, 3)
|
||||
return result
|
||||
|
||||
|
||||
class SchedulerType(ExplicitEnum):
|
||||
LINEAR = "linear"
|
||||
COSINE = "cosine"
|
||||
COSINE_WITH_RESTARTS = "cosine_with_restarts"
|
||||
POLYNOMIAL = "polynomial"
|
||||
CONSTANT = "constant"
|
||||
CONSTANT_WITH_WARMUP = "constant_with_warmup"
|
||||
|
||||
@@ -19,7 +19,7 @@ from enum import Enum
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from .file_utils import cached_property, is_torch_available, is_torch_tpu_available, torch_required
|
||||
from .trainer_utils import EvaluationStrategy
|
||||
from .trainer_utils import EvaluationStrategy, SchedulerType
|
||||
from .utils import logging
|
||||
|
||||
|
||||
@@ -121,6 +121,9 @@ class TrainingArguments:
|
||||
max_steps (:obj:`int`, `optional`, defaults to -1):
|
||||
If set to a positive number, the total number of training steps to perform. Overrides
|
||||
:obj:`num_train_epochs`.
|
||||
lr_scheduler_type (:obj:`str` or :class:`~transformers.SchedulerType`, `optional`, defaults to :obj:`"linear"`):
|
||||
The scheduler type to use. See the documentation of :class:`~transformers.SchedulerType` for all possible
|
||||
values.
|
||||
warmup_steps (:obj:`int`, `optional`, defaults to 0):
|
||||
Number of steps used for a linear warmup from 0 to :obj:`learning_rate`.
|
||||
logging_dir (:obj:`str`, `optional`):
|
||||
@@ -217,6 +220,13 @@ class TrainingArguments:
|
||||
sharded_ddp (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Use Sharded DDP training from `FairScale <https://github.com/facebookresearch/fairscale>`__ (in distributed
|
||||
training only). This is an experimental feature.
|
||||
label_smoothing_factor (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The label smoothing factor to use. Zero means no label smoothing, otherwise the underlying onehot-encoded
|
||||
labels are changed from 0s and 1s to :obj:`label_smoothing_factor/num_labels` and :obj:`1 -
|
||||
label_smoothing_factor + label_smoothing_factor/num_labels` respectively.
|
||||
adafactor (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to use the :class:`~transformers.Adafactor` optimizer instead of
|
||||
:class:`~transformers.AdamW`.
|
||||
"""
|
||||
|
||||
output_dir: str = field(
|
||||
@@ -246,7 +256,7 @@ class TrainingArguments:
|
||||
)
|
||||
evaluation_strategy: EvaluationStrategy = field(
|
||||
default="no",
|
||||
metadata={"help": "Run evaluation during training at each logging step."},
|
||||
metadata={"help": "The evaluation strategy to use."},
|
||||
)
|
||||
prediction_loss_only: bool = field(
|
||||
default=False,
|
||||
@@ -296,6 +306,10 @@ class TrainingArguments:
|
||||
default=-1,
|
||||
metadata={"help": "If > 0: set total number of training steps to perform. Override num_train_epochs."},
|
||||
)
|
||||
lr_scheduler_type: SchedulerType = field(
|
||||
default="linear",
|
||||
metadata={"help": "The scheduler type to use."},
|
||||
)
|
||||
warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
|
||||
|
||||
logging_dir: Optional[str] = field(default_factory=default_logdir, metadata={"help": "Tensorboard log dir."})
|
||||
@@ -392,11 +406,16 @@ class TrainingArguments:
|
||||
default=False,
|
||||
metadata={"help": "Whether or not to use sharded DDP training (in distributed training only)."},
|
||||
)
|
||||
label_smoothing_factor: float = field(
|
||||
default=0.0, metadata={"help": "The label smoothing epsilon to apply (zero means no label smoothing)."}
|
||||
)
|
||||
adafactor: bool = field(default=False, metadata={"help": "Whether or not to replace Adam by Adafactor."})
|
||||
|
||||
def __post_init__(self):
|
||||
if self.disable_tqdm is None:
|
||||
self.disable_tqdm = logger.getEffectiveLevel() > logging.WARN
|
||||
self.evaluation_strategy = EvaluationStrategy(self.evaluation_strategy)
|
||||
self.lr_scheduler_type = SchedulerType(self.lr_scheduler_type)
|
||||
if self.do_eval is False and self.evaluation_strategy != EvaluationStrategy.NO:
|
||||
self.do_eval = True
|
||||
if self.eval_steps is None:
|
||||
|
||||
42
src/transformers/training_args_seq2seq.py
Normal file
42
src/transformers/training_args_seq2seq.py
Normal file
@@ -0,0 +1,42 @@
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from .file_utils import add_start_docstrings
|
||||
from .training_args import TrainingArguments
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
@add_start_docstrings(TrainingArguments.__doc__)
|
||||
class Seq2SeqTrainingArguments(TrainingArguments):
|
||||
"""
|
||||
sortish_sampler (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether to use a `sortish sampler` or not. Only possible if the underlying datasets are `Seq2SeqDataset` for
|
||||
now but will become generally available in the near future.
|
||||
|
||||
It sorts the inputs according to lengths in order to minimize the padding size, with a bit of randomness for
|
||||
the training set.
|
||||
predict_with_generate (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether to use generate to calculate generative metrics (ROUGE, BLEU).
|
||||
"""
|
||||
|
||||
sortish_sampler: bool = field(default=False, metadata={"help": "Whether to use SortishSampler or not."})
|
||||
predict_with_generate: bool = field(
|
||||
default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."}
|
||||
)
|
||||
@@ -2279,6 +2279,10 @@ def get_polynomial_decay_schedule_with_warmup(*args, **kwargs):
|
||||
requires_pytorch(get_polynomial_decay_schedule_with_warmup)
|
||||
|
||||
|
||||
def get_scheduler(*args, **kwargs):
|
||||
requires_pytorch(get_scheduler)
|
||||
|
||||
|
||||
class Trainer:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
@@ -2286,3 +2290,8 @@ class Trainer:
|
||||
|
||||
def torch_distributed_zero_first(*args, **kwargs):
|
||||
requires_pytorch(torch_distributed_zero_first)
|
||||
|
||||
|
||||
class Seq2SeqTrainer:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
Reference in New Issue
Block a user