Seq2seq trainer (#9241)
* Add label smoothing in Trainer * Add options for scheduler and Adafactor in Trainer * Put Seq2SeqTrainer in the main lib * Apply suggestions from code review Co-authored-by: Stas Bekman <stas00@users.noreply.github.com> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> * Address review comments and adapt scripts * Documentation * Move test not using script to tests folder Co-authored-by: Stas Bekman <stas00@users.noreply.github.com> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
This commit is contained in:
@@ -19,6 +19,7 @@ Torch utilities for the Trainer class.
|
||||
import math
|
||||
import warnings
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
@@ -360,3 +361,32 @@ class DistributedTensorGatherer:
|
||||
if self._offsets[0] != self.process_length:
|
||||
logger.warn("Not all data has been set. Are you sure you passed all values?")
|
||||
return nested_truncate(self._storage, self.num_samples)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LabelSmoother:
|
||||
"""
|
||||
Adds label-smoothing on a pre-computed output from a Transformers model.
|
||||
|
||||
Args:
|
||||
epsilon (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The label smoothing factor.
|
||||
ignore_index (:obj:`int`, `optional`, defaults to -100):
|
||||
The index in the labels to ignore when computing the loss.
|
||||
"""
|
||||
|
||||
epsilon: float = 0.1
|
||||
ignore_index: int = -100
|
||||
|
||||
def __call__(self, model_output, labels):
|
||||
model_loss = model_output["loss"] if isinstance(model_output, dict) else model_output[0]
|
||||
logits = model_output["logits"] if isinstance(model_output, dict) else model_output[1]
|
||||
log_probs = -torch.nn.functional.log_softmax(logits, dim=-1)
|
||||
|
||||
# Look at the ignored index and mask the corresponding log_probs.
|
||||
padding_mask = labels.unsqueeze(-1).eq(self.ignore_index)
|
||||
log_probs.masked_fill_(padding_mask, 0.0)
|
||||
|
||||
# Take the mean over the label dimensions, then divide by the number of active elements (i.e. not-padded):
|
||||
smoothed_loss = log_probs.mean(dim=-1).sum() / (padding_mask.numel() - padding_mask.long().sum())
|
||||
return (1 - self.epsilon) * model_loss + self.epsilon * smoothed_loss
|
||||
|
||||
Reference in New Issue
Block a user