From 02d09c8fcc6bda2c345c84cec53289abbe7532ac Mon Sep 17 00:00:00 2001 From: "Jin Young (Daniel) Sohn" Date: Mon, 31 Aug 2020 08:35:51 -0700 Subject: [PATCH] Only access loss tensor every logging_steps (#6802) * Only access loss tensor every logging_steps * tensor.item() was being called every step. This must not be done for XLA:TPU tensors as it's terrible for performance causing TPU<>CPU communication at each step. On RoBERTa MLM for example, it reduces step time by 30%, should be larger for smaller step time models/tasks. * Train batch size was not correct in case a user uses the `per_gpu_train_batch_size` flag * Avg reduce loss accross eval shards * Fix style (#6803) * t5 model should make decoder_attention_mask (#6800) * [s2s] Test hub configs in self-scheduled CI (#6809) * [s2s] round runtime in run_eval (#6798) * Pegasus finetune script: add --adafactor (#6811) * [bart] rename self-attention -> attention (#6708) * [tests] fix typos in inputs (#6818) * Fixed open in colab link (#6825) * Add model card for singbert lite. Update widget for singbert and singbert-large. (#6827) * BR_BERTo model card (#6793) * clearly indicate shuffle=False (#6312) * Clarify shuffle * clarify shuffle Co-authored-by: Kevin Canwen Xu * [s2s README] Add more dataset download instructions (#6737) * Style * Patch logging issue * Set default logging level to `WARNING` instead of `INFO` * TF Flaubert w/ pre-norm (#6841) * Dataset and DataCollator for BERT Next Sentence Prediction (NSP) task (#6644) * add datacollator and dataset for next sentence prediction task * bug fix (numbers of special tokens & truncate sequences) * bug fix (+ dict inputs support for data collator) * add padding for nsp data collator; renamed cached files to avoid conflict. * add test for nsp data collator * Style Co-authored-by: Lysandre Debut Co-authored-by: Lysandre * Fix in Adafactor docstrings (#6845) * Fix resuming training for Windows (#6847) * Only access loss tensor every logging_steps * tensor.item() was being called every step. This must not be done for XLA:TPU tensors as it's terrible for performance causing TPU<>CPU communication at each step. On RoBERTa MLM for example, it reduces step time by 30%, should be larger for smaller step time models/tasks. * Train batch size was not correct in case a user uses the `per_gpu_train_batch_size` flag * Avg reduce loss accross eval shards * comments Co-authored-by: Sam Shleifer Co-authored-by: Stas Bekman Co-authored-by: Thomas Ashish Cherian <6967017+PandaWhoCodes@users.noreply.github.com> Co-authored-by: Zane Lim Co-authored-by: Rodolfo De Nadai Co-authored-by: xujiaze13 <37360975+xujiaze13@users.noreply.github.com> Co-authored-by: Kevin Canwen Xu Co-authored-by: Lysandre Co-authored-by: Lysandre Debut Co-authored-by: Huang Lianzhe Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- src/transformers/trainer.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index f49beb6928..74b00e7d1d 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -658,8 +658,8 @@ class Trainer: self.global_step = 0 logger.info(" Starting fine-tuning.") - tr_loss = 0.0 - logging_loss = 0.0 + tr_loss = torch.tensor(0.0).to(self.args.device) + logging_loss_scalar = 0.0 model.zero_grad() disable_tqdm = self.args.disable_tqdm or not self.is_local_process_zero() train_pbar = trange(epochs_trained, int(np.ceil(num_train_epochs)), desc="Epoch", disable=disable_tqdm) @@ -720,14 +720,15 @@ class Trainer: self.global_step == 1 and self.args.logging_first_step ): logs: Dict[str, float] = {} - logs["loss"] = (tr_loss - logging_loss) / self.args.logging_steps + tr_loss_scalar = tr_loss.item() + logs["loss"] = (tr_loss_scalar - logging_loss_scalar) / self.args.logging_steps # backward compatibility for pytorch schedulers logs["learning_rate"] = ( self.lr_scheduler.get_last_lr()[0] if version.parse(torch.__version__) >= version.parse("1.4") else self.lr_scheduler.get_lr()[0] ) - logging_loss = tr_loss + logging_loss_scalar = tr_loss_scalar self.log(logs) @@ -773,8 +774,6 @@ class Trainer: break epoch_pbar.close() train_pbar.update(1) - if self.args.max_steps > 0 and self.global_step >= self.args.max_steps: - break if self.args.tpu_metrics_debug or self.args.debug: if is_torch_tpu_available(): # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) @@ -784,6 +783,8 @@ class Trainer: "You enabled PyTorch/XLA debug metrics but you don't have a TPU " "configured. Check your training configuration if this is unexpected." ) + if self.args.max_steps > 0 and self.global_step >= self.args.max_steps: + break train_pbar.close() if self.tb_writer: @@ -793,7 +794,7 @@ class Trainer: delattr(self, "_past") logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") - return TrainOutput(self.global_step, tr_loss / self.global_step) + return TrainOutput(self.global_step, tr_loss.item() / self.global_step) def hyperparameter_search( self, @@ -973,7 +974,7 @@ class Trainer: return inputs - def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> float: + def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor: """ Perform a training step on a batch of inputs. @@ -989,7 +990,7 @@ class Trainer: argument :obj:`labels`. Check your model's documentation for all accepted arguments. Return: - :obj:`float`: The training loss on this batch. + :obj:`torch.Tensor`: The tensor with training loss on this batch. """ if hasattr(self, "_training_step"): warnings.warn( @@ -1027,7 +1028,7 @@ class Trainer: else: loss.backward() - return loss.item() + return loss def is_local_master(self) -> bool: """ @@ -1276,6 +1277,10 @@ class Trainer: preds = xm.mesh_reduce("eval_preds", preds, torch.cat) if label_ids is not None: label_ids = xm.mesh_reduce("eval_label_ids", label_ids, torch.cat) + if eval_losses is not None: + eval_losses = xm.mesh_reduce("eval_losses", torch.tensor(eval_losses), torch.cat).tolist() + if samples_count is not None: + samples_count = sum(xm.mesh_reduce("samples_count", torch.tensor([samples_count]), torch.cat).tolist()) # Finally, turn the aggregated tensors into numpy arrays. if preds is not None: