From 96d833b211a35bc48c3f9174042a796bb110a66b Mon Sep 17 00:00:00 2001 From: Matt Date: Mon, 4 Jul 2022 17:26:19 +0100 Subject: [PATCH] Return scalar losses instead of per-sample means (#18013) * Return scalar losses instead of per-sample means * Make loss shape (1,) instead of scalar * Allow scalar losses in test_loss_computation * Allow scalar losses in test_loss_computation * Allow scalar losses in test_loss_computation * Remove XLA loss function for RAG --- src/transformers/modeling_tf_utils.py | 11 ++-- .../models/albert/modeling_tf_albert.py | 8 +-- .../models/bert/modeling_tf_bert.py | 9 ++- .../models/led/modeling_tf_led.py | 5 +- .../models/rag/modeling_tf_rag.py | 55 +++++++------------ tests/models/xlnet/test_modeling_tf_xlnet.py | 6 +- tests/test_modeling_tf_common.py | 8 +-- 7 files changed, 39 insertions(+), 63 deletions(-) diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 7ba6ed9b38..c79c7d3d54 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -206,11 +206,9 @@ class TFCausalLanguageModelingLoss: unmasked_loss = loss_fn(tf.nn.relu(labels), logits) # make sure only labels that are not equal to -100 affect the loss loss_mask = tf.cast(labels != -100, dtype=unmasked_loss.dtype) - # Avoid division by zero later - loss_denominator = tf.math.maximum(tf.cast(1, loss_mask.dtype), tf.reduce_sum(loss_mask, axis=1)) masked_loss = unmasked_loss * loss_mask - reduced_masked_loss = tf.reduce_sum(masked_loss, axis=1) / loss_denominator - return reduced_masked_loss + reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(loss_mask) + return tf.reshape(reduced_masked_loss, (1,)) class TFQuestionAnsweringLoss: @@ -266,11 +264,10 @@ class TFTokenClassificationLoss: # are taken into account as loss loss_mask = tf.cast(labels >= 0, dtype=unmasked_loss.dtype) # Avoid possible division by zero later - loss_denominator = tf.math.maximum(tf.cast(1, loss_mask.dtype), tf.reduce_sum(loss_mask, axis=1)) # Masked positions will have a loss of NaN because -100 and -1 are not valid labels masked_loss = unmasked_loss * loss_mask - reduced_masked_loss = tf.reduce_sum(masked_loss, axis=1) / loss_denominator - return reduced_masked_loss + reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(loss_mask) + return tf.reshape(reduced_masked_loss, (1,)) class TFSequenceClassificationLoss: diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py index bfd79dc11b..b07ddf4762 100644 --- a/src/transformers/models/albert/modeling_tf_albert.py +++ b/src/transformers/models/albert/modeling_tf_albert.py @@ -118,20 +118,18 @@ class TFAlbertPreTrainingLoss: # make sure only labels that are not equal to -100 # are taken into account for the loss computation lm_loss_mask = tf.cast(labels["labels"] != -100, dtype=unmasked_lm_losses.dtype) - # Avoid division by zero later - lm_loss_denominator = tf.math.maximum(tf.cast(1, lm_loss_mask.dtype), tf.reduce_sum(lm_loss_mask, axis=1)) masked_lm_losses = unmasked_lm_losses * lm_loss_mask - reduced_masked_lm_loss = tf.reduce_sum(masked_lm_losses, axis=1) / lm_loss_denominator + reduced_masked_lm_loss = tf.reduce_sum(masked_lm_losses) / tf.reduce_sum(lm_loss_mask) sop_logits = tf.reshape(logits[1], (-1, 2)) # Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway unmasked_sop_loss = loss_fn(y_true=tf.nn.relu(labels["sentence_order_label"]), y_pred=sop_logits) sop_loss_mask = tf.cast(labels["sentence_order_label"] != -100, dtype=unmasked_sop_loss.dtype) - # No reduction because this already has shape (num_samples,) masked_sop_loss = unmasked_sop_loss * sop_loss_mask + reduced_masked_sop_loss = tf.reduce_sum(masked_sop_loss) / tf.reduce_sum(sop_loss_mask) - return reduced_masked_lm_loss + masked_sop_loss + return tf.reshape(reduced_masked_lm_loss + reduced_masked_sop_loss, (1,)) class TFAlbertEmbeddings(tf.keras.layers.Layer): diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py index 2f56410abf..aad730dc11 100644 --- a/src/transformers/models/bert/modeling_tf_bert.py +++ b/src/transformers/models/bert/modeling_tf_bert.py @@ -130,18 +130,17 @@ class TFBertPreTrainingLoss: # make sure only labels that are not equal to -100 # are taken into account for the loss computation lm_loss_mask = tf.cast(labels["labels"] != -100, dtype=unmasked_lm_losses.dtype) - # Avoid potential division by zero later - lm_loss_denominator = tf.math.maximum(tf.cast(1, lm_loss_mask.dtype), tf.reduce_sum(lm_loss_mask, axis=1)) masked_lm_losses = unmasked_lm_losses * lm_loss_mask - reduced_masked_lm_loss = tf.reduce_sum(masked_lm_losses, axis=1) / lm_loss_denominator + reduced_masked_lm_loss = tf.reduce_sum(masked_lm_losses) / tf.reduce_sum(lm_loss_mask) # Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway unmasked_ns_loss = loss_fn(y_true=tf.nn.relu(labels["next_sentence_label"]), y_pred=logits[1]) ns_loss_mask = tf.cast(labels["next_sentence_label"] != -100, dtype=unmasked_ns_loss.dtype) - # Just zero out samples where label is -100, no reduction masked_ns_loss = unmasked_ns_loss * ns_loss_mask - return reduced_masked_lm_loss + masked_ns_loss + reduced_masked_ns_loss = tf.reduce_sum(masked_ns_loss) / tf.reduce_sum(ns_loss_mask) + + return tf.reshape(reduced_masked_lm_loss + reduced_masked_ns_loss, (1,)) class TFBertEmbeddings(tf.keras.layers.Layer): diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py index db1e8682ee..846ba06e0e 100644 --- a/src/transformers/models/led/modeling_tf_led.py +++ b/src/transformers/models/led/modeling_tf_led.py @@ -2518,7 +2518,6 @@ class TFLEDForConditionalGeneration(TFLEDPreTrainedModel): unmasked_loss = loss_fn(tf.nn.relu(labels), logits) # make sure only non-padding labels affect the loss loss_mask = tf.cast(labels != self.config.pad_token_id, dtype=unmasked_loss.dtype) - loss_denominator = tf.math.maximum(tf.cast(1, loss_mask.dtype), tf.reduce_sum(loss_mask, axis=1)) masked_loss = unmasked_loss * loss_mask - reduced_masked_loss = tf.reduce_sum(masked_loss, axis=1) / loss_denominator - return reduced_masked_loss + reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(loss_mask) + return tf.reshape(reduced_masked_loss, (1,)) diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py index d73b047b67..26482026ba 100644 --- a/src/transformers/models/rag/modeling_tf_rag.py +++ b/src/transformers/models/rag/modeling_tf_rag.py @@ -1333,46 +1333,29 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss # Adopted modeling_tf_bart + add smooth_loss to match with pytorch version def hf_compute_loss(self, labels, y_pred, smooth_epsilon=0.0, from_logits=True, reduce_loss=False): """CrossEntropyLoss that ignores pad tokens""" - if self.config.tf_legacy_loss: - loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( - from_logits=True, - reduction=tf.keras.losses.Reduction.SUM, - ) - - if from_logits is False: # convert to logits - eps = 1e-9 - y_pred = tf.clip_by_value(y_pred, clip_value_min=eps, clip_value_max=1 - eps) - y_pred = tf.math.log(y_pred) - - logits = y_pred - melted_labels = tf.reshape(labels, (-1,)) - active_loss = tf.not_equal(melted_labels, self.config.generator.pad_token_id) - - reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, logits.shape[2])), active_loss) - labels = tf.boolean_mask(melted_labels, active_loss) - nll_loss = loss_fn(labels, reduced_logits) - - smooth_loss = -tf.reduce_sum(reduced_logits, axis=-1) - smooth_loss = tf.reduce_sum(smooth_loss) # sum and squeeze like torch - eps_i = smooth_epsilon / reduced_logits.shape[-1] - - loss = (1.0 - smooth_epsilon) * nll_loss + eps_i * smooth_loss - - return loss - + # Matt: As written, this loss is not XLA-compatible, but it's doing some very weird things + # and I don't feel comfortable converting it. loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( - from_logits=from_logits, - reduction=tf.keras.losses.Reduction.NONE, + from_logits=True, + reduction=tf.keras.losses.Reduction.SUM, ) - unmasked_loss = loss_fn(labels, y_pred) - loss_mask = labels != self.config.generator.pad_token_id - nll_loss = tf.reduce_sum(unmasked_loss * loss_mask) + if from_logits is False: # convert to logits + eps = 1e-9 + y_pred = tf.clip_by_value(y_pred, clip_value_min=eps, clip_value_max=1 - eps) + y_pred = tf.math.log(y_pred) - # Matt: This makes no sense to me, but I'm just copying the old loss in XLA-compatible form - smooth_loss = -tf.reduce_sum(y_pred * tf.expand_dims(labels, -1), axis=-1) - smooth_loss = tf.reduce_sum(smooth_loss) - eps_i = smooth_epsilon / y_pred.shape[-1] + logits = y_pred + melted_labels = tf.reshape(labels, (-1,)) + active_loss = tf.not_equal(melted_labels, self.config.generator.pad_token_id) + + reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, logits.shape[2])), active_loss) + labels = tf.boolean_mask(melted_labels, active_loss) + nll_loss = loss_fn(labels, reduced_logits) + + smooth_loss = -tf.reduce_sum(reduced_logits, axis=-1) + smooth_loss = tf.reduce_sum(smooth_loss) # sum and squeeze like torch + eps_i = smooth_epsilon / reduced_logits.shape[-1] loss = (1.0 - smooth_epsilon) * nll_loss + eps_i * smooth_loss diff --git a/tests/models/xlnet/test_modeling_tf_xlnet.py b/tests/models/xlnet/test_modeling_tf_xlnet.py index 59e9b47932..bc8f31006b 100644 --- a/tests/models/xlnet/test_modeling_tf_xlnet.py +++ b/tests/models/xlnet/test_modeling_tf_xlnet.py @@ -417,12 +417,12 @@ class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase): input_ids = prepared_for_class.pop(input_name) loss = model(input_ids, **prepared_for_class)[0] - self.assertEqual(loss.shape.as_list(), expected_loss_size) + self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1]) # Test that model correctly compute the loss with a dict prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) loss = model(prepared_for_class)[0] - self.assertEqual(loss.shape.as_list(), expected_loss_size) + self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1]) # Test that model correctly compute the loss with a tuple prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) @@ -453,7 +453,7 @@ class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase): # Send to model loss = model(tuple_input[:-1])[0] - self.assertEqual(loss.shape.as_list(), expected_loss_size) + self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1]) @require_tf diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index ee8958e649..87516228f2 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -1294,7 +1294,7 @@ class TFModelTesterMixin: model_input = prepared_for_class.pop(input_name) loss = model(model_input, **prepared_for_class)[0] - self.assertEqual(loss.shape.as_list(), expected_loss_size) + self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1]) # Test that model correctly compute the loss when we mask some positions prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) @@ -1307,13 +1307,13 @@ class TFModelTesterMixin: labels[0] = -100 prepared_for_class["labels"] = tf.convert_to_tensor(labels) loss = model(model_input, **prepared_for_class)[0] - self.assertEqual(loss.shape.as_list(), expected_loss_size) + self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1]) self.assertTrue(not np.any(np.isnan(loss.numpy()))) # Test that model correctly compute the loss with a dict prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) loss = model(prepared_for_class)[0] - self.assertEqual(loss.shape.as_list(), expected_loss_size) + self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1]) # Test that model correctly compute the loss with a tuple prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) @@ -1344,7 +1344,7 @@ class TFModelTesterMixin: # Send to model loss = model(tuple_input[:-1])[0] - self.assertEqual(loss.shape.as_list(), expected_loss_size) + self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1]) def test_keras_fit(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()