Return scalar losses instead of per-sample means (#18013)
* Return scalar losses instead of per-sample means * Make loss shape (1,) instead of scalar * Allow scalar losses in test_loss_computation * Allow scalar losses in test_loss_computation * Allow scalar losses in test_loss_computation * Remove XLA loss function for RAG
This commit is contained in:
@@ -206,11 +206,9 @@ class TFCausalLanguageModelingLoss:
|
|||||||
unmasked_loss = loss_fn(tf.nn.relu(labels), logits)
|
unmasked_loss = loss_fn(tf.nn.relu(labels), logits)
|
||||||
# make sure only labels that are not equal to -100 affect the loss
|
# make sure only labels that are not equal to -100 affect the loss
|
||||||
loss_mask = tf.cast(labels != -100, dtype=unmasked_loss.dtype)
|
loss_mask = tf.cast(labels != -100, dtype=unmasked_loss.dtype)
|
||||||
# Avoid division by zero later
|
|
||||||
loss_denominator = tf.math.maximum(tf.cast(1, loss_mask.dtype), tf.reduce_sum(loss_mask, axis=1))
|
|
||||||
masked_loss = unmasked_loss * loss_mask
|
masked_loss = unmasked_loss * loss_mask
|
||||||
reduced_masked_loss = tf.reduce_sum(masked_loss, axis=1) / loss_denominator
|
reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(loss_mask)
|
||||||
return reduced_masked_loss
|
return tf.reshape(reduced_masked_loss, (1,))
|
||||||
|
|
||||||
|
|
||||||
class TFQuestionAnsweringLoss:
|
class TFQuestionAnsweringLoss:
|
||||||
@@ -266,11 +264,10 @@ class TFTokenClassificationLoss:
|
|||||||
# are taken into account as loss
|
# are taken into account as loss
|
||||||
loss_mask = tf.cast(labels >= 0, dtype=unmasked_loss.dtype)
|
loss_mask = tf.cast(labels >= 0, dtype=unmasked_loss.dtype)
|
||||||
# Avoid possible division by zero later
|
# Avoid possible division by zero later
|
||||||
loss_denominator = tf.math.maximum(tf.cast(1, loss_mask.dtype), tf.reduce_sum(loss_mask, axis=1))
|
|
||||||
# Masked positions will have a loss of NaN because -100 and -1 are not valid labels
|
# Masked positions will have a loss of NaN because -100 and -1 are not valid labels
|
||||||
masked_loss = unmasked_loss * loss_mask
|
masked_loss = unmasked_loss * loss_mask
|
||||||
reduced_masked_loss = tf.reduce_sum(masked_loss, axis=1) / loss_denominator
|
reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(loss_mask)
|
||||||
return reduced_masked_loss
|
return tf.reshape(reduced_masked_loss, (1,))
|
||||||
|
|
||||||
|
|
||||||
class TFSequenceClassificationLoss:
|
class TFSequenceClassificationLoss:
|
||||||
|
|||||||
@@ -118,20 +118,18 @@ class TFAlbertPreTrainingLoss:
|
|||||||
# make sure only labels that are not equal to -100
|
# make sure only labels that are not equal to -100
|
||||||
# are taken into account for the loss computation
|
# are taken into account for the loss computation
|
||||||
lm_loss_mask = tf.cast(labels["labels"] != -100, dtype=unmasked_lm_losses.dtype)
|
lm_loss_mask = tf.cast(labels["labels"] != -100, dtype=unmasked_lm_losses.dtype)
|
||||||
# Avoid division by zero later
|
|
||||||
lm_loss_denominator = tf.math.maximum(tf.cast(1, lm_loss_mask.dtype), tf.reduce_sum(lm_loss_mask, axis=1))
|
|
||||||
masked_lm_losses = unmasked_lm_losses * lm_loss_mask
|
masked_lm_losses = unmasked_lm_losses * lm_loss_mask
|
||||||
reduced_masked_lm_loss = tf.reduce_sum(masked_lm_losses, axis=1) / lm_loss_denominator
|
reduced_masked_lm_loss = tf.reduce_sum(masked_lm_losses) / tf.reduce_sum(lm_loss_mask)
|
||||||
|
|
||||||
sop_logits = tf.reshape(logits[1], (-1, 2))
|
sop_logits = tf.reshape(logits[1], (-1, 2))
|
||||||
# Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway
|
# Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway
|
||||||
unmasked_sop_loss = loss_fn(y_true=tf.nn.relu(labels["sentence_order_label"]), y_pred=sop_logits)
|
unmasked_sop_loss = loss_fn(y_true=tf.nn.relu(labels["sentence_order_label"]), y_pred=sop_logits)
|
||||||
sop_loss_mask = tf.cast(labels["sentence_order_label"] != -100, dtype=unmasked_sop_loss.dtype)
|
sop_loss_mask = tf.cast(labels["sentence_order_label"] != -100, dtype=unmasked_sop_loss.dtype)
|
||||||
|
|
||||||
# No reduction because this already has shape (num_samples,)
|
|
||||||
masked_sop_loss = unmasked_sop_loss * sop_loss_mask
|
masked_sop_loss = unmasked_sop_loss * sop_loss_mask
|
||||||
|
reduced_masked_sop_loss = tf.reduce_sum(masked_sop_loss) / tf.reduce_sum(sop_loss_mask)
|
||||||
|
|
||||||
return reduced_masked_lm_loss + masked_sop_loss
|
return tf.reshape(reduced_masked_lm_loss + reduced_masked_sop_loss, (1,))
|
||||||
|
|
||||||
|
|
||||||
class TFAlbertEmbeddings(tf.keras.layers.Layer):
|
class TFAlbertEmbeddings(tf.keras.layers.Layer):
|
||||||
|
|||||||
@@ -130,18 +130,17 @@ class TFBertPreTrainingLoss:
|
|||||||
# make sure only labels that are not equal to -100
|
# make sure only labels that are not equal to -100
|
||||||
# are taken into account for the loss computation
|
# are taken into account for the loss computation
|
||||||
lm_loss_mask = tf.cast(labels["labels"] != -100, dtype=unmasked_lm_losses.dtype)
|
lm_loss_mask = tf.cast(labels["labels"] != -100, dtype=unmasked_lm_losses.dtype)
|
||||||
# Avoid potential division by zero later
|
|
||||||
lm_loss_denominator = tf.math.maximum(tf.cast(1, lm_loss_mask.dtype), tf.reduce_sum(lm_loss_mask, axis=1))
|
|
||||||
masked_lm_losses = unmasked_lm_losses * lm_loss_mask
|
masked_lm_losses = unmasked_lm_losses * lm_loss_mask
|
||||||
reduced_masked_lm_loss = tf.reduce_sum(masked_lm_losses, axis=1) / lm_loss_denominator
|
reduced_masked_lm_loss = tf.reduce_sum(masked_lm_losses) / tf.reduce_sum(lm_loss_mask)
|
||||||
|
|
||||||
# Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway
|
# Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway
|
||||||
unmasked_ns_loss = loss_fn(y_true=tf.nn.relu(labels["next_sentence_label"]), y_pred=logits[1])
|
unmasked_ns_loss = loss_fn(y_true=tf.nn.relu(labels["next_sentence_label"]), y_pred=logits[1])
|
||||||
ns_loss_mask = tf.cast(labels["next_sentence_label"] != -100, dtype=unmasked_ns_loss.dtype)
|
ns_loss_mask = tf.cast(labels["next_sentence_label"] != -100, dtype=unmasked_ns_loss.dtype)
|
||||||
# Just zero out samples where label is -100, no reduction
|
|
||||||
masked_ns_loss = unmasked_ns_loss * ns_loss_mask
|
masked_ns_loss = unmasked_ns_loss * ns_loss_mask
|
||||||
|
|
||||||
return reduced_masked_lm_loss + masked_ns_loss
|
reduced_masked_ns_loss = tf.reduce_sum(masked_ns_loss) / tf.reduce_sum(ns_loss_mask)
|
||||||
|
|
||||||
|
return tf.reshape(reduced_masked_lm_loss + reduced_masked_ns_loss, (1,))
|
||||||
|
|
||||||
|
|
||||||
class TFBertEmbeddings(tf.keras.layers.Layer):
|
class TFBertEmbeddings(tf.keras.layers.Layer):
|
||||||
|
|||||||
@@ -2518,7 +2518,6 @@ class TFLEDForConditionalGeneration(TFLEDPreTrainedModel):
|
|||||||
unmasked_loss = loss_fn(tf.nn.relu(labels), logits)
|
unmasked_loss = loss_fn(tf.nn.relu(labels), logits)
|
||||||
# make sure only non-padding labels affect the loss
|
# make sure only non-padding labels affect the loss
|
||||||
loss_mask = tf.cast(labels != self.config.pad_token_id, dtype=unmasked_loss.dtype)
|
loss_mask = tf.cast(labels != self.config.pad_token_id, dtype=unmasked_loss.dtype)
|
||||||
loss_denominator = tf.math.maximum(tf.cast(1, loss_mask.dtype), tf.reduce_sum(loss_mask, axis=1))
|
|
||||||
masked_loss = unmasked_loss * loss_mask
|
masked_loss = unmasked_loss * loss_mask
|
||||||
reduced_masked_loss = tf.reduce_sum(masked_loss, axis=1) / loss_denominator
|
reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(loss_mask)
|
||||||
return reduced_masked_loss
|
return tf.reshape(reduced_masked_loss, (1,))
|
||||||
|
|||||||
@@ -1333,46 +1333,29 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss
|
|||||||
# Adopted modeling_tf_bart + add smooth_loss to match with pytorch version
|
# Adopted modeling_tf_bart + add smooth_loss to match with pytorch version
|
||||||
def hf_compute_loss(self, labels, y_pred, smooth_epsilon=0.0, from_logits=True, reduce_loss=False):
|
def hf_compute_loss(self, labels, y_pred, smooth_epsilon=0.0, from_logits=True, reduce_loss=False):
|
||||||
"""CrossEntropyLoss that ignores pad tokens"""
|
"""CrossEntropyLoss that ignores pad tokens"""
|
||||||
if self.config.tf_legacy_loss:
|
# Matt: As written, this loss is not XLA-compatible, but it's doing some very weird things
|
||||||
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
|
# and I don't feel comfortable converting it.
|
||||||
from_logits=True,
|
|
||||||
reduction=tf.keras.losses.Reduction.SUM,
|
|
||||||
)
|
|
||||||
|
|
||||||
if from_logits is False: # convert to logits
|
|
||||||
eps = 1e-9
|
|
||||||
y_pred = tf.clip_by_value(y_pred, clip_value_min=eps, clip_value_max=1 - eps)
|
|
||||||
y_pred = tf.math.log(y_pred)
|
|
||||||
|
|
||||||
logits = y_pred
|
|
||||||
melted_labels = tf.reshape(labels, (-1,))
|
|
||||||
active_loss = tf.not_equal(melted_labels, self.config.generator.pad_token_id)
|
|
||||||
|
|
||||||
reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, logits.shape[2])), active_loss)
|
|
||||||
labels = tf.boolean_mask(melted_labels, active_loss)
|
|
||||||
nll_loss = loss_fn(labels, reduced_logits)
|
|
||||||
|
|
||||||
smooth_loss = -tf.reduce_sum(reduced_logits, axis=-1)
|
|
||||||
smooth_loss = tf.reduce_sum(smooth_loss) # sum and squeeze like torch
|
|
||||||
eps_i = smooth_epsilon / reduced_logits.shape[-1]
|
|
||||||
|
|
||||||
loss = (1.0 - smooth_epsilon) * nll_loss + eps_i * smooth_loss
|
|
||||||
|
|
||||||
return loss
|
|
||||||
|
|
||||||
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
|
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
|
||||||
from_logits=from_logits,
|
from_logits=True,
|
||||||
reduction=tf.keras.losses.Reduction.NONE,
|
reduction=tf.keras.losses.Reduction.SUM,
|
||||||
)
|
)
|
||||||
|
|
||||||
unmasked_loss = loss_fn(labels, y_pred)
|
if from_logits is False: # convert to logits
|
||||||
loss_mask = labels != self.config.generator.pad_token_id
|
eps = 1e-9
|
||||||
nll_loss = tf.reduce_sum(unmasked_loss * loss_mask)
|
y_pred = tf.clip_by_value(y_pred, clip_value_min=eps, clip_value_max=1 - eps)
|
||||||
|
y_pred = tf.math.log(y_pred)
|
||||||
|
|
||||||
# Matt: This makes no sense to me, but I'm just copying the old loss in XLA-compatible form
|
logits = y_pred
|
||||||
smooth_loss = -tf.reduce_sum(y_pred * tf.expand_dims(labels, -1), axis=-1)
|
melted_labels = tf.reshape(labels, (-1,))
|
||||||
smooth_loss = tf.reduce_sum(smooth_loss)
|
active_loss = tf.not_equal(melted_labels, self.config.generator.pad_token_id)
|
||||||
eps_i = smooth_epsilon / y_pred.shape[-1]
|
|
||||||
|
reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, logits.shape[2])), active_loss)
|
||||||
|
labels = tf.boolean_mask(melted_labels, active_loss)
|
||||||
|
nll_loss = loss_fn(labels, reduced_logits)
|
||||||
|
|
||||||
|
smooth_loss = -tf.reduce_sum(reduced_logits, axis=-1)
|
||||||
|
smooth_loss = tf.reduce_sum(smooth_loss) # sum and squeeze like torch
|
||||||
|
eps_i = smooth_epsilon / reduced_logits.shape[-1]
|
||||||
|
|
||||||
loss = (1.0 - smooth_epsilon) * nll_loss + eps_i * smooth_loss
|
loss = (1.0 - smooth_epsilon) * nll_loss + eps_i * smooth_loss
|
||||||
|
|
||||||
|
|||||||
@@ -417,12 +417,12 @@ class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase):
|
|||||||
input_ids = prepared_for_class.pop(input_name)
|
input_ids = prepared_for_class.pop(input_name)
|
||||||
|
|
||||||
loss = model(input_ids, **prepared_for_class)[0]
|
loss = model(input_ids, **prepared_for_class)[0]
|
||||||
self.assertEqual(loss.shape.as_list(), expected_loss_size)
|
self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
|
||||||
|
|
||||||
# Test that model correctly compute the loss with a dict
|
# Test that model correctly compute the loss with a dict
|
||||||
prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
|
prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
|
||||||
loss = model(prepared_for_class)[0]
|
loss = model(prepared_for_class)[0]
|
||||||
self.assertEqual(loss.shape.as_list(), expected_loss_size)
|
self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
|
||||||
|
|
||||||
# Test that model correctly compute the loss with a tuple
|
# Test that model correctly compute the loss with a tuple
|
||||||
prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
|
prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
|
||||||
@@ -453,7 +453,7 @@ class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase):
|
|||||||
# Send to model
|
# Send to model
|
||||||
loss = model(tuple_input[:-1])[0]
|
loss = model(tuple_input[:-1])[0]
|
||||||
|
|
||||||
self.assertEqual(loss.shape.as_list(), expected_loss_size)
|
self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
|
||||||
|
|
||||||
|
|
||||||
@require_tf
|
@require_tf
|
||||||
|
|||||||
@@ -1294,7 +1294,7 @@ class TFModelTesterMixin:
|
|||||||
model_input = prepared_for_class.pop(input_name)
|
model_input = prepared_for_class.pop(input_name)
|
||||||
|
|
||||||
loss = model(model_input, **prepared_for_class)[0]
|
loss = model(model_input, **prepared_for_class)[0]
|
||||||
self.assertEqual(loss.shape.as_list(), expected_loss_size)
|
self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
|
||||||
|
|
||||||
# Test that model correctly compute the loss when we mask some positions
|
# Test that model correctly compute the loss when we mask some positions
|
||||||
prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
|
prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
|
||||||
@@ -1307,13 +1307,13 @@ class TFModelTesterMixin:
|
|||||||
labels[0] = -100
|
labels[0] = -100
|
||||||
prepared_for_class["labels"] = tf.convert_to_tensor(labels)
|
prepared_for_class["labels"] = tf.convert_to_tensor(labels)
|
||||||
loss = model(model_input, **prepared_for_class)[0]
|
loss = model(model_input, **prepared_for_class)[0]
|
||||||
self.assertEqual(loss.shape.as_list(), expected_loss_size)
|
self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
|
||||||
self.assertTrue(not np.any(np.isnan(loss.numpy())))
|
self.assertTrue(not np.any(np.isnan(loss.numpy())))
|
||||||
|
|
||||||
# Test that model correctly compute the loss with a dict
|
# Test that model correctly compute the loss with a dict
|
||||||
prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
|
prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
|
||||||
loss = model(prepared_for_class)[0]
|
loss = model(prepared_for_class)[0]
|
||||||
self.assertEqual(loss.shape.as_list(), expected_loss_size)
|
self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
|
||||||
|
|
||||||
# Test that model correctly compute the loss with a tuple
|
# Test that model correctly compute the loss with a tuple
|
||||||
prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
|
prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
|
||||||
@@ -1344,7 +1344,7 @@ class TFModelTesterMixin:
|
|||||||
# Send to model
|
# Send to model
|
||||||
loss = model(tuple_input[:-1])[0]
|
loss = model(tuple_input[:-1])[0]
|
||||||
|
|
||||||
self.assertEqual(loss.shape.as_list(), expected_loss_size)
|
self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
|
||||||
|
|
||||||
def test_keras_fit(self):
|
def test_keras_fit(self):
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|||||||
Reference in New Issue
Block a user