From 4d2de5f63c1c31af59d53351686b1f4dfa20ecf8 Mon Sep 17 00:00:00 2001 From: Damiano Amatruda Date: Tue, 18 Feb 2025 15:37:48 +0100 Subject: [PATCH] Fix XGLM loss computation (PyTorch and TensorFlow) (#35878) * Fix XGLM loss computation (PyTorch and TensorFlow) * Update expected output string in XGLM sample test This updates the expected output string of test_xglm_sample for torch 2.0 to the correct one and removes the one for torch 1.13.1 + cu116 (transformers moved to torch 2.0 with PR #35358). * Update expected output IDs in XGLM generation test --- .../models/xglm/modeling_tf_xglm.py | 2 +- src/transformers/models/xglm/modeling_xglm.py | 29 --------------- tests/models/xglm/test_modeling_tf_xglm.py | 19 ++++++++++ tests/models/xglm/test_modeling_xglm.py | 37 ++++++++++++++----- 4 files changed, 48 insertions(+), 39 deletions(-) diff --git a/src/transformers/models/xglm/modeling_tf_xglm.py b/src/transformers/models/xglm/modeling_tf_xglm.py index 3c1d1afb9a..dc9f82900a 100644 --- a/src/transformers/models/xglm/modeling_tf_xglm.py +++ b/src/transformers/models/xglm/modeling_tf_xglm.py @@ -969,7 +969,7 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss): if labels is not None: # shift labels to the left and cut last logit token labels = tf.concat( - [labels[:, 1:], tf.fill((labels.shape[0], 1), tf.cast(self.config.pad_token_id, labels.dtype))], + [labels[:, 1:], tf.fill((labels.shape[0], 1), tf.cast(-100, labels.dtype))], axis=-1, ) loss = self.hf_compute_loss(labels, lm_logits) diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py index 190f50ca40..0ca0fa77fc 100755 --- a/src/transformers/models/xglm/modeling_xglm.py +++ b/src/transformers/models/xglm/modeling_xglm.py @@ -691,33 +691,6 @@ class XGLMModel(XGLMPreTrainedModel): ) -def xglm_cross_entropy_loss( - logits, - labels, - num_items_in_batch: int = None, - ignore_index: int = -100, - pad_token_id: int = -100, - vocab_size: int = None, -): - """ - Loss function for XGLM that takes into account `num_items_in_batch` - """ - shift_labels = labels.new_zeros(labels.shape) - shift_labels[:, :-1] = labels[:, 1:].clone() - shift_labels[:, -1] = pad_token_id - # move labels to correct device to enable model parallelism - labels = labels.float().to(logits.device) - - logits = logits.view(-1, vocab_size).float() - shift_labels = shift_labels.view(-1) - - reduction = "sum" if num_items_in_batch is not None else "mean" - loss = nn.functional.cross_entropy(logits, shift_labels, ignore_index=ignore_index, reduction=reduction) - if reduction == "sum": - loss = loss / num_items_in_batch - return loss - - @add_start_docstrings( """ The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input @@ -737,8 +710,6 @@ class XGLMForCausalLM(XGLMPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - self._loss_function = xglm_cross_entropy_loss - def get_input_embeddings(self): return self.model.embed_tokens diff --git a/tests/models/xglm/test_modeling_tf_xglm.py b/tests/models/xglm/test_modeling_tf_xglm.py index e651d27423..88e961fca7 100644 --- a/tests/models/xglm/test_modeling_tf_xglm.py +++ b/tests/models/xglm/test_modeling_tf_xglm.py @@ -238,3 +238,22 @@ class TFXGLMModelLanguageGenerationTest(unittest.TestCase): ] self.assertListEqual(expected_output_sentence, batch_out_sentence) self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence]) + + def test_loss_with_padding(self): + tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M") + model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M") + + tokenizer.padding_side = "right" + + sequence = "Sequence" + + tokenized_non_padded = tokenizer(sequence, return_tensors="tf") + labels_non_padded = tokenized_non_padded.input_ids + loss_non_padded = model(tokenized_non_padded, labels=labels_non_padded).loss + + tokenized_padded = tokenizer(sequence, padding="max_length", max_length=16, return_tensors="tf") + labels_padded = tokenized_padded.input_ids + labels_padded = tf.where(labels_padded == tokenizer.pad_token_id, -100, labels_padded) + loss_padded = model(tokenized_padded, labels=labels_padded).loss + + tf.debugging.assert_near(loss_non_padded, loss_padded, atol=1e-3) diff --git a/tests/models/xglm/test_modeling_xglm.py b/tests/models/xglm/test_modeling_xglm.py index e321aaf643..92210d0461 100644 --- a/tests/models/xglm/test_modeling_xglm.py +++ b/tests/models/xglm/test_modeling_xglm.py @@ -356,7 +356,7 @@ class XGLMModelLanguageGenerationTest(unittest.TestCase): model.to(torch_device) input_ids = torch.tensor([[2, 268, 9865]], dtype=torch.long, device=torch_device) # The dog # The dog is a very friendly dog. He is very affectionate and loves to play with other - expected_output_ids = [2, 268, 9865, 67, 11, 1988, 57252, 9865, 5, 984, 67, 1988, 213838, 1658, 53, 70446, 33, 6657, 278, 1581] # fmt: skip + expected_output_ids = [2, 268, 9865, 67, 11, 1988, 57252, 9865, 5, 984, 67, 1988, 213838, 1658, 53, 70446, 33, 6657, 278, 1581, 72616, 5, 984] # fmt: skip output_ids = model.generate(input_ids, do_sample=False, num_beams=1) if verify_outputs: self.assertListEqual(output_ids[0].tolist(), expected_output_ids) @@ -423,14 +423,11 @@ class XGLMModelLanguageGenerationTest(unittest.TestCase): output_ids = model.generate(input_ids, do_sample=True, num_beams=1) output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True) - EXPECTED_OUTPUT_STRS = [ - # TODO: remove this once we move to torch 2.0 - # torch 1.13.1 + cu116 - "Today is a nice day and the sun is shining. A nice day with warm rainy", - # torch 2.0 + cu117 - "Today is a nice day and the water is still cold. We just stopped off for some fresh", - ] - self.assertIn(output_str, EXPECTED_OUTPUT_STRS) + EXPECTED_OUTPUT_STR = ( + "Today is a nice day and the water is still cold. We just stopped off for some fresh coffee. This place" + " looks like a" + ) + self.assertEqual(output_str, EXPECTED_OUTPUT_STR) @require_torch_accelerator @require_torch_fp16 @@ -451,3 +448,25 @@ class XGLMModelLanguageGenerationTest(unittest.TestCase): self.assertFalse( torch.isnan(outputs.logits[0]).any().item() ) # the first logits could contain NaNs if it fails + + def test_loss_with_padding(self): + tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M") + model = XGLMForCausalLM.from_pretrained("facebook/xglm-564M") + model.to(torch_device) + + tokenizer.padding_side = "right" + + sequence = "Sequence" + + tokenized_non_padded = tokenizer(sequence, return_tensors="pt") + tokenized_non_padded.to(torch_device) + labels_non_padded = tokenized_non_padded.input_ids.clone() + loss_non_padded = model(**tokenized_non_padded, labels=labels_non_padded).loss + + tokenized_padded = tokenizer(sequence, padding="max_length", max_length=16, return_tensors="pt") + tokenized_padded.to(torch_device) + labels_padded = tokenized_padded.input_ids.clone() + labels_padded[labels_padded == tokenizer.pad_token_id] = -100 + loss_padded = model(**tokenized_padded, labels=labels_padded).loss + + torch.testing.assert_close(loss_non_padded, loss_padded, rtol=1e-3, atol=1e-3)