From 4d2de5f63c1c31af59d53351686b1f4dfa20ecf8 Mon Sep 17 00:00:00 2001
From: Damiano Amatruda <damiano.amatruda@outlook.com>
Date: Tue, 18 Feb 2025 15:37:48 +0100
Subject: [PATCH] Fix XGLM loss computation (PyTorch and TensorFlow) (#35878)

* Fix XGLM loss computation (PyTorch and TensorFlow)

* Update expected output string in XGLM sample test

This updates the expected output string of test_xglm_sample for torch
2.0 to the correct one and removes the one for torch 1.13.1 + cu116
(transformers moved to torch 2.0 with PR #35358).

* Update expected output IDs in XGLM generation test
---
 .../models/xglm/modeling_tf_xglm.py           |  2 +-
 src/transformers/models/xglm/modeling_xglm.py | 29 ---------------
 tests/models/xglm/test_modeling_tf_xglm.py    | 19 ++++++++++
 tests/models/xglm/test_modeling_xglm.py       | 37 ++++++++++++++-----
 4 files changed, 48 insertions(+), 39 deletions(-)

diff --git a/src/transformers/models/xglm/modeling_tf_xglm.py b/src/transformers/models/xglm/modeling_tf_xglm.py
index 3c1d1afb9a..dc9f82900a 100644
--- a/src/transformers/models/xglm/modeling_tf_xglm.py
+++ b/src/transformers/models/xglm/modeling_tf_xglm.py
@@ -969,7 +969,7 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss):
         if labels is not None:
             # shift labels to the left and cut last logit token
             labels = tf.concat(
-                [labels[:, 1:], tf.fill((labels.shape[0], 1), tf.cast(self.config.pad_token_id, labels.dtype))],
+                [labels[:, 1:], tf.fill((labels.shape[0], 1), tf.cast(-100, labels.dtype))],
                 axis=-1,
             )
             loss = self.hf_compute_loss(labels, lm_logits)
diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py
index 190f50ca40..0ca0fa77fc 100755
--- a/src/transformers/models/xglm/modeling_xglm.py
+++ b/src/transformers/models/xglm/modeling_xglm.py
@@ -691,33 +691,6 @@ class XGLMModel(XGLMPreTrainedModel):
         )
 
 
-def xglm_cross_entropy_loss(
-    logits,
-    labels,
-    num_items_in_batch: int = None,
-    ignore_index: int = -100,
-    pad_token_id: int = -100,
-    vocab_size: int = None,
-):
-    """
-    Loss function for XGLM that takes into account `num_items_in_batch`
-    """
-    shift_labels = labels.new_zeros(labels.shape)
-    shift_labels[:, :-1] = labels[:, 1:].clone()
-    shift_labels[:, -1] = pad_token_id
-    # move labels to correct device to enable model parallelism
-    labels = labels.float().to(logits.device)
-
-    logits = logits.view(-1, vocab_size).float()
-    shift_labels = shift_labels.view(-1)
-
-    reduction = "sum" if num_items_in_batch is not None else "mean"
-    loss = nn.functional.cross_entropy(logits, shift_labels, ignore_index=ignore_index, reduction=reduction)
-    if reduction == "sum":
-        loss = loss / num_items_in_batch
-    return loss
-
-
 @add_start_docstrings(
     """
     The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
@@ -737,8 +710,6 @@ class XGLMForCausalLM(XGLMPreTrainedModel, GenerationMixin):
         # Initialize weights and apply final processing
         self.post_init()
 
-        self._loss_function = xglm_cross_entropy_loss
-
     def get_input_embeddings(self):
         return self.model.embed_tokens
 
diff --git a/tests/models/xglm/test_modeling_tf_xglm.py b/tests/models/xglm/test_modeling_tf_xglm.py
index e651d27423..88e961fca7 100644
--- a/tests/models/xglm/test_modeling_tf_xglm.py
+++ b/tests/models/xglm/test_modeling_tf_xglm.py
@@ -238,3 +238,22 @@ class TFXGLMModelLanguageGenerationTest(unittest.TestCase):
         ]
         self.assertListEqual(expected_output_sentence, batch_out_sentence)
         self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
+
+    def test_loss_with_padding(self):
+        tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
+        model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M")
+
+        tokenizer.padding_side = "right"
+
+        sequence = "Sequence"
+
+        tokenized_non_padded = tokenizer(sequence, return_tensors="tf")
+        labels_non_padded = tokenized_non_padded.input_ids
+        loss_non_padded = model(tokenized_non_padded, labels=labels_non_padded).loss
+
+        tokenized_padded = tokenizer(sequence, padding="max_length", max_length=16, return_tensors="tf")
+        labels_padded = tokenized_padded.input_ids
+        labels_padded = tf.where(labels_padded == tokenizer.pad_token_id, -100, labels_padded)
+        loss_padded = model(tokenized_padded, labels=labels_padded).loss
+
+        tf.debugging.assert_near(loss_non_padded, loss_padded, atol=1e-3)
diff --git a/tests/models/xglm/test_modeling_xglm.py b/tests/models/xglm/test_modeling_xglm.py
index e321aaf643..92210d0461 100644
--- a/tests/models/xglm/test_modeling_xglm.py
+++ b/tests/models/xglm/test_modeling_xglm.py
@@ -356,7 +356,7 @@ class XGLMModelLanguageGenerationTest(unittest.TestCase):
         model.to(torch_device)
         input_ids = torch.tensor([[2, 268, 9865]], dtype=torch.long, device=torch_device)  # The dog
         # </s> The dog is a very friendly dog. He is very affectionate and loves to play with other
-        expected_output_ids = [2, 268, 9865, 67, 11, 1988, 57252, 9865, 5, 984, 67, 1988, 213838, 1658, 53, 70446, 33, 6657, 278, 1581]  # fmt: skip
+        expected_output_ids = [2, 268, 9865, 67, 11, 1988, 57252, 9865, 5, 984, 67, 1988, 213838, 1658, 53, 70446, 33, 6657, 278, 1581, 72616, 5, 984]  # fmt: skip
         output_ids = model.generate(input_ids, do_sample=False, num_beams=1)
         if verify_outputs:
             self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
@@ -423,14 +423,11 @@ class XGLMModelLanguageGenerationTest(unittest.TestCase):
         output_ids = model.generate(input_ids, do_sample=True, num_beams=1)
         output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
 
-        EXPECTED_OUTPUT_STRS = [
-            # TODO: remove this once we move to torch 2.0
-            # torch 1.13.1 + cu116
-            "Today is a nice day and the sun is shining. A nice day with warm rainy",
-            # torch 2.0 + cu117
-            "Today is a nice day and the water is still cold. We just stopped off for some fresh",
-        ]
-        self.assertIn(output_str, EXPECTED_OUTPUT_STRS)
+        EXPECTED_OUTPUT_STR = (
+            "Today is a nice day and the water is still cold. We just stopped off for some fresh coffee. This place"
+            " looks like a"
+        )
+        self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
 
     @require_torch_accelerator
     @require_torch_fp16
@@ -451,3 +448,25 @@ class XGLMModelLanguageGenerationTest(unittest.TestCase):
             self.assertFalse(
                 torch.isnan(outputs.logits[0]).any().item()
             )  # the first logits could contain NaNs if it fails
+
+    def test_loss_with_padding(self):
+        tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
+        model = XGLMForCausalLM.from_pretrained("facebook/xglm-564M")
+        model.to(torch_device)
+
+        tokenizer.padding_side = "right"
+
+        sequence = "Sequence"
+
+        tokenized_non_padded = tokenizer(sequence, return_tensors="pt")
+        tokenized_non_padded.to(torch_device)
+        labels_non_padded = tokenized_non_padded.input_ids.clone()
+        loss_non_padded = model(**tokenized_non_padded, labels=labels_non_padded).loss
+
+        tokenized_padded = tokenizer(sequence, padding="max_length", max_length=16, return_tensors="pt")
+        tokenized_padded.to(torch_device)
+        labels_padded = tokenized_padded.input_ids.clone()
+        labels_padded[labels_padded == tokenizer.pad_token_id] = -100
+        loss_padded = model(**tokenized_padded, labels=labels_padded).loss
+
+        torch.testing.assert_close(loss_non_padded, loss_padded, rtol=1e-3, atol=1e-3)