Fix XGLM loss computation (PyTorch and TensorFlow) (#35878)

* Fix XGLM loss computation (PyTorch and TensorFlow) * Update expected output string in XGLM sample test This updates the expected output string of test_xglm_sample for torch 2.0 to the correct one and removes the one for torch 1.13.1 + cu116 (transformers moved to torch 2.0 with PR #35358). * Update expected output IDs in XGLM generation test
2025-02-18 15:37:48 +01:00
parent c3ba53303b
commit 4d2de5f63c
4 changed files with 48 additions and 39 deletions
--- a/src/transformers/models/xglm/modeling_tf_xglm.py
+++ b/src/transformers/models/xglm/modeling_tf_xglm.py
@@ -969,7 +969,7 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss):
        if labels is not None:
            # shift labels to the left and cut last logit token
            labels = tf.concat(
-                [labels[:, 1:], tf.fill((labels.shape[0], 1), tf.cast(self.config.pad_token_id, labels.dtype))],
+                [labels[:, 1:], tf.fill((labels.shape[0], 1), tf.cast(-100, labels.dtype))],
                axis=-1,
            )
            loss = self.hf_compute_loss(labels, lm_logits)
--- a/src/transformers/models/xglm/modeling_xglm.py
+++ b/src/transformers/models/xglm/modeling_xglm.py
@@ -691,33 +691,6 @@ class XGLMModel(XGLMPreTrainedModel):
        )


-def xglm_cross_entropy_loss(
-    logits,
-    labels,
-    num_items_in_batch: int = None,
-    ignore_index: int = -100,
-    pad_token_id: int = -100,
-    vocab_size: int = None,
-):
-    """
-    Loss function for XGLM that takes into account `num_items_in_batch`
-    """
-    shift_labels = labels.new_zeros(labels.shape)
-    shift_labels[:, :-1] = labels[:, 1:].clone()
-    shift_labels[:, -1] = pad_token_id
-    # move labels to correct device to enable model parallelism
-    labels = labels.float().to(logits.device)
-
-    logits = logits.view(-1, vocab_size).float()
-    shift_labels = shift_labels.view(-1)
-
-    reduction = "sum" if num_items_in_batch is not None else "mean"
-    loss = nn.functional.cross_entropy(logits, shift_labels, ignore_index=ignore_index, reduction=reduction)
-    if reduction == "sum":
-        loss = loss / num_items_in_batch
-    return loss
-
-
@add_start_docstrings(
    """
    The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
@@ -737,8 +710,6 @@ class XGLMForCausalLM(XGLMPreTrainedModel, GenerationMixin):
        # Initialize weights and apply final processing
        self.post_init()

-        self._loss_function = xglm_cross_entropy_loss
-
    def get_input_embeddings(self):
        return self.model.embed_tokens

--- a/tests/models/xglm/test_modeling_tf_xglm.py
+++ b/tests/models/xglm/test_modeling_tf_xglm.py
@@ -238,3 +238,22 @@ class TFXGLMModelLanguageGenerationTest(unittest.TestCase):
        ]
        self.assertListEqual(expected_output_sentence, batch_out_sentence)
        self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
+
+    def test_loss_with_padding(self):
+        tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
+        model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M")
+
+        tokenizer.padding_side = "right"
+
+        sequence = "Sequence"
+
+        tokenized_non_padded = tokenizer(sequence, return_tensors="tf")
+        labels_non_padded = tokenized_non_padded.input_ids
+        loss_non_padded = model(tokenized_non_padded, labels=labels_non_padded).loss
+
+        tokenized_padded = tokenizer(sequence, padding="max_length", max_length=16, return_tensors="tf")
+        labels_padded = tokenized_padded.input_ids
+        labels_padded = tf.where(labels_padded == tokenizer.pad_token_id, -100, labels_padded)
+        loss_padded = model(tokenized_padded, labels=labels_padded).loss
+
+        tf.debugging.assert_near(loss_non_padded, loss_padded, atol=1e-3)
--- a/tests/models/xglm/test_modeling_xglm.py
+++ b/tests/models/xglm/test_modeling_xglm.py
@@ -356,7 +356,7 @@ class XGLMModelLanguageGenerationTest(unittest.TestCase):
        model.to(torch_device)
        input_ids = torch.tensor([[2, 268, 9865]], dtype=torch.long, device=torch_device)  # The dog
        # </s> The dog is a very friendly dog. He is very affectionate and loves to play with other
-        expected_output_ids = [2, 268, 9865, 67, 11, 1988, 57252, 9865, 5, 984, 67, 1988, 213838, 1658, 53, 70446, 33, 6657, 278, 1581]  # fmt: skip
+        expected_output_ids = [2, 268, 9865, 67, 11, 1988, 57252, 9865, 5, 984, 67, 1988, 213838, 1658, 53, 70446, 33, 6657, 278, 1581, 72616, 5, 984]  # fmt: skip
        output_ids = model.generate(input_ids, do_sample=False, num_beams=1)
        if verify_outputs:
            self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
@@ -423,14 +423,11 @@ class XGLMModelLanguageGenerationTest(unittest.TestCase):
        output_ids = model.generate(input_ids, do_sample=True, num_beams=1)
        output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)

-        EXPECTED_OUTPUT_STRS = [
-            # TODO: remove this once we move to torch 2.0
-            # torch 1.13.1 + cu116
-            "Today is a nice day and the sun is shining. A nice day with warm rainy",
-            # torch 2.0 + cu117
-            "Today is a nice day and the water is still cold. We just stopped off for some fresh",
-        ]
-        self.assertIn(output_str, EXPECTED_OUTPUT_STRS)
+        EXPECTED_OUTPUT_STR = (
+            "Today is a nice day and the water is still cold. We just stopped off for some fresh coffee. This place"
+            " looks like a"
+        )
+        self.assertEqual(output_str, EXPECTED_OUTPUT_STR)

    @require_torch_accelerator
    @require_torch_fp16
@@ -451,3 +448,25 @@ class XGLMModelLanguageGenerationTest(unittest.TestCase):
            self.assertFalse(
                torch.isnan(outputs.logits[0]).any().item()
            )  # the first logits could contain NaNs if it fails
+
+    def test_loss_with_padding(self):
+        tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
+        model = XGLMForCausalLM.from_pretrained("facebook/xglm-564M")
+        model.to(torch_device)
+
+        tokenizer.padding_side = "right"
+
+        sequence = "Sequence"
+
+        tokenized_non_padded = tokenizer(sequence, return_tensors="pt")
+        tokenized_non_padded.to(torch_device)
+        labels_non_padded = tokenized_non_padded.input_ids.clone()
+        loss_non_padded = model(**tokenized_non_padded, labels=labels_non_padded).loss
+
+        tokenized_padded = tokenizer(sequence, padding="max_length", max_length=16, return_tensors="pt")
+        tokenized_padded.to(torch_device)
+        labels_padded = tokenized_padded.input_ids.clone()
+        labels_padded[labels_padded == tokenizer.pad_token_id] = -100
+        loss_padded = model(**tokenized_padded, labels=labels_padded).loss
+
+        torch.testing.assert_close(loss_non_padded, loss_padded, rtol=1e-3, atol=1e-3)