Fix XGLM loss computation (PyTorch and TensorFlow) (#35878)

* Fix XGLM loss computation (PyTorch and TensorFlow)

* Update expected output string in XGLM sample test

This updates the expected output string of test_xglm_sample for torch
2.0 to the correct one and removes the one for torch 1.13.1 + cu116
(transformers moved to torch 2.0 with PR #35358).

* Update expected output IDs in XGLM generation test
This commit is contained in:
Damiano Amatruda
2025-02-18 15:37:48 +01:00
committed by GitHub
parent c3ba53303b
commit 4d2de5f63c
4 changed files with 48 additions and 39 deletions

View File

@@ -969,7 +969,7 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss):
if labels is not None:
# shift labels to the left and cut last logit token
labels = tf.concat(
[labels[:, 1:], tf.fill((labels.shape[0], 1), tf.cast(self.config.pad_token_id, labels.dtype))],
[labels[:, 1:], tf.fill((labels.shape[0], 1), tf.cast(-100, labels.dtype))],
axis=-1,
)
loss = self.hf_compute_loss(labels, lm_logits)

View File

@@ -691,33 +691,6 @@ class XGLMModel(XGLMPreTrainedModel):
)
def xglm_cross_entropy_loss(
logits,
labels,
num_items_in_batch: int = None,
ignore_index: int = -100,
pad_token_id: int = -100,
vocab_size: int = None,
):
"""
Loss function for XGLM that takes into account `num_items_in_batch`
"""
shift_labels = labels.new_zeros(labels.shape)
shift_labels[:, :-1] = labels[:, 1:].clone()
shift_labels[:, -1] = pad_token_id
# move labels to correct device to enable model parallelism
labels = labels.float().to(logits.device)
logits = logits.view(-1, vocab_size).float()
shift_labels = shift_labels.view(-1)
reduction = "sum" if num_items_in_batch is not None else "mean"
loss = nn.functional.cross_entropy(logits, shift_labels, ignore_index=ignore_index, reduction=reduction)
if reduction == "sum":
loss = loss / num_items_in_batch
return loss
@add_start_docstrings(
"""
The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
@@ -737,8 +710,6 @@ class XGLMForCausalLM(XGLMPreTrainedModel, GenerationMixin):
# Initialize weights and apply final processing
self.post_init()
self._loss_function = xglm_cross_entropy_loss
def get_input_embeddings(self):
return self.model.embed_tokens

View File

@@ -238,3 +238,22 @@ class TFXGLMModelLanguageGenerationTest(unittest.TestCase):
]
self.assertListEqual(expected_output_sentence, batch_out_sentence)
self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
def test_loss_with_padding(self):
tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M")
tokenizer.padding_side = "right"
sequence = "Sequence"
tokenized_non_padded = tokenizer(sequence, return_tensors="tf")
labels_non_padded = tokenized_non_padded.input_ids
loss_non_padded = model(tokenized_non_padded, labels=labels_non_padded).loss
tokenized_padded = tokenizer(sequence, padding="max_length", max_length=16, return_tensors="tf")
labels_padded = tokenized_padded.input_ids
labels_padded = tf.where(labels_padded == tokenizer.pad_token_id, -100, labels_padded)
loss_padded = model(tokenized_padded, labels=labels_padded).loss
tf.debugging.assert_near(loss_non_padded, loss_padded, atol=1e-3)

View File

@@ -356,7 +356,7 @@ class XGLMModelLanguageGenerationTest(unittest.TestCase):
model.to(torch_device)
input_ids = torch.tensor([[2, 268, 9865]], dtype=torch.long, device=torch_device) # The dog
# </s> The dog is a very friendly dog. He is very affectionate and loves to play with other
expected_output_ids = [2, 268, 9865, 67, 11, 1988, 57252, 9865, 5, 984, 67, 1988, 213838, 1658, 53, 70446, 33, 6657, 278, 1581] # fmt: skip
expected_output_ids = [2, 268, 9865, 67, 11, 1988, 57252, 9865, 5, 984, 67, 1988, 213838, 1658, 53, 70446, 33, 6657, 278, 1581, 72616, 5, 984] # fmt: skip
output_ids = model.generate(input_ids, do_sample=False, num_beams=1)
if verify_outputs:
self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
@@ -423,14 +423,11 @@ class XGLMModelLanguageGenerationTest(unittest.TestCase):
output_ids = model.generate(input_ids, do_sample=True, num_beams=1)
output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
EXPECTED_OUTPUT_STRS = [
# TODO: remove this once we move to torch 2.0
# torch 1.13.1 + cu116
"Today is a nice day and the sun is shining. A nice day with warm rainy",
# torch 2.0 + cu117
"Today is a nice day and the water is still cold. We just stopped off for some fresh",
]
self.assertIn(output_str, EXPECTED_OUTPUT_STRS)
EXPECTED_OUTPUT_STR = (
"Today is a nice day and the water is still cold. We just stopped off for some fresh coffee. This place"
" looks like a"
)
self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
@require_torch_accelerator
@require_torch_fp16
@@ -451,3 +448,25 @@ class XGLMModelLanguageGenerationTest(unittest.TestCase):
self.assertFalse(
torch.isnan(outputs.logits[0]).any().item()
) # the first logits could contain NaNs if it fails
def test_loss_with_padding(self):
tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
model = XGLMForCausalLM.from_pretrained("facebook/xglm-564M")
model.to(torch_device)
tokenizer.padding_side = "right"
sequence = "Sequence"
tokenized_non_padded = tokenizer(sequence, return_tensors="pt")
tokenized_non_padded.to(torch_device)
labels_non_padded = tokenized_non_padded.input_ids.clone()
loss_non_padded = model(**tokenized_non_padded, labels=labels_non_padded).loss
tokenized_padded = tokenizer(sequence, padding="max_length", max_length=16, return_tensors="pt")
tokenized_padded.to(torch_device)
labels_padded = tokenized_padded.input_ids.clone()
labels_padded[labels_padded == tokenizer.pad_token_id] = -100
loss_padded = model(**tokenized_padded, labels=labels_padded).loss
torch.testing.assert_close(loss_non_padded, loss_padded, rtol=1e-3, atol=1e-3)