From dc05dd539fbf73b9a365c0b79475b0266f50e478 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Tue, 1 Feb 2022 12:04:07 +0100 Subject: [PATCH] Fix TF Causal LM models' returned logits (#15256) * Fix TF Causal LM models' returned logits * Fix expected shape in the tests Co-authored-by: ydshieh --- src/transformers/models/bert/modeling_tf_bert.py | 4 ++-- src/transformers/models/ctrl/modeling_tf_ctrl.py | 4 ++-- src/transformers/models/gpt2/modeling_tf_gpt2.py | 4 ++-- src/transformers/models/openai/modeling_tf_openai.py | 4 ++-- src/transformers/models/rembert/modeling_tf_rembert.py | 4 ++-- src/transformers/models/roberta/modeling_tf_roberta.py | 4 ++-- src/transformers/models/roformer/modeling_tf_roformer.py | 4 ++-- .../modeling_tf_{{cookiecutter.lowercase_modelname}}.py | 4 ++-- tests/test_modeling_tf_encoder_decoder.py | 2 +- tests/test_modeling_tf_vision_encoder_decoder.py | 2 +- 10 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py index 8aab074064..7d7d431c7e 100644 --- a/src/transformers/models/bert/modeling_tf_bert.py +++ b/src/transformers/models/bert/modeling_tf_bert.py @@ -1542,9 +1542,9 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): if inputs["labels"] is not None: # shift labels to the left and cut last logit token - logits = logits[:, :-1] + shifted_logits = logits[:, :-1] labels = inputs["labels"][:, 1:] - loss = self.hf_compute_loss(labels=labels, logits=logits) + loss = self.hf_compute_loss(labels=labels, logits=shifted_logits) if not inputs["return_dict"]: output = (logits,) + outputs[2:] diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py index ff7529e8a4..acfce53c8a 100644 --- a/src/transformers/models/ctrl/modeling_tf_ctrl.py +++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py @@ -735,9 +735,9 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss): loss = None if inputs["labels"] is not None: # shift labels to the left and cut last logit token - logits = logits[:, :-1] + shifted_logits = logits[:, :-1] labels = inputs["labels"][:, 1:] - loss = self.hf_compute_loss(labels, logits) + loss = self.hf_compute_loss(labels, shifted_logits) if not inputs["return_dict"]: output = (logits,) + transformer_outputs[1:] diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py index b0c9f46744..ab32cc0e83 100644 --- a/src/transformers/models/gpt2/modeling_tf_gpt2.py +++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py @@ -949,9 +949,9 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss): loss = None if inputs["labels"] is not None: # shift labels to the left and cut last logit token - logits = logits[:, :-1] + shifted_logits = logits[:, :-1] labels = inputs["labels"][:, 1:] - loss = self.hf_compute_loss(labels, logits) + loss = self.hf_compute_loss(labels, shifted_logits) if not inputs["return_dict"]: output = (logits,) + transformer_outputs[1:] diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py index 1b05a1268a..a924fb4023 100644 --- a/src/transformers/models/openai/modeling_tf_openai.py +++ b/src/transformers/models/openai/modeling_tf_openai.py @@ -656,9 +656,9 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin loss = None if inputs["labels"] is not None: # shift labels to the left and cut last logit token - logits = logits[:, :-1] + shifted_logits = logits[:, :-1] labels = inputs["labels"][:, 1:] - loss = self.hf_compute_loss(labels, logits) + loss = self.hf_compute_loss(labels, shifted_logits) if not inputs["return_dict"]: output = (logits,) + transformer_outputs[1:] diff --git a/src/transformers/models/rembert/modeling_tf_rembert.py b/src/transformers/models/rembert/modeling_tf_rembert.py index fbd632b448..16accc1b94 100644 --- a/src/transformers/models/rembert/modeling_tf_rembert.py +++ b/src/transformers/models/rembert/modeling_tf_rembert.py @@ -1275,9 +1275,9 @@ class TFRemBertForCausalLM(TFRemBertPreTrainedModel, TFCausalLanguageModelingLos if inputs["labels"] is not None: # shift labels to the left and cut last logit token - logits = logits[:, :-1] + shifted_logits = logits[:, :-1] labels = inputs["labels"][:, 1:] - loss = self.hf_compute_loss(labels=labels, logits=logits) + loss = self.hf_compute_loss(labels=labels, logits=shifted_logits) if not inputs["return_dict"]: output = (logits,) + outputs[2:] diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py index 04e61e91c8..9aeb0a1eef 100644 --- a/src/transformers/models/roberta/modeling_tf_roberta.py +++ b/src/transformers/models/roberta/modeling_tf_roberta.py @@ -1310,9 +1310,9 @@ class TFRobertaForCausalLM(TFRobertaPreTrainedModel, TFCausalLanguageModelingLos if inputs["labels"] is not None: # shift labels to the left and cut last logit token - logits = logits[:, :-1] + shifted_logits = logits[:, :-1] labels = inputs["labels"][:, 1:] - loss = self.hf_compute_loss(labels=labels, logits=logits) + loss = self.hf_compute_loss(labels=labels, logits=shifted_logits) if not inputs["return_dict"]: output = (logits,) + outputs[2:] diff --git a/src/transformers/models/roformer/modeling_tf_roformer.py b/src/transformers/models/roformer/modeling_tf_roformer.py index 487e609066..57a40a2905 100644 --- a/src/transformers/models/roformer/modeling_tf_roformer.py +++ b/src/transformers/models/roformer/modeling_tf_roformer.py @@ -1035,9 +1035,9 @@ class TFRoFormerForCausalLM(TFRoFormerPreTrainedModel, TFCausalLanguageModelingL if inputs["labels"] is not None: # shift labels to the left and cut last logit token - logits = logits[:, :-1] + shifted_logits = logits[:, :-1] labels = inputs["labels"][:, 1:] - loss = self.hf_compute_loss(labels=labels, logits=logits) + loss = self.hf_compute_loss(labels=labels, logits=shifted_logits) if not inputs["return_dict"]: output = (logits,) + outputs[2:] diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py index d2720be7d6..37b62d5772 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py @@ -1262,9 +1262,9 @@ class TF{{cookiecutter.camelcase_modelname}}ForCausalLM(TF{{cookiecutter.camelca if inputs["labels"] is not None: # shift labels to the left and cut last logit token - logits = logits[:, :-1] + shifted_logits = logits[:, :-1] labels = inputs["labels"][:, 1:] - loss = self.hf_compute_loss(labels=labels, logits=logits) + loss = self.hf_compute_loss(labels=labels, logits=shifted_logits) if not inputs["return_dict"]: output = (logits,) + outputs[2:] diff --git a/tests/test_modeling_tf_encoder_decoder.py b/tests/test_modeling_tf_encoder_decoder.py index 61a57f6405..96f2b81554 100644 --- a/tests/test_modeling_tf_encoder_decoder.py +++ b/tests/test_modeling_tf_encoder_decoder.py @@ -240,7 +240,7 @@ class TFEncoderDecoderMixin: assert "loss" in outputs_encoder_decoder batch_size, seq_len = decoder_input_ids.shape - expected_shape = (batch_size, seq_len - 1, decoder_config.vocab_size) + expected_shape = (batch_size, seq_len, decoder_config.vocab_size) self.assertEqual(outputs_encoder_decoder["logits"].shape, expected_shape) self.assertEqual( outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,)) diff --git a/tests/test_modeling_tf_vision_encoder_decoder.py b/tests/test_modeling_tf_vision_encoder_decoder.py index 8af144b176..ec041786a8 100644 --- a/tests/test_modeling_tf_vision_encoder_decoder.py +++ b/tests/test_modeling_tf_vision_encoder_decoder.py @@ -231,7 +231,7 @@ class TFVisionEncoderDecoderMixin: self.assertIn("loss", outputs_encoder_decoder) batch_size, seq_len = decoder_input_ids.shape - expected_shape = (batch_size, seq_len - 1, decoder_config.vocab_size) + expected_shape = (batch_size, seq_len, decoder_config.vocab_size) self.assertEqual(outputs_encoder_decoder["logits"].shape, expected_shape) self.assertEqual(outputs_encoder_decoder["encoder_last_hidden_state"].shape[0], pixel_values.shape[0]) self.assertEqual(outputs_encoder_decoder["encoder_last_hidden_state"].shape[-1], config.hidden_size)