From 84ea427f460ffc8d2ddc08a341ccda076c24fc1f Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 17 Dec 2021 20:05:22 +0100
Subject: [PATCH] [ImageGPT] Deprecate pixel_values input name to input_ids
 (#14801)

* [ImageGPT] Deprecate pixel_values input name to input_ids

* up

* Apply suggestions from code review

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* correct

* finish

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/imagegpt/modeling_imagegpt.py      | 109 +++++++++++++-----
 tests/test_generation_utils.py                |  15 ++-
 tests/test_modeling_imagegpt.py               |   2 +-
 3 files changed, 93 insertions(+), 33 deletions(-)

diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py
index 4652774d07..4c90e8148c 100755
--- a/src/transformers/models/imagegpt/modeling_imagegpt.py
+++ b/src/transformers/models/imagegpt/modeling_imagegpt.py
@@ -16,6 +16,7 @@
 
 import math
 import os
+import warnings
 from typing import Tuple
 
 import torch
@@ -550,22 +551,22 @@ IMAGEGPT_START_DOCSTRING = r"""
 
 IMAGEGPT_INPUTS_DOCSTRING = r"""
     Args:
-        pixel_values (:obj:`torch.LongTensor` of shape :obj:`(batch_size, pixel_values_length)`):
-            :obj:`pixel_values_length` = ``sequence_length`` if :obj:`past_key_values` is ``None`` else
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            :obj:`input_ids_length` = ``sequence_length`` if :obj:`past_key_values` is ``None`` else
             ``past_key_values[0][0].shape[-2]`` (``sequence_length`` of input past key value states). Indices of input
             sequence tokens in the vocabulary.
 
-            If :obj:`past_key_values` is used, only ``pixel_values`` that do not have their past calculated should be
-            passed as ``pixel_values``.
+            If :obj:`past_key_values` is used, only ``input_ids`` that do not have their past calculated should be
+            passed as ``input_ids``.
 
             Indices can be obtained using :class:`~transformers.ImageGPTFeatureExtractor`. See
             :meth:`transformers.ImageGPTFeatureExtractor.__call__` for details.
 
         past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers`):
             Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
-            :obj:`past_key_values` output below). Can be used to speed up sequential decoding. The ``pixel_values``
-            which have their past given to this model should not be passed as ``pixel_values`` as they have already
-            been computed.
+            :obj:`past_key_values` output below). Can be used to speed up sequential decoding. The ``input_ids`` which
+            have their past given to this model should not be passed as ``input_ids`` as they have already been
+            computed.
         attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
             Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
 
@@ -573,7 +574,7 @@ IMAGEGPT_INPUTS_DOCSTRING = r"""
             - 0 for tokens that are **masked**.
 
             `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, pixel_values_length)`, `optional`):
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
             1]``:
 
@@ -593,9 +594,9 @@ IMAGEGPT_INPUTS_DOCSTRING = r"""
             - 0 indicates the head is **masked**.
 
         inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`pixel_values` you can choose to directly pass an embedded
-            representation. This is useful if you want more control over how to convert :obj:`pixel_values` indices
-            into associated vectors than the model's internal embedding lookup matrix.
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
 
             If :obj:`past_key_values` is used, optionally only the last :obj:`inputs_embeds` have to be input (see
             :obj:`past_key_values`).
@@ -656,7 +657,7 @@ class ImageGPTModel(ImageGPTPreTrainedModel):
     @replace_return_docstrings(output_type=BaseModelOutputWithPastAndCrossAttentions, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        pixel_values=None,
+        input_ids=None,
         past_key_values=None,
         attention_mask=None,
         token_type_ids=None,
@@ -669,11 +670,12 @@ class ImageGPTModel(ImageGPTPreTrainedModel):
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
+        **kwargs,
     ):
         r"""
         labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
             Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            ``labels = pixel_values`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
+            ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
             ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
 
         Returns:
@@ -695,6 +697,20 @@ class ImageGPTModel(ImageGPTPreTrainedModel):
             >>> last_hidden_states = outputs.last_hidden_state
         """
 
+        if "pixel_values" in kwargs:
+            warnings.warn(
+                "The `pixel_values` argument is deprecated and will be removed in a future version, use `input_ids` instead.",
+                FutureWarning,
+            )
+
+            if input_ids is not None:
+                raise ValueError(
+                    "You cannot pass both `pixel_values` and `input_ids`. "
+                    "Please make sure to only pass `input_ids`."
+                )
+
+            input_ids = kwargs.pop("pixel_values")
+
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -702,19 +718,19 @@ class ImageGPTModel(ImageGPTPreTrainedModel):
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        if pixel_values is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both pixel_values and inputs_embeds at the same time")
-        elif pixel_values is not None:
-            input_shape = pixel_values.size()
-            pixel_values = pixel_values.view(-1, input_shape[-1])
-            batch_size = pixel_values.shape[0]
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+            batch_size = input_ids.shape[0]
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
             batch_size = inputs_embeds.shape[0]
         else:
-            raise ValueError("You have to specify either pixel_values or inputs_embeds")
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
 
-        device = pixel_values.device if pixel_values is not None else inputs_embeds.device
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
 
         if token_type_ids is not None:
             token_type_ids = token_type_ids.view(-1, input_shape[-1])
@@ -768,7 +784,7 @@ class ImageGPTModel(ImageGPTPreTrainedModel):
         head_mask = self.get_head_mask(head_mask, self.config.n_layer)
 
         if inputs_embeds is None:
-            inputs_embeds = self.wte(pixel_values)
+            inputs_embeds = self.wte(input_ids)
         position_embeds = self.wpe(position_ids)
         hidden_states = inputs_embeds + position_embeds
 
@@ -901,11 +917,11 @@ class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
 
-    def prepare_inputs_for_generation(self, pixel_values, past=None, **kwargs):
+    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
         token_type_ids = kwargs.get("token_type_ids", None)
         # only last token for inputs_ids if past is defined in kwargs
         if past:
-            pixel_values = pixel_values[:, -1].unsqueeze(-1)
+            input_ids = input_ids[:, -1].unsqueeze(-1)
             if token_type_ids is not None:
                 token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
 
@@ -921,7 +937,7 @@ class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel):
         else:
             position_ids = None
         return {
-            "pixel_values": pixel_values,
+            "input_ids": input_ids,
             "past_key_values": past,
             "use_cache": kwargs.get("use_cache"),
             "position_ids": position_ids,
@@ -933,7 +949,7 @@ class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel):
     @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        pixel_values=None,
+        input_ids=None,
         past_key_values=None,
         attention_mask=None,
         token_type_ids=None,
@@ -947,11 +963,12 @@ class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel):
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
+        **kwargs,
     ):
         r"""
         labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
             Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            ``labels = pixel_values`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
+            ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
             ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
 
         Returns:
@@ -972,7 +989,7 @@ class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel):
             >>> batch_size = 8
             >>> context = torch.full((batch_size, 1), model.config.vocab_size - 1) #initialize with SOS token
             >>> context = torch.tensor(context).to(device)
-            >>> output = model.generate(pixel_values=context, max_length=model.config.n_positions + 1, temperature=1.0, do_sample=True, top_k=40)
+            >>> output = model.generate(input_ids=context, max_length=model.config.n_positions + 1, temperature=1.0, do_sample=True, top_k=40)
 
             >>> clusters = feature_extractor.clusters
             >>> n_px = feature_extractor.size
@@ -986,10 +1003,24 @@ class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel):
             ...    ax.imshow(img)
         """
 
+        if "pixel_values" in kwargs:
+            warnings.warn(
+                "The `pixel_values` argument is deprecated and will be removed in a future version, use `input_ids` instead.",
+                FutureWarning,
+            )
+
+            if input_ids is not None:
+                raise ValueError(
+                    "You cannot pass both `pixel_values` and `input_ids`. "
+                    "Please make sure to only pass `input_ids`."
+                )
+
+            input_ids = kwargs.pop("pixel_values")
+
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         transformer_outputs = self.transformer(
-            pixel_values,
+            input_ids,
             past_key_values=past_key_values,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
@@ -1066,7 +1097,7 @@ class ImageGPTForImageClassification(ImageGPTPreTrainedModel):
     @replace_return_docstrings(output_type=SequenceClassifierOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        pixel_values=None,
+        input_ids=None,
         past_key_values=None,
         attention_mask=None,
         token_type_ids=None,
@@ -1078,6 +1109,7 @@ class ImageGPTForImageClassification(ImageGPTPreTrainedModel):
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
+        **kwargs,
     ):
         r"""
         labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
@@ -1103,10 +1135,25 @@ class ImageGPTForImageClassification(ImageGPTPreTrainedModel):
             >>> outputs = model(**inputs)
             >>> logits = outputs.logits
         """
+
+        if "pixel_values" in kwargs:
+            warnings.warn(
+                "The `pixel_values` argument is deprecated and will be removed in a future version, use `input_ids` instead.",
+                FutureWarning,
+            )
+
+            if input_ids is not None:
+                raise ValueError(
+                    "You cannot pass both `pixel_values` and `input_ids`. "
+                    "Please make sure to only pass `input_ids`."
+                )
+
+            input_ids = kwargs.pop("pixel_values")
+
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         transformer_outputs = self.transformer(
-            pixel_values,
+            input_ids,
             past_key_values=past_key_values,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
diff --git a/tests/test_generation_utils.py b/tests/test_generation_utils.py
index 2a72840d2a..2a3a037c32 100644
--- a/tests/test_generation_utils.py
+++ b/tests/test_generation_utils.py
@@ -20,7 +20,7 @@ import unittest
 from transformers import is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 
-from .test_modeling_common import floats_tensor
+from .test_modeling_common import floats_tensor, ids_tensor
 
 
 if is_torch_available():
@@ -31,6 +31,7 @@ if is_torch_available():
         BartTokenizer,
         GPT2LMHeadModel,
         GPT2Tokenizer,
+        ImageGPTForCausalImageModeling,
         Speech2TextForConditionalGeneration,
         SpeechEncoderDecoderModel,
         VisionEncoderDecoderModel,
@@ -1766,6 +1767,18 @@ class GenerationIntegrationTests(unittest.TestCase):
         self.assertListEqual(output_sequences.tolist(), output_sequences_kwargs.tolist())
         self.assertEqual(output_sequences.shape, (1, 15))
 
+    def test_generate_non_nlp_input_ids_as_kwarg(self):
+        model = ImageGPTForCausalImageModeling.from_pretrained(
+            "hf-internal-testing/tiny-random-imagegpt", max_length=10
+        ).to(torch_device)
+        input_ids = ids_tensor((3, 5), vocab_size=10)
+
+        output_sequences_kwargs = model.generate(input_ids=input_ids).cpu()
+        output_sequences = model.generate(input_ids).cpu()
+
+        self.assertListEqual(output_sequences.tolist(), output_sequences_kwargs.tolist())
+        self.assertEqual(output_sequences.shape, (3, 10))
+
     def test_generate_input_ids_as_encoder_kwarg(self):
         article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
         tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
diff --git a/tests/test_modeling_imagegpt.py b/tests/test_modeling_imagegpt.py
index 85526800dc..2a59d4c1bd 100644
--- a/tests/test_modeling_imagegpt.py
+++ b/tests/test_modeling_imagegpt.py
@@ -314,7 +314,7 @@ class ImageGPTModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCa
             # signature.parameters is an OrderedDict => so arg_names order is deterministic
             arg_names = [*signature.parameters.keys()]
 
-            expected_arg_names = ["pixel_values"]
+            expected_arg_names = ["input_ids"]
             self.assertListEqual(arg_names[:1], expected_arg_names)
 
     def test_resize_tokens_embeddings(self):