reopen: llava-next fails to consider padding_side during Training (#32679)

restore #32386
2024-08-15 19:44:19 +09:00
parent 8820fe8b8c
commit e840127370
4 changed files with 78 additions and 23 deletions
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -512,6 +512,19 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel):
        image_token_index = image_token_index if image_token_index is not None else self.config.image_token_index
        ignore_index = ignore_index if ignore_index is not None else self.config.ignore_index

+        if self.training and self.padding_side == "left":
+            logger.warning_once(
+                "Padding side is set to 'left' but the model is in training mode. For training "
+                "it is recommended to set `model.padding_side='right' and `processor.tokenizer.padding_side='right'`. "
+                "If that's intended, ignore this warning"
+            )
+        if not self.training and self.padding_side == "right":
+            logger.warning_once(
+                "Padding side is set to 'right' but the model is in inference mode. For correct "
+                "generation results, please set `model.padding_side='left'` and `processor.tokenizer.padding_side='left'`. "
+                "If that's intended, ignore this warning"
+            )
+
        with torch.no_grad():
            # ! in llava 1.6, number of patches is variable
            num_images = feature_lens.size(0)
@@ -522,18 +535,14 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel):
            _left_padding = torch.any(attention_mask[:, 0] == 0)
            _right_padding = torch.any(attention_mask[:, -1] == 0)

-            left_padding = True if not self.training else False
-            if batch_size > 1 and not self.training:
-                if _left_padding and not _right_padding:
-                    left_padding = True
-                elif not _left_padding and _right_padding:
-                    left_padding = False
-                elif not _left_padding and not _right_padding:
-                    # both side is 1, so cannot tell
-                    left_padding = self.padding_side == "left"
-                else:
-                    # invalid attention_mask
+            left_padding = self.padding_side == "left"
+            if batch_size > 1:
+                if _left_padding and _right_padding:
                    raise ValueError(f"both side of attention_mask has zero, invalid. {attention_mask}")
+                elif _right_padding and left_padding:
+                    left_padding = False
+                elif _left_padding and not left_padding:
+                    left_padding = True

            # Whether to turn off right padding
            # 1. Create a mask to know where special image tokens are
--- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
@@ -454,6 +454,7 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel):
        self.vocab_size = model_embeds.num_embeddings
        return model_embeds

+    # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration._merge_input_ids_with_image_features
    def _merge_input_ids_with_image_features(
        self,
        image_features,
@@ -557,6 +558,19 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel):
        image_token_index = image_token_index if image_token_index is not None else self.config.image_token_index
        ignore_index = ignore_index if ignore_index is not None else self.config.ignore_index

+        if self.training and self.padding_side == "left":
+            logger.warning_once(
+                "Padding side is set to 'left' but the model is in training mode. For training "
+                "it is recommended to set `model.padding_side='right' and `processor.tokenizer.padding_side='right'`. "
+                "If that's intended, ignore this warning"
+            )
+        if not self.training and self.padding_side == "right":
+            logger.warning_once(
+                "Padding side is set to 'right' but the model is in inference mode. For correct "
+                "generation results, please set `model.padding_side='left'` and `processor.tokenizer.padding_side='left'`. "
+                "If that's intended, ignore this warning"
+            )
+
        with torch.no_grad():
            # ! in llava 1.6, number of patches is variable
            num_images = feature_lens.size(0)
@@ -567,18 +581,14 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel):
            _left_padding = torch.any(attention_mask[:, 0] == 0)
            _right_padding = torch.any(attention_mask[:, -1] == 0)

-            left_padding = True if not self.training else False
-            if batch_size > 1 and not self.training:
-                if _left_padding and not _right_padding:
-                    left_padding = True
-                elif not _left_padding and _right_padding:
-                    left_padding = False
-                elif not _left_padding and not _right_padding:
-                    # both side is 1, so cannot tell
-                    left_padding = self.padding_side == "left"
-                else:
-                    # invalid attention_mask
+            left_padding = self.padding_side == "left"
+            if batch_size > 1:
+                if _left_padding and _right_padding:
                    raise ValueError(f"both side of attention_mask has zero, invalid. {attention_mask}")
+                elif _right_padding and left_padding:
+                    left_padding = False
+                elif _left_padding and not left_padding:
+                    left_padding = True

            # Whether to turn off right padding
            # 1. Create a mask to know where special image tokens are
--- a/tests/models/llava_next/test_modeling_llava_next.py
+++ b/tests/models/llava_next/test_modeling_llava_next.py
@@ -549,6 +549,24 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
            output_train = model(**inputs_batched, output_hidden_states=True)
        self.assertTrue((output_train.hidden_states[0][0, -1414:, ...] == 0).all().item())

+        with self.assertLogs("transformers", level="WARNING") as logs:
+            model.padding_side = "left"
+            model.train()
+            model(**inputs_batched, output_hidden_states=True)
+
+            self.assertIn(
+                "Padding side is set to 'left' but the model is in training mode. For training", logs.output[0]
+            )
+
+        with self.assertLogs("transformers", level="WARNING") as logs:
+            model.padding_side = "right"
+            model.eval()
+            model(**inputs_batched, output_hidden_states=True)
+
+            self.assertIn(
+                "Padding side is set to 'right' but the model is in inference mode. For correct", logs.output[0]
+            )
+
    @slow
    @require_bitsandbytes
    def test_expansion_in_processing(self):
--- a/tests/models/llava_next_video/test_modeling_llava_next_video.py
+++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Testing suite for the PyTorch Llava-NeXT model."""
+"""Testing suite for the PyTorch Llava-NeXT-Video model."""

 import gc
 import unittest
@@ -511,6 +511,24 @@ class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase):
            output_train = model(**inputs_batched, output_hidden_states=True)
        self.assertTrue((output_train.hidden_states[0][0, -1482:, ...] == 0).all().item())

+        with self.assertLogs("transformers", level="WARNING") as logs:
+            model.padding_side = "left"
+            model.train()
+            model(**inputs_batched, output_hidden_states=True)
+
+            self.assertIn(
+                "Padding side is set to 'left' but the model is in training mode. For training", logs.output[0]
+            )
+
+        with self.assertLogs("transformers", level="WARNING") as logs:
+            model.padding_side = "right"
+            model.eval()
+            model(**inputs_batched, output_hidden_states=True)
+
+            self.assertIn(
+                "Padding side is set to 'right' but the model is in inference mode. For correct", logs.output[0]
+            )
+
    @slow
    @require_bitsandbytes
    def test_expansion_in_processing(self):