reopen: llava-next fails to consider padding_side during Training (#32679)
restore #32386
This commit is contained in:
@@ -512,6 +512,19 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel):
|
||||
image_token_index = image_token_index if image_token_index is not None else self.config.image_token_index
|
||||
ignore_index = ignore_index if ignore_index is not None else self.config.ignore_index
|
||||
|
||||
if self.training and self.padding_side == "left":
|
||||
logger.warning_once(
|
||||
"Padding side is set to 'left' but the model is in training mode. For training "
|
||||
"it is recommended to set `model.padding_side='right' and `processor.tokenizer.padding_side='right'`. "
|
||||
"If that's intended, ignore this warning"
|
||||
)
|
||||
if not self.training and self.padding_side == "right":
|
||||
logger.warning_once(
|
||||
"Padding side is set to 'right' but the model is in inference mode. For correct "
|
||||
"generation results, please set `model.padding_side='left'` and `processor.tokenizer.padding_side='left'`. "
|
||||
"If that's intended, ignore this warning"
|
||||
)
|
||||
|
||||
with torch.no_grad():
|
||||
# ! in llava 1.6, number of patches is variable
|
||||
num_images = feature_lens.size(0)
|
||||
@@ -522,18 +535,14 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel):
|
||||
_left_padding = torch.any(attention_mask[:, 0] == 0)
|
||||
_right_padding = torch.any(attention_mask[:, -1] == 0)
|
||||
|
||||
left_padding = True if not self.training else False
|
||||
if batch_size > 1 and not self.training:
|
||||
if _left_padding and not _right_padding:
|
||||
left_padding = True
|
||||
elif not _left_padding and _right_padding:
|
||||
left_padding = False
|
||||
elif not _left_padding and not _right_padding:
|
||||
# both side is 1, so cannot tell
|
||||
left_padding = self.padding_side == "left"
|
||||
else:
|
||||
# invalid attention_mask
|
||||
if batch_size > 1:
|
||||
if _left_padding and _right_padding:
|
||||
raise ValueError(f"both side of attention_mask has zero, invalid. {attention_mask}")
|
||||
elif _right_padding and left_padding:
|
||||
left_padding = False
|
||||
elif _left_padding and not left_padding:
|
||||
left_padding = True
|
||||
|
||||
# Whether to turn off right padding
|
||||
# 1. Create a mask to know where special image tokens are
|
||||
|
||||
@@ -454,6 +454,7 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel):
|
||||
self.vocab_size = model_embeds.num_embeddings
|
||||
return model_embeds
|
||||
|
||||
# Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration._merge_input_ids_with_image_features
|
||||
def _merge_input_ids_with_image_features(
|
||||
self,
|
||||
image_features,
|
||||
@@ -557,6 +558,19 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel):
|
||||
image_token_index = image_token_index if image_token_index is not None else self.config.image_token_index
|
||||
ignore_index = ignore_index if ignore_index is not None else self.config.ignore_index
|
||||
|
||||
if self.training and self.padding_side == "left":
|
||||
logger.warning_once(
|
||||
"Padding side is set to 'left' but the model is in training mode. For training "
|
||||
"it is recommended to set `model.padding_side='right' and `processor.tokenizer.padding_side='right'`. "
|
||||
"If that's intended, ignore this warning"
|
||||
)
|
||||
if not self.training and self.padding_side == "right":
|
||||
logger.warning_once(
|
||||
"Padding side is set to 'right' but the model is in inference mode. For correct "
|
||||
"generation results, please set `model.padding_side='left'` and `processor.tokenizer.padding_side='left'`. "
|
||||
"If that's intended, ignore this warning"
|
||||
)
|
||||
|
||||
with torch.no_grad():
|
||||
# ! in llava 1.6, number of patches is variable
|
||||
num_images = feature_lens.size(0)
|
||||
@@ -567,18 +581,14 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel):
|
||||
_left_padding = torch.any(attention_mask[:, 0] == 0)
|
||||
_right_padding = torch.any(attention_mask[:, -1] == 0)
|
||||
|
||||
left_padding = True if not self.training else False
|
||||
if batch_size > 1 and not self.training:
|
||||
if _left_padding and not _right_padding:
|
||||
left_padding = True
|
||||
elif not _left_padding and _right_padding:
|
||||
left_padding = False
|
||||
elif not _left_padding and not _right_padding:
|
||||
# both side is 1, so cannot tell
|
||||
left_padding = self.padding_side == "left"
|
||||
else:
|
||||
# invalid attention_mask
|
||||
if batch_size > 1:
|
||||
if _left_padding and _right_padding:
|
||||
raise ValueError(f"both side of attention_mask has zero, invalid. {attention_mask}")
|
||||
elif _right_padding and left_padding:
|
||||
left_padding = False
|
||||
elif _left_padding and not left_padding:
|
||||
left_padding = True
|
||||
|
||||
# Whether to turn off right padding
|
||||
# 1. Create a mask to know where special image tokens are
|
||||
|
||||
@@ -549,6 +549,24 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
output_train = model(**inputs_batched, output_hidden_states=True)
|
||||
self.assertTrue((output_train.hidden_states[0][0, -1414:, ...] == 0).all().item())
|
||||
|
||||
with self.assertLogs("transformers", level="WARNING") as logs:
|
||||
model.padding_side = "left"
|
||||
model.train()
|
||||
model(**inputs_batched, output_hidden_states=True)
|
||||
|
||||
self.assertIn(
|
||||
"Padding side is set to 'left' but the model is in training mode. For training", logs.output[0]
|
||||
)
|
||||
|
||||
with self.assertLogs("transformers", level="WARNING") as logs:
|
||||
model.padding_side = "right"
|
||||
model.eval()
|
||||
model(**inputs_batched, output_hidden_states=True)
|
||||
|
||||
self.assertIn(
|
||||
"Padding side is set to 'right' but the model is in inference mode. For correct", logs.output[0]
|
||||
)
|
||||
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_expansion_in_processing(self):
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Testing suite for the PyTorch Llava-NeXT model."""
|
||||
"""Testing suite for the PyTorch Llava-NeXT-Video model."""
|
||||
|
||||
import gc
|
||||
import unittest
|
||||
@@ -511,6 +511,24 @@ class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
output_train = model(**inputs_batched, output_hidden_states=True)
|
||||
self.assertTrue((output_train.hidden_states[0][0, -1482:, ...] == 0).all().item())
|
||||
|
||||
with self.assertLogs("transformers", level="WARNING") as logs:
|
||||
model.padding_side = "left"
|
||||
model.train()
|
||||
model(**inputs_batched, output_hidden_states=True)
|
||||
|
||||
self.assertIn(
|
||||
"Padding side is set to 'left' but the model is in training mode. For training", logs.output[0]
|
||||
)
|
||||
|
||||
with self.assertLogs("transformers", level="WARNING") as logs:
|
||||
model.padding_side = "right"
|
||||
model.eval()
|
||||
model(**inputs_batched, output_hidden_states=True)
|
||||
|
||||
self.assertIn(
|
||||
"Padding side is set to 'right' but the model is in inference mode. For correct", logs.output[0]
|
||||
)
|
||||
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_expansion_in_processing(self):
|
||||
|
||||
Reference in New Issue
Block a user