reopen: llava-next fails to consider padding_side during Training (#32679)
restore #32386
This commit is contained in:
@@ -512,6 +512,19 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel):
|
|||||||
image_token_index = image_token_index if image_token_index is not None else self.config.image_token_index
|
image_token_index = image_token_index if image_token_index is not None else self.config.image_token_index
|
||||||
ignore_index = ignore_index if ignore_index is not None else self.config.ignore_index
|
ignore_index = ignore_index if ignore_index is not None else self.config.ignore_index
|
||||||
|
|
||||||
|
if self.training and self.padding_side == "left":
|
||||||
|
logger.warning_once(
|
||||||
|
"Padding side is set to 'left' but the model is in training mode. For training "
|
||||||
|
"it is recommended to set `model.padding_side='right' and `processor.tokenizer.padding_side='right'`. "
|
||||||
|
"If that's intended, ignore this warning"
|
||||||
|
)
|
||||||
|
if not self.training and self.padding_side == "right":
|
||||||
|
logger.warning_once(
|
||||||
|
"Padding side is set to 'right' but the model is in inference mode. For correct "
|
||||||
|
"generation results, please set `model.padding_side='left'` and `processor.tokenizer.padding_side='left'`. "
|
||||||
|
"If that's intended, ignore this warning"
|
||||||
|
)
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
# ! in llava 1.6, number of patches is variable
|
# ! in llava 1.6, number of patches is variable
|
||||||
num_images = feature_lens.size(0)
|
num_images = feature_lens.size(0)
|
||||||
@@ -522,18 +535,14 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel):
|
|||||||
_left_padding = torch.any(attention_mask[:, 0] == 0)
|
_left_padding = torch.any(attention_mask[:, 0] == 0)
|
||||||
_right_padding = torch.any(attention_mask[:, -1] == 0)
|
_right_padding = torch.any(attention_mask[:, -1] == 0)
|
||||||
|
|
||||||
left_padding = True if not self.training else False
|
left_padding = self.padding_side == "left"
|
||||||
if batch_size > 1 and not self.training:
|
if batch_size > 1:
|
||||||
if _left_padding and not _right_padding:
|
if _left_padding and _right_padding:
|
||||||
left_padding = True
|
|
||||||
elif not _left_padding and _right_padding:
|
|
||||||
left_padding = False
|
|
||||||
elif not _left_padding and not _right_padding:
|
|
||||||
# both side is 1, so cannot tell
|
|
||||||
left_padding = self.padding_side == "left"
|
|
||||||
else:
|
|
||||||
# invalid attention_mask
|
|
||||||
raise ValueError(f"both side of attention_mask has zero, invalid. {attention_mask}")
|
raise ValueError(f"both side of attention_mask has zero, invalid. {attention_mask}")
|
||||||
|
elif _right_padding and left_padding:
|
||||||
|
left_padding = False
|
||||||
|
elif _left_padding and not left_padding:
|
||||||
|
left_padding = True
|
||||||
|
|
||||||
# Whether to turn off right padding
|
# Whether to turn off right padding
|
||||||
# 1. Create a mask to know where special image tokens are
|
# 1. Create a mask to know where special image tokens are
|
||||||
|
|||||||
@@ -454,6 +454,7 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel):
|
|||||||
self.vocab_size = model_embeds.num_embeddings
|
self.vocab_size = model_embeds.num_embeddings
|
||||||
return model_embeds
|
return model_embeds
|
||||||
|
|
||||||
|
# Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration._merge_input_ids_with_image_features
|
||||||
def _merge_input_ids_with_image_features(
|
def _merge_input_ids_with_image_features(
|
||||||
self,
|
self,
|
||||||
image_features,
|
image_features,
|
||||||
@@ -557,6 +558,19 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel):
|
|||||||
image_token_index = image_token_index if image_token_index is not None else self.config.image_token_index
|
image_token_index = image_token_index if image_token_index is not None else self.config.image_token_index
|
||||||
ignore_index = ignore_index if ignore_index is not None else self.config.ignore_index
|
ignore_index = ignore_index if ignore_index is not None else self.config.ignore_index
|
||||||
|
|
||||||
|
if self.training and self.padding_side == "left":
|
||||||
|
logger.warning_once(
|
||||||
|
"Padding side is set to 'left' but the model is in training mode. For training "
|
||||||
|
"it is recommended to set `model.padding_side='right' and `processor.tokenizer.padding_side='right'`. "
|
||||||
|
"If that's intended, ignore this warning"
|
||||||
|
)
|
||||||
|
if not self.training and self.padding_side == "right":
|
||||||
|
logger.warning_once(
|
||||||
|
"Padding side is set to 'right' but the model is in inference mode. For correct "
|
||||||
|
"generation results, please set `model.padding_side='left'` and `processor.tokenizer.padding_side='left'`. "
|
||||||
|
"If that's intended, ignore this warning"
|
||||||
|
)
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
# ! in llava 1.6, number of patches is variable
|
# ! in llava 1.6, number of patches is variable
|
||||||
num_images = feature_lens.size(0)
|
num_images = feature_lens.size(0)
|
||||||
@@ -567,18 +581,14 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel):
|
|||||||
_left_padding = torch.any(attention_mask[:, 0] == 0)
|
_left_padding = torch.any(attention_mask[:, 0] == 0)
|
||||||
_right_padding = torch.any(attention_mask[:, -1] == 0)
|
_right_padding = torch.any(attention_mask[:, -1] == 0)
|
||||||
|
|
||||||
left_padding = True if not self.training else False
|
left_padding = self.padding_side == "left"
|
||||||
if batch_size > 1 and not self.training:
|
if batch_size > 1:
|
||||||
if _left_padding and not _right_padding:
|
if _left_padding and _right_padding:
|
||||||
left_padding = True
|
|
||||||
elif not _left_padding and _right_padding:
|
|
||||||
left_padding = False
|
|
||||||
elif not _left_padding and not _right_padding:
|
|
||||||
# both side is 1, so cannot tell
|
|
||||||
left_padding = self.padding_side == "left"
|
|
||||||
else:
|
|
||||||
# invalid attention_mask
|
|
||||||
raise ValueError(f"both side of attention_mask has zero, invalid. {attention_mask}")
|
raise ValueError(f"both side of attention_mask has zero, invalid. {attention_mask}")
|
||||||
|
elif _right_padding and left_padding:
|
||||||
|
left_padding = False
|
||||||
|
elif _left_padding and not left_padding:
|
||||||
|
left_padding = True
|
||||||
|
|
||||||
# Whether to turn off right padding
|
# Whether to turn off right padding
|
||||||
# 1. Create a mask to know where special image tokens are
|
# 1. Create a mask to know where special image tokens are
|
||||||
|
|||||||
@@ -549,6 +549,24 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
output_train = model(**inputs_batched, output_hidden_states=True)
|
output_train = model(**inputs_batched, output_hidden_states=True)
|
||||||
self.assertTrue((output_train.hidden_states[0][0, -1414:, ...] == 0).all().item())
|
self.assertTrue((output_train.hidden_states[0][0, -1414:, ...] == 0).all().item())
|
||||||
|
|
||||||
|
with self.assertLogs("transformers", level="WARNING") as logs:
|
||||||
|
model.padding_side = "left"
|
||||||
|
model.train()
|
||||||
|
model(**inputs_batched, output_hidden_states=True)
|
||||||
|
|
||||||
|
self.assertIn(
|
||||||
|
"Padding side is set to 'left' but the model is in training mode. For training", logs.output[0]
|
||||||
|
)
|
||||||
|
|
||||||
|
with self.assertLogs("transformers", level="WARNING") as logs:
|
||||||
|
model.padding_side = "right"
|
||||||
|
model.eval()
|
||||||
|
model(**inputs_batched, output_hidden_states=True)
|
||||||
|
|
||||||
|
self.assertIn(
|
||||||
|
"Padding side is set to 'right' but the model is in inference mode. For correct", logs.output[0]
|
||||||
|
)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
@require_bitsandbytes
|
@require_bitsandbytes
|
||||||
def test_expansion_in_processing(self):
|
def test_expansion_in_processing(self):
|
||||||
|
|||||||
@@ -12,7 +12,7 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""Testing suite for the PyTorch Llava-NeXT model."""
|
"""Testing suite for the PyTorch Llava-NeXT-Video model."""
|
||||||
|
|
||||||
import gc
|
import gc
|
||||||
import unittest
|
import unittest
|
||||||
@@ -511,6 +511,24 @@ class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
output_train = model(**inputs_batched, output_hidden_states=True)
|
output_train = model(**inputs_batched, output_hidden_states=True)
|
||||||
self.assertTrue((output_train.hidden_states[0][0, -1482:, ...] == 0).all().item())
|
self.assertTrue((output_train.hidden_states[0][0, -1482:, ...] == 0).all().item())
|
||||||
|
|
||||||
|
with self.assertLogs("transformers", level="WARNING") as logs:
|
||||||
|
model.padding_side = "left"
|
||||||
|
model.train()
|
||||||
|
model(**inputs_batched, output_hidden_states=True)
|
||||||
|
|
||||||
|
self.assertIn(
|
||||||
|
"Padding side is set to 'left' but the model is in training mode. For training", logs.output[0]
|
||||||
|
)
|
||||||
|
|
||||||
|
with self.assertLogs("transformers", level="WARNING") as logs:
|
||||||
|
model.padding_side = "right"
|
||||||
|
model.eval()
|
||||||
|
model(**inputs_batched, output_hidden_states=True)
|
||||||
|
|
||||||
|
self.assertIn(
|
||||||
|
"Padding side is set to 'right' but the model is in inference mode. For correct", logs.output[0]
|
||||||
|
)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
@require_bitsandbytes
|
@require_bitsandbytes
|
||||||
def test_expansion_in_processing(self):
|
def test_expansion_in_processing(self):
|
||||||
|
|||||||
Reference in New Issue
Block a user