reopen: llava-next fails to consider padding_side during Training (#32679)

restore #32386
This commit is contained in:
jp
2024-08-15 19:44:19 +09:00
committed by GitHub
parent 8820fe8b8c
commit e840127370
4 changed files with 78 additions and 23 deletions

View File

@@ -512,6 +512,19 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel):
image_token_index = image_token_index if image_token_index is not None else self.config.image_token_index
ignore_index = ignore_index if ignore_index is not None else self.config.ignore_index
if self.training and self.padding_side == "left":
logger.warning_once(
"Padding side is set to 'left' but the model is in training mode. For training "
"it is recommended to set `model.padding_side='right' and `processor.tokenizer.padding_side='right'`. "
"If that's intended, ignore this warning"
)
if not self.training and self.padding_side == "right":
logger.warning_once(
"Padding side is set to 'right' but the model is in inference mode. For correct "
"generation results, please set `model.padding_side='left'` and `processor.tokenizer.padding_side='left'`. "
"If that's intended, ignore this warning"
)
with torch.no_grad():
# ! in llava 1.6, number of patches is variable
num_images = feature_lens.size(0)
@@ -522,18 +535,14 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel):
_left_padding = torch.any(attention_mask[:, 0] == 0)
_right_padding = torch.any(attention_mask[:, -1] == 0)
left_padding = True if not self.training else False
if batch_size > 1 and not self.training:
if _left_padding and not _right_padding:
left_padding = True
elif not _left_padding and _right_padding:
left_padding = False
elif not _left_padding and not _right_padding:
# both side is 1, so cannot tell
left_padding = self.padding_side == "left"
else:
# invalid attention_mask
left_padding = self.padding_side == "left"
if batch_size > 1:
if _left_padding and _right_padding:
raise ValueError(f"both side of attention_mask has zero, invalid. {attention_mask}")
elif _right_padding and left_padding:
left_padding = False
elif _left_padding and not left_padding:
left_padding = True
# Whether to turn off right padding
# 1. Create a mask to know where special image tokens are

View File

@@ -454,6 +454,7 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel):
self.vocab_size = model_embeds.num_embeddings
return model_embeds
# Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration._merge_input_ids_with_image_features
def _merge_input_ids_with_image_features(
self,
image_features,
@@ -557,6 +558,19 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel):
image_token_index = image_token_index if image_token_index is not None else self.config.image_token_index
ignore_index = ignore_index if ignore_index is not None else self.config.ignore_index
if self.training and self.padding_side == "left":
logger.warning_once(
"Padding side is set to 'left' but the model is in training mode. For training "
"it is recommended to set `model.padding_side='right' and `processor.tokenizer.padding_side='right'`. "
"If that's intended, ignore this warning"
)
if not self.training and self.padding_side == "right":
logger.warning_once(
"Padding side is set to 'right' but the model is in inference mode. For correct "
"generation results, please set `model.padding_side='left'` and `processor.tokenizer.padding_side='left'`. "
"If that's intended, ignore this warning"
)
with torch.no_grad():
# ! in llava 1.6, number of patches is variable
num_images = feature_lens.size(0)
@@ -567,18 +581,14 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel):
_left_padding = torch.any(attention_mask[:, 0] == 0)
_right_padding = torch.any(attention_mask[:, -1] == 0)
left_padding = True if not self.training else False
if batch_size > 1 and not self.training:
if _left_padding and not _right_padding:
left_padding = True
elif not _left_padding and _right_padding:
left_padding = False
elif not _left_padding and not _right_padding:
# both side is 1, so cannot tell
left_padding = self.padding_side == "left"
else:
# invalid attention_mask
left_padding = self.padding_side == "left"
if batch_size > 1:
if _left_padding and _right_padding:
raise ValueError(f"both side of attention_mask has zero, invalid. {attention_mask}")
elif _right_padding and left_padding:
left_padding = False
elif _left_padding and not left_padding:
left_padding = True
# Whether to turn off right padding
# 1. Create a mask to know where special image tokens are

View File

@@ -549,6 +549,24 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
output_train = model(**inputs_batched, output_hidden_states=True)
self.assertTrue((output_train.hidden_states[0][0, -1414:, ...] == 0).all().item())
with self.assertLogs("transformers", level="WARNING") as logs:
model.padding_side = "left"
model.train()
model(**inputs_batched, output_hidden_states=True)
self.assertIn(
"Padding side is set to 'left' but the model is in training mode. For training", logs.output[0]
)
with self.assertLogs("transformers", level="WARNING") as logs:
model.padding_side = "right"
model.eval()
model(**inputs_batched, output_hidden_states=True)
self.assertIn(
"Padding side is set to 'right' but the model is in inference mode. For correct", logs.output[0]
)
@slow
@require_bitsandbytes
def test_expansion_in_processing(self):

View File

@@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Llava-NeXT model."""
"""Testing suite for the PyTorch Llava-NeXT-Video model."""
import gc
import unittest
@@ -511,6 +511,24 @@ class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase):
output_train = model(**inputs_batched, output_hidden_states=True)
self.assertTrue((output_train.hidden_states[0][0, -1482:, ...] == 0).all().item())
with self.assertLogs("transformers", level="WARNING") as logs:
model.padding_side = "left"
model.train()
model(**inputs_batched, output_hidden_states=True)
self.assertIn(
"Padding side is set to 'left' but the model is in training mode. For training", logs.output[0]
)
with self.assertLogs("transformers", level="WARNING") as logs:
model.padding_side = "right"
model.eval()
model(**inputs_batched, output_hidden_states=True)
self.assertIn(
"Padding side is set to 'right' but the model is in inference mode. For correct", logs.output[0]
)
@slow
@require_bitsandbytes
def test_expansion_in_processing(self):