[bug] fix llava processor to calculate unpadding size correctly (#37988)

* fix llava processor to calculate unpad size correctly

* repo consistency

* Revert "repo consistency" & "setUp in llava family"

This reverts commit 26a50af8db5b15bb6b700db3d53342fe69579d8e.

* add edge case test for padding & unpadding

* compute unpadding size from original size

* make test config explicit

* Revert "compute unpadding size from original size"

This reverts commit 752cd27ad9710ab056c17a9986760c4651975540.

* Revert "add edge case test for padding & unpadding"

This reverts commit ccbd094d69c3f8f6a259159164284f60ba835bce.

* revert unpad logic

* remove irrelevant tests

* model test

* remove processor from model test

---------

Co-authored-by: jaycha <jaycha@ncsoft.com>
This commit is contained in:
youngrok cha
2025-05-13 22:49:09 +09:00
committed by GitHub
parent 67b3d45eb6
commit a5cc7a67d7
11 changed files with 158 additions and 64 deletions

View File

@@ -50,7 +50,7 @@ from ...test_modeling_common import (
if is_torch_available():
import torch
from transformers.models.llava_next.modeling_llava_next import image_size_to_num_patches, unpad_image
from transformers.models.llava_next.modeling_llava_next import image_size_to_num_patches
if is_vision_available():
@@ -298,18 +298,27 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
image_sizes = torch.cat([image_sizes, image_sizes], dim=0)
_ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes)
def test_unpad_image(self):
original_size = (400, 400)
def test_odd_sized_image(self):
# prepare model configuration
config = self.model_tester.get_config()
# Test case width is padded
pixel_values = floats_tensor([3, 400, 601])
unpadded_tensor = unpad_image(pixel_values, original_size)
self.assertEqual(unpadded_tensor.shape[1:], original_size)
# prepare input
num_image_tokens = 24
pixel_values = floats_tensor([1, 5, 3, config.vision_config.image_size, config.vision_config.image_size])
input_ids = ids_tensor([1, 64], config.text_config.vocab_size - 2) + 2
input_ids[:, :num_image_tokens] = config.image_token_index
attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
inputs_dict = {
"pixel_values": pixel_values,
"image_sizes": torch.tensor([[13, 16]]), # odd-sized image
"input_ids": input_ids,
"attention_mask": attention_mask,
}
# Test case height is padded
pixel_values = floats_tensor([3, 503, 400])
unpadded_tensor = unpad_image(pixel_values, original_size)
self.assertEqual(unpadded_tensor.shape[1:], original_size)
# forward with odd-sized image input
for model_class in self.all_model_classes:
model = model_class(config).to(torch_device)
model(**inputs_dict)
@parameterized.expand(
[

View File

@@ -11,13 +11,15 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import shutil
import tempfile
import unittest
import torch
from transformers import AutoProcessor, LlamaTokenizerFast, LlavaNextProcessor
from transformers import LlamaTokenizerFast, LlavaNextProcessor
from transformers.testing_utils import (
require_vision,
)
@@ -52,6 +54,10 @@ class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase):
def get_image_processor(self, **kwargs):
return LlavaNextProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
@classmethod
def tearDownClass(cls):
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
@staticmethod
def prepare_processor_dict():
return {
@@ -73,13 +79,16 @@ class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase):
self.assertTrue(processor_loaded.chat_template == processor_dict.get("chat_template", None))
def test_image_token_filling(self):
processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-vicuna-7b-hf")
processor = self.processor_class.from_pretrained(self.tmpdirname)
processor.patch_size = 14
processor.vision_feature_select_strategy = "default"
processor.image_processor.crop_size = {"height": 336, "width": 336}
processor.image_processor.size = {"shortest_edge": 336}
processor.image_processor.image_grid_pinpoints = [[672, 336]]
# Important to check with non square image
image = torch.randint(0, 2, (3, 500, 316))
expected_image_tokens = 1526
image_token_index = 32000
image = torch.randint(0, 2, (3, 503, 316))
expected_image_tokens = 1525
image_token_index = processor.image_token_id
messages = [
{