From 91455c182542f3d57adde70bfac9a24fe7a0dc0e Mon Sep 17 00:00:00 2001 From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Date: Mon, 24 Mar 2025 13:19:26 -0400 Subject: [PATCH] Fix processor kwargs qwen2 vl (#36890) * Fix qwen2_vl and qwen2_5_vl processors cutom images kwargs * change version warning --- .../models/auto/image_processing_auto.py | 4 ++-- .../models/qwen2_5_vl/modular_qwen2_5_vl.py | 7 ++++++- .../models/qwen2_5_vl/processing_qwen2_5_vl.py | 13 +++++++++++-- .../models/qwen2_vl/image_processing_qwen2_vl.py | 2 +- .../qwen2_vl/image_processing_qwen2_vl_fast.py | 2 +- .../models/qwen2_vl/processing_qwen2_vl.py | 13 +++++++++++-- src/transformers/processing_utils.py | 4 ++-- .../qwen2_5_vl/test_processor_qwen2_5_vl.py | 16 ++++++++++++++++ tests/models/qwen2_vl/test_processor_qwen2_vl.py | 16 ++++++++++++++++ 9 files changed, 66 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 2c9b6a266d..7cd47bc060 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -493,7 +493,7 @@ class AutoImageProcessor: image_processor_auto_map = config.auto_map["AutoImageProcessor"] image_processor_class = None - # TODO: @yoni, change logic in v4.50 (when use_fast set to True by default) + # TODO: @yoni, change logic in v4.52 (when use_fast set to True by default) if image_processor_type is not None: # if use_fast is not set and the processor was saved with a fast processor, we use it, otherwise we use the slow processor. if use_fast is None: @@ -501,7 +501,7 @@ class AutoImageProcessor: if not use_fast: logger.warning_once( "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. " - "`use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. " + "`use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. " "This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`." ) # Update class name to reflect the use_fast option. If class is not found, we fall back to the slow version. diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py index f4397cf122..a77344c976 100644 --- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py @@ -41,7 +41,7 @@ from transformers.models.qwen2_vl.modeling_qwen2_vl import ( VisionRotaryEmbedding, VisionSdpaAttention, ) -from transformers.models.qwen2_vl.processing_qwen2_vl import Qwen2VLProcessor +from transformers.models.qwen2_vl.processing_qwen2_vl import Qwen2VLImagesKwargs, Qwen2VLProcessor from ...activations import ACT2FN from ...configuration_utils import PretrainedConfig @@ -816,7 +816,12 @@ class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False): fps: Union[List[float], float] +class Qwen2_5_VLImagesKwargs(Qwen2VLImagesKwargs): + pass + + class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: Qwen2_5_VLImagesKwargs videos_kwargs: Qwen2_5_VLVideosProcessorKwargs _defaults = { "text_kwargs": { diff --git a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py index 6357debbe2..e07642a1bf 100644 --- a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py @@ -23,11 +23,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Union +from typing import List, Optional, Union from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput, VideoInput -from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs +from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs from ...tokenization_utils_base import PreTokenizedInput, TextInput @@ -35,7 +35,16 @@ class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False): fps: Union[List[float], float] +class Qwen2_5_VLImagesKwargs(ImagesKwargs): + min_pixels: Optional[int] + max_pixels: Optional[int] + patch_size: Optional[int] + temporal_patch_size: Optional[int] + merge_size: Optional[int] + + class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: Qwen2_5_VLImagesKwargs videos_kwargs: Qwen2_5_VLVideosProcessorKwargs _defaults = { "text_kwargs": { diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py index d115b1d062..671cd86170 100644 --- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py @@ -384,7 +384,7 @@ class Qwen2VLImageProcessor(BaseImageProcessor): raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.") min_pixels = size["shortest_edge"] else: - size = self.size + size = {**self.size} # backward compatibility: override size with min_pixels and max_pixels if they are provided if min_pixels is not None: size["shortest_edge"] = min_pixels diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py index 1e99125806..8d92cb0845 100644 --- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py @@ -339,7 +339,7 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast): raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.") min_pixels = size["shortest_edge"] else: - size = self.size + size = {**self.size} # backward compatibility: override size with min_pixels and max_pixels if they are provided if min_pixels is not None: size["shortest_edge"] = min_pixels diff --git a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py index 90720ad586..06b0adb0fb 100644 --- a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py @@ -21,11 +21,11 @@ Processor class for Qwen2-VL. """ -from typing import List, Union +from typing import List, Optional, Union from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput, VideoInput -from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import logging @@ -33,7 +33,16 @@ from ...utils import logging logger = logging.get_logger(__name__) +class Qwen2VLImagesKwargs(ImagesKwargs): + min_pixels: Optional[int] + max_pixels: Optional[int] + patch_size: Optional[int] + temporal_patch_size: Optional[int] + merge_size: Optional[int] + + class Qwen2VLProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: Qwen2VLImagesKwargs _defaults = { "text_kwargs": { "padding": False, diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 17632a502e..556ce3d522 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -1111,12 +1111,12 @@ class ProcessorMixin(PushToHubMixin): if isinstance(class_name, tuple): classes = tuple(cls.get_possibly_dynamic_module(n) if n is not None else None for n in class_name) if attribute_name == "image_processor": - # TODO: @yoni, change logic in v4.50 (when use_fast set to True by default) + # TODO: @yoni, change logic in v4.52 (when use_fast set to True by default) use_fast = kwargs.get("use_fast", None) if use_fast is None: logger.warning_once( "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. " - "`use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. " + "`use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. " "This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`." ) else: diff --git a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py index 481e206a71..65361f016f 100644 --- a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py +++ b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py @@ -310,3 +310,19 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase): ) self.assertTrue(self.videos_input_name in out_dict_with_video) self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 71280) + + def test_kwargs_overrides_custom_image_processor_kwargs(self): + processor_components = self.prepare_components() + processor_components["image_processor"] = self.get_component("image_processor") + processor_components["tokenizer"] = self.get_component("tokenizer") + processor_kwargs = self.prepare_processor_dict() + + processor = self.processor_class(**processor_components, **processor_kwargs, use_fast=True) + self.skip_processor_without_typed_kwargs(processor) + + input_str = self.prepare_text_inputs() + image_input = self.prepare_image_inputs() + inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt") + self.assertEqual(inputs[self.images_input_name].shape[0], 612) + inputs = processor(text=input_str, images=image_input, return_tensors="pt") + self.assertEqual(inputs[self.images_input_name].shape[0], 800) diff --git a/tests/models/qwen2_vl/test_processor_qwen2_vl.py b/tests/models/qwen2_vl/test_processor_qwen2_vl.py index 59546485c5..64540eed04 100644 --- a/tests/models/qwen2_vl/test_processor_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_processor_qwen2_vl.py @@ -307,3 +307,19 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase): ) self.assertTrue(self.videos_input_name in out_dict_with_video) self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 71280) + + def test_kwargs_overrides_custom_image_processor_kwargs(self): + processor_components = self.prepare_components() + processor_components["image_processor"] = self.get_component("image_processor") + processor_components["tokenizer"] = self.get_component("tokenizer") + processor_kwargs = self.prepare_processor_dict() + + processor = self.processor_class(**processor_components, **processor_kwargs, use_fast=True) + self.skip_processor_without_typed_kwargs(processor) + + input_str = self.prepare_text_inputs() + image_input = self.prepare_image_inputs() + inputs = processor(text=input_str, images=image_input, return_tensors="pt") + self.assertEqual(inputs[self.images_input_name].shape[0], 800) + inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt") + self.assertEqual(inputs[self.images_input_name].shape[0], 612)