Release: v4.52.4

[qwen-vl] Look for vocab size in text config (#38372 )
fix qwen
2025-05-30 11:14:55 +02:00 · 2025-05-30 11:05:43 +02:00 · 2025-05-28 09:31:32 +02:00 · 2025-05-28 09:31:32 +02:00 · 2025-05-28 09:31:32 +02:00 · 2025-05-28 09:31:32 +02:00
15 changed files with 143 additions and 41 deletions
--- a/examples/3D_parallel.py
+++ b/examples/3D_parallel.py
@@ -1,3 +1,16 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """:
 This script is used to test training a model using Tensor Parallelism and Data Parallelism.

--- a/examples/pytorch/3d_parallel_checks.py
+++ b/examples/pytorch/3d_parallel_checks.py
@@ -1,3 +1,16 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """:
 This script is used to test training a model using Tensor Parallelism and Data Parallelism.

--- a/setup.py
+++ b/setup.py
@@ -125,7 +125,7 @@ _deps = [
    "jaxlib>=0.4.1,<=0.4.13",
    "jieba",
    "jinja2>=3.1.0",
-    "kenlm@git+https://github.com/ydshieh/kenlm@78f664fb3dafe1468d868d71faf19534530698d5",
+    "kenlm",
    # Keras pin - this is to make sure Keras 3 doesn't destroy us. Remove or change when we have proper support.
    "keras>2.9,<2.16",
    "keras-nlp>=0.3.1,<0.14.0",  # keras-nlp 0.14 doesn't support keras 2, see pin on keras.
@@ -315,7 +315,7 @@ extras["audio"] = deps_list(
    "librosa",
    "pyctcdecode",
    "phonemizer",
-    "kenlm@git+https://github.com/ydshieh/kenlm@78f664fb3dafe1468d868d71faf19534530698d5",
+    "kenlm",
 )
 # `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead
 extras["speech"] = deps_list("torchaudio") + extras["audio"]
@@ -451,7 +451,7 @@ install_requires = [

 setup(
    name="transformers",
-    version="4.52.0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="4.52.4",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
    author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
    author_email="transformers@huggingface.co",
    description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -18,7 +18,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).

-__version__ = "4.52.0"
+__version__ = "4.52.4"

 from pathlib import Path
 from typing import TYPE_CHECKING
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -32,7 +32,7 @@ deps = {
    "jaxlib": "jaxlib>=0.4.1,<=0.4.13",
    "jieba": "jieba",
    "jinja2": "jinja2>=3.1.0",
-    "kenlm@git+https://github.com/ydshieh/kenlm@78f664fb3dafe1468d868d71faf19534530698d5": "kenlm@git+https://github.com/ydshieh/kenlm@78f664fb3dafe1468d868d71faf19534530698d5",
+    "kenlm": "kenlm",
    "keras": "keras>2.9,<2.16",
    "keras-nlp": "keras-nlp>=0.3.1,<0.14.0",
    "kernels": "kernels>=0.4.4,<0.5",
--- a/src/transformers/integrations/tensor_parallel.py
+++ b/src/transformers/integrations/tensor_parallel.py
@@ -52,6 +52,7 @@ def initialize_tensor_parallelism(tp_plan, tp_size=None):

    # Detect the accelerator on the machine. If no accelerator is available, it returns CPU.
    device_type = torch._C._get_accelerator().type
+    current_device = getattr(torch, device_type)
    if not torch.distributed.is_initialized():
        try:
            rank = int(os.environ["RANK"])
@@ -73,6 +74,9 @@ def initialize_tensor_parallelism(tp_plan, tp_size=None):
                "We tried to initialize torch.distributed for you, but it failed. Make "
                "sure you init torch distributed in your script to use `tp_plan='auto'`."
            ) from e
+
+    if device_type != "cpu":
+        current_device.set_device(int(os.environ["LOCAL_RANK"]))
    index = current_device.current_device() if device_type != "cpu" else None
    tp_device = torch.device(device_type, index)

@@ -729,23 +733,24 @@ class ParallelInterface(MutableMapping):

    # Class instance object, so that a call to `register` can be reflected into all other files correctly, even if
    # a new instance is created (in order to locally override a given function)
-    _global_mapping = {
-        "colwise": ColwiseParallel(),
-        "rowwise": RowwiseParallel(),
-        "colwise_rep": ColwiseParallel(output_layouts=Replicate()),
-        "rowwise_rep": RowwiseParallel(input_layouts=Replicate()),
-        "local_colwise": ColwiseParallel(use_dtensor=False),
-        "local_rowwise": RowwiseParallel(use_dtensor=False),
-        "local": IsolatedParallel(),
-        "gather": GatherParallel(),
-        "local_packed_rowwise": PackedRowwiseParallel(use_dtensor=False),
-        "sequence_parallel": SequenceParallel(),
-        "replicate": ReplicateParallel(),
-    }

    def __init__(self):
        self._local_mapping = {}

+        ParallelInterface._global_mapping = {
+            "colwise": ColwiseParallel(),
+            "rowwise": RowwiseParallel(),
+            "colwise_rep": ColwiseParallel(output_layouts=Replicate()),
+            "rowwise_rep": RowwiseParallel(input_layouts=Replicate()),
+            "local_colwise": ColwiseParallel(use_dtensor=False),
+            "local_rowwise": RowwiseParallel(use_dtensor=False),
+            "local": IsolatedParallel(),
+            "gather": GatherParallel(),
+            "local_packed_rowwise": PackedRowwiseParallel(use_dtensor=False),
+            "sequence_parallel": SequenceParallel(),
+            "replicate": ReplicateParallel(),
+        }
+
    def __getitem__(self, key):
        # First check if instance has a local override
        if key in self._local_mapping:
@@ -775,7 +780,11 @@ class ParallelInterface(MutableMapping):


 # Global AttentionInterface shared by all models which do not need to overwrite any of the existing ones
-ALL_PARALLEL_STYLES: ParallelInterface = ParallelInterface()
+
+if is_torch_greater_or_equal("2.5") and _torch_distributed_available:
+    ALL_PARALLEL_STYLES: ParallelInterface = ParallelInterface()
+else:
+    ALL_PARALLEL_STYLES = None


 def convert_local_tensor_to_dtensor(
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -319,7 +319,8 @@ def get_torch_context_manager_or_global_device():
    is not "cpu". This is used to infer the correct device to load the model on, in case `device_map` is not provided.
    """
    device_in_context = torch.tensor([]).device
-    default_device = torch.get_default_device()
+    # `get_default_device` was only introduced in torch>=2.3 - use cpu otherwise to align the behavior
+    default_device = torch.get_default_device() if is_torch_greater_or_equal("2.3") else torch.device("cpu")
    # This case means no context manager was used -> we still check if the default that was potentially set is not cpu
    if device_in_context == default_device:
        if default_device != torch.device("cpu"):
@@ -3532,7 +3533,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
            for key, value in state_dict.items():
                for pattern, replacement in reverse_key_mapping.items():
                    replacement = replacement.lstrip("^")  # strip off un-needed chars and patterns
-                    replacement = re.sub(r"\(.*?\)", "", pattern)
+                    replacement = re.sub(r"\(.*\)", "", replacement)
                    key, n_replace = re.subn(pattern, replacement, key)
                    # Early exit of the loop
                    if n_replace > 0:
@@ -4177,13 +4178,14 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi

        # We need to correctly dispatch the model on the current process device. The easiest way for this is to use a simple
        # `device_map` pointing to the correct device
-        if device_mesh is None:
-            tp_plan, device_map, device_mesh = initialize_tensor_parallelism(tp_plan, tp_size=None)
-        else:
-            # TODO: make device_mesh support multiple dimensions
-            if device_mesh.ndim == 1:
-                raise ValueError("device_mesh must be 1 dimensional and will be used for TP")
-            device_map = torch.device(device_mesh.device_type, int(os.environ["LOCAL_RANK"]))
+        if tp_plan is not None:
+            if device_mesh is None and tp_plan is not None:
+                tp_plan, device_map, device_mesh = initialize_tensor_parallelism(tp_plan, tp_size=None)
+            else:
+                # TODO: make device_mesh support multiple dimensions
+                if device_mesh.ndim == 1:
+                    raise ValueError("device_mesh must be 1 dimensional and will be used for TP")
+                device_map = torch.device(device_mesh.device_type, int(os.environ["LOCAL_RANK"]))

        if use_auth_token is not None:
            warnings.warn(
--- a/src/transformers/models/opt/modeling_opt.py
+++ b/src/transformers/models/opt/modeling_opt.py
@@ -154,7 +154,11 @@ class OPTAttention(nn.Module):
        """Input shape: Batch x Time x Channel"""
        bsz, tgt_len, _ = hidden_states.size()

-        # get query proj
+        # Scaling is susceptible to floating point arithmetics' inprecisions
+        # which can lead to different results (this is dependent from model
+        # to model, e.g. whisper is one such case). We therefore keep the
+        # original order of scaling to follow the original implementation
+        # and enforce no scaling (1.0) in the attention call below.
        query_states = self.q_proj(hidden_states) * self.scaling
        query_states = query_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)

@@ -187,7 +191,7 @@ class OPTAttention(nn.Module):
            value_states,
            attention_mask,
            dropout=0.0 if not self.training else self.dropout,
-            scaling=self.scaling,
+            scaling=1.0,
            **kwargs,
        )

--- a/src/transformers/models/paligemma/processing_paligemma.py
+++ b/src/transformers/models/paligemma/processing_paligemma.py
@@ -310,7 +310,8 @@ class PaliGemmaProcessor(ProcessorMixin):
        return_data = {**inputs, "pixel_values": pixel_values}

        if return_token_type_ids:
-            labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100)
+            labels = np.array(inputs["input_ids"])
+            labels[np.array(inputs["token_type_ids"]) == 0] = -100
            return_data.update({"labels": labels})
        return BatchFeature(data=return_data, tensor_type=return_tensors)

--- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@@ -1797,7 +1797,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
    def __init__(self, config):
        super().__init__(config)
        self.model = Qwen2_5_VLModel(config)
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)

        self.post_init()

--- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
@@ -1673,7 +1673,7 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel, GenerationMixin):
    def __init__(self, config):
        super().__init__(config)
        self.model = Qwen2VLModel(config)
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)

        self.post_init()

--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -2072,10 +2072,7 @@ class _LazyModule(ModuleType):
        try:
            return importlib.import_module("." + module_name, self.__name__)
        except Exception as e:
-            raise RuntimeError(
-                f"Failed to import {self.__name__}.{module_name} because of the following error (look up to see its"
-                f" traceback):\n{e}"
-            ) from e
+            raise e

    def __reduce__(self):
        return (self.__class__, (self._name, self.__file__, self._import_structure))
--- a/src/transformers/video_utils.py
+++ b/src/transformers/video_utils.py
@@ -696,11 +696,13 @@ def group_videos_by_shape(
    grouped_videos_index = {}
    for i, video in enumerate(videos):
        shape = video.shape[-2::]
+        num_frames = video.shape[-4]  # video format BTCHW
+        shape = (num_frames, *shape)
        if shape not in grouped_videos:
            grouped_videos[shape] = []
        grouped_videos[shape].append(video)
        grouped_videos_index[i] = (shape, len(grouped_videos[shape]) - 1)
-    # stack videos with the same shape
+    # stack videos with the same size and number of frames
    grouped_videos = {shape: torch.stack(videos, dim=0) for shape, videos in grouped_videos.items()}
    return grouped_videos, grouped_videos_index

--- a/tests/models/paligemma/test_processor_paligemma.py
+++ b/tests/models/paligemma/test_processor_paligemma.py
@@ -62,6 +62,20 @@ class PaliGemmaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        )
        self.assertEqual(len(inputs["input_ids"][0]), 112)

+    @require_torch
+    def test_call_with_suffix(self):
+        input_str = "lower newer"
+        suffix = "upper older longer string"
+        image_input = self.prepare_image_inputs()
+        processor = self.get_processor()
+        inputs = processor(text=input_str, images=image_input, suffix=suffix)
+        self.assertTrue("labels" in inputs)
+        self.assertEqual(len(inputs["labels"][0]), len(inputs["input_ids"][0]))
+
+        inputs = processor(text=input_str, images=image_input, suffix=suffix, return_tensors="pt")
+        self.assertTrue("labels" in inputs)
+        self.assertEqual(len(inputs["labels"][0]), len(inputs["input_ids"][0]))
+
    def test_text_with_image_tokens(self):
        image_processor = self.get_component("image_processor")
        tokenizer = self.get_component("tokenizer")
--- a/tests/utils/test_video_utils.py
+++ b/tests/utils/test_video_utils.py
@@ -30,7 +30,7 @@ from transformers.testing_utils import (
    require_torchvision,
    require_vision,
 )
-from transformers.video_utils import make_batched_videos
+from transformers.video_utils import group_videos_by_shape, make_batched_videos, reorder_videos


 if is_torch_available():
@@ -43,9 +43,9 @@ if is_vision_available():
    from transformers.video_utils import VideoMetadata, load_video


-def get_random_video(height, width, return_torch=False):
+def get_random_video(height, width, num_frames=8, return_torch=False):
    random_frame = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
-    video = np.array(([random_frame] * 8))
+    video = np.array(([random_frame] * num_frames))
    if return_torch:
        # move channel first
        return torch.from_numpy(video).permute(0, 3, 1, 2)
@@ -189,6 +189,53 @@ class BaseVideoProcessorTester(unittest.TestCase):
        rgb_video = video_processor.convert_to_rgb(torch.cat([video, video[:, :1]], dim=1))
        self.assertEqual(rgb_video.shape, (8, 3, 20, 20))

+    def test_group_and_reorder_videos(self):
+        """Tests that videos can be grouped by frame size and number of frames"""
+        video_1 = get_random_video(20, 20, num_frames=3, return_torch=True)
+        video_2 = get_random_video(20, 20, num_frames=5, return_torch=True)
+
+        # Group two videos of same size but different number of frames
+        grouped_videos, grouped_videos_index = group_videos_by_shape([video_1, video_2])
+        self.assertEqual(len(grouped_videos), 2)
+
+        regrouped_videos = reorder_videos(grouped_videos, grouped_videos_index)
+        self.assertTrue(len(regrouped_videos), 2)
+        self.assertEqual(video_1.shape, regrouped_videos[0].shape)
+
+        # Group two videos of different size but same number of frames
+        video_3 = get_random_video(15, 20, num_frames=3, return_torch=True)
+        grouped_videos, grouped_videos_index = group_videos_by_shape([video_1, video_3])
+        self.assertEqual(len(grouped_videos), 2)
+
+        regrouped_videos = reorder_videos(grouped_videos, grouped_videos_index)
+        self.assertTrue(len(regrouped_videos), 2)
+        self.assertEqual(video_1.shape, regrouped_videos[0].shape)
+
+        # Group all three videos where some have same size or same frame count
+        # But since none have frames and sizes identical, we'll have 3 groups
+        grouped_videos, grouped_videos_index = group_videos_by_shape([video_1, video_2, video_3])
+        self.assertEqual(len(grouped_videos), 3)
+
+        regrouped_videos = reorder_videos(grouped_videos, grouped_videos_index)
+        self.assertTrue(len(regrouped_videos), 3)
+        self.assertEqual(video_1.shape, regrouped_videos[0].shape)
+
+        # Group if we had some videos with identical shapes
+        grouped_videos, grouped_videos_index = group_videos_by_shape([video_1, video_1, video_3])
+        self.assertEqual(len(grouped_videos), 2)
+
+        regrouped_videos = reorder_videos(grouped_videos, grouped_videos_index)
+        self.assertTrue(len(regrouped_videos), 2)
+        self.assertEqual(video_1.shape, regrouped_videos[0].shape)
+
+        # Group if we had all videos with identical shapes
+        grouped_videos, grouped_videos_index = group_videos_by_shape([video_1, video_1, video_1])
+        self.assertEqual(len(grouped_videos), 1)
+
+        regrouped_videos = reorder_videos(grouped_videos, grouped_videos_index)
+        self.assertTrue(len(regrouped_videos), 1)
+        self.assertEqual(video_1.shape, regrouped_videos[0].shape)
+

@require_vision
@require_av
Author	SHA1	Message	Date
Lysandre	51f94ea06d	Release: v4.52.4 Some checks failed Release - Conda / build_and_package (push) Has been cancelled Details Secret Leaks / trufflehog (push) Has been cancelled Details	2025-05-30 11:14:55 +02:00
Raushan Turganbay	cdf04ff738	[qwen-vl] Look for vocab size in text config (#38372 ) fix qwen	2025-05-30 11:05:43 +02:00
hoshi-hiyouga	2842b82c61	Fix convert to original state dict for VLMs (#38385 ) * fix convert to original state dict * fix * lint * Update modeling_utils.py	2025-05-28 09:31:32 +02:00
Raushan Turganbay	24c6d5b082	[video utils] group and reorder by number of frames (#38374 ) fix	2025-05-28 09:31:32 +02:00
Raushan Turganbay	222af35ca9	[paligemma] fix processor with suffix (#38365 ) fix pg processor	2025-05-28 09:31:32 +02:00
Cyril Vallez	7c34e2c3cb	Protect `get_default_device` for torch<2.3 (#38376 ) * Update modeling_utils.py * CIs	2025-05-28 09:31:32 +02:00
Anton Vlasjuk	66d32abcbf	[`OPT`] Fix attention scaling (#38290 ) * fix opt attention scaling * add comment to why we do this	2025-05-28 09:31:31 +02:00
Arthur Zucker	f4fc42216c	v 4.52.3 Some checks failed Release - Conda / build_and_package (push) Has been cancelled Details Secret Leaks / trufflehog (push) Has been cancelled Details	2025-05-22 16:29:44 +02:00
Marc Sun	48459c97d7	Fix tp error when torch distributed is already initialized (#38294 ) fix tp error	2025-05-22 16:29:24 +02:00
Arthur	597e159145	Protect ParallelInterface (#38262 ) Co-authored-by: Lysandre <hi@lysand.re> Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>	2025-05-22 16:29:16 +02:00
Cyril Vallez	237c7c356c	update kenlm Some checks failed Release - Conda / build_and_package (push) Has been cancelled Details Secret Leaks / trufflehog (push) Has been cancelled Details	2025-05-21 15:26:05 +02:00
Lysandre Debut	55f6c7fd2c	Clearer error on import failure (#38257 ) Clearer error	2025-05-21 15:12:00 +02:00
Arthur	723563d8fd	tp plan should not be NONE (#38255 ) * accept custom device_mesh * fix device_map * assert that num_heads % tp_size == 0 * todo. * ReplicateParallel * handle tied weights * handle dtensor in save_pretrained with safe_serialization * tp test works * doesnt work * fix shard_and_distribute_module's rank should be local_rank * tp=4 is correct * dp+tp is broken * todo allreduce with dtensors on another dim is annoying * workaround to sync dp grads when using dtensors * loading a checkpoint works * wandb and compare losses with different tp/dp * cleaning * cleaning * . * . * logs * CP2 DP2 no mask works after commenting attn_mask and is_causal from scaled_dot_product_attention * DP=2 TP=2 now works even with tied embeddings * model.parameters() and model.module.parameters() are empty.. * reformat sanity_check_tensor_sync * set atol=1e-4 for CP to pass * try populate _parameters from named_modules * refactors TP2 DP2 works CP2 DP2 works * is_causal=True and pack sequences, no attn mask, and preshuffle dataset * fix packing * CP=4 doesn't work * fix labels and position_ids for CP * DP CP works with transformers 🥳🥳🥳 * refactor * add example cp * fixup * revert sdpa changes * example cleared * add CP, DP to the mesh init * nit * clean * use `ALL_PARALLEL_STYLES` * style * FSDP works * log on 1 rank * . * fix? * FSDP1 also has .parameters() bug * reported gradnorm when using FSDP1 is wrong, but loss is correct so it's okay * . * style and fixup * move stuff around * fix tests * style * let's make it a check * add missing licences * warning should be an info * tp plan should not be NONE * test all * god damn it * test all --------- Co-authored-by: nouamanetazi <nouamane98@gmail.com>	2025-05-21 15:11:54 +02:00
Cyril Vallez	9df95ec4fc	Relase: 4.52.2	2025-05-21 15:05:33 +02:00
Lysandre	945727948c	Release: v4.52.1 Some checks failed Release - Conda / build_and_package (push) Has been cancelled Details Secret Leaks / trufflehog (push) Has been cancelled Details	2025-05-20 22:45:10 +02:00
Lysandre Debut	eaa301673a	Revert parallelism temporarily (#38240 ) * Revert "Protect ParallelInterface" This reverts commit `cb513e35f9`. * Revert "parallelism goes brrr (#37877)" This reverts commit `1c2f36b480`. * Empty commit	2025-05-20 22:43:54 +02:00
Lysandre	b5f494632c	Protect ParallelInterface	2025-05-20 18:26:11 +02:00