v4.53.3

refactor: remove set_tracer_provider and set_meter_provider calls (#39422 )
style
2025-07-22 09:23:05 +02:00 · 2025-07-22 09:20:46 +02:00 · 2025-07-11 14:10:09 +02:00 · 2025-07-11 14:09:53 +02:00 · 2025-07-11 13:34:41 +02:00 · 2025-07-11 13:13:11 +02:00
19 changed files with 220 additions and 104 deletions
--- a/setup.py
+++ b/setup.py
@@ -457,7 +457,7 @@ install_requires = [

 setup(
    name="transformers",
-    version="4.53.1",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="4.53.3",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
    author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
    author_email="transformers@huggingface.co",
    description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -18,7 +18,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).

-__version__ = "4.53.1"
+__version__ = "4.53.3"

 from pathlib import Path
 from typing import TYPE_CHECKING
--- a/src/transformers/generation/continuous_batching.py
+++ b/src/transformers/generation/continuous_batching.py
@@ -122,6 +122,11 @@ class RequestState:
        is_eos = token_id == self.eos_token_id and self.eos_token_id != -1
        is_max_len = self.generated_len() >= self.max_new_tokens

+        # Only add the token if we're not finishing due to max length
+        # (EOS tokens should still be added to the output)
+        if not (is_max_len and not is_eos):
+            self.static_outputs.extend([token_id])
+
        if is_eos or is_max_len:
            self.status = RequestStatus.FINISHED
            return True
@@ -1011,7 +1016,6 @@ class ContinuousBatchProcessor:
                self.metrics.record_ttft_metric(state.created_time, state.request_id)
                state.status = RequestStatus.DECODING
                token = out_tokens[self.logits_indices[i]]
-                state.static_outputs.extend([token])
                state.prompt_ids = [token]
                if state.update_with_token(token):
                    self.metrics.record_request_completion(state.created_time, state.request_id)
--- a/src/transformers/masking_utils.py
+++ b/src/transformers/masking_utils.py
@@ -599,7 +599,7 @@ class AttentionMaskInterface(GeneralInterface):
 ALL_MASK_ATTENTION_FUNCTIONS: AttentionMaskInterface = AttentionMaskInterface()


-def find_packed_sequence_indices(position_ids: torch.Tensor) -> Optional[torch.Tensor]:
+def find_packed_sequence_indices(position_ids: torch.Tensor) -> torch.Tensor:
    """
    Find the indices of the sequence to which each new query token in the sequence belongs when using packed
    tensor format (i.e. several sequences packed in the same batch dimension).
@@ -713,7 +713,7 @@ def create_causal_mask(
    attention_mask: Optional[torch.Tensor],
    cache_position: torch.Tensor,
    past_key_values: Optional[Cache],
-    position_ids: Optional[torch.Tensor],
+    position_ids: Optional[torch.Tensor] = None,
    or_mask_function: Optional[Callable] = None,
    and_mask_function: Optional[Callable] = None,
 ) -> Optional[Union[torch.Tensor, BlockMask]]:
@@ -802,7 +802,7 @@ def create_sliding_window_causal_mask(
    attention_mask: Optional[torch.Tensor],
    cache_position: torch.Tensor,
    past_key_values: Optional[Cache],
-    position_ids: Optional[torch.Tensor],
+    position_ids: Optional[torch.Tensor] = None,
    or_mask_function: Optional[Callable] = None,
    and_mask_function: Optional[Callable] = None,
 ) -> Optional[Union[torch.Tensor, BlockMask]]:
@@ -897,7 +897,7 @@ def create_chunked_causal_mask(
    attention_mask: Optional[torch.Tensor],
    cache_position: torch.Tensor,
    past_key_values: Optional[Cache],
-    position_ids: Optional[torch.Tensor],
+    position_ids: Optional[torch.Tensor] = None,
    or_mask_function: Optional[Callable] = None,
    and_mask_function: Optional[Callable] = None,
 ) -> Optional[Union[torch.Tensor, BlockMask]]:
@@ -1006,7 +1006,7 @@ def create_masks_for_generate(
    attention_mask: Optional[torch.Tensor],
    cache_position: torch.Tensor,
    past_key_values: Optional[Cache],
-    position_ids: Optional[torch.Tensor],
+    position_ids: Optional[torch.Tensor] = None,
    or_mask_function: Optional[Callable] = None,
    and_mask_function: Optional[Callable] = None,
    **kwargs,
--- a/src/transformers/modeling_flash_attention_utils.py
+++ b/src/transformers/modeling_flash_attention_utils.py
@@ -103,6 +103,16 @@ if is_flash_attn_2_available():
    from flash_attn.bert_padding import unpad_input as unpad_input_fa2
    from flash_attn.layers.rotary import apply_rotary_emb

+    HAS_FA2 = True
+    FA_VERSION = 2
+elif is_torch_npu_available():
+    # patch functions in package `flash-attn` when using flash-attention on Ascend NPU.
+    from .integrations.npu_flash_attention import npu_apply_rotary_emb as apply_rotary_emb  # noqa: F401
+    from .integrations.npu_flash_attention import npu_flash_attn_func as flash_attn_2_func
+    from .integrations.npu_flash_attention import npu_flash_attn_varlen_func as flash_attn_2_varlen_func
+    from .integrations.npu_flash_attention import pad_input as pad_input_fa2
+    from .integrations.npu_flash_attention import unpad_input as unpad_input_fa2
+
    HAS_FA2 = True
    FA_VERSION = 2
 else:
@@ -136,22 +146,6 @@ if FA_VERSION:
    unpad_input = globals()[f"unpad_input_fa{FA_VERSION}"]
    pad_input = globals()[f"pad_input_fa{FA_VERSION}"]

-# patch functions in package `flash-attn` when using flash-attention on Ascend NPU.
-if is_torch_npu_available():
-    from .integrations.npu_flash_attention import (
-        npu_apply_rotary_emb as apply_rotary_emb,  # noqa: F401
-    )
-    from .integrations.npu_flash_attention import (
-        npu_flash_attn_func as flash_attn_func,
-    )
-    from .integrations.npu_flash_attention import (
-        npu_flash_attn_varlen_func as flash_attn_varlen_func,
-    )
-    from .integrations.npu_flash_attention import (
-        pad_input,
-        unpad_input,
-    )
-

 _flash_supports_window_size = False

--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -573,6 +573,7 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
                "GemmaTokenizerFast" if is_tokenizers_available() else None,
            ),
        ),
+        ("smollm3", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
        ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
        ("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
        ("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)),
--- a/src/transformers/models/cohere2/configuration_cohere2.py
+++ b/src/transformers/models/cohere2/configuration_cohere2.py
@@ -19,6 +19,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import warnings
+
 from ...configuration_utils import PretrainedConfig, layer_type_validation
 from ...modeling_rope_utils import rope_config_validation

@@ -216,14 +218,29 @@ class Cohere2Config(PretrainedConfig):
            **kwargs,
        )

+        # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
+        self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 4)
+
        if self.layer_types is None:
            # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
-            sliding_window_pattern = getattr(self, "sliding_window_pattern", 4)
+            self._sliding_window_pattern = getattr(self, "sliding_window_pattern", 4)
            self.layer_types = [
-                "sliding_attention" if bool((i + 1) % sliding_window_pattern) else "full_attention"
+                "sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention"
                for i in range(self.num_hidden_layers)
            ]
        layer_type_validation(self.layer_types)

+    @property
+    def sliding_window_pattern(self):
+        warnings.warn(
+            "The `sliding_window_pattern` attribute is deprecated and will be removed in v4.55.0.",
+            FutureWarning,
+        )
+        return self._sliding_window_pattern
+
+    @sliding_window_pattern.setter
+    def sliding_window_pattern(self, value):
+        self._sliding_window_pattern = value
+

 __all__ = ["Cohere2Config"]
--- a/src/transformers/models/cohere2/modular_cohere2.py
+++ b/src/transformers/models/cohere2/modular_cohere2.py
@@ -13,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import warnings
 from typing import Callable, Optional

 import torch
@@ -238,15 +239,30 @@ class Cohere2Config(PretrainedConfig):
            **kwargs,
        )

+        # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
+        self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 4)
+
        if self.layer_types is None:
            # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
-            sliding_window_pattern = getattr(self, "sliding_window_pattern", 4)
+            self._sliding_window_pattern = getattr(self, "sliding_window_pattern", 4)
            self.layer_types = [
-                "sliding_attention" if bool((i + 1) % sliding_window_pattern) else "full_attention"
+                "sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention"
                for i in range(self.num_hidden_layers)
            ]
        layer_type_validation(self.layer_types)

+    @property
+    def sliding_window_pattern(self):
+        warnings.warn(
+            "The `sliding_window_pattern` attribute is deprecated and will be removed in v4.55.0.",
+            FutureWarning,
+        )
+        return self._sliding_window_pattern
+
+    @sliding_window_pattern.setter
+    def sliding_window_pattern(self, value):
+        self._sliding_window_pattern = value
+

 class Cohere2RotaryEmbedding(CohereRotaryEmbedding):
    pass
--- a/src/transformers/models/gemma3/configuration_gemma3.py
+++ b/src/transformers/models/gemma3/configuration_gemma3.py
@@ -19,6 +19,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import warnings
 from typing import Any, Optional, Union

 from ...configuration_utils import PretrainedConfig, layer_type_validation
@@ -145,10 +146,6 @@ class Gemma3TextConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
-        rope_local_base_freq (float, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings for local attention.
-        sliding_window_pattern (`int`, *optional*, defaults to 6):
-            Pattern for the sliding window attention.
    """

    model_type = "gemma3_text"
@@ -230,15 +227,28 @@ class Gemma3TextConfig(PretrainedConfig):
        self.rope_scaling = rope_scaling
        rope_config_validation(self)

+        # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
+        self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 6)
+
        if self.layer_types is None:
-            # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
-            sliding_window_pattern = getattr(self, "sliding_window_pattern", 6)
            self.layer_types = [
-                "sliding_attention" if bool((i + 1) % sliding_window_pattern) else "full_attention"
+                "sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention"
                for i in range(self.num_hidden_layers)
            ]
        layer_type_validation(self.layer_types)

+    @property
+    def sliding_window_pattern(self):
+        warnings.warn(
+            "The `sliding_window_pattern` attribute is deprecated and will be removed in v4.55.0.",
+            FutureWarning,
+        )
+        return self._sliding_window_pattern
+
+    @sliding_window_pattern.setter
+    def sliding_window_pattern(self, value):
+        self._sliding_window_pattern = value
+

 class Gemma3Config(PretrainedConfig):
    r"""
--- a/src/transformers/models/gemma3/modular_gemma3.py
+++ b/src/transformers/models/gemma3/modular_gemma3.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import copy
+import warnings
 from collections.abc import Callable
 from typing import Any, Optional, Union

@@ -171,10 +172,6 @@ class Gemma3TextConfig(Gemma2Config, PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
-        rope_local_base_freq (float, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings for local attention.
-        sliding_window_pattern (`int`, *optional*, defaults to 6):
-            Pattern for the sliding window attention.
    """

    model_type = "gemma3_text"
@@ -241,15 +238,28 @@ class Gemma3TextConfig(Gemma2Config, PretrainedConfig):
        self.rope_scaling = rope_scaling
        rope_config_validation(self)

+        # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
+        self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 6)
+
        if self.layer_types is None:
-            # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
-            sliding_window_pattern = getattr(self, "sliding_window_pattern", 6)
            self.layer_types = [
-                "sliding_attention" if bool((i + 1) % sliding_window_pattern) else "full_attention"
+                "sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention"
                for i in range(self.num_hidden_layers)
            ]
        layer_type_validation(self.layer_types)

+    @property
+    def sliding_window_pattern(self):
+        warnings.warn(
+            "The `sliding_window_pattern` attribute is deprecated and will be removed in v4.55.0.",
+            FutureWarning,
+        )
+        return self._sliding_window_pattern
+
+    @sliding_window_pattern.setter
+    def sliding_window_pattern(self, value):
+        self._sliding_window_pattern = value
+

 class Gemma3Config(PretrainedConfig):
    r"""
--- a/src/transformers/models/glm4v/image_processing_glm4v_fast.py
+++ b/src/transformers/models/glm4v/image_processing_glm4v_fast.py
@@ -121,6 +121,7 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
        do_convert_rgb: bool,
        input_data_format: Optional[Union[str, ChannelDimension]],
        device: Optional[Union[str, torch.device]],
+        disable_grouping: Optional[bool],
    ):
        """
        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
@@ -173,7 +174,7 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
        resized_height, resized_width = height, width

        # Group images by size for batched resizing
-        grouped_images, grouped_images_index = group_images_by_shape(images)
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
        resized_images_grouped = {}
        for shape, stacked_images in grouped_images.items():
            if do_resize:
@@ -191,7 +192,7 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
        # Group images by size for further processing
        # Needed in case do_resize is False, or resize returns images with different sizes
-        grouped_images, grouped_images_index = group_images_by_shape(resized_images)
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
        processed_images_grouped = {}
        for shape, stacked_images in grouped_images.items():
            # Fused rescale and normalize
@@ -249,6 +250,7 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        device: Optional["torch.device"] = None,
+        disable_grouping: Optional[bool] = False,
        **kwargs,
    ):
        r"""
@@ -323,6 +325,7 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
                    do_convert_rgb=do_convert_rgb,
                    input_data_format=input_data_format,
                    device=device,
+                    disable_grouping=disable_grouping,
                )
                pixel_values.extend(patches)
                vision_grid_thws.append(image_grid_thw)
@@ -351,11 +354,11 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):

        factor = patch_size * merge_size
        resized_height, resized_width = smart_resize(
-            t=self.temporal_patch_size,
+            num_frames=self.temporal_patch_size,
            height=height,
            width=width,
+            temporal_factor=self.temporal_patch_size,
            factor=factor,
-            t_factor=self.temporal_patch_size,
        )
        grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
        return grid_h * grid_w
--- a/src/transformers/models/glm4v/modeling_glm4v.py
+++ b/src/transformers/models/glm4v/modeling_glm4v.py
@@ -952,7 +952,7 @@ class Glm4vTextModel(Glm4vPreTrainedModel):
@auto_docstring
 class Glm4vModel(Glm4vPreTrainedModel):
    base_model_prefix = ""
-    _checkpoint_conversion_mapping = None
+    _checkpoint_conversion_mapping = {}
    config_class = Glm4vConfig
    _no_split_modules = ["Glm4vTextDecoderLayer", "Glm4vVisionBlock"]

@@ -1053,7 +1053,8 @@ class Glm4vModel(Glm4vPreTrainedModel):
                dtype=input_ids.dtype,
                device=input_ids.device,
            )
-
+            image_index, video_index = 0, 0
+            video_group_index = 0
            attention_mask = attention_mask.to(total_input_ids.device)
            for i, input_ids in enumerate(total_input_ids):
                input_ids = input_ids[attention_mask[i] == 1]
@@ -1083,8 +1084,6 @@ class Glm4vModel(Glm4vPreTrainedModel):

                llm_pos_ids_list = []
                video_frame_num = 1
-                image_index, video_index = 0, 0
-
                for modality_type, start_idx, end_idx in input_type_group:
                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0

@@ -1125,12 +1124,14 @@ class Glm4vModel(Glm4vPreTrainedModel):
                            t_index = torch.tensor(t_idx).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()

                            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(1, -1, llm_grid_w).flatten()
-
                            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(1, llm_grid_h, -1).flatten()
-
                            llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + st_idx)

-                        video_index += 1
+                        video_group_index += 1
+
+                        if video_group_index >= video_grid_thw[video_index][0]:
+                            video_index += 1
+                            video_group_index = 0

                        video_frame_num += 1

@@ -1179,7 +1180,13 @@ class Glm4vModel(Glm4vPreTrainedModel):
                The temporal, height and width of feature shape of each video in LLM.
        """
        pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
-        video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
+        # reshape video_grid_thw -> [b, 3] -> [1, h, w] * frames
+        temp_frames_hw = []
+        for t, h, w in video_grid_thw:
+            repeated_row = torch.tensor([1, h.item(), w.item()]).unsqueeze(0).repeat(t, 1)
+            temp_frames_hw.append(repeated_row)
+        flattened_video_grid_thw = torch.cat(temp_frames_hw, dim=0)
+        video_embeds = self.visual(pixel_values_videos, grid_thw=flattened_video_grid_thw)
        split_sizes = (video_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
        video_embeds = torch.split(video_embeds, split_sizes)
        return video_embeds
@@ -1379,7 +1386,7 @@ class Glm4vCausalLMOutputWithPast(ModelOutput):


 class Glm4vForConditionalGeneration(Glm4vPreTrainedModel, GenerationMixin):
-    _checkpoint_conversion_mapping = None
+    _checkpoint_conversion_mapping = {}
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
--- a/src/transformers/models/glm4v/modular_glm4v.py
+++ b/src/transformers/models/glm4v/modular_glm4v.py
@@ -1004,7 +1004,7 @@ class Glm4vTextModel(Qwen2_5_VLTextModel):


 class Glm4vModel(Qwen2_5_VLModel):
-    _checkpoint_conversion_mapping = None
+    _checkpoint_conversion_mapping = {}
    _no_split_modules = ["Glm4vTextDecoderLayer", "Glm4vVisionBlock"]

    def __init__(self, config):
@@ -1087,7 +1087,8 @@ class Glm4vModel(Qwen2_5_VLModel):
                dtype=input_ids.dtype,
                device=input_ids.device,
            )
-
+            image_index, video_index = 0, 0
+            video_group_index = 0
            attention_mask = attention_mask.to(total_input_ids.device)
            for i, input_ids in enumerate(total_input_ids):
                input_ids = input_ids[attention_mask[i] == 1]
@@ -1117,8 +1118,6 @@ class Glm4vModel(Qwen2_5_VLModel):

                llm_pos_ids_list = []
                video_frame_num = 1
-                image_index, video_index = 0, 0
-
                for modality_type, start_idx, end_idx in input_type_group:
                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0

@@ -1159,12 +1158,14 @@ class Glm4vModel(Qwen2_5_VLModel):
                            t_index = torch.tensor(t_idx).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()

                            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(1, -1, llm_grid_w).flatten()
-
                            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(1, llm_grid_h, -1).flatten()
-
                            llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + st_idx)

-                        video_index += 1
+                        video_group_index += 1
+
+                        if video_group_index >= video_grid_thw[video_index][0]:
+                            video_index += 1
+                            video_group_index = 0

                        video_frame_num += 1

@@ -1200,6 +1201,30 @@ class Glm4vModel(Qwen2_5_VLModel):

            return position_ids, mrope_position_deltas

+    def get_video_features(
+        self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
+    ):
+        """
+        Encodes videos into continuous embeddings that can be forwarded to the language model.
+
+        Args:
+            pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input videos.
+            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+                The temporal, height and width of feature shape of each video in LLM.
+        """
+        pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
+        # reshape video_grid_thw -> [b, 3] -> [1, h, w] * frames
+        temp_frames_hw = []
+        for t, h, w in video_grid_thw:
+            repeated_row = torch.tensor([1, h.item(), w.item()]).unsqueeze(0).repeat(t, 1)
+            temp_frames_hw.append(repeated_row)
+        flattened_video_grid_thw = torch.cat(temp_frames_hw, dim=0)
+        video_embeds = self.visual(pixel_values_videos, grid_thw=flattened_video_grid_thw)
+        split_sizes = (video_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
+        video_embeds = torch.split(video_embeds, split_sizes)
+        return video_embeds
+
    @auto_docstring
    @can_return_tuple
    def forward(
@@ -1353,7 +1378,7 @@ class Glm4vCausalLMOutputWithPast(Qwen2_5_VLCausalLMOutputWithPast):


 class Glm4vForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
-    _checkpoint_conversion_mapping = None
+    _checkpoint_conversion_mapping = {}

    def forward(
        self,
@@ -1661,32 +1686,38 @@ class Glm4vProcessor(Qwen2_5_VLProcessor):
            video_index = 0
            for i in range(len(text)):
                while self.video_token in text[i]:
-                    num_frames = len(video_grid_thw)
+                    num_frames = video_grid_thw[video_index][0]
                    video_structure = ""

                    if hasattr(timestamps, "tolist"):
                        timestamps_list = timestamps.tolist()[0]
                    else:
                        timestamps_list = timestamps[0] if isinstance(timestamps[0], list) else timestamps
+
                    unique_timestamps = []
                    for idx in range(0, len(timestamps_list)):
                        unique_timestamps.append(timestamps_list[idx])
+
                    selected_timestamps = unique_timestamps[:num_frames]
                    while len(selected_timestamps) < num_frames:
                        selected_timestamps.append(selected_timestamps[-1] if selected_timestamps else 0)
+
                    for frame_idx in range(num_frames):
                        timestamp_sec = selected_timestamps[frame_idx]
                        frame_structure = f"<|begin_of_image|>{self.image_token}<|end_of_image|>{timestamp_sec}"
                        video_structure += frame_structure
+
                    text[i] = text[i].replace(self.video_token, video_structure, 1)
+                    num_image_tokens = (
+                        video_grid_thw[video_index].prod() // merge_length // video_grid_thw[video_index][0]
+                    )
+                    for frame_idx in range(num_frames):
+                        if self.image_token in text[i]:
+                            text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
+
                    video_index += 1

-                for frame_idx in range(len(video_grid_thw)):
-                    if self.image_token in text[i]:
-                        num_image_tokens = video_grid_thw[frame_idx].prod() // merge_length
-                        text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
                text[i] = text[i].replace("<|placeholder|>", self.image_token)
-
        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
        self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
--- a/src/transformers/models/glm4v/processing_glm4v.py
+++ b/src/transformers/models/glm4v/processing_glm4v.py
@@ -167,32 +167,38 @@ class Glm4vProcessor(ProcessorMixin):
            video_index = 0
            for i in range(len(text)):
                while self.video_token in text[i]:
-                    num_frames = len(video_grid_thw)
+                    num_frames = video_grid_thw[video_index][0]
                    video_structure = ""

                    if hasattr(timestamps, "tolist"):
                        timestamps_list = timestamps.tolist()[0]
                    else:
                        timestamps_list = timestamps[0] if isinstance(timestamps[0], list) else timestamps
+
                    unique_timestamps = []
                    for idx in range(0, len(timestamps_list)):
                        unique_timestamps.append(timestamps_list[idx])
+
                    selected_timestamps = unique_timestamps[:num_frames]
                    while len(selected_timestamps) < num_frames:
                        selected_timestamps.append(selected_timestamps[-1] if selected_timestamps else 0)
+
                    for frame_idx in range(num_frames):
                        timestamp_sec = selected_timestamps[frame_idx]
                        frame_structure = f"<|begin_of_image|>{self.image_token}<|end_of_image|>{timestamp_sec}"
                        video_structure += frame_structure
+
                    text[i] = text[i].replace(self.video_token, video_structure, 1)
+                    num_image_tokens = (
+                        video_grid_thw[video_index].prod() // merge_length // video_grid_thw[video_index][0]
+                    )
+                    for frame_idx in range(num_frames):
+                        if self.image_token in text[i]:
+                            text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
+
                    video_index += 1

-                for frame_idx in range(len(video_grid_thw)):
-                    if self.image_token in text[i]:
-                        num_image_tokens = video_grid_thw[frame_idx].prod() // merge_length
-                        text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
                text[i] = text[i].replace("<|placeholder|>", self.image_token)
-
        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
        self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
--- a/src/transformers/models/glm4v/video_processing_glm4v.py
+++ b/src/transformers/models/glm4v/video_processing_glm4v.py
@@ -246,10 +246,6 @@ class Glm4vVideoProcessor(BaseVideoProcessor):
        processed_grids = reorder_videos(processed_grids, grouped_videos_index)
        pixel_values_videos = torch.cat(processed_videos, dim=0)
        video_grid_thw = torch.tensor(processed_grids)
-        total_frames = video_grid_thw[0][0].item()
-        h = video_grid_thw[0][1].item()
-        w = video_grid_thw[0][2].item()
-        video_grid_thw = [[1, h, w] for _ in range(total_frames)]
        data = {
            "pixel_values_videos": pixel_values_videos,
            "video_grid_thw": video_grid_thw,
--- a/src/transformers/utils/metrics.py
+++ b/src/transformers/utils/metrics.py
@@ -20,27 +20,9 @@ class RequestStatus(Enum):


 try:
-    from opentelemetry import metrics, trace
-    from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
-    from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
-    from opentelemetry.sdk.metrics import MeterProvider
-    from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
-    from opentelemetry.sdk.resources import Resource
-    from opentelemetry.sdk.trace import TracerProvider
-    from opentelemetry.sdk.trace.export import BatchSpanProcessor
+    from opentelemetry import metrics
    from opentelemetry.trace import Status, StatusCode, get_tracer

-    resource = Resource.create({"service.name": "transformers"})
-
-    metrics_exporter = PeriodicExportingMetricReader(OTLPMetricExporter(), export_interval_millis=1000)
-    meter_provider = MeterProvider(resource=resource, metric_readers=[metrics_exporter])
-    metrics.set_meter_provider(meter_provider)
-
-    trace_exporter = OTLPSpanExporter()
-    tracer_provider = TracerProvider(resource=resource)
-    tracer_provider.add_span_processor(BatchSpanProcessor(trace_exporter))
-    trace.set_tracer_provider(tracer_provider)
-
    _has_opentelemetry = True
 except ImportError:
    _has_opentelemetry = False
--- a/tests/models/glm4v/test_modeling_glm4v.py
+++ b/tests/models/glm4v/test_modeling_glm4v.py
@@ -254,10 +254,6 @@ class Glm4vModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase)
    def test_generate_from_inputs_embeds_with_static_cache(self):
        pass

-    # The multimodal base model embeds will not match ids, due to pixel values. We can't change base test
-    # because in some models `pixel_values` are required. Will be fixed when we add support for merging `embeds+pixels`
-    # TODO: @raushan
-
    def test_inputs_embeds(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

@@ -380,6 +376,44 @@ class Glm4vIntegrationTest(unittest.TestCase):
            EXPECTED_DECODED_TEXT,
        )

+    @slow
+    def test_small_model_integration_test_with_video(self):
+        processor = AutoProcessor.from_pretrained("THUDM/GLM-4.1V-9B-Thinking", max_image_size={"longest_edge": 50176})
+        model = Glm4vForConditionalGeneration.from_pretrained(
+            "THUDM/GLM-4.1V-9B-Thinking", torch_dtype=torch.float16, device_map="auto"
+        )
+        questions = ["Describe this video."] * 2
+        video_urls = [
+            "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4"
+        ] * 2
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "video",
+                            "video": video_url,
+                        },
+                        {"type": "text", "text": question},
+                    ],
+                }
+            ]
+            for question, video_url in zip(questions, video_urls)
+        ]
+        inputs = processor.apply_chat_template(
+            messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", padding=True
+        ).to(torch_device)
+        output = model.generate(**inputs, max_new_tokens=30)
+        EXPECTED_DECODED_TEXT = [
+            "\n012345Describe this video.\n<think>Got it, let's analyze the video. First, the scene is a room with a wooden floor, maybe a traditional Japanese room with tatami",
+            "\n012345Describe this video.\n<think>Got it, let's analyze the video. First, the scene is a room with a wooden floor, maybe a traditional Japanese room with tatami"
+        ]  # fmt: skip
+        self.assertEqual(
+            processor.batch_decode(output, skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
+        )
+
    @slow
    def test_small_model_integration_test_expand(self):
        model = Glm4vForConditionalGeneration.from_pretrained(
--- a/tests/models/glm4v/test_video_processing_glm4v.py
+++ b/tests/models/glm4v/test_video_processing_glm4v.py
@@ -228,7 +228,7 @@ class Glm4vVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
            self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)

-    @unittest.skip("Skip for now, the test needs adjustment fo GLM-4.1V")
+    @unittest.skip("Skip for now, the test needs adjustment for GLM-4.1V")
    def test_call_numpy_4_channels(self):
        for video_processing_class in self.video_processor_list:
            # Test that can process videos which have an arbitrary number of channels
--- a/tests/utils/test_masking_utils.py
+++ b/tests/utils/test_masking_utils.py
@@ -22,7 +22,7 @@ if is_torch_available():
    from torch.nn.attention.flex_attention import create_block_mask

    from transformers import LlamaConfig
-    from transformers.masking_utils import create_causal_mask
+    from transformers.masking_utils import create_causal_mask, find_packed_sequence_indices


 # fmt: off
@@ -130,3 +130,8 @@ class MaskTest(unittest.TestCase):

        # We compatre the str representations, as the BlockMask objects themselves cannot easily be compared
        self.assertEqual(causal_mask.to_string(), EXPECTED_BLOCK_MASK.to_string())
+
+    def test_find_packed_sequence_indices(self):
+        position_ids = torch.tensor([[0, 1, 2, 3, 0, 1, 0, 1, 2, 3], [0, 1, 2, 3, 4, 5, 0, 1, 2, 3]])
+        EXPECTED_SEQUENCE_INDICES = torch.tensor([[0, 0, 0, 0, 1, 1, 2, 2, 2, 2], [0, 0, 0, 0, 0, 0, 1, 1, 1, 1]])
+        self.assertTrue((find_packed_sequence_indices(position_ids) == EXPECTED_SEQUENCE_INDICES).all())
Author	SHA1	Message	Date
Kashif Rasul	a5923d4de7	v4.53.3 Some checks failed Release - Conda / build_and_package (push) Has been cancelled Details Secret Leaks / trufflehog (push) Has been cancelled Details	2025-07-22 09:23:05 +02:00
Luc Georges	d04a942545	refactor: remove `set_tracer_provider` and `set_meter_provider` calls (#39422 )	2025-07-22 09:20:46 +02:00
Cyril Vallez	37f8b0b535	style Some checks failed Release - Conda / build_and_package (push) Has been cancelled Details Secret Leaks / trufflehog (push) Has been cancelled Details	2025-07-11 14:10:09 +02:00
Cyril Vallez	d15e157d60	finally fix glm4v accordingly	2025-07-11 14:09:53 +02:00
Cyril Vallez	9873b2f917	fix attr post conflict	2025-07-11 13:34:41 +02:00
Cyril Vallez	d6f31553c0	Fix conflict	2025-07-11 13:13:11 +02:00
Cyril Vallez	cbac426388	Add a default value for `position_ids` in masking_utils (#39310 ) * set default * Update masking_utils.py * add small test	2025-07-11 13:09:43 +02:00
Kingsley	c122145839	fix Glm4v batch videos forward (#39172 ) * changes for video * update modular * change get_video_features * update video token replacement * update modular * add test and fix typo * lint * fix order * lint * fix * remove dependency * lint * lint * remove todo * resize video for test * lint.. * fix test * new a processor for video_test * fix test	2025-07-11 13:09:05 +02:00
Raushan Turganbay	9da3f7d072	[sliding window] revert and deprecate (#39301 ) * bring back and deprecate * oops --------- Co-authored-by: Cyril Vallez <cyril.vallez@huggingface.co>	2025-07-11 13:07:56 +02:00
Joao Gante	b2fbc02270	[smollm3] add tokenizer mapping for `smollm3` (#39271 ) add tok mapping to smollm3	2025-07-11 13:07:28 +02:00
Kashif Rasul	c30a6b7bd7	[pagged-attention] fix off-by-1 error in pagged attention generation (#39258 ) * fix off-by-1 error in pagged attention generation * formatting * use update_with_token	2025-07-11 13:06:28 +02:00
Joonchen Liau	be7d1a9da0	Fix errors when use verl to train GLM4.1v model (#39199 ) * Fix errors when use verl to train GLM4.1v model * Support glm4v load from AutoModelForVision2Seq * Set glm4v model _checkpoint_conversion_mapping attr from None to {} * Update modeling_auto.py	2025-07-11 13:05:05 +02:00
Zhen	6023ca8abd	[bugfix] fix flash attention 2 unavailable error on Ascend NPU (#39166 ) [bugfix] fix flash attention 2 error on Ascend NPU	2025-07-11 13:04:17 +02:00
Yuxuan Zhang	e7e78b2a0d	Fix some bug for finetune and batch infer For GLM-4.1V (#39090 ) * update * 1	2025-07-11 13:01:12 +02:00
Cyril Vallez	4a5d5c490f	Release: v4.53.2	2025-07-11 12:48:40 +02:00