Compare commits

...

15 Commits

Author SHA1 Message Date
Kashif Rasul
a5923d4de7 v4.53.3
Some checks failed
Release - Conda / build_and_package (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
2025-07-22 09:23:05 +02:00
Luc Georges
d04a942545 refactor: remove set_tracer_provider and set_meter_provider calls (#39422) 2025-07-22 09:20:46 +02:00
Cyril Vallez
37f8b0b535 style
Some checks failed
Release - Conda / build_and_package (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
2025-07-11 14:10:09 +02:00
Cyril Vallez
d15e157d60 finally fix glm4v accordingly 2025-07-11 14:09:53 +02:00
Cyril Vallez
9873b2f917 fix attr post conflict 2025-07-11 13:34:41 +02:00
Cyril Vallez
d6f31553c0 Fix conflict 2025-07-11 13:13:11 +02:00
Cyril Vallez
cbac426388 Add a default value for position_ids in masking_utils (#39310)
* set default

* Update masking_utils.py

* add small test
2025-07-11 13:09:43 +02:00
Kingsley
c122145839 fix Glm4v batch videos forward (#39172)
* changes for video

* update modular

* change get_video_features

* update video token replacement

* update modular

* add test and fix typo

* lint

* fix order

* lint

* fix

* remove dependency

* lint

* lint

* remove todo

* resize video for test

* lint..

* fix test

* new a processor for video_test

* fix test
2025-07-11 13:09:05 +02:00
Raushan Turganbay
9da3f7d072 [sliding window] revert and deprecate (#39301)
* bring back and deprecate

* oops

---------

Co-authored-by: Cyril Vallez <cyril.vallez@huggingface.co>
2025-07-11 13:07:56 +02:00
Joao Gante
b2fbc02270 [smollm3] add tokenizer mapping for smollm3 (#39271)
add tok mapping to smollm3
2025-07-11 13:07:28 +02:00
Kashif Rasul
c30a6b7bd7 [pagged-attention] fix off-by-1 error in pagged attention generation (#39258)
* fix off-by-1 error in pagged attention generation

* formatting

* use update_with_token
2025-07-11 13:06:28 +02:00
Joonchen Liau
be7d1a9da0 Fix errors when use verl to train GLM4.1v model (#39199)
* Fix errors when use verl to train GLM4.1v model

* Support glm4v load from AutoModelForVision2Seq
* Set glm4v model _checkpoint_conversion_mapping attr from None to {}

* Update modeling_auto.py
2025-07-11 13:05:05 +02:00
Zhen
6023ca8abd [bugfix] fix flash attention 2 unavailable error on Ascend NPU (#39166)
[bugfix] fix flash attention 2 error on Ascend NPU
2025-07-11 13:04:17 +02:00
Yuxuan Zhang
e7e78b2a0d Fix some bug for finetune and batch infer For GLM-4.1V (#39090)
* update

* 1
2025-07-11 13:01:12 +02:00
Cyril Vallez
4a5d5c490f Release: v4.53.2 2025-07-11 12:48:40 +02:00
19 changed files with 220 additions and 104 deletions

View File

@@ -457,7 +457,7 @@ install_requires = [
setup(
name="transformers",
version="4.53.1", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
version="4.53.3", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
author_email="transformers@huggingface.co",
description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",

View File

@@ -18,7 +18,7 @@
# to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
# in the namespace without actually importing anything (and especially none of the backends).
__version__ = "4.53.1"
__version__ = "4.53.3"
from pathlib import Path
from typing import TYPE_CHECKING

View File

@@ -122,6 +122,11 @@ class RequestState:
is_eos = token_id == self.eos_token_id and self.eos_token_id != -1
is_max_len = self.generated_len() >= self.max_new_tokens
# Only add the token if we're not finishing due to max length
# (EOS tokens should still be added to the output)
if not (is_max_len and not is_eos):
self.static_outputs.extend([token_id])
if is_eos or is_max_len:
self.status = RequestStatus.FINISHED
return True
@@ -1011,7 +1016,6 @@ class ContinuousBatchProcessor:
self.metrics.record_ttft_metric(state.created_time, state.request_id)
state.status = RequestStatus.DECODING
token = out_tokens[self.logits_indices[i]]
state.static_outputs.extend([token])
state.prompt_ids = [token]
if state.update_with_token(token):
self.metrics.record_request_completion(state.created_time, state.request_id)

View File

@@ -599,7 +599,7 @@ class AttentionMaskInterface(GeneralInterface):
ALL_MASK_ATTENTION_FUNCTIONS: AttentionMaskInterface = AttentionMaskInterface()
def find_packed_sequence_indices(position_ids: torch.Tensor) -> Optional[torch.Tensor]:
def find_packed_sequence_indices(position_ids: torch.Tensor) -> torch.Tensor:
"""
Find the indices of the sequence to which each new query token in the sequence belongs when using packed
tensor format (i.e. several sequences packed in the same batch dimension).
@@ -713,7 +713,7 @@ def create_causal_mask(
attention_mask: Optional[torch.Tensor],
cache_position: torch.Tensor,
past_key_values: Optional[Cache],
position_ids: Optional[torch.Tensor],
position_ids: Optional[torch.Tensor] = None,
or_mask_function: Optional[Callable] = None,
and_mask_function: Optional[Callable] = None,
) -> Optional[Union[torch.Tensor, BlockMask]]:
@@ -802,7 +802,7 @@ def create_sliding_window_causal_mask(
attention_mask: Optional[torch.Tensor],
cache_position: torch.Tensor,
past_key_values: Optional[Cache],
position_ids: Optional[torch.Tensor],
position_ids: Optional[torch.Tensor] = None,
or_mask_function: Optional[Callable] = None,
and_mask_function: Optional[Callable] = None,
) -> Optional[Union[torch.Tensor, BlockMask]]:
@@ -897,7 +897,7 @@ def create_chunked_causal_mask(
attention_mask: Optional[torch.Tensor],
cache_position: torch.Tensor,
past_key_values: Optional[Cache],
position_ids: Optional[torch.Tensor],
position_ids: Optional[torch.Tensor] = None,
or_mask_function: Optional[Callable] = None,
and_mask_function: Optional[Callable] = None,
) -> Optional[Union[torch.Tensor, BlockMask]]:
@@ -1006,7 +1006,7 @@ def create_masks_for_generate(
attention_mask: Optional[torch.Tensor],
cache_position: torch.Tensor,
past_key_values: Optional[Cache],
position_ids: Optional[torch.Tensor],
position_ids: Optional[torch.Tensor] = None,
or_mask_function: Optional[Callable] = None,
and_mask_function: Optional[Callable] = None,
**kwargs,

View File

@@ -103,6 +103,16 @@ if is_flash_attn_2_available():
from flash_attn.bert_padding import unpad_input as unpad_input_fa2
from flash_attn.layers.rotary import apply_rotary_emb
HAS_FA2 = True
FA_VERSION = 2
elif is_torch_npu_available():
# patch functions in package `flash-attn` when using flash-attention on Ascend NPU.
from .integrations.npu_flash_attention import npu_apply_rotary_emb as apply_rotary_emb # noqa: F401
from .integrations.npu_flash_attention import npu_flash_attn_func as flash_attn_2_func
from .integrations.npu_flash_attention import npu_flash_attn_varlen_func as flash_attn_2_varlen_func
from .integrations.npu_flash_attention import pad_input as pad_input_fa2
from .integrations.npu_flash_attention import unpad_input as unpad_input_fa2
HAS_FA2 = True
FA_VERSION = 2
else:
@@ -136,22 +146,6 @@ if FA_VERSION:
unpad_input = globals()[f"unpad_input_fa{FA_VERSION}"]
pad_input = globals()[f"pad_input_fa{FA_VERSION}"]
# patch functions in package `flash-attn` when using flash-attention on Ascend NPU.
if is_torch_npu_available():
from .integrations.npu_flash_attention import (
npu_apply_rotary_emb as apply_rotary_emb, # noqa: F401
)
from .integrations.npu_flash_attention import (
npu_flash_attn_func as flash_attn_func,
)
from .integrations.npu_flash_attention import (
npu_flash_attn_varlen_func as flash_attn_varlen_func,
)
from .integrations.npu_flash_attention import (
pad_input,
unpad_input,
)
_flash_supports_window_size = False

View File

@@ -573,6 +573,7 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
"GemmaTokenizerFast" if is_tokenizers_available() else None,
),
),
("smollm3", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)),

View File

@@ -19,6 +19,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
from ...configuration_utils import PretrainedConfig, layer_type_validation
from ...modeling_rope_utils import rope_config_validation
@@ -216,14 +218,29 @@ class Cohere2Config(PretrainedConfig):
**kwargs,
)
# BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 4)
if self.layer_types is None:
# BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
sliding_window_pattern = getattr(self, "sliding_window_pattern", 4)
self._sliding_window_pattern = getattr(self, "sliding_window_pattern", 4)
self.layer_types = [
"sliding_attention" if bool((i + 1) % sliding_window_pattern) else "full_attention"
"sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention"
for i in range(self.num_hidden_layers)
]
layer_type_validation(self.layer_types)
@property
def sliding_window_pattern(self):
warnings.warn(
"The `sliding_window_pattern` attribute is deprecated and will be removed in v4.55.0.",
FutureWarning,
)
return self._sliding_window_pattern
@sliding_window_pattern.setter
def sliding_window_pattern(self, value):
self._sliding_window_pattern = value
__all__ = ["Cohere2Config"]

View File

@@ -13,6 +13,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
from typing import Callable, Optional
import torch
@@ -238,15 +239,30 @@ class Cohere2Config(PretrainedConfig):
**kwargs,
)
# BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 4)
if self.layer_types is None:
# BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
sliding_window_pattern = getattr(self, "sliding_window_pattern", 4)
self._sliding_window_pattern = getattr(self, "sliding_window_pattern", 4)
self.layer_types = [
"sliding_attention" if bool((i + 1) % sliding_window_pattern) else "full_attention"
"sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention"
for i in range(self.num_hidden_layers)
]
layer_type_validation(self.layer_types)
@property
def sliding_window_pattern(self):
warnings.warn(
"The `sliding_window_pattern` attribute is deprecated and will be removed in v4.55.0.",
FutureWarning,
)
return self._sliding_window_pattern
@sliding_window_pattern.setter
def sliding_window_pattern(self, value):
self._sliding_window_pattern = value
class Cohere2RotaryEmbedding(CohereRotaryEmbedding):
pass

View File

@@ -19,6 +19,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
from typing import Any, Optional, Union
from ...configuration_utils import PretrainedConfig, layer_type_validation
@@ -145,10 +146,6 @@ class Gemma3TextConfig(PretrainedConfig):
>>> # Accessing the model configuration
>>> configuration = model.config
```
rope_local_base_freq (float, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings for local attention.
sliding_window_pattern (`int`, *optional*, defaults to 6):
Pattern for the sliding window attention.
"""
model_type = "gemma3_text"
@@ -230,15 +227,28 @@ class Gemma3TextConfig(PretrainedConfig):
self.rope_scaling = rope_scaling
rope_config_validation(self)
# BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 6)
if self.layer_types is None:
# BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
sliding_window_pattern = getattr(self, "sliding_window_pattern", 6)
self.layer_types = [
"sliding_attention" if bool((i + 1) % sliding_window_pattern) else "full_attention"
"sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention"
for i in range(self.num_hidden_layers)
]
layer_type_validation(self.layer_types)
@property
def sliding_window_pattern(self):
warnings.warn(
"The `sliding_window_pattern` attribute is deprecated and will be removed in v4.55.0.",
FutureWarning,
)
return self._sliding_window_pattern
@sliding_window_pattern.setter
def sliding_window_pattern(self, value):
self._sliding_window_pattern = value
class Gemma3Config(PretrainedConfig):
r"""

View File

@@ -14,6 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import warnings
from collections.abc import Callable
from typing import Any, Optional, Union
@@ -171,10 +172,6 @@ class Gemma3TextConfig(Gemma2Config, PretrainedConfig):
>>> # Accessing the model configuration
>>> configuration = model.config
```
rope_local_base_freq (float, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings for local attention.
sliding_window_pattern (`int`, *optional*, defaults to 6):
Pattern for the sliding window attention.
"""
model_type = "gemma3_text"
@@ -241,15 +238,28 @@ class Gemma3TextConfig(Gemma2Config, PretrainedConfig):
self.rope_scaling = rope_scaling
rope_config_validation(self)
# BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 6)
if self.layer_types is None:
# BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
sliding_window_pattern = getattr(self, "sliding_window_pattern", 6)
self.layer_types = [
"sliding_attention" if bool((i + 1) % sliding_window_pattern) else "full_attention"
"sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention"
for i in range(self.num_hidden_layers)
]
layer_type_validation(self.layer_types)
@property
def sliding_window_pattern(self):
warnings.warn(
"The `sliding_window_pattern` attribute is deprecated and will be removed in v4.55.0.",
FutureWarning,
)
return self._sliding_window_pattern
@sliding_window_pattern.setter
def sliding_window_pattern(self, value):
self._sliding_window_pattern = value
class Gemma3Config(PretrainedConfig):
r"""

View File

@@ -121,6 +121,7 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
do_convert_rgb: bool,
input_data_format: Optional[Union[str, ChannelDimension]],
device: Optional[Union[str, torch.device]],
disable_grouping: Optional[bool],
):
"""
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
@@ -173,7 +174,7 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
resized_height, resized_width = height, width
# Group images by size for batched resizing
grouped_images, grouped_images_index = group_images_by_shape(images)
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
resized_images_grouped = {}
for shape, stacked_images in grouped_images.items():
if do_resize:
@@ -191,7 +192,7 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
resized_images = reorder_images(resized_images_grouped, grouped_images_index)
# Group images by size for further processing
# Needed in case do_resize is False, or resize returns images with different sizes
grouped_images, grouped_images_index = group_images_by_shape(resized_images)
grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
processed_images_grouped = {}
for shape, stacked_images in grouped_images.items():
# Fused rescale and normalize
@@ -249,6 +250,7 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
device: Optional["torch.device"] = None,
disable_grouping: Optional[bool] = False,
**kwargs,
):
r"""
@@ -323,6 +325,7 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
do_convert_rgb=do_convert_rgb,
input_data_format=input_data_format,
device=device,
disable_grouping=disable_grouping,
)
pixel_values.extend(patches)
vision_grid_thws.append(image_grid_thw)
@@ -351,11 +354,11 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
factor = patch_size * merge_size
resized_height, resized_width = smart_resize(
t=self.temporal_patch_size,
num_frames=self.temporal_patch_size,
height=height,
width=width,
temporal_factor=self.temporal_patch_size,
factor=factor,
t_factor=self.temporal_patch_size,
)
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
return grid_h * grid_w

View File

@@ -952,7 +952,7 @@ class Glm4vTextModel(Glm4vPreTrainedModel):
@auto_docstring
class Glm4vModel(Glm4vPreTrainedModel):
base_model_prefix = ""
_checkpoint_conversion_mapping = None
_checkpoint_conversion_mapping = {}
config_class = Glm4vConfig
_no_split_modules = ["Glm4vTextDecoderLayer", "Glm4vVisionBlock"]
@@ -1053,7 +1053,8 @@ class Glm4vModel(Glm4vPreTrainedModel):
dtype=input_ids.dtype,
device=input_ids.device,
)
image_index, video_index = 0, 0
video_group_index = 0
attention_mask = attention_mask.to(total_input_ids.device)
for i, input_ids in enumerate(total_input_ids):
input_ids = input_ids[attention_mask[i] == 1]
@@ -1083,8 +1084,6 @@ class Glm4vModel(Glm4vPreTrainedModel):
llm_pos_ids_list = []
video_frame_num = 1
image_index, video_index = 0, 0
for modality_type, start_idx, end_idx in input_type_group:
st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
@@ -1125,12 +1124,14 @@ class Glm4vModel(Glm4vPreTrainedModel):
t_index = torch.tensor(t_idx).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(1, -1, llm_grid_w).flatten()
w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(1, llm_grid_h, -1).flatten()
llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + st_idx)
video_index += 1
video_group_index += 1
if video_group_index >= video_grid_thw[video_index][0]:
video_index += 1
video_group_index = 0
video_frame_num += 1
@@ -1179,7 +1180,13 @@ class Glm4vModel(Glm4vPreTrainedModel):
The temporal, height and width of feature shape of each video in LLM.
"""
pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
# reshape video_grid_thw -> [b, 3] -> [1, h, w] * frames
temp_frames_hw = []
for t, h, w in video_grid_thw:
repeated_row = torch.tensor([1, h.item(), w.item()]).unsqueeze(0).repeat(t, 1)
temp_frames_hw.append(repeated_row)
flattened_video_grid_thw = torch.cat(temp_frames_hw, dim=0)
video_embeds = self.visual(pixel_values_videos, grid_thw=flattened_video_grid_thw)
split_sizes = (video_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
video_embeds = torch.split(video_embeds, split_sizes)
return video_embeds
@@ -1379,7 +1386,7 @@ class Glm4vCausalLMOutputWithPast(ModelOutput):
class Glm4vForConditionalGeneration(Glm4vPreTrainedModel, GenerationMixin):
_checkpoint_conversion_mapping = None
_checkpoint_conversion_mapping = {}
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):

View File

@@ -1004,7 +1004,7 @@ class Glm4vTextModel(Qwen2_5_VLTextModel):
class Glm4vModel(Qwen2_5_VLModel):
_checkpoint_conversion_mapping = None
_checkpoint_conversion_mapping = {}
_no_split_modules = ["Glm4vTextDecoderLayer", "Glm4vVisionBlock"]
def __init__(self, config):
@@ -1087,7 +1087,8 @@ class Glm4vModel(Qwen2_5_VLModel):
dtype=input_ids.dtype,
device=input_ids.device,
)
image_index, video_index = 0, 0
video_group_index = 0
attention_mask = attention_mask.to(total_input_ids.device)
for i, input_ids in enumerate(total_input_ids):
input_ids = input_ids[attention_mask[i] == 1]
@@ -1117,8 +1118,6 @@ class Glm4vModel(Qwen2_5_VLModel):
llm_pos_ids_list = []
video_frame_num = 1
image_index, video_index = 0, 0
for modality_type, start_idx, end_idx in input_type_group:
st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
@@ -1159,12 +1158,14 @@ class Glm4vModel(Qwen2_5_VLModel):
t_index = torch.tensor(t_idx).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(1, -1, llm_grid_w).flatten()
w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(1, llm_grid_h, -1).flatten()
llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + st_idx)
video_index += 1
video_group_index += 1
if video_group_index >= video_grid_thw[video_index][0]:
video_index += 1
video_group_index = 0
video_frame_num += 1
@@ -1200,6 +1201,30 @@ class Glm4vModel(Qwen2_5_VLModel):
return position_ids, mrope_position_deltas
def get_video_features(
self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
):
"""
Encodes videos into continuous embeddings that can be forwarded to the language model.
Args:
pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
The tensors corresponding to the input videos.
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
The temporal, height and width of feature shape of each video in LLM.
"""
pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
# reshape video_grid_thw -> [b, 3] -> [1, h, w] * frames
temp_frames_hw = []
for t, h, w in video_grid_thw:
repeated_row = torch.tensor([1, h.item(), w.item()]).unsqueeze(0).repeat(t, 1)
temp_frames_hw.append(repeated_row)
flattened_video_grid_thw = torch.cat(temp_frames_hw, dim=0)
video_embeds = self.visual(pixel_values_videos, grid_thw=flattened_video_grid_thw)
split_sizes = (video_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
video_embeds = torch.split(video_embeds, split_sizes)
return video_embeds
@auto_docstring
@can_return_tuple
def forward(
@@ -1353,7 +1378,7 @@ class Glm4vCausalLMOutputWithPast(Qwen2_5_VLCausalLMOutputWithPast):
class Glm4vForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
_checkpoint_conversion_mapping = None
_checkpoint_conversion_mapping = {}
def forward(
self,
@@ -1661,32 +1686,38 @@ class Glm4vProcessor(Qwen2_5_VLProcessor):
video_index = 0
for i in range(len(text)):
while self.video_token in text[i]:
num_frames = len(video_grid_thw)
num_frames = video_grid_thw[video_index][0]
video_structure = ""
if hasattr(timestamps, "tolist"):
timestamps_list = timestamps.tolist()[0]
else:
timestamps_list = timestamps[0] if isinstance(timestamps[0], list) else timestamps
unique_timestamps = []
for idx in range(0, len(timestamps_list)):
unique_timestamps.append(timestamps_list[idx])
selected_timestamps = unique_timestamps[:num_frames]
while len(selected_timestamps) < num_frames:
selected_timestamps.append(selected_timestamps[-1] if selected_timestamps else 0)
for frame_idx in range(num_frames):
timestamp_sec = selected_timestamps[frame_idx]
frame_structure = f"<|begin_of_image|>{self.image_token}<|end_of_image|>{timestamp_sec}"
video_structure += frame_structure
text[i] = text[i].replace(self.video_token, video_structure, 1)
num_image_tokens = (
video_grid_thw[video_index].prod() // merge_length // video_grid_thw[video_index][0]
)
for frame_idx in range(num_frames):
if self.image_token in text[i]:
text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
video_index += 1
for frame_idx in range(len(video_grid_thw)):
if self.image_token in text[i]:
num_image_tokens = video_grid_thw[frame_idx].prod() // merge_length
text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
text[i] = text[i].replace("<|placeholder|>", self.image_token)
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])

View File

@@ -167,32 +167,38 @@ class Glm4vProcessor(ProcessorMixin):
video_index = 0
for i in range(len(text)):
while self.video_token in text[i]:
num_frames = len(video_grid_thw)
num_frames = video_grid_thw[video_index][0]
video_structure = ""
if hasattr(timestamps, "tolist"):
timestamps_list = timestamps.tolist()[0]
else:
timestamps_list = timestamps[0] if isinstance(timestamps[0], list) else timestamps
unique_timestamps = []
for idx in range(0, len(timestamps_list)):
unique_timestamps.append(timestamps_list[idx])
selected_timestamps = unique_timestamps[:num_frames]
while len(selected_timestamps) < num_frames:
selected_timestamps.append(selected_timestamps[-1] if selected_timestamps else 0)
for frame_idx in range(num_frames):
timestamp_sec = selected_timestamps[frame_idx]
frame_structure = f"<|begin_of_image|>{self.image_token}<|end_of_image|>{timestamp_sec}"
video_structure += frame_structure
text[i] = text[i].replace(self.video_token, video_structure, 1)
num_image_tokens = (
video_grid_thw[video_index].prod() // merge_length // video_grid_thw[video_index][0]
)
for frame_idx in range(num_frames):
if self.image_token in text[i]:
text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
video_index += 1
for frame_idx in range(len(video_grid_thw)):
if self.image_token in text[i]:
num_image_tokens = video_grid_thw[frame_idx].prod() // merge_length
text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
text[i] = text[i].replace("<|placeholder|>", self.image_token)
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])

View File

@@ -246,10 +246,6 @@ class Glm4vVideoProcessor(BaseVideoProcessor):
processed_grids = reorder_videos(processed_grids, grouped_videos_index)
pixel_values_videos = torch.cat(processed_videos, dim=0)
video_grid_thw = torch.tensor(processed_grids)
total_frames = video_grid_thw[0][0].item()
h = video_grid_thw[0][1].item()
w = video_grid_thw[0][2].item()
video_grid_thw = [[1, h, w] for _ in range(total_frames)]
data = {
"pixel_values_videos": pixel_values_videos,
"video_grid_thw": video_grid_thw,

View File

@@ -20,27 +20,9 @@ class RequestStatus(Enum):
try:
from opentelemetry import metrics, trace
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry import metrics
from opentelemetry.trace import Status, StatusCode, get_tracer
resource = Resource.create({"service.name": "transformers"})
metrics_exporter = PeriodicExportingMetricReader(OTLPMetricExporter(), export_interval_millis=1000)
meter_provider = MeterProvider(resource=resource, metric_readers=[metrics_exporter])
metrics.set_meter_provider(meter_provider)
trace_exporter = OTLPSpanExporter()
tracer_provider = TracerProvider(resource=resource)
tracer_provider.add_span_processor(BatchSpanProcessor(trace_exporter))
trace.set_tracer_provider(tracer_provider)
_has_opentelemetry = True
except ImportError:
_has_opentelemetry = False

View File

@@ -254,10 +254,6 @@ class Glm4vModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase)
def test_generate_from_inputs_embeds_with_static_cache(self):
pass
# The multimodal base model embeds will not match ids, due to pixel values. We can't change base test
# because in some models `pixel_values` are required. Will be fixed when we add support for merging `embeds+pixels`
# TODO: @raushan
def test_inputs_embeds(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -380,6 +376,44 @@ class Glm4vIntegrationTest(unittest.TestCase):
EXPECTED_DECODED_TEXT,
)
@slow
def test_small_model_integration_test_with_video(self):
processor = AutoProcessor.from_pretrained("THUDM/GLM-4.1V-9B-Thinking", max_image_size={"longest_edge": 50176})
model = Glm4vForConditionalGeneration.from_pretrained(
"THUDM/GLM-4.1V-9B-Thinking", torch_dtype=torch.float16, device_map="auto"
)
questions = ["Describe this video."] * 2
video_urls = [
"https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4"
] * 2
messages = [
[
{
"role": "user",
"content": [
{
"type": "video",
"video": video_url,
},
{"type": "text", "text": question},
],
}
]
for question, video_url in zip(questions, video_urls)
]
inputs = processor.apply_chat_template(
messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", padding=True
).to(torch_device)
output = model.generate(**inputs, max_new_tokens=30)
EXPECTED_DECODED_TEXT = [
"\n012345Describe this video.\n<think>Got it, let's analyze the video. First, the scene is a room with a wooden floor, maybe a traditional Japanese room with tatami",
"\n012345Describe this video.\n<think>Got it, let's analyze the video. First, the scene is a room with a wooden floor, maybe a traditional Japanese room with tatami"
] # fmt: skip
self.assertEqual(
processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
@slow
def test_small_model_integration_test_expand(self):
model = Glm4vForConditionalGeneration.from_pretrained(

View File

@@ -228,7 +228,7 @@ class Glm4vVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
@unittest.skip("Skip for now, the test needs adjustment fo GLM-4.1V")
@unittest.skip("Skip for now, the test needs adjustment for GLM-4.1V")
def test_call_numpy_4_channels(self):
for video_processing_class in self.video_processor_list:
# Test that can process videos which have an arbitrary number of channels

View File

@@ -22,7 +22,7 @@ if is_torch_available():
from torch.nn.attention.flex_attention import create_block_mask
from transformers import LlamaConfig
from transformers.masking_utils import create_causal_mask
from transformers.masking_utils import create_causal_mask, find_packed_sequence_indices
# fmt: off
@@ -130,3 +130,8 @@ class MaskTest(unittest.TestCase):
# We compatre the str representations, as the BlockMask objects themselves cannot easily be compared
self.assertEqual(causal_mask.to_string(), EXPECTED_BLOCK_MASK.to_string())
def test_find_packed_sequence_indices(self):
position_ids = torch.tensor([[0, 1, 2, 3, 0, 1, 0, 1, 2, 3], [0, 1, 2, 3, 4, 5, 0, 1, 2, 3]])
EXPECTED_SEQUENCE_INDICES = torch.tensor([[0, 0, 0, 0, 1, 1, 2, 2, 2, 2], [0, 0, 0, 0, 0, 0, 1, 1, 1, 1]])
self.assertTrue((find_packed_sequence_indices(position_ids) == EXPECTED_SEQUENCE_INDICES).all())