Compare commits
15 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a5923d4de7 | ||
|
|
d04a942545 | ||
|
|
37f8b0b535 | ||
|
|
d15e157d60 | ||
|
|
9873b2f917 | ||
|
|
d6f31553c0 | ||
|
|
cbac426388 | ||
|
|
c122145839 | ||
|
|
9da3f7d072 | ||
|
|
b2fbc02270 | ||
|
|
c30a6b7bd7 | ||
|
|
be7d1a9da0 | ||
|
|
6023ca8abd | ||
|
|
e7e78b2a0d | ||
|
|
4a5d5c490f |
2
setup.py
2
setup.py
@@ -457,7 +457,7 @@ install_requires = [
|
||||
|
||||
setup(
|
||||
name="transformers",
|
||||
version="4.53.1", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
||||
version="4.53.3", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
||||
author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
|
||||
author_email="transformers@huggingface.co",
|
||||
description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
# to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
|
||||
# in the namespace without actually importing anything (and especially none of the backends).
|
||||
|
||||
__version__ = "4.53.1"
|
||||
__version__ = "4.53.3"
|
||||
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
@@ -122,6 +122,11 @@ class RequestState:
|
||||
is_eos = token_id == self.eos_token_id and self.eos_token_id != -1
|
||||
is_max_len = self.generated_len() >= self.max_new_tokens
|
||||
|
||||
# Only add the token if we're not finishing due to max length
|
||||
# (EOS tokens should still be added to the output)
|
||||
if not (is_max_len and not is_eos):
|
||||
self.static_outputs.extend([token_id])
|
||||
|
||||
if is_eos or is_max_len:
|
||||
self.status = RequestStatus.FINISHED
|
||||
return True
|
||||
@@ -1011,7 +1016,6 @@ class ContinuousBatchProcessor:
|
||||
self.metrics.record_ttft_metric(state.created_time, state.request_id)
|
||||
state.status = RequestStatus.DECODING
|
||||
token = out_tokens[self.logits_indices[i]]
|
||||
state.static_outputs.extend([token])
|
||||
state.prompt_ids = [token]
|
||||
if state.update_with_token(token):
|
||||
self.metrics.record_request_completion(state.created_time, state.request_id)
|
||||
|
||||
@@ -599,7 +599,7 @@ class AttentionMaskInterface(GeneralInterface):
|
||||
ALL_MASK_ATTENTION_FUNCTIONS: AttentionMaskInterface = AttentionMaskInterface()
|
||||
|
||||
|
||||
def find_packed_sequence_indices(position_ids: torch.Tensor) -> Optional[torch.Tensor]:
|
||||
def find_packed_sequence_indices(position_ids: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Find the indices of the sequence to which each new query token in the sequence belongs when using packed
|
||||
tensor format (i.e. several sequences packed in the same batch dimension).
|
||||
@@ -713,7 +713,7 @@ def create_causal_mask(
|
||||
attention_mask: Optional[torch.Tensor],
|
||||
cache_position: torch.Tensor,
|
||||
past_key_values: Optional[Cache],
|
||||
position_ids: Optional[torch.Tensor],
|
||||
position_ids: Optional[torch.Tensor] = None,
|
||||
or_mask_function: Optional[Callable] = None,
|
||||
and_mask_function: Optional[Callable] = None,
|
||||
) -> Optional[Union[torch.Tensor, BlockMask]]:
|
||||
@@ -802,7 +802,7 @@ def create_sliding_window_causal_mask(
|
||||
attention_mask: Optional[torch.Tensor],
|
||||
cache_position: torch.Tensor,
|
||||
past_key_values: Optional[Cache],
|
||||
position_ids: Optional[torch.Tensor],
|
||||
position_ids: Optional[torch.Tensor] = None,
|
||||
or_mask_function: Optional[Callable] = None,
|
||||
and_mask_function: Optional[Callable] = None,
|
||||
) -> Optional[Union[torch.Tensor, BlockMask]]:
|
||||
@@ -897,7 +897,7 @@ def create_chunked_causal_mask(
|
||||
attention_mask: Optional[torch.Tensor],
|
||||
cache_position: torch.Tensor,
|
||||
past_key_values: Optional[Cache],
|
||||
position_ids: Optional[torch.Tensor],
|
||||
position_ids: Optional[torch.Tensor] = None,
|
||||
or_mask_function: Optional[Callable] = None,
|
||||
and_mask_function: Optional[Callable] = None,
|
||||
) -> Optional[Union[torch.Tensor, BlockMask]]:
|
||||
@@ -1006,7 +1006,7 @@ def create_masks_for_generate(
|
||||
attention_mask: Optional[torch.Tensor],
|
||||
cache_position: torch.Tensor,
|
||||
past_key_values: Optional[Cache],
|
||||
position_ids: Optional[torch.Tensor],
|
||||
position_ids: Optional[torch.Tensor] = None,
|
||||
or_mask_function: Optional[Callable] = None,
|
||||
and_mask_function: Optional[Callable] = None,
|
||||
**kwargs,
|
||||
|
||||
@@ -103,6 +103,16 @@ if is_flash_attn_2_available():
|
||||
from flash_attn.bert_padding import unpad_input as unpad_input_fa2
|
||||
from flash_attn.layers.rotary import apply_rotary_emb
|
||||
|
||||
HAS_FA2 = True
|
||||
FA_VERSION = 2
|
||||
elif is_torch_npu_available():
|
||||
# patch functions in package `flash-attn` when using flash-attention on Ascend NPU.
|
||||
from .integrations.npu_flash_attention import npu_apply_rotary_emb as apply_rotary_emb # noqa: F401
|
||||
from .integrations.npu_flash_attention import npu_flash_attn_func as flash_attn_2_func
|
||||
from .integrations.npu_flash_attention import npu_flash_attn_varlen_func as flash_attn_2_varlen_func
|
||||
from .integrations.npu_flash_attention import pad_input as pad_input_fa2
|
||||
from .integrations.npu_flash_attention import unpad_input as unpad_input_fa2
|
||||
|
||||
HAS_FA2 = True
|
||||
FA_VERSION = 2
|
||||
else:
|
||||
@@ -136,22 +146,6 @@ if FA_VERSION:
|
||||
unpad_input = globals()[f"unpad_input_fa{FA_VERSION}"]
|
||||
pad_input = globals()[f"pad_input_fa{FA_VERSION}"]
|
||||
|
||||
# patch functions in package `flash-attn` when using flash-attention on Ascend NPU.
|
||||
if is_torch_npu_available():
|
||||
from .integrations.npu_flash_attention import (
|
||||
npu_apply_rotary_emb as apply_rotary_emb, # noqa: F401
|
||||
)
|
||||
from .integrations.npu_flash_attention import (
|
||||
npu_flash_attn_func as flash_attn_func,
|
||||
)
|
||||
from .integrations.npu_flash_attention import (
|
||||
npu_flash_attn_varlen_func as flash_attn_varlen_func,
|
||||
)
|
||||
from .integrations.npu_flash_attention import (
|
||||
pad_input,
|
||||
unpad_input,
|
||||
)
|
||||
|
||||
|
||||
_flash_supports_window_size = False
|
||||
|
||||
|
||||
@@ -573,6 +573,7 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
|
||||
"GemmaTokenizerFast" if is_tokenizers_available() else None,
|
||||
),
|
||||
),
|
||||
("smollm3", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
|
||||
("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
|
||||
("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)),
|
||||
|
||||
@@ -19,6 +19,8 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import warnings
|
||||
|
||||
from ...configuration_utils import PretrainedConfig, layer_type_validation
|
||||
from ...modeling_rope_utils import rope_config_validation
|
||||
|
||||
@@ -216,14 +218,29 @@ class Cohere2Config(PretrainedConfig):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
|
||||
self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 4)
|
||||
|
||||
if self.layer_types is None:
|
||||
# BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
|
||||
sliding_window_pattern = getattr(self, "sliding_window_pattern", 4)
|
||||
self._sliding_window_pattern = getattr(self, "sliding_window_pattern", 4)
|
||||
self.layer_types = [
|
||||
"sliding_attention" if bool((i + 1) % sliding_window_pattern) else "full_attention"
|
||||
"sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention"
|
||||
for i in range(self.num_hidden_layers)
|
||||
]
|
||||
layer_type_validation(self.layer_types)
|
||||
|
||||
@property
|
||||
def sliding_window_pattern(self):
|
||||
warnings.warn(
|
||||
"The `sliding_window_pattern` attribute is deprecated and will be removed in v4.55.0.",
|
||||
FutureWarning,
|
||||
)
|
||||
return self._sliding_window_pattern
|
||||
|
||||
@sliding_window_pattern.setter
|
||||
def sliding_window_pattern(self, value):
|
||||
self._sliding_window_pattern = value
|
||||
|
||||
|
||||
__all__ = ["Cohere2Config"]
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import warnings
|
||||
from typing import Callable, Optional
|
||||
|
||||
import torch
|
||||
@@ -238,15 +239,30 @@ class Cohere2Config(PretrainedConfig):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
|
||||
self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 4)
|
||||
|
||||
if self.layer_types is None:
|
||||
# BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
|
||||
sliding_window_pattern = getattr(self, "sliding_window_pattern", 4)
|
||||
self._sliding_window_pattern = getattr(self, "sliding_window_pattern", 4)
|
||||
self.layer_types = [
|
||||
"sliding_attention" if bool((i + 1) % sliding_window_pattern) else "full_attention"
|
||||
"sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention"
|
||||
for i in range(self.num_hidden_layers)
|
||||
]
|
||||
layer_type_validation(self.layer_types)
|
||||
|
||||
@property
|
||||
def sliding_window_pattern(self):
|
||||
warnings.warn(
|
||||
"The `sliding_window_pattern` attribute is deprecated and will be removed in v4.55.0.",
|
||||
FutureWarning,
|
||||
)
|
||||
return self._sliding_window_pattern
|
||||
|
||||
@sliding_window_pattern.setter
|
||||
def sliding_window_pattern(self, value):
|
||||
self._sliding_window_pattern = value
|
||||
|
||||
|
||||
class Cohere2RotaryEmbedding(CohereRotaryEmbedding):
|
||||
pass
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import warnings
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from ...configuration_utils import PretrainedConfig, layer_type_validation
|
||||
@@ -145,10 +146,6 @@ class Gemma3TextConfig(PretrainedConfig):
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```
|
||||
rope_local_base_freq (float, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings for local attention.
|
||||
sliding_window_pattern (`int`, *optional*, defaults to 6):
|
||||
Pattern for the sliding window attention.
|
||||
"""
|
||||
|
||||
model_type = "gemma3_text"
|
||||
@@ -230,15 +227,28 @@ class Gemma3TextConfig(PretrainedConfig):
|
||||
self.rope_scaling = rope_scaling
|
||||
rope_config_validation(self)
|
||||
|
||||
# BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
|
||||
self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 6)
|
||||
|
||||
if self.layer_types is None:
|
||||
# BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
|
||||
sliding_window_pattern = getattr(self, "sliding_window_pattern", 6)
|
||||
self.layer_types = [
|
||||
"sliding_attention" if bool((i + 1) % sliding_window_pattern) else "full_attention"
|
||||
"sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention"
|
||||
for i in range(self.num_hidden_layers)
|
||||
]
|
||||
layer_type_validation(self.layer_types)
|
||||
|
||||
@property
|
||||
def sliding_window_pattern(self):
|
||||
warnings.warn(
|
||||
"The `sliding_window_pattern` attribute is deprecated and will be removed in v4.55.0.",
|
||||
FutureWarning,
|
||||
)
|
||||
return self._sliding_window_pattern
|
||||
|
||||
@sliding_window_pattern.setter
|
||||
def sliding_window_pattern(self, value):
|
||||
self._sliding_window_pattern = value
|
||||
|
||||
|
||||
class Gemma3Config(PretrainedConfig):
|
||||
r"""
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import copy
|
||||
import warnings
|
||||
from collections.abc import Callable
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
@@ -171,10 +172,6 @@ class Gemma3TextConfig(Gemma2Config, PretrainedConfig):
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```
|
||||
rope_local_base_freq (float, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings for local attention.
|
||||
sliding_window_pattern (`int`, *optional*, defaults to 6):
|
||||
Pattern for the sliding window attention.
|
||||
"""
|
||||
|
||||
model_type = "gemma3_text"
|
||||
@@ -241,15 +238,28 @@ class Gemma3TextConfig(Gemma2Config, PretrainedConfig):
|
||||
self.rope_scaling = rope_scaling
|
||||
rope_config_validation(self)
|
||||
|
||||
# BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
|
||||
self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 6)
|
||||
|
||||
if self.layer_types is None:
|
||||
# BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
|
||||
sliding_window_pattern = getattr(self, "sliding_window_pattern", 6)
|
||||
self.layer_types = [
|
||||
"sliding_attention" if bool((i + 1) % sliding_window_pattern) else "full_attention"
|
||||
"sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention"
|
||||
for i in range(self.num_hidden_layers)
|
||||
]
|
||||
layer_type_validation(self.layer_types)
|
||||
|
||||
@property
|
||||
def sliding_window_pattern(self):
|
||||
warnings.warn(
|
||||
"The `sliding_window_pattern` attribute is deprecated and will be removed in v4.55.0.",
|
||||
FutureWarning,
|
||||
)
|
||||
return self._sliding_window_pattern
|
||||
|
||||
@sliding_window_pattern.setter
|
||||
def sliding_window_pattern(self, value):
|
||||
self._sliding_window_pattern = value
|
||||
|
||||
|
||||
class Gemma3Config(PretrainedConfig):
|
||||
r"""
|
||||
|
||||
@@ -121,6 +121,7 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
|
||||
do_convert_rgb: bool,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]],
|
||||
device: Optional[Union[str, torch.device]],
|
||||
disable_grouping: Optional[bool],
|
||||
):
|
||||
"""
|
||||
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
|
||||
@@ -173,7 +174,7 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
|
||||
resized_height, resized_width = height, width
|
||||
|
||||
# Group images by size for batched resizing
|
||||
grouped_images, grouped_images_index = group_images_by_shape(images)
|
||||
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
|
||||
resized_images_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
if do_resize:
|
||||
@@ -191,7 +192,7 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
|
||||
resized_images = reorder_images(resized_images_grouped, grouped_images_index)
|
||||
# Group images by size for further processing
|
||||
# Needed in case do_resize is False, or resize returns images with different sizes
|
||||
grouped_images, grouped_images_index = group_images_by_shape(resized_images)
|
||||
grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
|
||||
processed_images_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
# Fused rescale and normalize
|
||||
@@ -249,6 +250,7 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
|
||||
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
device: Optional["torch.device"] = None,
|
||||
disable_grouping: Optional[bool] = False,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
@@ -323,6 +325,7 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
|
||||
do_convert_rgb=do_convert_rgb,
|
||||
input_data_format=input_data_format,
|
||||
device=device,
|
||||
disable_grouping=disable_grouping,
|
||||
)
|
||||
pixel_values.extend(patches)
|
||||
vision_grid_thws.append(image_grid_thw)
|
||||
@@ -351,11 +354,11 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
|
||||
|
||||
factor = patch_size * merge_size
|
||||
resized_height, resized_width = smart_resize(
|
||||
t=self.temporal_patch_size,
|
||||
num_frames=self.temporal_patch_size,
|
||||
height=height,
|
||||
width=width,
|
||||
temporal_factor=self.temporal_patch_size,
|
||||
factor=factor,
|
||||
t_factor=self.temporal_patch_size,
|
||||
)
|
||||
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
|
||||
return grid_h * grid_w
|
||||
|
||||
@@ -952,7 +952,7 @@ class Glm4vTextModel(Glm4vPreTrainedModel):
|
||||
@auto_docstring
|
||||
class Glm4vModel(Glm4vPreTrainedModel):
|
||||
base_model_prefix = ""
|
||||
_checkpoint_conversion_mapping = None
|
||||
_checkpoint_conversion_mapping = {}
|
||||
config_class = Glm4vConfig
|
||||
_no_split_modules = ["Glm4vTextDecoderLayer", "Glm4vVisionBlock"]
|
||||
|
||||
@@ -1053,7 +1053,8 @@ class Glm4vModel(Glm4vPreTrainedModel):
|
||||
dtype=input_ids.dtype,
|
||||
device=input_ids.device,
|
||||
)
|
||||
|
||||
image_index, video_index = 0, 0
|
||||
video_group_index = 0
|
||||
attention_mask = attention_mask.to(total_input_ids.device)
|
||||
for i, input_ids in enumerate(total_input_ids):
|
||||
input_ids = input_ids[attention_mask[i] == 1]
|
||||
@@ -1083,8 +1084,6 @@ class Glm4vModel(Glm4vPreTrainedModel):
|
||||
|
||||
llm_pos_ids_list = []
|
||||
video_frame_num = 1
|
||||
image_index, video_index = 0, 0
|
||||
|
||||
for modality_type, start_idx, end_idx in input_type_group:
|
||||
st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
|
||||
|
||||
@@ -1125,12 +1124,14 @@ class Glm4vModel(Glm4vPreTrainedModel):
|
||||
t_index = torch.tensor(t_idx).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
|
||||
|
||||
h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(1, -1, llm_grid_w).flatten()
|
||||
|
||||
w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(1, llm_grid_h, -1).flatten()
|
||||
|
||||
llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + st_idx)
|
||||
|
||||
video_index += 1
|
||||
video_group_index += 1
|
||||
|
||||
if video_group_index >= video_grid_thw[video_index][0]:
|
||||
video_index += 1
|
||||
video_group_index = 0
|
||||
|
||||
video_frame_num += 1
|
||||
|
||||
@@ -1179,7 +1180,13 @@ class Glm4vModel(Glm4vPreTrainedModel):
|
||||
The temporal, height and width of feature shape of each video in LLM.
|
||||
"""
|
||||
pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
|
||||
video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
|
||||
# reshape video_grid_thw -> [b, 3] -> [1, h, w] * frames
|
||||
temp_frames_hw = []
|
||||
for t, h, w in video_grid_thw:
|
||||
repeated_row = torch.tensor([1, h.item(), w.item()]).unsqueeze(0).repeat(t, 1)
|
||||
temp_frames_hw.append(repeated_row)
|
||||
flattened_video_grid_thw = torch.cat(temp_frames_hw, dim=0)
|
||||
video_embeds = self.visual(pixel_values_videos, grid_thw=flattened_video_grid_thw)
|
||||
split_sizes = (video_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
|
||||
video_embeds = torch.split(video_embeds, split_sizes)
|
||||
return video_embeds
|
||||
@@ -1379,7 +1386,7 @@ class Glm4vCausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
|
||||
class Glm4vForConditionalGeneration(Glm4vPreTrainedModel, GenerationMixin):
|
||||
_checkpoint_conversion_mapping = None
|
||||
_checkpoint_conversion_mapping = {}
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
||||
@@ -1004,7 +1004,7 @@ class Glm4vTextModel(Qwen2_5_VLTextModel):
|
||||
|
||||
|
||||
class Glm4vModel(Qwen2_5_VLModel):
|
||||
_checkpoint_conversion_mapping = None
|
||||
_checkpoint_conversion_mapping = {}
|
||||
_no_split_modules = ["Glm4vTextDecoderLayer", "Glm4vVisionBlock"]
|
||||
|
||||
def __init__(self, config):
|
||||
@@ -1087,7 +1087,8 @@ class Glm4vModel(Qwen2_5_VLModel):
|
||||
dtype=input_ids.dtype,
|
||||
device=input_ids.device,
|
||||
)
|
||||
|
||||
image_index, video_index = 0, 0
|
||||
video_group_index = 0
|
||||
attention_mask = attention_mask.to(total_input_ids.device)
|
||||
for i, input_ids in enumerate(total_input_ids):
|
||||
input_ids = input_ids[attention_mask[i] == 1]
|
||||
@@ -1117,8 +1118,6 @@ class Glm4vModel(Qwen2_5_VLModel):
|
||||
|
||||
llm_pos_ids_list = []
|
||||
video_frame_num = 1
|
||||
image_index, video_index = 0, 0
|
||||
|
||||
for modality_type, start_idx, end_idx in input_type_group:
|
||||
st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
|
||||
|
||||
@@ -1159,12 +1158,14 @@ class Glm4vModel(Qwen2_5_VLModel):
|
||||
t_index = torch.tensor(t_idx).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
|
||||
|
||||
h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(1, -1, llm_grid_w).flatten()
|
||||
|
||||
w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(1, llm_grid_h, -1).flatten()
|
||||
|
||||
llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + st_idx)
|
||||
|
||||
video_index += 1
|
||||
video_group_index += 1
|
||||
|
||||
if video_group_index >= video_grid_thw[video_index][0]:
|
||||
video_index += 1
|
||||
video_group_index = 0
|
||||
|
||||
video_frame_num += 1
|
||||
|
||||
@@ -1200,6 +1201,30 @@ class Glm4vModel(Qwen2_5_VLModel):
|
||||
|
||||
return position_ids, mrope_position_deltas
|
||||
|
||||
def get_video_features(
|
||||
self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
|
||||
):
|
||||
"""
|
||||
Encodes videos into continuous embeddings that can be forwarded to the language model.
|
||||
|
||||
Args:
|
||||
pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
|
||||
The tensors corresponding to the input videos.
|
||||
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
|
||||
The temporal, height and width of feature shape of each video in LLM.
|
||||
"""
|
||||
pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
|
||||
# reshape video_grid_thw -> [b, 3] -> [1, h, w] * frames
|
||||
temp_frames_hw = []
|
||||
for t, h, w in video_grid_thw:
|
||||
repeated_row = torch.tensor([1, h.item(), w.item()]).unsqueeze(0).repeat(t, 1)
|
||||
temp_frames_hw.append(repeated_row)
|
||||
flattened_video_grid_thw = torch.cat(temp_frames_hw, dim=0)
|
||||
video_embeds = self.visual(pixel_values_videos, grid_thw=flattened_video_grid_thw)
|
||||
split_sizes = (video_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
|
||||
video_embeds = torch.split(video_embeds, split_sizes)
|
||||
return video_embeds
|
||||
|
||||
@auto_docstring
|
||||
@can_return_tuple
|
||||
def forward(
|
||||
@@ -1353,7 +1378,7 @@ class Glm4vCausalLMOutputWithPast(Qwen2_5_VLCausalLMOutputWithPast):
|
||||
|
||||
|
||||
class Glm4vForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
|
||||
_checkpoint_conversion_mapping = None
|
||||
_checkpoint_conversion_mapping = {}
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@@ -1661,32 +1686,38 @@ class Glm4vProcessor(Qwen2_5_VLProcessor):
|
||||
video_index = 0
|
||||
for i in range(len(text)):
|
||||
while self.video_token in text[i]:
|
||||
num_frames = len(video_grid_thw)
|
||||
num_frames = video_grid_thw[video_index][0]
|
||||
video_structure = ""
|
||||
|
||||
if hasattr(timestamps, "tolist"):
|
||||
timestamps_list = timestamps.tolist()[0]
|
||||
else:
|
||||
timestamps_list = timestamps[0] if isinstance(timestamps[0], list) else timestamps
|
||||
|
||||
unique_timestamps = []
|
||||
for idx in range(0, len(timestamps_list)):
|
||||
unique_timestamps.append(timestamps_list[idx])
|
||||
|
||||
selected_timestamps = unique_timestamps[:num_frames]
|
||||
while len(selected_timestamps) < num_frames:
|
||||
selected_timestamps.append(selected_timestamps[-1] if selected_timestamps else 0)
|
||||
|
||||
for frame_idx in range(num_frames):
|
||||
timestamp_sec = selected_timestamps[frame_idx]
|
||||
frame_structure = f"<|begin_of_image|>{self.image_token}<|end_of_image|>{timestamp_sec}"
|
||||
video_structure += frame_structure
|
||||
|
||||
text[i] = text[i].replace(self.video_token, video_structure, 1)
|
||||
num_image_tokens = (
|
||||
video_grid_thw[video_index].prod() // merge_length // video_grid_thw[video_index][0]
|
||||
)
|
||||
for frame_idx in range(num_frames):
|
||||
if self.image_token in text[i]:
|
||||
text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
|
||||
|
||||
video_index += 1
|
||||
|
||||
for frame_idx in range(len(video_grid_thw)):
|
||||
if self.image_token in text[i]:
|
||||
num_image_tokens = video_grid_thw[frame_idx].prod() // merge_length
|
||||
text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
|
||||
text[i] = text[i].replace("<|placeholder|>", self.image_token)
|
||||
|
||||
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||||
self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
|
||||
|
||||
@@ -167,32 +167,38 @@ class Glm4vProcessor(ProcessorMixin):
|
||||
video_index = 0
|
||||
for i in range(len(text)):
|
||||
while self.video_token in text[i]:
|
||||
num_frames = len(video_grid_thw)
|
||||
num_frames = video_grid_thw[video_index][0]
|
||||
video_structure = ""
|
||||
|
||||
if hasattr(timestamps, "tolist"):
|
||||
timestamps_list = timestamps.tolist()[0]
|
||||
else:
|
||||
timestamps_list = timestamps[0] if isinstance(timestamps[0], list) else timestamps
|
||||
|
||||
unique_timestamps = []
|
||||
for idx in range(0, len(timestamps_list)):
|
||||
unique_timestamps.append(timestamps_list[idx])
|
||||
|
||||
selected_timestamps = unique_timestamps[:num_frames]
|
||||
while len(selected_timestamps) < num_frames:
|
||||
selected_timestamps.append(selected_timestamps[-1] if selected_timestamps else 0)
|
||||
|
||||
for frame_idx in range(num_frames):
|
||||
timestamp_sec = selected_timestamps[frame_idx]
|
||||
frame_structure = f"<|begin_of_image|>{self.image_token}<|end_of_image|>{timestamp_sec}"
|
||||
video_structure += frame_structure
|
||||
|
||||
text[i] = text[i].replace(self.video_token, video_structure, 1)
|
||||
num_image_tokens = (
|
||||
video_grid_thw[video_index].prod() // merge_length // video_grid_thw[video_index][0]
|
||||
)
|
||||
for frame_idx in range(num_frames):
|
||||
if self.image_token in text[i]:
|
||||
text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
|
||||
|
||||
video_index += 1
|
||||
|
||||
for frame_idx in range(len(video_grid_thw)):
|
||||
if self.image_token in text[i]:
|
||||
num_image_tokens = video_grid_thw[frame_idx].prod() // merge_length
|
||||
text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
|
||||
text[i] = text[i].replace("<|placeholder|>", self.image_token)
|
||||
|
||||
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||||
self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
|
||||
|
||||
@@ -246,10 +246,6 @@ class Glm4vVideoProcessor(BaseVideoProcessor):
|
||||
processed_grids = reorder_videos(processed_grids, grouped_videos_index)
|
||||
pixel_values_videos = torch.cat(processed_videos, dim=0)
|
||||
video_grid_thw = torch.tensor(processed_grids)
|
||||
total_frames = video_grid_thw[0][0].item()
|
||||
h = video_grid_thw[0][1].item()
|
||||
w = video_grid_thw[0][2].item()
|
||||
video_grid_thw = [[1, h, w] for _ in range(total_frames)]
|
||||
data = {
|
||||
"pixel_values_videos": pixel_values_videos,
|
||||
"video_grid_thw": video_grid_thw,
|
||||
|
||||
@@ -20,27 +20,9 @@ class RequestStatus(Enum):
|
||||
|
||||
|
||||
try:
|
||||
from opentelemetry import metrics, trace
|
||||
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
|
||||
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
|
||||
from opentelemetry.sdk.metrics import MeterProvider
|
||||
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
|
||||
from opentelemetry.sdk.resources import Resource
|
||||
from opentelemetry.sdk.trace import TracerProvider
|
||||
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
||||
from opentelemetry import metrics
|
||||
from opentelemetry.trace import Status, StatusCode, get_tracer
|
||||
|
||||
resource = Resource.create({"service.name": "transformers"})
|
||||
|
||||
metrics_exporter = PeriodicExportingMetricReader(OTLPMetricExporter(), export_interval_millis=1000)
|
||||
meter_provider = MeterProvider(resource=resource, metric_readers=[metrics_exporter])
|
||||
metrics.set_meter_provider(meter_provider)
|
||||
|
||||
trace_exporter = OTLPSpanExporter()
|
||||
tracer_provider = TracerProvider(resource=resource)
|
||||
tracer_provider.add_span_processor(BatchSpanProcessor(trace_exporter))
|
||||
trace.set_tracer_provider(tracer_provider)
|
||||
|
||||
_has_opentelemetry = True
|
||||
except ImportError:
|
||||
_has_opentelemetry = False
|
||||
|
||||
@@ -254,10 +254,6 @@ class Glm4vModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase)
|
||||
def test_generate_from_inputs_embeds_with_static_cache(self):
|
||||
pass
|
||||
|
||||
# The multimodal base model embeds will not match ids, due to pixel values. We can't change base test
|
||||
# because in some models `pixel_values` are required. Will be fixed when we add support for merging `embeds+pixels`
|
||||
# TODO: @raushan
|
||||
|
||||
def test_inputs_embeds(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
@@ -380,6 +376,44 @@ class Glm4vIntegrationTest(unittest.TestCase):
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
|
||||
@slow
|
||||
def test_small_model_integration_test_with_video(self):
|
||||
processor = AutoProcessor.from_pretrained("THUDM/GLM-4.1V-9B-Thinking", max_image_size={"longest_edge": 50176})
|
||||
model = Glm4vForConditionalGeneration.from_pretrained(
|
||||
"THUDM/GLM-4.1V-9B-Thinking", torch_dtype=torch.float16, device_map="auto"
|
||||
)
|
||||
questions = ["Describe this video."] * 2
|
||||
video_urls = [
|
||||
"https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4"
|
||||
] * 2
|
||||
messages = [
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "video",
|
||||
"video": video_url,
|
||||
},
|
||||
{"type": "text", "text": question},
|
||||
],
|
||||
}
|
||||
]
|
||||
for question, video_url in zip(questions, video_urls)
|
||||
]
|
||||
inputs = processor.apply_chat_template(
|
||||
messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", padding=True
|
||||
).to(torch_device)
|
||||
output = model.generate(**inputs, max_new_tokens=30)
|
||||
EXPECTED_DECODED_TEXT = [
|
||||
"\n012345Describe this video.\n<think>Got it, let's analyze the video. First, the scene is a room with a wooden floor, maybe a traditional Japanese room with tatami",
|
||||
"\n012345Describe this video.\n<think>Got it, let's analyze the video. First, the scene is a room with a wooden floor, maybe a traditional Japanese room with tatami"
|
||||
] # fmt: skip
|
||||
self.assertEqual(
|
||||
processor.batch_decode(output, skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
|
||||
@slow
|
||||
def test_small_model_integration_test_expand(self):
|
||||
model = Glm4vForConditionalGeneration.from_pretrained(
|
||||
|
||||
@@ -228,7 +228,7 @@ class Glm4vVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
|
||||
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
|
||||
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
@unittest.skip("Skip for now, the test needs adjustment fo GLM-4.1V")
|
||||
@unittest.skip("Skip for now, the test needs adjustment for GLM-4.1V")
|
||||
def test_call_numpy_4_channels(self):
|
||||
for video_processing_class in self.video_processor_list:
|
||||
# Test that can process videos which have an arbitrary number of channels
|
||||
|
||||
@@ -22,7 +22,7 @@ if is_torch_available():
|
||||
from torch.nn.attention.flex_attention import create_block_mask
|
||||
|
||||
from transformers import LlamaConfig
|
||||
from transformers.masking_utils import create_causal_mask
|
||||
from transformers.masking_utils import create_causal_mask, find_packed_sequence_indices
|
||||
|
||||
|
||||
# fmt: off
|
||||
@@ -130,3 +130,8 @@ class MaskTest(unittest.TestCase):
|
||||
|
||||
# We compatre the str representations, as the BlockMask objects themselves cannot easily be compared
|
||||
self.assertEqual(causal_mask.to_string(), EXPECTED_BLOCK_MASK.to_string())
|
||||
|
||||
def test_find_packed_sequence_indices(self):
|
||||
position_ids = torch.tensor([[0, 1, 2, 3, 0, 1, 0, 1, 2, 3], [0, 1, 2, 3, 4, 5, 0, 1, 2, 3]])
|
||||
EXPECTED_SEQUENCE_INDICES = torch.tensor([[0, 0, 0, 0, 1, 1, 2, 2, 2, 2], [0, 0, 0, 0, 0, 0, 1, 1, 1, 1]])
|
||||
self.assertTrue((find_packed_sequence_indices(position_ids) == EXPECTED_SEQUENCE_INDICES).all())
|
||||
|
||||
Reference in New Issue
Block a user