GLM-4.5V Model Support (#39805)
* init * update * uupdate * ruff * t patch is 2 defalut not 1 * draft * back * back1 * update * config update * update using glm-41 format * add self.rope_scaling = config.rope_scaling * update config * update * remove the processor * update * fix tests * update * for test * update * update 2126 * self.rope_scaling is missing in GLM4MOE lets add it * update * update * Update modular_glm4v_moe.py * change config * update apply_multimodal_rotary_pos_emb * format * update * Delete 3-rollout_qas_thinking_answers.py * use right name * update with place holder * update * use right rotary * Update image_processing_glm4v_fast.py * rope_config_validation needs to rewrite the entire config file in modular * update * changed name * update * Update modeling_glm4v_moe.py * _init_weights shoud be add in Glm4vMoePreTrainedModel * remove use_qk_norm * Update modular_glm4v_moe.py * remove use_qk_norm as it is not use * fix style * deprecations are not needed on new models * fix merge issues --------- Co-authored-by: raushan <raushan@huggingface.co> Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> Co-authored-by: Arthur <arthur.zucker@gmail.com>
This commit is contained in:
@@ -1009,6 +1009,8 @@
|
||||
title: GIT
|
||||
- local: model_doc/glm4v
|
||||
title: glm4v
|
||||
- local: model_doc/glm4v_moe
|
||||
title: glm4v_moe
|
||||
- local: model_doc/got_ocr2
|
||||
title: GOT-OCR2
|
||||
- local: model_doc/granitevision
|
||||
|
||||
64
docs/source/en/model_doc/glm4v_moe.md
Normal file
64
docs/source/en/model_doc/glm4v_moe.md
Normal file
@@ -0,0 +1,64 @@
|
||||
<!--Copyright 2025 The ZhipuAI Inc. and The HuggingFace Inc. team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
<div style="float: right;">
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
|
||||
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white"> </div>
|
||||
</div>
|
||||
|
||||
# Glm4vMoe
|
||||
|
||||
## Overview
|
||||
|
||||
The Glm4vMoe model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
|
||||
<INSERT SHORT SUMMARY HERE>
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*<INSERT PAPER ABSTRACT HERE>*
|
||||
|
||||
Tips:
|
||||
|
||||
<INSERT TIPS ABOUT MODEL HERE>
|
||||
|
||||
This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
|
||||
The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
|
||||
|
||||
|
||||
## Glm4vMoeConfig
|
||||
|
||||
[[autodoc]] Glm4vMoeConfig
|
||||
|
||||
## Glm4vMoeTextConfig
|
||||
|
||||
[[autodoc]] Glm4vMoeTextConfig
|
||||
|
||||
## Glm4vMoeTextModel
|
||||
|
||||
[[autodoc]] Glm4vMoeTextModel
|
||||
- forward
|
||||
|
||||
## Glm4vMoeModel
|
||||
|
||||
[[autodoc]] Glm4vMoeModel
|
||||
- forward
|
||||
|
||||
## Glm4vMoeForConditionalGeneration
|
||||
|
||||
[[autodoc]] Glm4vMoeForConditionalGeneration
|
||||
- forward
|
||||
@@ -163,6 +163,8 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
||||
("glm4", "Glm4Config"),
|
||||
("glm4_moe", "Glm4MoeConfig"),
|
||||
("glm4v", "Glm4vConfig"),
|
||||
("glm4v_moe", "Glm4vMoeConfig"),
|
||||
("glm4v_moe_text", "Glm4vMoeTextConfig"),
|
||||
("glm4v_text", "Glm4vTextConfig"),
|
||||
("glpn", "GLPNConfig"),
|
||||
("got_ocr2", "GotOcr2Config"),
|
||||
@@ -569,6 +571,8 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
||||
("glm4", "GLM4"),
|
||||
("glm4_moe", "Glm4MoE"),
|
||||
("glm4v", "GLM4V"),
|
||||
("glm4v_moe", "GLM4VMOE"),
|
||||
("glm4v_moe_text", "GLM4VMOE"),
|
||||
("glm4v_text", "GLM4V"),
|
||||
("glpn", "GLPN"),
|
||||
("got_ocr2", "GOT-OCR2"),
|
||||
@@ -900,6 +904,7 @@ SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict[str, str](
|
||||
("gemma3n_text", "gemma3n"),
|
||||
("gemma3n_vision", "gemma3n"),
|
||||
("glm4v_text", "glm4v"),
|
||||
("glm4v_moe_text", "glm4v_moe"),
|
||||
("idefics3_vision", "idefics3"),
|
||||
("siglip_vision_model", "siglip"),
|
||||
("aimv2_vision_model", "aimv2"),
|
||||
|
||||
@@ -165,6 +165,8 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("glm4", "Glm4Model"),
|
||||
("glm4_moe", "Glm4MoeModel"),
|
||||
("glm4v", "Glm4vModel"),
|
||||
("glm4v_moe", "Glm4vMoeModel"),
|
||||
("glm4v_moe_text", "Glm4vMoeTextModel"),
|
||||
("glm4v_text", "Glm4vTextModel"),
|
||||
("glpn", "GLPNModel"),
|
||||
("got_ocr2", "GotOcr2Model"),
|
||||
@@ -970,6 +972,7 @@ MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict(
|
||||
("gemma3n", "Gemma3nForConditionalGeneration"),
|
||||
("git", "GitForCausalLM"),
|
||||
("glm4v", "Glm4vForConditionalGeneration"),
|
||||
("glm4v_moe", "Glm4vMoeForConditionalGeneration"),
|
||||
("got_ocr2", "GotOcr2ForConditionalGeneration"),
|
||||
("idefics", "IdeficsForVisionText2Text"),
|
||||
("idefics2", "Idefics2ForConditionalGeneration"),
|
||||
|
||||
@@ -74,6 +74,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
|
||||
("gemma3n", "Gemma3nProcessor"),
|
||||
("git", "GitProcessor"),
|
||||
("glm4v", "Glm4vProcessor"),
|
||||
("glm4v_moe", "Glm4vProcessor"),
|
||||
("got_ocr2", "GotOcr2Processor"),
|
||||
("granite_speech", "GraniteSpeechProcessor"),
|
||||
("grounding-dino", "GroundingDinoProcessor"),
|
||||
|
||||
@@ -294,6 +294,7 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
|
||||
("glm4", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("glm4_moe", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("glm4v", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("glm4v_moe", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)),
|
||||
("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
|
||||
("gpt_bigcode", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
|
||||
|
||||
@@ -135,6 +135,7 @@ class Glm4MoeAttention(nn.Module):
|
||||
self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
|
||||
self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
|
||||
self.scaling = self.head_dim**-0.5
|
||||
self.rope_scaling = config.rope_scaling
|
||||
self.attention_dropout = config.attention_dropout
|
||||
self.is_causal = True
|
||||
|
||||
|
||||
@@ -263,6 +263,7 @@ class Glm4MoeAttention(CohereAttention, nn.Module):
|
||||
self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
|
||||
self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
|
||||
self.scaling = self.head_dim**-0.5
|
||||
self.rope_scaling = config.rope_scaling
|
||||
self.attention_dropout = config.attention_dropout
|
||||
self.is_causal = True
|
||||
|
||||
|
||||
@@ -94,7 +94,7 @@ class Glm4vVisionConfig(PretrainedConfig):
|
||||
patch_size=14,
|
||||
rms_norm_eps=1e-05,
|
||||
spatial_merge_size=2,
|
||||
temporal_patch_size=1,
|
||||
temporal_patch_size=2,
|
||||
out_hidden_size=4096,
|
||||
intermediate_size=13696,
|
||||
initializer_range=0.02,
|
||||
|
||||
@@ -22,8 +22,6 @@ from ...image_processing_utils import (
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
group_images_by_shape,
|
||||
reorder_images,
|
||||
)
|
||||
from ...image_utils import (
|
||||
OPENAI_CLIP_MEAN,
|
||||
@@ -47,7 +45,6 @@ from .image_processing_glm4v import smart_resize
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
if is_torchvision_available():
|
||||
if is_torchvision_v2_available():
|
||||
from torchvision.transforms.v2 import functional as F
|
||||
@@ -112,48 +109,44 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
|
||||
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
|
||||
"""
|
||||
|
||||
# Group images by size for batched resizing
|
||||
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
|
||||
resized_images_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
height, width = stacked_images.shape[-2:]
|
||||
processed_images = []
|
||||
processed_grids = []
|
||||
|
||||
all_target_sizes = []
|
||||
for image in images:
|
||||
height, width = image.shape[-2:]
|
||||
resized_height, resized_width = smart_resize(
|
||||
num_frames=temporal_patch_size,
|
||||
height=height,
|
||||
width=width,
|
||||
temporal_factor=temporal_patch_size,
|
||||
factor=patch_size * merge_size,
|
||||
)
|
||||
all_target_sizes.append((resized_height, resized_width))
|
||||
|
||||
target_height = max([s[0] for s in all_target_sizes])
|
||||
target_width = max([s[1] for s in all_target_sizes])
|
||||
|
||||
for image in images:
|
||||
if do_resize:
|
||||
resized_height, resized_width = smart_resize(
|
||||
num_frames=temporal_patch_size,
|
||||
height=height,
|
||||
width=width,
|
||||
temporal_factor=temporal_patch_size,
|
||||
factor=patch_size * merge_size,
|
||||
)
|
||||
stacked_images = self.resize(
|
||||
stacked_images,
|
||||
size=SizeDict(height=resized_height, width=resized_width),
|
||||
image = self.resize(
|
||||
image,
|
||||
size=SizeDict(height=target_height, width=target_width),
|
||||
interpolation=interpolation,
|
||||
)
|
||||
resized_images_grouped[shape] = stacked_images
|
||||
resized_images = reorder_images(resized_images_grouped, grouped_images_index)
|
||||
# Group images by size for further processing
|
||||
# Needed in case do_resize is False, or resize returns images with different sizes
|
||||
grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
|
||||
processed_images_grouped = {}
|
||||
processed_grids = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
resized_height, resized_width = stacked_images.shape[-2:]
|
||||
# Fused rescale and normalize
|
||||
stacked_images = self.rescale_and_normalize(
|
||||
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
|
||||
)
|
||||
# add a temporal dimension
|
||||
patches = stacked_images.unsqueeze(1)
|
||||
if patches.shape[1] % temporal_patch_size != 0:
|
||||
repeats = patches[:, -1:].repeat(1, temporal_patch_size - 1, 1, 1, 1)
|
||||
patches = torch.cat([patches, repeats], dim=1)
|
||||
batch_size, grid_t, channel = patches.shape[:3]
|
||||
grid_t = grid_t // temporal_patch_size
|
||||
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
|
||||
|
||||
image = self.rescale_and_normalize(
|
||||
image.unsqueeze(0), do_rescale, rescale_factor, do_normalize, image_mean, image_std
|
||||
).squeeze(0)
|
||||
|
||||
patches = image.unsqueeze(0)
|
||||
if patches.shape[0] % temporal_patch_size != 0:
|
||||
repeats = patches[-1:].repeat(temporal_patch_size - (patches.shape[0] % temporal_patch_size), 1, 1, 1)
|
||||
patches = torch.cat([patches, repeats], dim=0)
|
||||
channel = patches.shape[1]
|
||||
grid_t = patches.shape[0] // temporal_patch_size
|
||||
grid_h, grid_w = target_height // patch_size, target_width // patch_size
|
||||
patches = patches.view(
|
||||
batch_size,
|
||||
grid_t,
|
||||
temporal_patch_size,
|
||||
channel,
|
||||
@@ -164,18 +157,14 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
|
||||
merge_size,
|
||||
patch_size,
|
||||
)
|
||||
patches = patches.permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9)
|
||||
patches = patches.permute(0, 3, 6, 4, 7, 2, 1, 5, 8)
|
||||
flatten_patches = patches.reshape(
|
||||
batch_size,
|
||||
grid_t * grid_h * grid_w,
|
||||
channel * temporal_patch_size * patch_size * patch_size,
|
||||
)
|
||||
processed_images.append(flatten_patches)
|
||||
processed_grids.append([grid_t, grid_h, grid_w])
|
||||
|
||||
processed_images_grouped[shape] = flatten_patches
|
||||
processed_grids[shape] = [[grid_t, grid_h, grid_w]] * batch_size
|
||||
|
||||
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
||||
processed_grids = reorder_images(processed_grids, grouped_images_index)
|
||||
pixel_values = torch.stack(processed_images, dim=0)
|
||||
image_grid_thw = torch.tensor(processed_grids)
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
|
||||
from ...utils.deprecation import deprecate_kwarg
|
||||
from ...utils.generic import check_model_inputs
|
||||
from .configuration_glm4v import Glm4vConfig, Glm4vTextConfig, Glm4vVisionConfig
|
||||
|
||||
|
||||
@@ -399,130 +399,6 @@ class Glm4vVisionBlock(GradientCheckpointingLayer):
|
||||
return hidden_states
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class Glm4vPreTrainedModel(PreTrainedModel):
|
||||
config: Glm4vConfig
|
||||
base_model_prefix = "model"
|
||||
supports_gradient_checkpointing = True
|
||||
_no_split_modules = ["Glm4vTextDecoderLayer", "Glm4vVisionBlock"]
|
||||
_skip_keys_device_placement = "past_key_values"
|
||||
_supports_flash_attn = True
|
||||
_supports_sdpa = True
|
||||
|
||||
_can_compile_fullgraph = True
|
||||
_supports_attention_backend = True
|
||||
|
||||
|
||||
class Glm4vVisionModel(Glm4vPreTrainedModel):
|
||||
config: Glm4vVisionConfig
|
||||
_no_split_modules = ["Glm4vVisionBlock"]
|
||||
|
||||
def __init__(self, config) -> None:
|
||||
super().__init__(config)
|
||||
self.spatial_merge_size = config.spatial_merge_size
|
||||
self.patch_size = config.patch_size
|
||||
|
||||
self.embeddings = Glm4vVisionEmbeddings(config)
|
||||
self.patch_embed = Glm4vVisionPatchEmbed(config)
|
||||
|
||||
head_dim = config.hidden_size // config.num_heads
|
||||
self.rotary_pos_emb = Glm4vVisionRotaryEmbedding(head_dim // 2)
|
||||
|
||||
self.blocks = nn.ModuleList([Glm4vVisionBlock(config) for _ in range(config.depth)])
|
||||
self.merger = Glm4vVisionPatchMerger(
|
||||
dim=config.out_hidden_size, context_dim=config.intermediate_size, hidden_act=config.hidden_act
|
||||
)
|
||||
|
||||
self.post_conv_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.downsample = nn.Conv2d(
|
||||
in_channels=config.hidden_size,
|
||||
out_channels=config.out_hidden_size,
|
||||
kernel_size=config.spatial_merge_size,
|
||||
stride=config.spatial_merge_size,
|
||||
)
|
||||
self.post_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
|
||||
self.gradient_checkpointing = False
|
||||
self.post_init()
|
||||
|
||||
def rot_pos_emb(self, grid_thw):
|
||||
pos_ids = []
|
||||
for t, h, w in grid_thw:
|
||||
hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
|
||||
hpos_ids = hpos_ids.reshape(
|
||||
h // self.spatial_merge_size,
|
||||
self.spatial_merge_size,
|
||||
w // self.spatial_merge_size,
|
||||
self.spatial_merge_size,
|
||||
)
|
||||
hpos_ids = hpos_ids.permute(0, 2, 1, 3)
|
||||
hpos_ids = hpos_ids.flatten()
|
||||
|
||||
wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
|
||||
wpos_ids = wpos_ids.reshape(
|
||||
h // self.spatial_merge_size,
|
||||
self.spatial_merge_size,
|
||||
w // self.spatial_merge_size,
|
||||
self.spatial_merge_size,
|
||||
)
|
||||
wpos_ids = wpos_ids.permute(0, 2, 1, 3)
|
||||
wpos_ids = wpos_ids.flatten()
|
||||
pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
|
||||
pos_ids = torch.cat(pos_ids, dim=0)
|
||||
max_grid_size = grid_thw[:, 1:].max()
|
||||
rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
|
||||
rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
|
||||
return rotary_pos_emb, pos_ids
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Args:
|
||||
hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
|
||||
The final hidden states of the model.
|
||||
grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
|
||||
The temporal, height and width of feature shape of each image in LLM.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`: hidden_states.
|
||||
"""
|
||||
hidden_states = self.patch_embed(hidden_states)
|
||||
hidden_states = self.post_conv_layernorm(hidden_states)
|
||||
|
||||
rotary_pos_emb, image_type_ids = self.rot_pos_emb(grid_thw)
|
||||
emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
|
||||
position_embeddings = (emb.cos(), emb.sin())
|
||||
|
||||
cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
|
||||
dim=0,
|
||||
# Select dtype based on the following factors:
|
||||
# - FA2 requires that cu_seqlens_q must have dtype int32
|
||||
# - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
|
||||
# See https://github.com/huggingface/transformers/pull/34852 for more information
|
||||
dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
|
||||
)
|
||||
cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
|
||||
seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
|
||||
hidden_states = self.embeddings(hidden_states, seqlens, grid_thw, image_type_ids[:, 0], image_type_ids[:, 1])
|
||||
|
||||
for blk in self.blocks:
|
||||
hidden_states = blk(
|
||||
hidden_states,
|
||||
cu_seqlens=cu_seqlens,
|
||||
position_embeddings=position_embeddings,
|
||||
)
|
||||
|
||||
hidden_states = self.post_layernorm(hidden_states)
|
||||
|
||||
hidden_states = hidden_states.view(
|
||||
-1, self.spatial_merge_size, self.spatial_merge_size, hidden_states.shape[-1]
|
||||
)
|
||||
hidden_states = hidden_states.permute(0, 3, 1, 2)
|
||||
hidden_states = self.downsample(hidden_states).view(-1, self.config.out_hidden_size)
|
||||
|
||||
hidden_states = self.merger(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class Glm4vTextRotaryEmbedding(nn.Module):
|
||||
inv_freq: torch.Tensor # fix linting for `register_buffer`
|
||||
|
||||
@@ -651,7 +527,6 @@ class Glm4vTextAttention(nn.Module):
|
||||
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
|
||||
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
|
||||
|
||||
@deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
@@ -659,8 +534,6 @@ class Glm4vTextAttention(nn.Module):
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[Cache] = None,
|
||||
output_attentions: bool = False,
|
||||
use_cache: bool = False,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
|
||||
@@ -700,7 +573,7 @@ class Glm4vTextAttention(nn.Module):
|
||||
|
||||
attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
|
||||
attn_output = self.o_proj(attn_output)
|
||||
return attn_output, attn_weights, past_key_values
|
||||
return attn_output, attn_weights
|
||||
|
||||
|
||||
class Glm4vTextMLP(nn.Module):
|
||||
@@ -732,7 +605,6 @@ class Glm4vTextDecoderLayer(GradientCheckpointingLayer):
|
||||
self.post_self_attn_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.post_mlp_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
|
||||
@deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
@@ -750,7 +622,7 @@ class Glm4vTextDecoderLayer(GradientCheckpointingLayer):
|
||||
hidden_states = self.input_layernorm(hidden_states)
|
||||
|
||||
# Self Attention
|
||||
hidden_states, self_attn_weights, present_key_value = self.self_attn(
|
||||
hidden_states, _ = self.self_attn(
|
||||
hidden_states=hidden_states,
|
||||
position_embeddings=position_embeddings,
|
||||
attention_mask=attention_mask,
|
||||
@@ -772,15 +644,7 @@ class Glm4vTextDecoderLayer(GradientCheckpointingLayer):
|
||||
hidden_states = self.post_mlp_layernorm(hidden_states)
|
||||
hidden_states = residual + hidden_states
|
||||
|
||||
outputs = (hidden_states,)
|
||||
|
||||
if output_attentions:
|
||||
outputs += (self_attn_weights,)
|
||||
|
||||
if use_cache:
|
||||
outputs += (present_key_value,)
|
||||
|
||||
return outputs
|
||||
return hidden_states
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -808,6 +672,134 @@ class Glm4vModelOutputWithPast(ModelOutput):
|
||||
rope_deltas: Optional[torch.LongTensor] = None
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class Glm4vPreTrainedModel(PreTrainedModel):
|
||||
config: Glm4vConfig
|
||||
base_model_prefix = "model"
|
||||
supports_gradient_checkpointing = True
|
||||
_no_split_modules = ["Glm4vTextDecoderLayer", "Glm4vVisionBlock"]
|
||||
_skip_keys_device_placement = "past_key_values"
|
||||
_supports_flash_attn = True
|
||||
_supports_sdpa = True
|
||||
|
||||
_can_compile_fullgraph = True
|
||||
_supports_attention_backend = True
|
||||
_can_record_outputs = {
|
||||
"hidden_states": Glm4vTextDecoderLayer,
|
||||
"attentions": Glm4vTextAttention,
|
||||
}
|
||||
|
||||
|
||||
class Glm4vVisionModel(Glm4vPreTrainedModel):
|
||||
config: Glm4vVisionConfig
|
||||
_no_split_modules = ["Glm4vVisionBlock"]
|
||||
|
||||
def __init__(self, config) -> None:
|
||||
super().__init__(config)
|
||||
self.spatial_merge_size = config.spatial_merge_size
|
||||
self.patch_size = config.patch_size
|
||||
|
||||
self.embeddings = Glm4vVisionEmbeddings(config)
|
||||
self.patch_embed = Glm4vVisionPatchEmbed(config)
|
||||
|
||||
head_dim = config.hidden_size // config.num_heads
|
||||
self.rotary_pos_emb = Glm4vVisionRotaryEmbedding(head_dim // 2)
|
||||
|
||||
self.blocks = nn.ModuleList([Glm4vVisionBlock(config) for _ in range(config.depth)])
|
||||
self.merger = Glm4vVisionPatchMerger(
|
||||
dim=config.out_hidden_size, context_dim=config.intermediate_size, hidden_act=config.hidden_act
|
||||
)
|
||||
|
||||
self.post_conv_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.downsample = nn.Conv2d(
|
||||
in_channels=config.hidden_size,
|
||||
out_channels=config.out_hidden_size,
|
||||
kernel_size=config.spatial_merge_size,
|
||||
stride=config.spatial_merge_size,
|
||||
)
|
||||
self.post_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
|
||||
self.gradient_checkpointing = False
|
||||
self.post_init()
|
||||
|
||||
def rot_pos_emb(self, grid_thw):
|
||||
pos_ids = []
|
||||
for t, h, w in grid_thw:
|
||||
hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
|
||||
hpos_ids = hpos_ids.reshape(
|
||||
h // self.spatial_merge_size,
|
||||
self.spatial_merge_size,
|
||||
w // self.spatial_merge_size,
|
||||
self.spatial_merge_size,
|
||||
)
|
||||
hpos_ids = hpos_ids.permute(0, 2, 1, 3)
|
||||
hpos_ids = hpos_ids.flatten()
|
||||
|
||||
wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
|
||||
wpos_ids = wpos_ids.reshape(
|
||||
h // self.spatial_merge_size,
|
||||
self.spatial_merge_size,
|
||||
w // self.spatial_merge_size,
|
||||
self.spatial_merge_size,
|
||||
)
|
||||
wpos_ids = wpos_ids.permute(0, 2, 1, 3)
|
||||
wpos_ids = wpos_ids.flatten()
|
||||
pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
|
||||
pos_ids = torch.cat(pos_ids, dim=0)
|
||||
max_grid_size = grid_thw[:, 1:].max()
|
||||
rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
|
||||
rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
|
||||
return rotary_pos_emb, pos_ids
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Args:
|
||||
hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
|
||||
The final hidden states of the model.
|
||||
grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
|
||||
The temporal, height and width of feature shape of each image in LLM.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`: hidden_states.
|
||||
"""
|
||||
hidden_states = self.patch_embed(hidden_states)
|
||||
hidden_states = self.post_conv_layernorm(hidden_states)
|
||||
|
||||
rotary_pos_emb, image_type_ids = self.rot_pos_emb(grid_thw)
|
||||
emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
|
||||
position_embeddings = (emb.cos(), emb.sin())
|
||||
|
||||
cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
|
||||
dim=0,
|
||||
# Select dtype based on the following factors:
|
||||
# - FA2 requires that cu_seqlens_q must have dtype int32
|
||||
# - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
|
||||
# See https://github.com/huggingface/transformers/pull/34852 for more information
|
||||
dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
|
||||
)
|
||||
cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
|
||||
seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
|
||||
hidden_states = self.embeddings(hidden_states, seqlens, grid_thw, image_type_ids[:, 0], image_type_ids[:, 1])
|
||||
|
||||
for blk in self.blocks:
|
||||
hidden_states = blk(
|
||||
hidden_states,
|
||||
cu_seqlens=cu_seqlens,
|
||||
position_embeddings=position_embeddings,
|
||||
)
|
||||
|
||||
hidden_states = self.post_layernorm(hidden_states)
|
||||
|
||||
hidden_states = hidden_states.view(
|
||||
-1, self.spatial_merge_size, self.spatial_merge_size, hidden_states.shape[-1]
|
||||
)
|
||||
hidden_states = hidden_states.permute(0, 3, 1, 2)
|
||||
hidden_states = self.downsample(hidden_states).view(-1, self.config.out_hidden_size)
|
||||
|
||||
hidden_states = self.merger(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class Glm4vTextModel(Glm4vPreTrainedModel):
|
||||
config: Glm4vTextConfig
|
||||
@@ -829,7 +821,7 @@ class Glm4vTextModel(Glm4vPreTrainedModel):
|
||||
self.post_init()
|
||||
|
||||
@auto_docstring
|
||||
@can_return_tuple
|
||||
@check_model_inputs
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
@@ -838,27 +830,12 @@ class Glm4vTextModel(Glm4vPreTrainedModel):
|
||||
past_key_values: Optional[list[torch.FloatTensor]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> Union[tuple, BaseModelOutputWithPast]:
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
|
||||
if (input_ids is None) ^ (inputs_embeds is not None):
|
||||
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
if use_cache:
|
||||
logger.warning_once(
|
||||
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
|
||||
)
|
||||
use_cache = False
|
||||
|
||||
# torch.jit.trace() doesn't support cache objects in the output
|
||||
if use_cache and past_key_values is None and not torch.jit.is_tracing():
|
||||
past_key_values = DynamicCache()
|
||||
@@ -892,42 +869,23 @@ class Glm4vTextModel(Glm4vPreTrainedModel):
|
||||
# create position embeddings to be shared across the decoder layers
|
||||
position_embeddings = self.rotary_emb(hidden_states, position_ids)
|
||||
|
||||
# decoder layers
|
||||
all_hidden_states = () if output_hidden_states else None
|
||||
all_self_attns = () if output_attentions else None
|
||||
|
||||
for decoder_layer in self.layers:
|
||||
if output_hidden_states:
|
||||
all_hidden_states += (hidden_states,)
|
||||
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
position_embeddings=position_embeddings,
|
||||
attention_mask=causal_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_values=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
if output_attentions:
|
||||
all_self_attns += (layer_outputs[1],)
|
||||
hidden_states = layer_outputs
|
||||
|
||||
hidden_states = self.norm(hidden_states)
|
||||
|
||||
# add hidden states from the last decoder layer
|
||||
if output_hidden_states:
|
||||
all_hidden_states += (hidden_states,)
|
||||
|
||||
return BaseModelOutputWithPast(
|
||||
last_hidden_state=hidden_states,
|
||||
past_key_values=past_key_values if use_cache else None,
|
||||
hidden_states=all_hidden_states,
|
||||
attentions=all_self_attns,
|
||||
past_key_values=past_key_values,
|
||||
)
|
||||
|
||||
|
||||
@@ -1210,8 +1168,9 @@ class Glm4vModel(Glm4vPreTrainedModel):
|
||||
)
|
||||
special_video_mask = special_video_mask.all(-1)
|
||||
else:
|
||||
# GLM-4.1V and GLM-4.5V special_video_mask is special_image_mask
|
||||
special_image_mask = input_ids == self.config.image_token_id
|
||||
special_video_mask = input_ids == self.config.video_token_id
|
||||
special_video_mask = input_ids == self.config.image_token_id
|
||||
|
||||
n_image_tokens = special_image_mask.sum()
|
||||
special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
|
||||
@@ -1238,9 +1197,6 @@ class Glm4vModel(Glm4vPreTrainedModel):
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[list[torch.FloatTensor]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
pixel_values: Optional[torch.Tensor] = None,
|
||||
pixel_values_videos: Optional[torch.FloatTensor] = None,
|
||||
image_grid_thw: Optional[torch.LongTensor] = None,
|
||||
@@ -1257,12 +1213,6 @@ class Glm4vModel(Glm4vPreTrainedModel):
|
||||
rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
|
||||
The rope index difference between sequence length and multimodal rope.
|
||||
"""
|
||||
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
|
||||
if (input_ids is None) ^ (inputs_embeds is not None):
|
||||
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
||||
|
||||
@@ -1333,10 +1283,6 @@ class Glm4vModel(Glm4vPreTrainedModel):
|
||||
attention_mask=attention_mask,
|
||||
past_key_values=past_key_values,
|
||||
inputs_embeds=inputs_embeds,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=True,
|
||||
cache_position=cache_position,
|
||||
**kwargs,
|
||||
)
|
||||
@@ -1430,10 +1376,6 @@ class Glm4vForConditionalGeneration(Glm4vPreTrainedModel, GenerationMixin):
|
||||
past_key_values: Optional[list[torch.FloatTensor]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
pixel_values: Optional[torch.Tensor] = None,
|
||||
pixel_values_videos: Optional[torch.FloatTensor] = None,
|
||||
image_grid_thw: Optional[torch.LongTensor] = None,
|
||||
@@ -1485,12 +1427,6 @@ class Glm4vForConditionalGeneration(Glm4vPreTrainedModel, GenerationMixin):
|
||||
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
||||
"The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
|
||||
```"""
|
||||
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
|
||||
outputs = self.model(
|
||||
input_ids=input_ids,
|
||||
pixel_values=pixel_values,
|
||||
@@ -1501,9 +1437,6 @@ class Glm4vForConditionalGeneration(Glm4vPreTrainedModel, GenerationMixin):
|
||||
attention_mask=attention_mask,
|
||||
past_key_values=past_key_values,
|
||||
inputs_embeds=inputs_embeds,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
cache_position=cache_position,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@@ -19,7 +19,6 @@ import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.utils.checkpoint
|
||||
from torch.nn import LayerNorm
|
||||
|
||||
from ...activations import ACT2FN
|
||||
@@ -36,7 +35,7 @@ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
||||
from ...processing_utils import ImagesKwargs, Unpack
|
||||
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
|
||||
from ...utils.deprecation import deprecate_kwarg
|
||||
from ...utils.generic import check_model_inputs
|
||||
from ...video_utils import VideoInput
|
||||
from ..glm4.modeling_glm4 import Glm4MLP, Glm4RMSNorm, eager_attention_forward
|
||||
from ..qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLConfig
|
||||
@@ -136,7 +135,7 @@ class Glm4vVisionConfig(PretrainedConfig):
|
||||
patch_size=14,
|
||||
rms_norm_eps=1e-05,
|
||||
spatial_merge_size=2,
|
||||
temporal_patch_size=1,
|
||||
temporal_patch_size=2,
|
||||
out_hidden_size=4096,
|
||||
intermediate_size=13696,
|
||||
initializer_range=0.02,
|
||||
@@ -523,8 +522,216 @@ class Glm4vVisionBlock(Qwen2_5_VLVisionBlock):
|
||||
self.mlp = Glm4VisionMlp(config, bias=False)
|
||||
|
||||
|
||||
class Glm4vTextRotaryEmbedding(Qwen2_5_VLRotaryEmbedding):
|
||||
pass
|
||||
|
||||
|
||||
def rotate_half_llm(x):
|
||||
"""Rotates half the hidden dims of the input."""
|
||||
x1 = x[..., 0::2]
|
||||
x2 = x[..., 1::2]
|
||||
return torch.stack((-x2, x1), dim=-1).flatten(-2)
|
||||
|
||||
|
||||
def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
|
||||
"""Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/).
|
||||
|
||||
Explanation:
|
||||
Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
|
||||
sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
|
||||
vision embedding part, we apply rotary position embedding on temporal, height and width dimension separately.
|
||||
Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
|
||||
For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
|
||||
height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
|
||||
difference with modern LLMs.
|
||||
|
||||
Args:
|
||||
q (`torch.Tensor`): The query tensor.
|
||||
k (`torch.Tensor`): The key tensor.
|
||||
cos (`torch.Tensor`): The cosine part of the rotary embedding.
|
||||
sin (`torch.Tensor`): The sine part of the rotary embedding.
|
||||
mrope_section(`List(int)`):
|
||||
Multimodal rope section is for channel dimension of temporal, height and width in rope calculation.
|
||||
unsqueeze_dim (`int`, *optional*, defaults to 1):
|
||||
The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
|
||||
sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
|
||||
that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
|
||||
k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
|
||||
cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
|
||||
the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
|
||||
Returns:
|
||||
`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
|
||||
"""
|
||||
mrope_section = mrope_section * 2
|
||||
cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
|
||||
unsqueeze_dim
|
||||
)
|
||||
sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
|
||||
unsqueeze_dim
|
||||
)
|
||||
|
||||
# Interleave them instead of usual shape
|
||||
cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1)
|
||||
sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1)
|
||||
|
||||
# Keep half or full tensor for later concatenation
|
||||
rotary_dim = cos.shape[-1]
|
||||
q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
|
||||
k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
|
||||
|
||||
# Apply rotary embeddings on the first half or full tensor
|
||||
q_embed = (q_rot * cos) + (rotate_half_llm(q_rot) * sin)
|
||||
k_embed = (k_rot * cos) + (rotate_half_llm(k_rot) * sin)
|
||||
|
||||
# Concatenate back to full shape
|
||||
q_embed = torch.cat([q_embed, q_pass], dim=-1)
|
||||
k_embed = torch.cat([k_embed, k_pass], dim=-1)
|
||||
|
||||
return q_embed, k_embed
|
||||
|
||||
|
||||
class Glm4vTextAttention(nn.Module):
|
||||
"""
|
||||
Multi-headed attention from 'Attention Is All You Need' paper.
|
||||
and "Generating Long Sequences with Sparse Transformers".
|
||||
"""
|
||||
|
||||
def __init__(self, config: Glm4vTextConfig, layer_idx: Optional[int] = None):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.layer_idx = layer_idx
|
||||
|
||||
self.hidden_size = config.hidden_size
|
||||
self.num_heads = config.num_attention_heads
|
||||
self.head_dim = self.hidden_size // self.num_heads
|
||||
self.num_key_value_heads = config.num_key_value_heads
|
||||
self.num_key_value_groups = self.num_heads // self.num_key_value_heads
|
||||
self.is_causal = True
|
||||
self.attention_dropout = config.attention_dropout
|
||||
self.rope_scaling = config.rope_scaling
|
||||
self.scaling = self.head_dim**-0.5
|
||||
|
||||
self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
|
||||
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
|
||||
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
|
||||
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
position_embeddings: tuple[torch.Tensor, torch.Tensor],
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[Cache] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
|
||||
bsz, q_len, _ = hidden_states.size()
|
||||
|
||||
query_states = self.q_proj(hidden_states)
|
||||
key_states = self.k_proj(hidden_states)
|
||||
value_states = self.v_proj(hidden_states)
|
||||
|
||||
query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
|
||||
key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
|
||||
value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
|
||||
|
||||
cos, sin = position_embeddings
|
||||
query_states, key_states = apply_multimodal_rotary_pos_emb( # diff with Llama
|
||||
query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
|
||||
)
|
||||
|
||||
if past_key_values is not None:
|
||||
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
|
||||
key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
||||
|
||||
attention_interface: Callable = eager_attention_forward
|
||||
if self.config._attn_implementation != "eager":
|
||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||
|
||||
attn_output, attn_weights = attention_interface(
|
||||
self,
|
||||
query_states,
|
||||
key_states,
|
||||
value_states,
|
||||
attention_mask,
|
||||
dropout=0.0 if not self.training else self.attention_dropout,
|
||||
scaling=self.scaling,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
|
||||
attn_output = self.o_proj(attn_output)
|
||||
return attn_output, attn_weights
|
||||
|
||||
|
||||
class Glm4vTextMLP(Glm4MLP):
|
||||
pass
|
||||
|
||||
|
||||
class Glm4vTextDecoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: Glm4vTextConfig, layer_idx: int):
|
||||
super().__init__()
|
||||
self.hidden_size = config.hidden_size
|
||||
self.self_attn = Glm4vTextAttention(config, layer_idx)
|
||||
self.mlp = Glm4vTextMLP(config)
|
||||
self.input_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.post_attention_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.post_self_attn_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.post_mlp_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
position_embeddings: tuple[torch.Tensor, torch.Tensor],
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[tuple[torch.Tensor]] = None,
|
||||
output_attentions: Optional[bool] = False,
|
||||
use_cache: Optional[bool] = False,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs,
|
||||
) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
||||
residual = hidden_states
|
||||
|
||||
hidden_states = self.input_layernorm(hidden_states)
|
||||
|
||||
# Self Attention
|
||||
hidden_states, _ = self.self_attn(
|
||||
hidden_states=hidden_states,
|
||||
position_embeddings=position_embeddings,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_values=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
hidden_states = self.post_self_attn_layernorm(hidden_states)
|
||||
hidden_states = residual + hidden_states
|
||||
|
||||
# Fully Connected
|
||||
residual = hidden_states
|
||||
hidden_states = self.post_attention_layernorm(hidden_states)
|
||||
hidden_states = self.mlp(hidden_states)
|
||||
hidden_states = self.post_mlp_layernorm(hidden_states)
|
||||
hidden_states = residual + hidden_states
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class Glm4vModelOutputWithPast(Qwen2_5_VLModelOutputWithPast):
|
||||
pass
|
||||
|
||||
|
||||
class Glm4vPreTrainedModel(Qwen2_5_VLPreTrainedModel):
|
||||
_no_split_modules = ["Glm4vTextDecoderLayer", "Glm4vVisionBlock"]
|
||||
_can_record_outputs = {
|
||||
"hidden_states": Glm4vTextDecoderLayer,
|
||||
"attentions": Glm4vTextAttention,
|
||||
}
|
||||
|
||||
|
||||
class Glm4vVisionModel(Glm4vPreTrainedModel):
|
||||
@@ -637,222 +844,6 @@ class Glm4vVisionModel(Glm4vPreTrainedModel):
|
||||
return hidden_states
|
||||
|
||||
|
||||
class Glm4vTextRotaryEmbedding(Qwen2_5_VLRotaryEmbedding):
|
||||
pass
|
||||
|
||||
|
||||
def rotate_half_llm(x):
|
||||
"""Rotates half the hidden dims of the input."""
|
||||
x1 = x[..., 0::2]
|
||||
x2 = x[..., 1::2]
|
||||
return torch.stack((-x2, x1), dim=-1).flatten(-2)
|
||||
|
||||
|
||||
def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
|
||||
"""Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/).
|
||||
|
||||
Explanation:
|
||||
Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
|
||||
sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
|
||||
vision embedding part, we apply rotary position embedding on temporal, height and width dimension separately.
|
||||
Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
|
||||
For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
|
||||
height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
|
||||
difference with modern LLMs.
|
||||
|
||||
Args:
|
||||
q (`torch.Tensor`): The query tensor.
|
||||
k (`torch.Tensor`): The key tensor.
|
||||
cos (`torch.Tensor`): The cosine part of the rotary embedding.
|
||||
sin (`torch.Tensor`): The sine part of the rotary embedding.
|
||||
mrope_section(`List(int)`):
|
||||
Multimodal rope section is for channel dimension of temporal, height and width in rope calculation.
|
||||
unsqueeze_dim (`int`, *optional*, defaults to 1):
|
||||
The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
|
||||
sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
|
||||
that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
|
||||
k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
|
||||
cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
|
||||
the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
|
||||
Returns:
|
||||
`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
|
||||
"""
|
||||
mrope_section = mrope_section * 2
|
||||
cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
|
||||
unsqueeze_dim
|
||||
)
|
||||
sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
|
||||
unsqueeze_dim
|
||||
)
|
||||
|
||||
# Interleave them instead of usual shape
|
||||
cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1)
|
||||
sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1)
|
||||
|
||||
# Keep half or full tensor for later concatenation
|
||||
rotary_dim = cos.shape[-1]
|
||||
q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
|
||||
k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
|
||||
|
||||
# Apply rotary embeddings on the first half or full tensor
|
||||
q_embed = (q_rot * cos) + (rotate_half_llm(q_rot) * sin)
|
||||
k_embed = (k_rot * cos) + (rotate_half_llm(k_rot) * sin)
|
||||
|
||||
# Concatenate back to full shape
|
||||
q_embed = torch.cat([q_embed, q_pass], dim=-1)
|
||||
k_embed = torch.cat([k_embed, k_pass], dim=-1)
|
||||
|
||||
return q_embed, k_embed
|
||||
|
||||
|
||||
class Glm4vTextAttention(nn.Module):
|
||||
"""
|
||||
Multi-headed attention from 'Attention Is All You Need' paper.
|
||||
and "Generating Long Sequences with Sparse Transformers".
|
||||
"""
|
||||
|
||||
def __init__(self, config: Glm4vTextConfig, layer_idx: Optional[int] = None):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.layer_idx = layer_idx
|
||||
|
||||
self.hidden_size = config.hidden_size
|
||||
self.num_heads = config.num_attention_heads
|
||||
self.head_dim = self.hidden_size // self.num_heads
|
||||
self.num_key_value_heads = config.num_key_value_heads
|
||||
self.num_key_value_groups = self.num_heads // self.num_key_value_heads
|
||||
self.is_causal = True
|
||||
self.attention_dropout = config.attention_dropout
|
||||
self.rope_scaling = config.rope_scaling
|
||||
self.scaling = self.head_dim**-0.5
|
||||
|
||||
self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
|
||||
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
|
||||
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
|
||||
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
|
||||
|
||||
@deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
position_embeddings: tuple[torch.Tensor, torch.Tensor],
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[Cache] = None,
|
||||
output_attentions: bool = False,
|
||||
use_cache: bool = False,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
|
||||
bsz, q_len, _ = hidden_states.size()
|
||||
|
||||
query_states = self.q_proj(hidden_states)
|
||||
key_states = self.k_proj(hidden_states)
|
||||
value_states = self.v_proj(hidden_states)
|
||||
|
||||
query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
|
||||
key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
|
||||
value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
|
||||
|
||||
cos, sin = position_embeddings
|
||||
query_states, key_states = apply_multimodal_rotary_pos_emb( # diff with Llama
|
||||
query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
|
||||
)
|
||||
|
||||
if past_key_values is not None:
|
||||
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
|
||||
key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
||||
|
||||
attention_interface: Callable = eager_attention_forward
|
||||
if self.config._attn_implementation != "eager":
|
||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||
|
||||
attn_output, attn_weights = attention_interface(
|
||||
self,
|
||||
query_states,
|
||||
key_states,
|
||||
value_states,
|
||||
attention_mask,
|
||||
dropout=0.0 if not self.training else self.attention_dropout,
|
||||
scaling=self.scaling,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
|
||||
attn_output = self.o_proj(attn_output)
|
||||
return attn_output, attn_weights, past_key_values
|
||||
|
||||
|
||||
class Glm4vTextMLP(Glm4MLP):
|
||||
pass
|
||||
|
||||
|
||||
class Glm4vTextDecoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: Glm4vTextConfig, layer_idx: int):
|
||||
super().__init__()
|
||||
self.hidden_size = config.hidden_size
|
||||
self.self_attn = Glm4vTextAttention(config, layer_idx)
|
||||
self.mlp = Glm4vTextMLP(config)
|
||||
self.input_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.post_attention_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.post_self_attn_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.post_mlp_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
|
||||
@deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
position_embeddings: tuple[torch.Tensor, torch.Tensor],
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[tuple[torch.Tensor]] = None,
|
||||
output_attentions: Optional[bool] = False,
|
||||
use_cache: Optional[bool] = False,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs,
|
||||
) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
||||
residual = hidden_states
|
||||
|
||||
hidden_states = self.input_layernorm(hidden_states)
|
||||
|
||||
# Self Attention
|
||||
hidden_states, self_attn_weights, present_key_value = self.self_attn(
|
||||
hidden_states=hidden_states,
|
||||
position_embeddings=position_embeddings,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_values=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
hidden_states = self.post_self_attn_layernorm(hidden_states)
|
||||
hidden_states = residual + hidden_states
|
||||
|
||||
# Fully Connected
|
||||
residual = hidden_states
|
||||
hidden_states = self.post_attention_layernorm(hidden_states)
|
||||
hidden_states = self.mlp(hidden_states)
|
||||
hidden_states = self.post_mlp_layernorm(hidden_states)
|
||||
hidden_states = residual + hidden_states
|
||||
|
||||
outputs = (hidden_states,)
|
||||
|
||||
if output_attentions:
|
||||
outputs += (self_attn_weights,)
|
||||
|
||||
if use_cache:
|
||||
outputs += (present_key_value,)
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
class Glm4vModelOutputWithPast(Qwen2_5_VLModelOutputWithPast):
|
||||
pass
|
||||
|
||||
|
||||
class Glm4vTextModel(Qwen2_5_VLTextModel):
|
||||
def __init__(self, config: Glm4vTextConfig):
|
||||
super().__init__(config)
|
||||
@@ -865,7 +856,7 @@ class Glm4vTextModel(Qwen2_5_VLTextModel):
|
||||
del self.has_sliding_layers
|
||||
|
||||
@auto_docstring
|
||||
@can_return_tuple
|
||||
@check_model_inputs
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
@@ -874,27 +865,12 @@ class Glm4vTextModel(Qwen2_5_VLTextModel):
|
||||
past_key_values: Optional[list[torch.FloatTensor]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> Union[tuple, BaseModelOutputWithPast]:
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
|
||||
if (input_ids is None) ^ (inputs_embeds is not None):
|
||||
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
if use_cache:
|
||||
logger.warning_once(
|
||||
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
|
||||
)
|
||||
use_cache = False
|
||||
|
||||
# torch.jit.trace() doesn't support cache objects in the output
|
||||
if use_cache and past_key_values is None and not torch.jit.is_tracing():
|
||||
past_key_values = DynamicCache()
|
||||
@@ -928,42 +904,23 @@ class Glm4vTextModel(Qwen2_5_VLTextModel):
|
||||
# create position embeddings to be shared across the decoder layers
|
||||
position_embeddings = self.rotary_emb(hidden_states, position_ids)
|
||||
|
||||
# decoder layers
|
||||
all_hidden_states = () if output_hidden_states else None
|
||||
all_self_attns = () if output_attentions else None
|
||||
|
||||
for decoder_layer in self.layers:
|
||||
if output_hidden_states:
|
||||
all_hidden_states += (hidden_states,)
|
||||
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
position_embeddings=position_embeddings,
|
||||
attention_mask=causal_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_values=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
if output_attentions:
|
||||
all_self_attns += (layer_outputs[1],)
|
||||
hidden_states = layer_outputs
|
||||
|
||||
hidden_states = self.norm(hidden_states)
|
||||
|
||||
# add hidden states from the last decoder layer
|
||||
if output_hidden_states:
|
||||
all_hidden_states += (hidden_states,)
|
||||
|
||||
return BaseModelOutputWithPast(
|
||||
last_hidden_state=hidden_states,
|
||||
past_key_values=past_key_values if use_cache else None,
|
||||
hidden_states=all_hidden_states,
|
||||
attentions=all_self_attns,
|
||||
past_key_values=past_key_values,
|
||||
)
|
||||
|
||||
|
||||
@@ -1189,6 +1146,47 @@ class Glm4vModel(Qwen2_5_VLModel):
|
||||
video_embeds = torch.split(video_embeds, split_sizes)
|
||||
return video_embeds
|
||||
|
||||
def get_placeholder_mask(
|
||||
self,
|
||||
input_ids: torch.LongTensor,
|
||||
inputs_embeds: torch.FloatTensor,
|
||||
image_features: torch.FloatTensor = None,
|
||||
video_features: torch.FloatTensor = None,
|
||||
):
|
||||
"""
|
||||
Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
|
||||
equal to the length of multimodal features. If the lengths are different, an error is raised.
|
||||
"""
|
||||
if input_ids is None:
|
||||
special_image_mask = inputs_embeds == self.get_input_embeddings()(
|
||||
torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
|
||||
)
|
||||
special_image_mask = special_image_mask.all(-1)
|
||||
special_video_mask = inputs_embeds == self.get_input_embeddings()(
|
||||
torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device)
|
||||
)
|
||||
special_video_mask = special_video_mask.all(-1)
|
||||
else:
|
||||
# GLM-4.1V and GLM-4.5V special_video_mask is special_image_mask
|
||||
special_image_mask = input_ids == self.config.image_token_id
|
||||
special_video_mask = input_ids == self.config.image_token_id
|
||||
|
||||
n_image_tokens = special_image_mask.sum()
|
||||
special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
|
||||
if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel():
|
||||
raise ValueError(
|
||||
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0]}"
|
||||
)
|
||||
|
||||
n_video_tokens = special_video_mask.sum()
|
||||
special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
|
||||
if video_features is not None and inputs_embeds[special_video_mask].numel() != video_features.numel():
|
||||
raise ValueError(
|
||||
f"Videos features and video tokens do not match: tokens: {n_video_tokens}, features {video_features.shape[0]}"
|
||||
)
|
||||
|
||||
return special_image_mask, special_video_mask
|
||||
|
||||
@auto_docstring
|
||||
@can_return_tuple
|
||||
def forward(
|
||||
@@ -1198,9 +1196,6 @@ class Glm4vModel(Qwen2_5_VLModel):
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[list[torch.FloatTensor]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
pixel_values: Optional[torch.Tensor] = None,
|
||||
pixel_values_videos: Optional[torch.FloatTensor] = None,
|
||||
image_grid_thw: Optional[torch.LongTensor] = None,
|
||||
@@ -1217,12 +1212,6 @@ class Glm4vModel(Qwen2_5_VLModel):
|
||||
rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
|
||||
The rope index difference between sequence length and multimodal rope.
|
||||
"""
|
||||
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
|
||||
if (input_ids is None) ^ (inputs_embeds is not None):
|
||||
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
||||
|
||||
@@ -1293,10 +1282,6 @@ class Glm4vModel(Qwen2_5_VLModel):
|
||||
attention_mask=attention_mask,
|
||||
past_key_values=past_key_values,
|
||||
inputs_embeds=inputs_embeds,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=True,
|
||||
cache_position=cache_position,
|
||||
**kwargs,
|
||||
)
|
||||
@@ -1325,10 +1310,6 @@ class Glm4vForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
|
||||
past_key_values: Optional[list[torch.FloatTensor]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
pixel_values: Optional[torch.Tensor] = None,
|
||||
pixel_values_videos: Optional[torch.FloatTensor] = None,
|
||||
image_grid_thw: Optional[torch.LongTensor] = None,
|
||||
@@ -1380,12 +1361,6 @@ class Glm4vForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
|
||||
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
||||
"The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
|
||||
```"""
|
||||
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
|
||||
outputs = self.model(
|
||||
input_ids=input_ids,
|
||||
pixel_values=pixel_values,
|
||||
@@ -1396,9 +1371,6 @@ class Glm4vForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
|
||||
attention_mask=attention_mask,
|
||||
past_key_values=past_key_values,
|
||||
inputs_embeds=inputs_embeds,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
cache_position=cache_position,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
27
src/transformers/models/glm4v_moe/__init__.py
Normal file
27
src/transformers/models/glm4v_moe/__init__.py
Normal file
@@ -0,0 +1,27 @@
|
||||
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ...utils import _LazyModule
|
||||
from ...utils.import_utils import define_import_structure
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_glm4v_moe import *
|
||||
from .modeling_glm4v_moe import *
|
||||
else:
|
||||
import sys
|
||||
|
||||
_file = globals()["__file__"]
|
||||
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|
||||
388
src/transformers/models/glm4v_moe/configuration_glm4v_moe.py
Normal file
388
src/transformers/models/glm4v_moe/configuration_glm4v_moe.py
Normal file
@@ -0,0 +1,388 @@
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
# This file was automatically generated from src/transformers/models/glm4v_moe/modular_glm4v_moe.py.
|
||||
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
||||
# the file from the modular. If any change should be done, please apply the change to the
|
||||
# modular_glm4v_moe.py file directly. One of our CI enforces this.
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
# coding=utf-8
|
||||
# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...modeling_rope_utils import rope_config_validation
|
||||
|
||||
|
||||
class Glm4vMoeVisionConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`Glm4vMoeVisionModel`]. It is used to instantiate an Glm4vMoeVisionModel
|
||||
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield
|
||||
a similar configuration to that of
|
||||
GLM-4.1V-9B-Thinking [THUDM/GLM-4.1V-9B-Thinking](https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking).
|
||||
|
||||
Args:
|
||||
hidden_size (`int`, *optional*, defaults to 1536):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
depth (`int`, *optional*, defaults to 24):
|
||||
Number of layers (depth) in the model.
|
||||
attention_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to add a bias to the queries, keys and values.
|
||||
intermediate_size (`int`, *optional*, defaults to 13696):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"selu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` are supported.
|
||||
hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
Dropout probability for attention weights.
|
||||
projection_dropout (`float`, *optional*, defaults to 0.0):
|
||||
Dropout probability for the projection layer.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
image_size (`int` or `list[int]`, *optional*, defaults to `[336, 336]`):
|
||||
The size (resolution) of each image.
|
||||
patch_size (`int`, *optional*, defaults to `14`):
|
||||
The size (resolution) of each patch.
|
||||
num_channels (`int`, *optional*, defaults to 3):
|
||||
The number of input channels.
|
||||
out_hidden_size (`int`, *optional*, defaults to 4096):
|
||||
The output hidden size of the vision model.
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the rms normalization layers.
|
||||
spatial_merge_size (`int`, *optional*, defaults to 2):
|
||||
The size used for merging spatial dimensions.
|
||||
temporal_patch_size (`int`, *optional*, defaults to 2):
|
||||
The size used for patches along the temporal dimension.
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import Glm4vMoeVisionConfig, Glm4vMoeVisionModel
|
||||
|
||||
>>> # Initializing a Glm4vMoeVisionConfig GLM-4.1V-9B style configuration
|
||||
>>> configuration = Glm4vMoeVisionConfig()
|
||||
|
||||
>>> # Initializing a model (with random weights) from the GLM-4.1V-9B configuration
|
||||
>>> model = Glm4vMoeVisionModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "glm4v_moe"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
depth=24,
|
||||
hidden_size=1536,
|
||||
hidden_act="silu",
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
num_heads=12,
|
||||
in_channels=3,
|
||||
image_size=336,
|
||||
patch_size=14,
|
||||
rms_norm_eps=1e-05,
|
||||
spatial_merge_size=2,
|
||||
temporal_patch_size=2,
|
||||
out_hidden_size=4096,
|
||||
intermediate_size=13696,
|
||||
initializer_range=0.02,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.depth = depth
|
||||
self.hidden_size = hidden_size
|
||||
self.hidden_act = hidden_act
|
||||
self.num_heads = num_heads
|
||||
self.in_channels = in_channels
|
||||
self.image_size = image_size
|
||||
self.patch_size = patch_size
|
||||
self.spatial_merge_size = spatial_merge_size
|
||||
self.temporal_patch_size = temporal_patch_size
|
||||
self.out_hidden_size = out_hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
|
||||
class Glm4vMoeTextConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`Glm4vMoeModel`]. It is used to instantiate a
|
||||
GLM-4.5V model according to the specified arguments, defining the model architecture. Instantiating a
|
||||
configuration with the defaults will yield a similar configuration to that of
|
||||
GLM-4.5V [THUDM/GLM-4.5V](https://huggingface.co/THUDM/GLM-4.5V).
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 151424):
|
||||
Vocabulary size of the Glm4vMoe model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`Glm4vMoeModel`]
|
||||
hidden_size (`int`, *optional*, defaults to 4096):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 10944):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 46):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 96):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position.
|
||||
num_key_value_heads (`int`, *optional*, defaults to 8):
|
||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group. For more details checkout [this
|
||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||
The non-linear activation function (function or string) in the decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 65536):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the rms normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model's input and output word embeddings should be tied.
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
rope_scaling (`Dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
|
||||
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
|
||||
accordingly.
|
||||
Expected contents:
|
||||
`rope_type` (`str`):
|
||||
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
|
||||
'llama3'], with 'default' being the original RoPE implementation.
|
||||
`factor` (`float`, *optional*):
|
||||
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
|
||||
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
|
||||
original maximum pre-trained length.
|
||||
`original_max_position_embeddings` (`int`, *optional*):
|
||||
Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
|
||||
pretraining.
|
||||
`attention_factor` (`float`, *optional*):
|
||||
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
|
||||
computation. If unspecified, it defaults to value recommended by the implementation, using the
|
||||
`factor` field to infer the suggested value.
|
||||
attention_bias (`bool`, defaults to `True`, *optional*, defaults to `True`):
|
||||
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
moe_intermediate_size (`int`, *optional*, defaults to 1408):
|
||||
Intermediate size of the routed expert.
|
||||
num_experts_per_tok (`int`, *optional*, defaults to 8):
|
||||
number of experts per token.
|
||||
n_shared_experts (`int`, *optional*, defaults to 1):
|
||||
Number of shared experts.
|
||||
n_routed_experts (`int`, *optional*, defaults to 128):
|
||||
Number of routed experts.
|
||||
routed_scaling_factor (`float`, *optional*, defaults to 1.0):
|
||||
Scaling factor or routed experts.
|
||||
n_group (`int`, *optional*, defaults to 1):
|
||||
Number of groups for routed experts.
|
||||
topk_group (`int`, *optional*, defaults to 1):
|
||||
Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
|
||||
first_k_dense_replace (`int`, *optional*, defaults to 1):
|
||||
Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
|
||||
\--k dense layers--/
|
||||
norm_topk_prob (`bool`, *optional*, defaults to `True`):
|
||||
Whether to normalize the topk probabilities.
|
||||
|
||||
```python
|
||||
>>> from transformers import Glm4vMoeTextModel, Glm4vMoeConfig
|
||||
|
||||
>>> # Initializing a GLM-4.5V style configuration
|
||||
>>> configuration = Glm4vMoeConfig()
|
||||
|
||||
>>> # Initializing a model from the GLM-4.5V style configuration
|
||||
>>> model = Glm4vMoeTextModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "Glm4vMoe_text"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
# Default tensor parallel plan for base model `Glm4vMoe`
|
||||
base_model_tp_plan = {
|
||||
"layers.*.self_attn.q_proj": "colwise",
|
||||
"layers.*.self_attn.k_proj": "colwise",
|
||||
"layers.*.self_attn.v_proj": "colwise",
|
||||
"layers.*.self_attn.o_proj": "rowwise",
|
||||
"layers.*.mlp.gate_up_proj": "colwise_rep", # we need to replicate here due to the `chunk` operation
|
||||
"layers.*.mlp.down_proj": "rowwise_rep", # we need to replicate here due to the `chunk` operation
|
||||
}
|
||||
base_model_pp_plan = {
|
||||
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
|
||||
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
|
||||
"norm": (["hidden_states"], ["hidden_states"]),
|
||||
}
|
||||
base_config_key = "text_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=151424,
|
||||
hidden_size=4096,
|
||||
intermediate_size=10944,
|
||||
num_hidden_layers=46,
|
||||
num_attention_heads=96,
|
||||
partial_rotary_factor=0.5,
|
||||
num_key_value_heads=8,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=65536,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-5,
|
||||
use_cache=True,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
attention_bias=True,
|
||||
attention_dropout=0.0,
|
||||
moe_intermediate_size=1408,
|
||||
num_experts_per_tok=8,
|
||||
n_shared_experts=1,
|
||||
n_routed_experts=128,
|
||||
routed_scaling_factor=1.0,
|
||||
n_group=1,
|
||||
topk_group=1,
|
||||
first_k_dense_replace=1,
|
||||
norm_topk_prob=True,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.partial_rotary_factor = partial_rotary_factor
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
# Validate the correctness of rotary position embeddings parameters
|
||||
# BC: if there is a 'type' field, move it to 'rope_type'.
|
||||
if self.rope_scaling is not None and "type" in self.rope_scaling:
|
||||
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
|
||||
rope_config_validation(self, ignore_keys={"mrope_section"})
|
||||
|
||||
# MoE arguments
|
||||
self.moe_intermediate_size = moe_intermediate_size
|
||||
self.num_experts_per_tok = num_experts_per_tok
|
||||
self.n_group = n_group
|
||||
self.topk_group = topk_group
|
||||
self.n_shared_experts = n_shared_experts
|
||||
self.n_routed_experts = n_routed_experts
|
||||
self.routed_scaling_factor = routed_scaling_factor
|
||||
self.first_k_dense_replace = first_k_dense_replace
|
||||
self.norm_topk_prob = norm_topk_prob
|
||||
|
||||
|
||||
class Glm4vMoeConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`Glm4vMoeModel`]. It is used to instantiate a
|
||||
GLM-4.5V model according to the specified arguments, defining the model architecture. Instantiating a
|
||||
configuration with the defaults will yield a similar configuration to that of
|
||||
GLM-4.5V [zai_org/GLM-4.5V](https://huggingface.co/zai_org/GLM-4.5V).
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Glm4vMoeTextConfig`):
|
||||
The config object or dictionary of the text backbone.
|
||||
vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Glm4vMoeVisionConfig`):
|
||||
The config object or dictionary of the vision backbone.
|
||||
image_token_id (`int`, *optional*, defaults to 151363):
|
||||
The image token index to encode the image prompt.
|
||||
video_token_id (`int`, *optional*, defaults to 151364):
|
||||
The video token index to encode the image prompt.
|
||||
image_start_token_id (`int`, *optional*, defaults to 151339):
|
||||
The image start token index to encode the start of image.
|
||||
image_end_token_id (`int`, *optional*, defaults to 151340):
|
||||
The image end token index to encode the end of image.
|
||||
video_start_token_id (`int`, *optional*, defaults to 151341):
|
||||
The video start token index to encode the start of video.
|
||||
video_end_token_id (`int`, *optional*, defaults to 151342):
|
||||
The video end token index to encode the end of video.
|
||||
|
||||
```python
|
||||
>>> from transformers import Glm4vMoeForConditionalGeneration, Glm4vMoeConfig
|
||||
|
||||
>>> # Initializing a GLM-4.5V style configuration
|
||||
>>> configuration = Glm4vMoeConfig()
|
||||
|
||||
>>> # Initializing a model from the GLM-4.5V style configuration
|
||||
>>> model = Glm4vMoeForConditionalGeneration(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "glm4v_moe"
|
||||
sub_configs = {"vision_config": Glm4vMoeVisionConfig, "text_config": Glm4vMoeTextConfig}
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
text_config=None,
|
||||
vision_config=None,
|
||||
image_token_id=151363,
|
||||
video_token_id=151364,
|
||||
image_start_token_id=151339,
|
||||
image_end_token_id=151340,
|
||||
video_start_token_id=151341,
|
||||
video_end_token_id=151342,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
if isinstance(vision_config, dict):
|
||||
self.vision_config = self.sub_configs["vision_config"](**vision_config)
|
||||
elif vision_config is None:
|
||||
self.vision_config = self.sub_configs["vision_config"]()
|
||||
|
||||
if isinstance(text_config, dict):
|
||||
self.text_config = self.sub_configs["text_config"](**text_config)
|
||||
elif text_config is None:
|
||||
# For BC use all kwargs to init `TextConfig`
|
||||
self.text_config = self.sub_configs["text_config"](**kwargs)
|
||||
|
||||
self.image_token_id = image_token_id
|
||||
self.video_token_id = video_token_id
|
||||
self.video_start_token_id = video_start_token_id
|
||||
self.video_end_token_id = video_end_token_id
|
||||
self.image_start_token_id = image_start_token_id
|
||||
self.image_end_token_id = image_end_token_id
|
||||
|
||||
|
||||
__all__ = ["Glm4vMoeConfig", "Glm4vMoeTextConfig"]
|
||||
1769
src/transformers/models/glm4v_moe/modeling_glm4v_moe.py
Normal file
1769
src/transformers/models/glm4v_moe/modeling_glm4v_moe.py
Normal file
File diff suppressed because it is too large
Load Diff
461
src/transformers/models/glm4v_moe/modular_glm4v_moe.py
Normal file
461
src/transformers/models/glm4v_moe/modular_glm4v_moe.py
Normal file
@@ -0,0 +1,461 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import Callable, Optional
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from ...cache_utils import Cache
|
||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...modeling_rope_utils import rope_config_validation
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import logging
|
||||
from ..glm4.modeling_glm4 import Glm4Attention
|
||||
from ..glm4_moe.configuration_glm4_moe import Glm4MoeConfig
|
||||
from ..glm4_moe.modeling_glm4_moe import (
|
||||
Glm4MoeDecoderLayer,
|
||||
Glm4MoeMLP,
|
||||
Glm4MoeMoE,
|
||||
Glm4MoePreTrainedModel,
|
||||
Glm4MoeRMSNorm,
|
||||
Glm4MoeTopkRouter,
|
||||
eager_attention_forward,
|
||||
)
|
||||
from ..glm4v.configuration_glm4v import Glm4vConfig, Glm4vVisionConfig
|
||||
from ..glm4v.modeling_glm4v import (
|
||||
Glm4vForConditionalGeneration,
|
||||
rotate_half,
|
||||
)
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class Glm4vMoeVisionConfig(Glm4vVisionConfig):
|
||||
pass
|
||||
|
||||
|
||||
class Glm4vMoeTextConfig(Glm4MoeConfig, nn.Module):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`Glm4vMoeModel`]. It is used to instantiate a
|
||||
GLM-4.5V model according to the specified arguments, defining the model architecture. Instantiating a
|
||||
configuration with the defaults will yield a similar configuration to that of
|
||||
GLM-4.5V [THUDM/GLM-4.5V](https://huggingface.co/THUDM/GLM-4.5V).
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 151424):
|
||||
Vocabulary size of the Glm4vMoe model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`Glm4vMoeModel`]
|
||||
hidden_size (`int`, *optional*, defaults to 4096):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 10944):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 46):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 96):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position.
|
||||
num_key_value_heads (`int`, *optional*, defaults to 8):
|
||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group. For more details checkout [this
|
||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||
The non-linear activation function (function or string) in the decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 65536):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the rms normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model's input and output word embeddings should be tied.
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
rope_scaling (`Dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
|
||||
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
|
||||
accordingly.
|
||||
Expected contents:
|
||||
`rope_type` (`str`):
|
||||
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
|
||||
'llama3'], with 'default' being the original RoPE implementation.
|
||||
`factor` (`float`, *optional*):
|
||||
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
|
||||
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
|
||||
original maximum pre-trained length.
|
||||
`original_max_position_embeddings` (`int`, *optional*):
|
||||
Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
|
||||
pretraining.
|
||||
`attention_factor` (`float`, *optional*):
|
||||
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
|
||||
computation. If unspecified, it defaults to value recommended by the implementation, using the
|
||||
`factor` field to infer the suggested value.
|
||||
attention_bias (`bool`, defaults to `True`, *optional*, defaults to `True`):
|
||||
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
moe_intermediate_size (`int`, *optional*, defaults to 1408):
|
||||
Intermediate size of the routed expert.
|
||||
num_experts_per_tok (`int`, *optional*, defaults to 8):
|
||||
number of experts per token.
|
||||
n_shared_experts (`int`, *optional*, defaults to 1):
|
||||
Number of shared experts.
|
||||
n_routed_experts (`int`, *optional*, defaults to 128):
|
||||
Number of routed experts.
|
||||
routed_scaling_factor (`float`, *optional*, defaults to 1.0):
|
||||
Scaling factor or routed experts.
|
||||
n_group (`int`, *optional*, defaults to 1):
|
||||
Number of groups for routed experts.
|
||||
topk_group (`int`, *optional*, defaults to 1):
|
||||
Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
|
||||
first_k_dense_replace (`int`, *optional*, defaults to 1):
|
||||
Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
|
||||
\--k dense layers--/
|
||||
norm_topk_prob (`bool`, *optional*, defaults to `True`):
|
||||
Whether to normalize the topk probabilities.
|
||||
|
||||
```python
|
||||
>>> from transformers import Glm4vMoeTextModel, Glm4vMoeConfig
|
||||
|
||||
>>> # Initializing a GLM-4.5V style configuration
|
||||
>>> configuration = Glm4vMoeConfig()
|
||||
|
||||
>>> # Initializing a model from the GLM-4.5V style configuration
|
||||
>>> model = Glm4vMoeTextModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "Glm4vMoe_text"
|
||||
base_config_key = "text_config"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
# Default tensor parallel plan for base model `Glm4vMoe`
|
||||
base_model_tp_plan = {
|
||||
"layers.*.self_attn.q_proj": "colwise",
|
||||
"layers.*.self_attn.k_proj": "colwise",
|
||||
"layers.*.self_attn.v_proj": "colwise",
|
||||
"layers.*.self_attn.o_proj": "rowwise",
|
||||
"layers.*.mlp.gate_up_proj": "colwise_rep", # we need to replicate here due to the `chunk` operation
|
||||
"layers.*.mlp.down_proj": "rowwise_rep", # we need to replicate here due to the `chunk` operation
|
||||
}
|
||||
base_model_pp_plan = {
|
||||
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
|
||||
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
|
||||
"norm": (["hidden_states"], ["hidden_states"]),
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=151424,
|
||||
hidden_size=4096,
|
||||
intermediate_size=10944,
|
||||
num_hidden_layers=46,
|
||||
num_attention_heads=96,
|
||||
partial_rotary_factor=0.5,
|
||||
num_key_value_heads=8,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=65536,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-5,
|
||||
use_cache=True,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
attention_bias=True,
|
||||
attention_dropout=0.0,
|
||||
moe_intermediate_size=1408,
|
||||
num_experts_per_tok=8,
|
||||
n_shared_experts=1,
|
||||
n_routed_experts=128,
|
||||
routed_scaling_factor=1.0,
|
||||
n_group=1,
|
||||
topk_group=1,
|
||||
first_k_dense_replace=1,
|
||||
norm_topk_prob=True,
|
||||
**kwargs,
|
||||
):
|
||||
nn.Module().__init__(
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.partial_rotary_factor = partial_rotary_factor
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
# Validate the correctness of rotary position embeddings parameters
|
||||
# BC: if there is a 'type' field, move it to 'rope_type'.
|
||||
if self.rope_scaling is not None and "type" in self.rope_scaling:
|
||||
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
|
||||
rope_config_validation(self, ignore_keys={"mrope_section"})
|
||||
|
||||
# MoE arguments
|
||||
self.moe_intermediate_size = moe_intermediate_size
|
||||
self.num_experts_per_tok = num_experts_per_tok
|
||||
self.n_group = n_group
|
||||
self.topk_group = topk_group
|
||||
self.n_shared_experts = n_shared_experts
|
||||
self.n_routed_experts = n_routed_experts
|
||||
self.routed_scaling_factor = routed_scaling_factor
|
||||
self.first_k_dense_replace = first_k_dense_replace
|
||||
self.norm_topk_prob = norm_topk_prob
|
||||
|
||||
|
||||
class Glm4vMoeConfig(Glm4vConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`Glm4vMoeModel`]. It is used to instantiate a
|
||||
GLM-4.5V model according to the specified arguments, defining the model architecture. Instantiating a
|
||||
configuration with the defaults will yield a similar configuration to that of
|
||||
GLM-4.5V [zai_org/GLM-4.5V](https://huggingface.co/zai_org/GLM-4.5V).
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Glm4vMoeTextConfig`):
|
||||
The config object or dictionary of the text backbone.
|
||||
vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Glm4vMoeVisionConfig`):
|
||||
The config object or dictionary of the vision backbone.
|
||||
image_token_id (`int`, *optional*, defaults to 151363):
|
||||
The image token index to encode the image prompt.
|
||||
video_token_id (`int`, *optional*, defaults to 151364):
|
||||
The video token index to encode the image prompt.
|
||||
image_start_token_id (`int`, *optional*, defaults to 151339):
|
||||
The image start token index to encode the start of image.
|
||||
image_end_token_id (`int`, *optional*, defaults to 151340):
|
||||
The image end token index to encode the end of image.
|
||||
video_start_token_id (`int`, *optional*, defaults to 151341):
|
||||
The video start token index to encode the start of video.
|
||||
video_end_token_id (`int`, *optional*, defaults to 151342):
|
||||
The video end token index to encode the end of video.
|
||||
|
||||
```python
|
||||
>>> from transformers import Glm4vMoeForConditionalGeneration, Glm4vMoeConfig
|
||||
|
||||
>>> # Initializing a GLM-4.5V style configuration
|
||||
>>> configuration = Glm4vMoeConfig()
|
||||
|
||||
>>> # Initializing a model from the GLM-4.5V style configuration
|
||||
>>> model = Glm4vMoeForConditionalGeneration(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
text_config=None,
|
||||
vision_config=None,
|
||||
image_token_id=151363,
|
||||
video_token_id=151364,
|
||||
image_start_token_id=151339,
|
||||
image_end_token_id=151340,
|
||||
video_start_token_id=151341,
|
||||
video_end_token_id=151342,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
|
||||
class Glm4vMoeRMSNorm(Glm4MoeRMSNorm):
|
||||
pass
|
||||
|
||||
|
||||
def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
|
||||
"""Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/).
|
||||
|
||||
Explanation:
|
||||
Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
|
||||
sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
|
||||
vision embedding part, we apply rotary position embedding on temporal, height and width dimension separately.
|
||||
Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
|
||||
For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
|
||||
height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
|
||||
difference with modern LLMs.
|
||||
|
||||
Args:
|
||||
q (`torch.Tensor`): The query tensor.
|
||||
k (`torch.Tensor`): The key tensor.
|
||||
cos (`torch.Tensor`): The cosine part of the rotary embedding.
|
||||
sin (`torch.Tensor`): The sine part of the rotary embedding.
|
||||
mrope_section(`List(int)`):
|
||||
Multimodal rope section is for channel dimension of temporal, height and width in rope calculation.
|
||||
unsqueeze_dim (`int`, *optional*, defaults to 1):
|
||||
The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
|
||||
sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
|
||||
that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
|
||||
k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
|
||||
cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
|
||||
the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
|
||||
Returns:
|
||||
`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
|
||||
"""
|
||||
mrope_section = mrope_section * 2
|
||||
cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
|
||||
unsqueeze_dim
|
||||
)
|
||||
sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
|
||||
unsqueeze_dim
|
||||
)
|
||||
|
||||
# Keep half or full tensor for later concatenation
|
||||
rotary_dim = cos.shape[-1]
|
||||
q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
|
||||
k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
|
||||
|
||||
# Apply rotary embeddings on the first half or full tensor
|
||||
q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
|
||||
k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)
|
||||
|
||||
# Concatenate back to full shape
|
||||
q_embed = torch.cat([q_embed, q_pass], dim=-1)
|
||||
k_embed = torch.cat([k_embed, k_pass], dim=-1)
|
||||
|
||||
return q_embed, k_embed
|
||||
|
||||
|
||||
class Glm4vMoeTextAttention(Glm4Attention):
|
||||
def __init__(self, config: Glm4vMoeTextConfig, layer_idx: Optional[int] = None):
|
||||
super().__init__(config, layer_idx)
|
||||
self.rope_scaling = config.rope_scaling
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
position_embeddings: tuple[torch.Tensor, torch.Tensor],
|
||||
attention_mask: Optional[torch.Tensor],
|
||||
past_key_values: Optional[Cache] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
|
||||
input_shape = hidden_states.shape[:-1]
|
||||
hidden_shape = (*input_shape, -1, self.head_dim)
|
||||
|
||||
query_states = self.q_proj(hidden_states).view(hidden_shape)
|
||||
key_states = self.k_proj(hidden_states).view(hidden_shape)
|
||||
value_states = self.v_proj(hidden_states).view(hidden_shape)
|
||||
|
||||
query_states = query_states.transpose(1, 2)
|
||||
key_states = key_states.transpose(1, 2)
|
||||
value_states = value_states.transpose(1, 2)
|
||||
|
||||
cos, sin = position_embeddings
|
||||
query_states, key_states = apply_multimodal_rotary_pos_emb( # diff with Llama
|
||||
query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
|
||||
)
|
||||
|
||||
if past_key_values is not None:
|
||||
# sin and cos are specific to RoPE models; position_ids needed for the static cache
|
||||
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
||||
key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
||||
|
||||
attention_interface: Callable = eager_attention_forward
|
||||
if self.config._attn_implementation != "eager":
|
||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||
|
||||
attn_output, attn_weights = attention_interface(
|
||||
self,
|
||||
query_states,
|
||||
key_states,
|
||||
value_states,
|
||||
attention_mask,
|
||||
dropout=0.0 if not self.training else self.attention_dropout,
|
||||
scaling=self.scaling,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
attn_output = attn_output.reshape(*input_shape, -1).contiguous()
|
||||
attn_output = self.o_proj(attn_output)
|
||||
return attn_output, attn_weights
|
||||
|
||||
|
||||
class Glm4vMoeTextTopkRouter(Glm4MoeTopkRouter, nn.Module):
|
||||
def __init__(self, config: Glm4vMoeTextConfig):
|
||||
super().__init__(config)
|
||||
|
||||
|
||||
class Glm4vMoeTextMoE(Glm4MoeMoE):
|
||||
def __init__(self, config: Glm4vMoeTextConfig):
|
||||
super().__init__(config)
|
||||
self.config = config
|
||||
self.experts = nn.ModuleList(
|
||||
[
|
||||
Glm4vMoeTextMLP(config, intermediate_size=config.moe_intermediate_size)
|
||||
for _ in range(config.n_routed_experts)
|
||||
]
|
||||
)
|
||||
self.gate = Glm4vMoeTextTopkRouter(config)
|
||||
self.shared_experts = Glm4vMoeTextMLP(
|
||||
config=config, intermediate_size=config.moe_intermediate_size * config.n_shared_experts
|
||||
)
|
||||
|
||||
|
||||
class Glm4vMoeTextMLP(Glm4MoeMLP):
|
||||
pass
|
||||
|
||||
|
||||
class Glm4vMoeTextDecoderLayer(Glm4MoeDecoderLayer):
|
||||
def __init__(self, config: Glm4vMoeTextConfig, layer_idx: int):
|
||||
super().__init__(config, layer_idx)
|
||||
|
||||
|
||||
class Glm4vMoePreTrainedModel(Glm4MoePreTrainedModel):
|
||||
config: Glm4vMoeConfig
|
||||
base_model_prefix = ""
|
||||
_no_split_modules = ["Glm4vMoeTextDecoderLayer", "Glm4vMoeVisionBlock"]
|
||||
_skip_keys_device_placement = "past_key_values"
|
||||
|
||||
_can_record_outputs = {
|
||||
"hidden_states": Glm4vMoeTextDecoderLayer,
|
||||
"attentions": Glm4vMoeTextAttention,
|
||||
}
|
||||
|
||||
|
||||
class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
|
||||
pass
|
||||
|
||||
|
||||
__all__ = [
|
||||
"Glm4vMoeConfig",
|
||||
"Glm4vMoeTextConfig",
|
||||
"Glm4vMoeForConditionalGeneration",
|
||||
"Glm4vMoeModel", # noqa: F822
|
||||
"Glm4vMoePreTrainedModel",
|
||||
"Glm4vMoeTextModel", # noqa: F822
|
||||
]
|
||||
@@ -107,9 +107,9 @@ class Glm4MoeIntegrationTest(unittest.TestCase):
|
||||
]
|
||||
|
||||
prompts = ["[gMASK]<sop>hello", "[gMASK]<sop>tell me"]
|
||||
tokenizer = AutoTokenizer.from_pretrained("THUDM/GLM-4.5")
|
||||
tokenizer = AutoTokenizer.from_pretrained("zai-org/GLM-4.5")
|
||||
model = Glm4MoeForCausalLM.from_pretrained(
|
||||
"THUDM/GLM-4.5", device_map=torch_device, torch_dtype=torch.bfloat16
|
||||
"zai-org/GLM-4.5", device_map=torch_device, torch_dtype=torch.bfloat16
|
||||
)
|
||||
inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
|
||||
|
||||
|
||||
0
tests/models/glm4v_moe/__init__.py
Normal file
0
tests/models/glm4v_moe/__init__.py
Normal file
568
tests/models/glm4v_moe/test_modeling_glm4v_moe.py
Normal file
568
tests/models/glm4v_moe/test_modeling_glm4v_moe.py
Normal file
@@ -0,0 +1,568 @@
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Testing suite for the PyTorch GLM-4.1V model."""
|
||||
|
||||
import copy
|
||||
import gc
|
||||
import unittest
|
||||
|
||||
from transformers import (
|
||||
AutoProcessor,
|
||||
Glm4vMoeConfig,
|
||||
Glm4vMoeForConditionalGeneration,
|
||||
Glm4vMoeModel,
|
||||
is_torch_available,
|
||||
)
|
||||
from transformers.testing_utils import (
|
||||
require_flash_attn,
|
||||
require_torch,
|
||||
require_torch_gpu,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ...generation.test_utils import GenerationTesterMixin
|
||||
from ...test_configuration_common import ConfigTester
|
||||
from ...test_modeling_common import (
|
||||
ModelTesterMixin,
|
||||
floats_tensor,
|
||||
ids_tensor,
|
||||
)
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
class Glm4vMoeVisionText2TextModelTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=3,
|
||||
seq_length=7,
|
||||
num_channels=3,
|
||||
ignore_index=-100,
|
||||
image_size=112,
|
||||
video_start_token_id=3,
|
||||
video_end_token_id=4,
|
||||
image_start_token_id=5,
|
||||
image_end_token_id=6,
|
||||
image_token_id=7,
|
||||
video_token_id=8,
|
||||
is_training=True,
|
||||
text_config={
|
||||
"vocab_size": 99,
|
||||
"hidden_size": 16,
|
||||
"intermediate_size": 22,
|
||||
"num_hidden_layers": 2,
|
||||
"num_attention_heads": 2,
|
||||
"num_key_value_heads": 1,
|
||||
"output_channels": 64,
|
||||
"hidden_act": "silu",
|
||||
"max_position_embeddings": 512,
|
||||
"rope_scaling": {"type": "default", "mrope_section": [1, 1]},
|
||||
"rope_theta": 10000,
|
||||
"tie_word_embeddings": True,
|
||||
"bos_token_id": 0,
|
||||
"eos_token_id": 0,
|
||||
"pad_token_id": 0,
|
||||
"n_routed_experts": 8,
|
||||
"n_shared_experts": 1,
|
||||
"n_group": 1,
|
||||
"topk_group": 1,
|
||||
"num_experts_per_tok": 8,
|
||||
},
|
||||
vision_config={
|
||||
"depth": 2,
|
||||
"hidden_act": "silu",
|
||||
"hidden_size": 48,
|
||||
"out_hidden_size": 16,
|
||||
"intermediate_size": 22,
|
||||
"patch_size": 14,
|
||||
"spatial_merge_size": 1,
|
||||
"temporal_patch_size": 2,
|
||||
},
|
||||
):
|
||||
self.parent = parent
|
||||
self.ignore_index = ignore_index
|
||||
self.bos_token_id = text_config["bos_token_id"]
|
||||
self.eos_token_id = text_config["eos_token_id"]
|
||||
self.pad_token_id = text_config["pad_token_id"]
|
||||
self.video_start_token_id = video_start_token_id
|
||||
self.video_end_token_id = video_end_token_id
|
||||
self.image_start_token_id = image_start_token_id
|
||||
self.image_end_token_id = image_end_token_id
|
||||
self.image_token_id = image_token_id
|
||||
self.video_token_id = video_token_id
|
||||
self.text_config = text_config
|
||||
self.vision_config = vision_config
|
||||
self.batch_size = batch_size
|
||||
self.num_channels = num_channels
|
||||
self.image_size = image_size
|
||||
self.is_training = is_training
|
||||
self.hidden_size = text_config["hidden_size"]
|
||||
self.num_hidden_layers = text_config["num_hidden_layers"]
|
||||
self.num_attention_heads = text_config["num_attention_heads"]
|
||||
self.vocab_size = text_config["vocab_size"]
|
||||
self.num_image_tokens = 64
|
||||
self.seq_length = seq_length + self.num_image_tokens
|
||||
self.n_routed_experts = text_config["n_routed_experts"]
|
||||
self.n_shared_experts = text_config["n_shared_experts"]
|
||||
self.num_experts_per_tok = text_config["num_experts_per_tok"]
|
||||
self.n_group = text_config["n_group"]
|
||||
self.topk_group = text_config["topk_group"]
|
||||
|
||||
def get_config(self):
|
||||
return Glm4vMoeConfig(
|
||||
text_config=self.text_config,
|
||||
vision_config=self.vision_config,
|
||||
image_token_id=self.image_token_id,
|
||||
video_token_id=self.video_token_id,
|
||||
video_start_token_id=self.video_start_token_id,
|
||||
video_end_token_id=self.video_end_token_id,
|
||||
image_start_token_id=self.image_start_token_id,
|
||||
image_end_token_id=self.image_end_token_id,
|
||||
)
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
config = self.get_config()
|
||||
patch_size = config.vision_config.patch_size
|
||||
temporal_patch_size = config.vision_config.temporal_patch_size
|
||||
pixel_values = floats_tensor(
|
||||
[
|
||||
self.batch_size * (self.image_size**2) // (patch_size**2),
|
||||
self.num_channels * (patch_size**2) * temporal_patch_size,
|
||||
]
|
||||
)
|
||||
|
||||
return config, pixel_values
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
config, pixel_values = config_and_inputs
|
||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
|
||||
|
||||
input_ids[input_ids == self.video_token_id] = self.pad_token_id
|
||||
input_ids[input_ids == self.image_token_id] = self.pad_token_id
|
||||
input_ids[input_ids == self.video_start_token_id] = self.pad_token_id
|
||||
input_ids[input_ids == self.image_start_token_id] = self.pad_token_id
|
||||
input_ids[input_ids == self.video_end_token_id] = self.pad_token_id
|
||||
input_ids[input_ids == self.image_end_token_id] = self.pad_token_id
|
||||
|
||||
input_ids[:, 0] = self.image_start_token_id
|
||||
input_ids[:, 1 : 1 + self.num_image_tokens] = self.image_token_id
|
||||
input_ids[:, 1 + self.num_image_tokens] = self.image_end_token_id
|
||||
patch_size = config.vision_config.patch_size
|
||||
patches_per_side = self.image_size // patch_size
|
||||
|
||||
inputs_dict = {
|
||||
"pixel_values": pixel_values,
|
||||
"image_grid_thw": torch.tensor([[1, patches_per_side, patches_per_side]] * self.batch_size),
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
}
|
||||
return config, inputs_dict
|
||||
|
||||
|
||||
@require_torch
|
||||
class Glm4vMoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (Glm4vMoeModel, Glm4vMoeForConditionalGeneration) if is_torch_available() else ()
|
||||
test_pruning = False
|
||||
test_head_masking = False
|
||||
test_torchscript = False
|
||||
model_split_percents = [0.7, 0.9] # model too big to split at 0.5
|
||||
_is_composite = True
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = Glm4vMoeVisionText2TextModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=Glm4vMoeConfig, has_text_modality=False)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
# Glm4vMoe has images shaped as (bs*patch_len, dim) so we can't slice to batches in generate
|
||||
def prepare_config_and_inputs_for_generate(self, batch_size=2):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
# We don't want a few model inputs in our model input dictionary for generation tests
|
||||
input_keys_to_ignore = [
|
||||
# we don't want to mask attention heads
|
||||
"head_mask",
|
||||
"decoder_head_mask",
|
||||
"cross_attn_head_mask",
|
||||
# we don't want encoder-decoder models to start from filled decoder ids
|
||||
"decoder_input_ids",
|
||||
"decoder_attention_mask",
|
||||
# we'll set cache use in each test differently
|
||||
"use_cache",
|
||||
# Ignore labels if it is in the input dict
|
||||
"labels",
|
||||
# model-specific exceptions should overload/overwrite this function
|
||||
]
|
||||
|
||||
# The diff from the general `prepare_config_and_inputs_for_generate` lies here
|
||||
patch_size = config.vision_config.patch_size
|
||||
filtered_image_length = batch_size * (self.model_tester.image_size**2) // (patch_size**2)
|
||||
filtered_inputs_dict = {
|
||||
k: v[:batch_size, ...] if isinstance(v, torch.Tensor) else v
|
||||
for k, v in inputs_dict.items()
|
||||
if k not in input_keys_to_ignore
|
||||
}
|
||||
filtered_inputs_dict["pixel_values"] = inputs_dict["pixel_values"][:filtered_image_length]
|
||||
|
||||
# It is important set `eos_token_id` to `None` to avoid early stopping (would break for length-based checks)
|
||||
text_gen_config = config.get_text_config(decoder=True)
|
||||
if text_gen_config.eos_token_id is not None and text_gen_config.pad_token_id is None:
|
||||
text_gen_config.pad_token_id = (
|
||||
text_gen_config.eos_token_id
|
||||
if isinstance(text_gen_config.eos_token_id, int)
|
||||
else text_gen_config.eos_token_id[0]
|
||||
)
|
||||
text_gen_config.eos_token_id = None
|
||||
text_gen_config.forced_eos_token_id = None
|
||||
|
||||
return config, filtered_inputs_dict
|
||||
|
||||
@unittest.skip(reason="No available kernels - not supported")
|
||||
def test_sdpa_can_dispatch_on_flash(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="Size mismatch")
|
||||
def test_multi_gpu_data_parallel_forward(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("GLM4's moe is not compatible `token_indices, weight_indices = torch.where(mask)`.")
|
||||
def test_generate_compilation_all_outputs(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Error with compilation")
|
||||
def test_generate_from_inputs_embeds_with_static_cache(self):
|
||||
pass
|
||||
|
||||
def test_inputs_embeds(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
|
||||
|
||||
input_ids = inputs["input_ids"]
|
||||
del inputs["input_ids"]
|
||||
del inputs["pixel_values"]
|
||||
del inputs["image_grid_thw"]
|
||||
|
||||
wte = model.get_input_embeddings()
|
||||
inputs["inputs_embeds"] = wte(input_ids)
|
||||
with torch.no_grad():
|
||||
model(**inputs)[0]
|
||||
|
||||
def test_inputs_embeds_matches_input_ids(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
inputs = self._prepare_for_class(inputs_dict, model_class)
|
||||
input_ids = inputs["input_ids"]
|
||||
del inputs["input_ids"]
|
||||
del inputs["pixel_values"]
|
||||
del inputs["image_grid_thw"]
|
||||
|
||||
inputs_embeds = model.get_input_embeddings()(input_ids)
|
||||
|
||||
with torch.no_grad():
|
||||
out_ids = model(input_ids=input_ids, **inputs)[0]
|
||||
out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
|
||||
torch.testing.assert_close(out_embeds, out_ids)
|
||||
|
||||
|
||||
@require_torch
|
||||
class Glm4vMoeIntegrationTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.processor = AutoProcessor.from_pretrained("zai-org/GLM-4.5V")
|
||||
self.message = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image",
|
||||
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
|
||||
},
|
||||
{"type": "text", "text": "What kind of dog is this?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
self.message2 = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image",
|
||||
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png",
|
||||
},
|
||||
{"type": "text", "text": "What kind of dog is this?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
@slow
|
||||
def test_small_model_integration_test(self):
|
||||
model = Glm4vMoeForConditionalGeneration.from_pretrained(
|
||||
"zai-org/GLM-4.5V", torch_dtype="auto", device_map="auto"
|
||||
)
|
||||
|
||||
inputs = self.processor.apply_chat_template(
|
||||
self.message, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt"
|
||||
)
|
||||
expected_input_ids = [151331, 151333, 151336, 198, 151339, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343] # fmt: skip
|
||||
assert expected_input_ids == inputs.input_ids[0].tolist()[:17]
|
||||
|
||||
expected_pixel_slice = torch.tensor(
|
||||
[
|
||||
[-0.0988, -0.0842, -0.0842],
|
||||
[-0.5660, -0.5514, -0.4200],
|
||||
[-0.0259, -0.0259, -0.0259],
|
||||
[-0.1280, -0.0988, -0.2010],
|
||||
[-0.4638, -0.5806, -0.6974],
|
||||
[-1.2083, -1.2229, -1.2083],
|
||||
],
|
||||
dtype=torch.float32,
|
||||
device="cpu",
|
||||
)
|
||||
assert torch.allclose(expected_pixel_slice, inputs.pixel_values[:6, :3], atol=3e-3)
|
||||
|
||||
# verify generation
|
||||
inputs = inputs.to(torch_device)
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=30)
|
||||
EXPECTED_DECODED_TEXT = "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks"
|
||||
self.assertEqual(
|
||||
self.processor.decode(output[0], skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
|
||||
@slow
|
||||
def test_small_model_integration_test_batch(self):
|
||||
model = Glm4vMoeForConditionalGeneration.from_pretrained(
|
||||
"zai-org/GLM-4.5V", torch_dtype="auto", device_map="auto"
|
||||
)
|
||||
batch_messages = [self.message] * 2
|
||||
inputs = self.processor.apply_chat_template(
|
||||
batch_messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt"
|
||||
).to(torch_device)
|
||||
|
||||
# it should not matter whether two images are the same size or not
|
||||
output = model.generate(**inputs, max_new_tokens=30)
|
||||
|
||||
EXPECTED_DECODED_TEXT = [
|
||||
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks",
|
||||
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks"
|
||||
] # fmt: skip
|
||||
self.assertEqual(
|
||||
self.processor.batch_decode(output, skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
|
||||
@slow
|
||||
def test_small_model_integration_test_with_video(self):
|
||||
processor = AutoProcessor.from_pretrained("zai-org/GLM-4.5V", max_image_size={"longest_edge": 50176})
|
||||
model = Glm4vMoeForConditionalGeneration.from_pretrained(
|
||||
"zai-org/GLM-4.5V", torch_dtype=torch.float16, device_map="auto"
|
||||
)
|
||||
questions = ["Describe this video."] * 2
|
||||
video_urls = [
|
||||
"https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4"
|
||||
] * 2
|
||||
messages = [
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "video",
|
||||
"video": video_url,
|
||||
},
|
||||
{"type": "text", "text": question},
|
||||
],
|
||||
}
|
||||
]
|
||||
for question, video_url in zip(questions, video_urls)
|
||||
]
|
||||
inputs = processor.apply_chat_template(
|
||||
messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", padding=True
|
||||
).to(torch_device)
|
||||
output = model.generate(**inputs, max_new_tokens=30)
|
||||
EXPECTED_DECODED_TEXT = [
|
||||
"\n012345Describe this video.\n<think>Got it, let's analyze the video. First, the scene is a room with a wooden floor, maybe a traditional Japanese room with tatami",
|
||||
"\n012345Describe this video.\n<think>Got it, let's analyze the video. First, the scene is a room with a wooden floor, maybe a traditional Japanese room with tatami"
|
||||
] # fmt: skip
|
||||
self.assertEqual(
|
||||
processor.batch_decode(output, skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
|
||||
@slow
|
||||
def test_small_model_integration_test_expand(self):
|
||||
model = Glm4vMoeForConditionalGeneration.from_pretrained(
|
||||
"zai-org/GLM-4.5V", torch_dtype="auto", device_map="auto"
|
||||
)
|
||||
inputs = self.processor.apply_chat_template(
|
||||
self.message, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt"
|
||||
).to(torch_device)
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=30, do_sample=False, num_beams=2, num_return_sequences=2)
|
||||
|
||||
EXPECTED_DECODED_TEXT = [
|
||||
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically",
|
||||
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat, specifically"
|
||||
] # fmt: skip
|
||||
self.assertEqual(
|
||||
self.processor.batch_decode(output, skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
|
||||
@slow
|
||||
def test_small_model_integration_test_batch_wo_image(self):
|
||||
model = Glm4vMoeForConditionalGeneration.from_pretrained(
|
||||
"zai-org/GLM-4.5V", torch_dtype="auto", device_map="auto"
|
||||
)
|
||||
message_wo_image = [
|
||||
{"role": "user", "content": [{"type": "text", "text": "Who are you?"}]},
|
||||
]
|
||||
batched_messages = [self.message, message_wo_image]
|
||||
inputs = self.processor.apply_chat_template(
|
||||
batched_messages,
|
||||
tokenize=True,
|
||||
add_generation_prompt=True,
|
||||
return_dict=True,
|
||||
return_tensors="pt",
|
||||
padding=True,
|
||||
).to(torch_device)
|
||||
|
||||
# it should not matter whether two images are the same size or not
|
||||
output = model.generate(**inputs, max_new_tokens=30)
|
||||
|
||||
EXPECTED_DECODED_TEXT = [
|
||||
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks",
|
||||
'\nWho are you?\n<think>Got it, the user is asking "Who are you?" I need to respond appropriately. First, I should clarify that I\'m an AI assistant'
|
||||
] # fmt: skip
|
||||
self.assertEqual(
|
||||
self.processor.batch_decode(output, skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
|
||||
@slow
|
||||
def test_small_model_integration_test_batch_different_resolutions(self):
|
||||
model = Glm4vMoeForConditionalGeneration.from_pretrained(
|
||||
"zai-org/GLM-4.5V", torch_dtype="auto", device_map="auto"
|
||||
)
|
||||
batched_messages = [self.message, self.message2]
|
||||
inputs = self.processor.apply_chat_template(
|
||||
batched_messages,
|
||||
tokenize=True,
|
||||
add_generation_prompt=True,
|
||||
return_dict=True,
|
||||
return_tensors="pt",
|
||||
padding=True,
|
||||
).to(torch_device)
|
||||
|
||||
# it should not matter whether two images are the same size or not
|
||||
output = model.generate(**inputs, max_new_tokens=30)
|
||||
|
||||
EXPECTED_DECODED_TEXT = [
|
||||
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks",
|
||||
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but"
|
||||
] # fmt: skip
|
||||
self.assertEqual(
|
||||
self.processor.batch_decode(output, skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
|
||||
@slow
|
||||
@require_flash_attn
|
||||
@require_torch_gpu
|
||||
def test_small_model_integration_test_batch_flashatt2(self):
|
||||
model = Glm4vMoeForConditionalGeneration.from_pretrained(
|
||||
"zai-org/GLM-4.5V",
|
||||
torch_dtype=torch.bfloat16,
|
||||
attn_implementation="flash_attention_2",
|
||||
device_map="auto",
|
||||
)
|
||||
batched_messages = [self.message, self.message2]
|
||||
inputs = self.processor.apply_chat_template(
|
||||
batched_messages,
|
||||
tokenize=True,
|
||||
add_generation_prompt=True,
|
||||
return_dict=True,
|
||||
return_tensors="pt",
|
||||
padding=True,
|
||||
).to(torch_device)
|
||||
|
||||
# it should not matter whether two images are the same size or not
|
||||
output = model.generate(**inputs, max_new_tokens=30)
|
||||
|
||||
EXPECTED_DECODED_TEXT = [
|
||||
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture has a stocky build, thick fur, and a face that's",
|
||||
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but"
|
||||
] # fmt: skip
|
||||
self.assertEqual(
|
||||
self.processor.batch_decode(output, skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
|
||||
@slow
|
||||
@require_flash_attn
|
||||
@require_torch_gpu
|
||||
def test_small_model_integration_test_batch_wo_image_flashatt2(self):
|
||||
model = Glm4vMoeForConditionalGeneration.from_pretrained(
|
||||
"zai-org/GLM-4.5V",
|
||||
torch_dtype=torch.bfloat16,
|
||||
attn_implementation="flash_attention_2",
|
||||
device_map="auto",
|
||||
)
|
||||
message_wo_image = [
|
||||
{"role": "user", "content": [{"type": "text", "text": "Who are you?"}]},
|
||||
]
|
||||
batched_messages = [self.message, message_wo_image]
|
||||
inputs = self.processor.apply_chat_template(
|
||||
batched_messages,
|
||||
tokenize=True,
|
||||
add_generation_prompt=True,
|
||||
return_dict=True,
|
||||
return_tensors="pt",
|
||||
padding=True,
|
||||
).to(torch_device)
|
||||
|
||||
# it should not matter whether two images are the same size or not
|
||||
output = model.generate(**inputs, max_new_tokens=30)
|
||||
|
||||
EXPECTED_DECODED_TEXT = [
|
||||
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks",
|
||||
'\nWho are you?\n<think>Got it, let\'s look at the question. The user is asking "Who are you?" which is a common question when someone meets an AI'
|
||||
] # fmt: skip
|
||||
|
||||
self.assertEqual(
|
||||
self.processor.batch_decode(output, skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
@@ -92,6 +92,7 @@ PRIVATE_MODELS = [
|
||||
"Phi4MultimodalAudioModel",
|
||||
"Phi4MultimodalVisionModel",
|
||||
"Glm4vVisionModel",
|
||||
"Glm4vMoeVisionModel",
|
||||
"EvollaSaProtPreTrainedModel",
|
||||
]
|
||||
|
||||
@@ -158,6 +159,7 @@ IGNORE_NON_TESTED = (
|
||||
"Emu3VQVAE", # Building part of bigger (tested) model
|
||||
"Emu3TextModel", # Building part of bigger (tested) model
|
||||
"Glm4vTextModel", # Building part of bigger (tested) model
|
||||
"Glm4vMoeTextModel", # Building part of bigger (tested) model
|
||||
"Qwen2VLTextModel", # Building part of bigger (tested) model
|
||||
"Qwen2_5_VLTextModel", # Building part of bigger (tested) model
|
||||
"InternVLVisionModel", # Building part of bigger (tested) model
|
||||
|
||||
Reference in New Issue
Block a user