* add changed

* Revert "add changed"

This reverts commit 0a0166a1fe80556115a49fbf0c2132de0f4f85c9.

* update with NEW MODEL class called GLM4

* update

* Update glm4.md

* Name

* style

* fix copies

* fixup test

---------

Co-authored-by: Yuxuan Zhang <2448370773@qq.com>
This commit is contained in:
Arthur
2025-04-09 14:02:04 +02:00
committed by Lysandre
parent 28c9541c1c
commit 5c076fb4d5
15 changed files with 1911 additions and 0 deletions

View File

@@ -461,6 +461,8 @@
title: Gemma2
- local: model_doc/glm
title: GLM
- local: model_doc/glm4
title: glm4
- local: model_doc/openai-gpt
title: GPT
- local: model_doc/gpt_neo

View File

@@ -0,0 +1,45 @@
<!--Copyright 2025 The GLM & ZhipuAI team and The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# Glm4
## Overview
To be released with the official model launch.
## Glm4Config
[[autodoc]] Glm4Config
## Glm4Model
[[autodoc]] Glm4Model
- forward
## Glm4ForCausalLM
[[autodoc]] Glm4ForCausalLM
- forward
## Glm4ForSequenceClassification
[[autodoc]] Glm4ForSequenceClassification
- forward
## Glm4ForTokenClassification
[[autodoc]] Glm4ForTokenClassification
- forward

View File

@@ -482,6 +482,7 @@ _import_structure = {
"GitVisionConfig",
],
"models.glm": ["GlmConfig"],
"models.glm4": ["Glm4Config"],
"models.glpn": ["GLPNConfig"],
"models.got_ocr2": [
"GotOcr2Config",
@@ -2526,6 +2527,15 @@ else:
"Llama4PreTrainedModel",
]
)
_import_structure["models.glm4"].extend(
[
"Glm4ForCausalLM",
"Glm4ForSequenceClassification",
"Glm4ForTokenClassification",
"Glm4Model",
"Glm4PreTrainedModel",
]
)
_import_structure["models.glpn"].extend(
[
"GLPNForDepthEstimation",
@@ -5742,6 +5752,7 @@ if TYPE_CHECKING:
GitVisionConfig,
)
from .models.glm import GlmConfig
from .models.glm4 import Glm4Config
from .models.glpn import GLPNConfig
from .models.got_ocr2 import GotOcr2Config, GotOcr2Processor, GotOcr2VisionConfig
from .models.gpt2 import (
@@ -7624,6 +7635,13 @@ if TYPE_CHECKING:
GlmModel,
GlmPreTrainedModel,
)
from .models.glm4 import (
Glm4ForCausalLM,
Glm4ForSequenceClassification,
Glm4ForTokenClassification,
Glm4Model,
Glm4PreTrainedModel,
)
from .models.glpn import (
GLPNForDepthEstimation,
GLPNModel,

View File

@@ -110,6 +110,7 @@ from . import (
gemma3,
git,
glm,
glm4,
glpn,
got_ocr2,
gpt2,

View File

@@ -129,6 +129,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("gemma3_text", "Gemma3TextConfig"),
("git", "GitConfig"),
("glm", "GlmConfig"),
("glm4", "Glm4Config"),
("glpn", "GLPNConfig"),
("got_ocr2", "GotOcr2Config"),
("gpt-sw3", "GPT2Config"),
@@ -476,6 +477,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
("gemma3_text", "Gemma3ForCausalLM"),
("git", "GIT"),
("glm", "GLM"),
("glm4", "glm4"),
("glpn", "GLPN"),
("got_ocr2", "GOT-OCR2"),
("gpt-sw3", "GPT-Sw3"),

View File

@@ -122,6 +122,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("gemma3_text", "Gemma3TextModel"),
("git", "GitModel"),
("glm", "GlmModel"),
("glm4", "Glm4Model"),
("glpn", "GLPNModel"),
("got_ocr2", "GotOcr2ForConditionalGeneration"),
("gpt-sw3", "GPT2Model"),
@@ -532,6 +533,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
("gemma3_text", "Gemma3ForCausalLM"),
("git", "GitForCausalLM"),
("glm", "GlmForCausalLM"),
("glm4", "Glm4ForCausalLM"),
("got_ocr2", "GotOcr2ForConditionalGeneration"),
("gpt-sw3", "GPT2LMHeadModel"),
("gpt2", "GPT2LMHeadModel"),
@@ -1035,6 +1037,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("gemma", "GemmaForSequenceClassification"),
("gemma2", "Gemma2ForSequenceClassification"),
("glm", "GlmForSequenceClassification"),
("glm4", "Glm4ForSequenceClassification"),
("gpt-sw3", "GPT2ForSequenceClassification"),
("gpt2", "GPT2ForSequenceClassification"),
("gpt_bigcode", "GPTBigCodeForSequenceClassification"),
@@ -1236,6 +1239,7 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("gemma", "GemmaForTokenClassification"),
("gemma2", "Gemma2ForTokenClassification"),
("glm", "GlmForTokenClassification"),
("glm4", "Glm4ForTokenClassification"),
("gpt-sw3", "GPT2ForTokenClassification"),
("gpt2", "GPT2ForTokenClassification"),
("gpt_bigcode", "GPTBigCodeForTokenClassification"),

View File

@@ -238,6 +238,7 @@ else:
),
("git", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
("glm", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
("glm4", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)),
("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
("gpt_bigcode", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),

View File

@@ -0,0 +1,27 @@
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...utils import _LazyModule
from ...utils.import_utils import define_import_structure
if TYPE_CHECKING:
from .configuration_glm4 import *
from .modeling_glm4 import *
else:
import sys
_file = globals()["__file__"]
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

View File

@@ -0,0 +1,152 @@
# coding=utf-8
# Copyright 2025 The GLM4 & ZhipuAI team and HuggingFace Inc. team. All rights reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ...configuration_utils import PretrainedConfig
class Glm4Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Glm4Model`]. It is used to instantiate an Glm4
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Glm4-4-9b-chat.
e.g. [THUDM/glm-4-0414-9b-chat-chat](https://huggingface.co/THUDM/glm-4-0414-9b-chat-chat)
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 151552):
Vocabulary size of the Glm4 model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`Glm4Model`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 13696):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 40):
Number of hidden layers in the Transformer decoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer decoder.
num_key_value_heads (`int`, *optional*, defaults to 2):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details checkout [this
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
`num_attention_heads`.
partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position.
head_dim (`int`, *optional*, defaults to 128):
The attention head dimension.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The legacy activation function. It is overwritten by the `hidden_activation`.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
max_position_embeddings (`int`, *optional*, defaults to 131072):
The maximum sequence length that this model might ever be used with.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1.5625e-07):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
pad_token_id (`int`, *optional*, defaults to 151329):
Padding token id.
eos_token_id (`int` | `list`, *optional*, defaults to `[151329, 151336, 151338]`):
End of stream token id.
bos_token_id (`int`, *optional*):
Beginning of stream token id.
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `True`):
Whether to use a bias in the query, key, value and output projection layers during self-attention.
```python
>>> from transformers import Glm4Model, Glm4Config
>>> # Initializing a Glm4 glm4-4-9b-chat style configuration
>>> configuration = Glm4Config()
>>> # Initializing a model from the glm4-4-9b-chat style configuration
>>> model = Glm4Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "glm4"
keys_to_ignore_at_inference = ["past_key_values"]
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise",
"layers.*.self_attn.k_proj": "colwise",
"layers.*.self_attn.v_proj": "colwise",
"layers.*.self_attn.o_proj": "rowwise",
"layers.*.mlp.gate_up_proj": "colwise_rep", # we need to replicate here due to the `chunk` operation
"layers.*.mlp.down_proj": "rowwise_rep", # we need to replicate here due to the `chunk` operation
}
base_model_pp_plan = {
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
"norm": (["hidden_states"], ["hidden_states"]),
}
def __init__(
self,
vocab_size=151552,
hidden_size=4096,
intermediate_size=13696,
num_hidden_layers=40,
num_attention_heads=32,
num_key_value_heads=2,
partial_rotary_factor=0.5,
head_dim=128,
hidden_act="silu",
attention_dropout=0.0,
max_position_embeddings=131072,
initializer_range=0.02,
rms_norm_eps=0.00000015625,
use_cache=True,
tie_word_embeddings=False,
rope_theta=10000.0,
pad_token_id=151329,
eos_token_id=[151329, 151336, 151338],
bos_token_id=None,
attention_bias=True,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.partial_rotary_factor = partial_rotary_factor
self.head_dim = head_dim
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
__all__ = ["Glm4Config"]

View File

@@ -0,0 +1,199 @@
import argparse
import json
import os
import re
import torch
from safetensors.torch import load_file
from tokenizers import processors
from transformers import Glm4Config, Glm4ForCausalLM, PreTrainedTokenizerFast
# fmt: off
# `None` means we drop the key
STATE_DICT_MAPPING = {
# CausalLM keys
r"transformer.output_layer.weight": r"lm_head.weight",
# Model keys
r"transformer.embedding.word_embeddings.weight": r"model.embed_tokens.weight",
r"transformer.rotary_pos_emb.inv_freq": None,
r"transformer.encoder.final_layernorm.weight": r"model.norm.weight",
# Layers keys
r"transformer.encoder.layers.(\d+).input_layernorm.weight": r"model.layers.\1.input_layernorm.weight",
# Sandwich keys
r"transformer.encoder.layers.(\d+).post_mlp_layernorm.weight": r"model.layers.\1.post_mlp_layernorm.weight",
r"transformer.encoder.layers.(\d+).post_self_attn_layernorm.weight": r"model.layers.\1.post_self_attn_layernorm.weight",
r"transformer.encoder.layers.(\d+).post_attention_layernorm.weight": r"model.layers.\1.post_attention_layernorm.weight",
# Attention keys
r"transformer.encoder.layers.(\d+).self_attention.dense.weight": r"model.layers.\1.self_attn.o_proj.weight",
# qkv_proj will later be split in q|k|v|_proj
r"transformer.encoder.layers.(\d+).self_attention.query_key_value.(weight|bias)": r"model.layers.\1.self_attn.qkv_proj.\2",
# MLP keys
r"transformer.encoder.layers.(\d+).mlp.dense_h_to_4h.weight": r"model.layers.\1.mlp.gate_up_proj.weight",
r"transformer.encoder.layers.(\d+).mlp.dense_4h_to_h.weight": r"model.layers.\1.mlp.down_proj.weight",
}
# fmt: on
def load_weights(input_dir: str):
safetensor_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".safetensors")]
bin_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".bin")]
all_weights = {}
if safetensor_files:
safetensor_files = sorted(safetensor_files, key=lambda x: int(x.rsplit("-", 3)[1]))
for file in safetensor_files:
tensors = load_file(file)
all_weights.update(tensors)
return all_weights
elif bin_files:
bin_files = sorted(bin_files, key=lambda x: int(x.rsplit("-", 3)[1]))
for file in bin_files:
tensors = torch.load(file, map_location="cpu")
all_weights.update(tensors)
return all_weights
else:
raise ValueError("No .safetensors or .bin files found in the specified directory.")
def map_old_key_to_new(old_key):
for pattern, replacement in STATE_DICT_MAPPING.items():
if replacement is None:
if re.fullmatch(pattern, old_key):
return None
else:
new_key, n_replace = re.subn(pattern, replacement, old_key)
# Early exit of the loop
if n_replace > 0:
return new_key
raise ValueError(f"Key: {old_key} could not be mapped (check the mapping).")
def convert_state_dict(original_state_dict: dict, config: Glm4Config):
new_dict = {}
head_dim = config.hidden_size // config.num_attention_heads
query_size = config.num_attention_heads * head_dim
kv_size = config.num_key_value_heads * head_dim
for old_key, value in original_state_dict.items():
new_key = map_old_key_to_new(old_key)
if new_key is None:
continue
if "qkv_proj." in new_key:
q_proj, k_proj, v_proj = (
value[:query_size, ...],
value[query_size : query_size + kv_size, ...],
value[query_size + kv_size :, ...],
)
new_dict[new_key.replace("qkv_proj.", "q_proj.")] = q_proj
new_dict[new_key.replace("qkv_proj.", "k_proj.")] = k_proj
new_dict[new_key.replace("qkv_proj.", "v_proj.")] = v_proj
else:
new_dict[new_key] = value
return new_dict
def convert_config(original_config: dict):
key_mapping = {
"vocab_size": "padded_vocab_size",
"intermediate_size": "ffn_hidden_size",
"num_hidden_layers": "num_layers",
"max_position_embeddings": "seq_length",
"rms_norm_eps": "layernorm_epsilon",
"head_dim": "kv_channels",
"attention_bias": "add_qkv_bias",
}
similar_keys_to_keep = [
"num_attention_heads",
"hidden_size",
"attention_dropout",
"use_cache",
"eos_token_id",
"pad_token_id",
"tie_word_embeddings",
]
new_config_kwargs = {k: original_config[v] for k, v in key_mapping.items()}
new_config_kwargs.update({k: v for k, v in original_config.items() if k in similar_keys_to_keep})
new_config_kwargs["num_key_value_heads"] = (
new_config_kwargs["num_attention_heads"]
if not original_config["multi_query_attention"]
else original_config["multi_query_group_num"]
)
new_config_kwargs["rope_theta"] = 10000.0 * getattr(original_config, "rope_ratio", 1)
new_config = Glm4Config(**new_config_kwargs)
return new_config
def convert_glm4_tokenizer(input_dir, use_post_processor=False):
fast_tok = PreTrainedTokenizerFast.from_pretrained(input_dir, model_input_names=["input_ids", "attention_mask"])
if use_post_processor:
fast_tok._tokenizer.post_processor = processors.Sequence(
[
processors.ByteLevel(trim_offsets=False),
processors.TemplateProcessing(
single="[gMASK]:0 <sop>:0 $A:0",
pair="[gMASK]:0 <sop>:0 $A:0 $B:1",
special_tokens=[("[gMASK]", 151331), ("<sop>", 151333)],
),
],
)
else:
fast_tok._tokenizer.post_processor = processors.Sequence(
[processors.ByteLevel(trim_offsets=False)],
)
return fast_tok
def convert_glm4_model(input_dir, output_dir, use_post_processor=False):
# Load and convert config
with open(os.path.join(input_dir, "config.json")) as f:
original_config = json.load(f)
config = convert_config(original_config)
config.save_pretrained(output_dir)
# Load and convert weights
original_state_dict = load_weights(input_dir)
new_dict = convert_state_dict(original_state_dict, config)
with torch.device("meta"):
model = Glm4ForCausalLM(config)
model.load_state_dict(new_dict, strict=True, assign=True)
model.save_pretrained(output_dir)
# Load and convert tokenizer
tokenizer = convert_glm4_tokenizer(input_dir, use_post_processor)
tokenizer.save_pretrained(output_dir)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"input_dir",
type=str,
help="Location of the local folder copied from the Hub.",
)
parser.add_argument(
"output_dir",
type=str,
help="Location to write HF model and tokenizer",
)
parser.add_argument(
"--use_post_processor",
action="store_true",
help="Whether to apply post processor with special tokens",
)
args = parser.parse_args()
convert_glm4_model(args.input_dir, args.output_dir, args.use_post_processor)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,164 @@
# coding=utf-8
# Copyright 2025 The GLM4 & ZhipuAI team and HuggingFace Inc. team. All rights reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional, Tuple, Union
import torch.nn as nn
import torch.utils.checkpoint
from ...cache_utils import Cache
from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_outputs import CausalLMOutputWithPast
from ...processing_utils import Unpack
from ...utils import LossKwargs, logging
from ..glm.modeling_glm import (
GlmAttention,
GlmForCausalLM,
GlmForSequenceClassification,
GlmForTokenClassification,
)
from ..phi3.modeling_phi3 import Phi3MLP
from .configuration_glm4 import Glm4Config
from .modeling_glm4 import Glm4RMSNorm
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "THUDM/GLM-4-9B-Chat-0414"
class Glm4MLP(Phi3MLP):
pass
class Glm4DecoderLayer(nn.Module):
def __init__(self, config: Glm4Config, layer_idx: int):
super().__init__()
self.hidden_size = config.hidden_size
self.self_attn = Glm4Attention(config=config, layer_idx=layer_idx)
self.mlp = Glm4MLP(config)
self.input_layernorm = Glm4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.post_attention_layernorm = Glm4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.post_self_attn_layernorm = Glm4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.post_mlp_layernorm = Glm4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
cache_position: Optional[torch.LongTensor] = None,
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
**kwargs: Unpack[FlashAttentionKwargs],
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
# Self Attention
hidden_states, self_attn_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,
position_embeddings=position_embeddings,
**kwargs,
)
hidden_states = self.post_self_attn_layernorm(hidden_states)
hidden_states = residual + hidden_states
# Fully Connected
residual = hidden_states
hidden_states = self.post_attention_layernorm(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = self.post_mlp_layernorm(hidden_states)
hidden_states = residual + hidden_states
outputs = (hidden_states,)
if output_attentions:
outputs += (self_attn_weights,)
return outputs
class Glm4Attention(GlmAttention):
pass
class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
class Glm4ForCausalLM(GlmForCausalLM):
def forward(
self,
**super_kwargs: Unpack[KwargsForCausalLM],
) -> Union[Tuple, CausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
logits_to_keep (`int` or `torch.Tensor`, *optional*):
If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
`input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
This is useful when using packed tensor format (single dimension for batch and sequence length).
Returns:
Example:
```python
>>> from transformers import AutoTokenizer, Glm4ForCausalLM
>>> model = Glm4ForCausalLM.from_pretrained("THUDM/GLM-4-9B-Chat-0414")
>>> tokenizer = AutoTokenizer.from_pretrained("THUDM/GLM-4-9B-Chat-0414")
>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")
>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```"""
return super().forward(**super_kwargs)
class Glm4ForSequenceClassification(GlmForSequenceClassification):
pass
class Glm4ForTokenClassification(GlmForTokenClassification):
pass
__all__ = [
"Glm4PreTrainedModel", # noqa: F822
"Glm4Model", # noqa: F822
"Glm4ForCausalLM",
"Glm4ForSequenceClassification",
"Glm4ForTokenClassification",
]

View File

@@ -4740,6 +4740,41 @@ class GlmPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"])
class Glm4ForCausalLM(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Glm4ForSequenceClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Glm4ForTokenClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Glm4Model(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Glm4PreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class GLPNForDepthEstimation(metaclass=DummyObject):
_backends = ["torch"]

View File

View File

@@ -0,0 +1,205 @@
# coding=utf-8
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Glm4 model."""
import unittest
import pytest
from transformers import AutoModelForCausalLM, AutoTokenizer, Glm4Config, is_torch_available
from transformers.testing_utils import (
require_flash_attn,
require_torch,
require_torch_large_gpu,
require_torch_sdpa,
slow,
torch_device,
)
from ...models.gemma.test_modeling_gemma import GemmaModelTest, GemmaModelTester
from ...test_configuration_common import ConfigTester
if is_torch_available():
import torch
from transformers import (
Glm4ForCausalLM,
Glm4ForSequenceClassification,
Glm4ForTokenClassification,
Glm4Model,
)
class Glm4ModelTester(GemmaModelTester):
if is_torch_available():
config_class = Glm4Config
model_class = Glm4Model
for_causal_lm_class = Glm4ForCausalLM
for_sequence_class = Glm4ForSequenceClassification
for_token_class = Glm4ForTokenClassification
@require_torch
class Glm4ModelTest(GemmaModelTest, unittest.TestCase):
all_model_classes = (
(Glm4Model, Glm4ForCausalLM, Glm4ForSequenceClassification, Glm4ForTokenClassification)
if is_torch_available()
else ()
)
pipeline_model_mapping = (
{
"feature-extraction": Glm4Model,
"text-classification": Glm4ForSequenceClassification,
"token-classification": Glm4ForTokenClassification,
"text-generation": Glm4ForCausalLM,
"zero-shot": Glm4ForSequenceClassification,
}
if is_torch_available()
else {}
)
test_headmasking = False
test_pruning = False
_is_stateful = True
model_split_percents = [0.5, 0.6]
def setUp(self):
self.model_tester = Glm4ModelTester(self)
self.config_tester = ConfigTester(self, config_class=Glm4Config, hidden_size=37)
@slow
@require_torch_large_gpu
class Glm4IntegrationTest(unittest.TestCase):
input_text = ["Hello I am doing", "Hi today"]
model_id = "THUDM/glm-4-0414-9b-chat"
revision = "refs/pr/15"
# This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
# Depending on the hardware we get different logits / generations
cuda_compute_capability_major_version = None
@classmethod
def setUpClass(cls):
if is_torch_available() and torch.cuda.is_available():
# 8 is for A100 / A10 and 7 for T4
cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]
def test_model_9b_fp16(self):
EXPECTED_TEXTS = [
"Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the",
"Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.",
]
model = AutoModelForCausalLM.from_pretrained(
self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, revision=self.revision
).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision)
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
self.assertEqual(output_text, EXPECTED_TEXTS)
def test_model_9b_bf16(self):
EXPECTED_TEXTS = [
"Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the",
"Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.",
]
model = AutoModelForCausalLM.from_pretrained(
self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, revision=self.revision
).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision)
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
self.assertEqual(output_text, EXPECTED_TEXTS)
def test_model_9b_eager(self):
EXPECTED_TEXTS = [
"Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the",
"Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.",
]
model = AutoModelForCausalLM.from_pretrained(
self.model_id,
low_cpu_mem_usage=True,
torch_dtype=torch.bfloat16,
attn_implementation="eager",
revision=self.revision,
)
model.to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision)
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
self.assertEqual(output_text, EXPECTED_TEXTS)
@require_torch_sdpa
def test_model_9b_sdpa(self):
EXPECTED_TEXTS = [
"Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the",
"Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.",
]
model = AutoModelForCausalLM.from_pretrained(
self.model_id,
low_cpu_mem_usage=True,
torch_dtype=torch.bfloat16,
attn_implementation="sdpa",
revision=self.revision,
)
model.to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision)
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
self.assertEqual(output_text, EXPECTED_TEXTS)
@require_flash_attn
@pytest.mark.flash_attn_test
def test_model_9b_flash_attn(self):
EXPECTED_TEXTS = [
"Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the",
"Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.",
]
model = AutoModelForCausalLM.from_pretrained(
self.model_id,
low_cpu_mem_usage=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
revision=self.revision,
)
model.to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision)
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
self.assertEqual(output_text, EXPECTED_TEXTS)