Add glm4 (#37388)

* add changed * Revert "add changed" This reverts commit 0a0166a1fe80556115a49fbf0c2132de0f4f85c9. * update with NEW MODEL class called GLM4 * update * Update glm4.md * Name * style * fix copies * fixup test --------- Co-authored-by: Yuxuan Zhang <2448370773@qq.com>
2025-04-09 14:02:04 +02:00
parent 28c9541c1c
commit 5c076fb4d5
15 changed files with 1911 additions and 0 deletions
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -461,6 +461,8 @@
        title: Gemma2
      - local: model_doc/glm
        title: GLM
+      - local: model_doc/glm4
+        title: glm4
      - local: model_doc/openai-gpt
        title: GPT
      - local: model_doc/gpt_neo
--- a/docs/source/en/model_doc/glm4.md
+++ b/docs/source/en/model_doc/glm4.md
@@ -0,0 +1,45 @@
+<!--Copyright 2025 The GLM & ZhipuAI team and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Glm4
+
+## Overview
+
+To be released with the official model launch.
+
+## Glm4Config
+
+[[autodoc]] Glm4Config
+
+## Glm4Model
+
+[[autodoc]] Glm4Model
+    - forward
+
+## Glm4ForCausalLM
+
+[[autodoc]] Glm4ForCausalLM
+    - forward
+
+## Glm4ForSequenceClassification
+
+[[autodoc]] Glm4ForSequenceClassification
+    - forward
+
+## Glm4ForTokenClassification
+
+[[autodoc]] Glm4ForTokenClassification
+    - forward
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -482,6 +482,7 @@ _import_structure = {
        "GitVisionConfig",
    ],
    "models.glm": ["GlmConfig"],
+    "models.glm4": ["Glm4Config"],
    "models.glpn": ["GLPNConfig"],
    "models.got_ocr2": [
        "GotOcr2Config",
@@ -2526,6 +2527,15 @@ else:
            "Llama4PreTrainedModel",
        ]
    )
+    _import_structure["models.glm4"].extend(
+        [
+            "Glm4ForCausalLM",
+            "Glm4ForSequenceClassification",
+            "Glm4ForTokenClassification",
+            "Glm4Model",
+            "Glm4PreTrainedModel",
+        ]
+    )
    _import_structure["models.glpn"].extend(
        [
            "GLPNForDepthEstimation",
@@ -5742,6 +5752,7 @@ if TYPE_CHECKING:
        GitVisionConfig,
    )
    from .models.glm import GlmConfig
+    from .models.glm4 import Glm4Config
    from .models.glpn import GLPNConfig
    from .models.got_ocr2 import GotOcr2Config, GotOcr2Processor, GotOcr2VisionConfig
    from .models.gpt2 import (
@@ -7624,6 +7635,13 @@ if TYPE_CHECKING:
            GlmModel,
            GlmPreTrainedModel,
        )
+        from .models.glm4 import (
+            Glm4ForCausalLM,
+            Glm4ForSequenceClassification,
+            Glm4ForTokenClassification,
+            Glm4Model,
+            Glm4PreTrainedModel,
+        )
        from .models.glpn import (
            GLPNForDepthEstimation,
            GLPNModel,
--- a/src/transformers/models/init.py
+++ b/src/transformers/models/init.py
@@ -110,6 +110,7 @@ from . import (
    gemma3,
    git,
    glm,
+    glm4,
    glpn,
    got_ocr2,
    gpt2,
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -129,6 +129,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
        ("gemma3_text", "Gemma3TextConfig"),
        ("git", "GitConfig"),
        ("glm", "GlmConfig"),
+        ("glm4", "Glm4Config"),
        ("glpn", "GLPNConfig"),
        ("got_ocr2", "GotOcr2Config"),
        ("gpt-sw3", "GPT2Config"),
@@ -476,6 +477,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
        ("gemma3_text", "Gemma3ForCausalLM"),
        ("git", "GIT"),
        ("glm", "GLM"),
+        ("glm4", "glm4"),
        ("glpn", "GLPN"),
        ("got_ocr2", "GOT-OCR2"),
        ("gpt-sw3", "GPT-Sw3"),
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -122,6 +122,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
        ("gemma3_text", "Gemma3TextModel"),
        ("git", "GitModel"),
        ("glm", "GlmModel"),
+        ("glm4", "Glm4Model"),
        ("glpn", "GLPNModel"),
        ("got_ocr2", "GotOcr2ForConditionalGeneration"),
        ("gpt-sw3", "GPT2Model"),
@@ -532,6 +533,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
        ("gemma3_text", "Gemma3ForCausalLM"),
        ("git", "GitForCausalLM"),
        ("glm", "GlmForCausalLM"),
+        ("glm4", "Glm4ForCausalLM"),
        ("got_ocr2", "GotOcr2ForConditionalGeneration"),
        ("gpt-sw3", "GPT2LMHeadModel"),
        ("gpt2", "GPT2LMHeadModel"),
@@ -1035,6 +1037,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
        ("gemma", "GemmaForSequenceClassification"),
        ("gemma2", "Gemma2ForSequenceClassification"),
        ("glm", "GlmForSequenceClassification"),
+        ("glm4", "Glm4ForSequenceClassification"),
        ("gpt-sw3", "GPT2ForSequenceClassification"),
        ("gpt2", "GPT2ForSequenceClassification"),
        ("gpt_bigcode", "GPTBigCodeForSequenceClassification"),
@@ -1236,6 +1239,7 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
        ("gemma", "GemmaForTokenClassification"),
        ("gemma2", "Gemma2ForTokenClassification"),
        ("glm", "GlmForTokenClassification"),
+        ("glm4", "Glm4ForTokenClassification"),
        ("gpt-sw3", "GPT2ForTokenClassification"),
        ("gpt2", "GPT2ForTokenClassification"),
        ("gpt_bigcode", "GPTBigCodeForTokenClassification"),
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -238,6 +238,7 @@ else:
            ),
            ("git", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
            ("glm", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+            ("glm4", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
            ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)),
            ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
            ("gpt_bigcode", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
--- a/src/transformers/models/glm4/init.py
+++ b/src/transformers/models/glm4/init.py
@@ -0,0 +1,27 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_glm4 import *
+    from .modeling_glm4 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
--- a/src/transformers/models/glm4/configuration_glm4.py
+++ b/src/transformers/models/glm4/configuration_glm4.py
@@ -0,0 +1,152 @@
+# coding=utf-8
+# Copyright 2025 The GLM4 & ZhipuAI team and HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...configuration_utils import PretrainedConfig
+
+
+class Glm4Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Glm4Model`]. It is used to instantiate an Glm4
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Glm4-4-9b-chat.
+    e.g. [THUDM/glm-4-0414-9b-chat-chat](https://huggingface.co/THUDM/glm-4-0414-9b-chat-chat)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151552):
+            Vocabulary size of the Glm4 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Glm4Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 13696):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 40):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 2):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position.
+        head_dim (`int`, *optional*, defaults to 128):
+            The attention head dimension.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The legacy activation function. It is overwritten by the `hidden_activation`.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1.5625e-07):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        pad_token_id (`int`, *optional*, defaults to 151329):
+            Padding token id.
+        eos_token_id (`int` | `list`, *optional*, defaults to `[151329, 151336, 151338]`):
+            End of stream token id.
+        bos_token_id (`int`, *optional*):
+            Beginning of stream token id.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `True`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+    ```python
+    >>> from transformers import Glm4Model, Glm4Config
+    >>> # Initializing a Glm4 glm4-4-9b-chat style configuration
+    >>> configuration = Glm4Config()
+    >>> # Initializing a model from the glm4-4-9b-chat style configuration
+    >>> model = Glm4Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "glm4"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_up_proj": "colwise_rep",  # we need to replicate here due to the `chunk` operation
+        "layers.*.mlp.down_proj": "rowwise_rep",  # we need to replicate here due to the `chunk` operation
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=151552,
+        hidden_size=4096,
+        intermediate_size=13696,
+        num_hidden_layers=40,
+        num_attention_heads=32,
+        num_key_value_heads=2,
+        partial_rotary_factor=0.5,
+        head_dim=128,
+        hidden_act="silu",
+        attention_dropout=0.0,
+        max_position_embeddings=131072,
+        initializer_range=0.02,
+        rms_norm_eps=0.00000015625,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        pad_token_id=151329,
+        eos_token_id=[151329, 151336, 151338],
+        bos_token_id=None,
+        attention_bias=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.partial_rotary_factor = partial_rotary_factor
+        self.head_dim = head_dim
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["Glm4Config"]
--- a/src/transformers/models/glm4/convert_glm4_weights_to_hf.py
+++ b/src/transformers/models/glm4/convert_glm4_weights_to_hf.py
@@ -0,0 +1,199 @@
+import argparse
+import json
+import os
+import re
+
+import torch
+from safetensors.torch import load_file
+from tokenizers import processors
+
+from transformers import Glm4Config, Glm4ForCausalLM, PreTrainedTokenizerFast
+
+
+# fmt: off
+# `None` means we drop the key
+STATE_DICT_MAPPING = {
+    # CausalLM keys
+    r"transformer.output_layer.weight":                                               r"lm_head.weight",
+
+    # Model keys
+    r"transformer.embedding.word_embeddings.weight":                                  r"model.embed_tokens.weight",
+    r"transformer.rotary_pos_emb.inv_freq":                                           None,
+    r"transformer.encoder.final_layernorm.weight":                                    r"model.norm.weight",
+
+    # Layers keys
+    r"transformer.encoder.layers.(\d+).input_layernorm.weight":                       r"model.layers.\1.input_layernorm.weight",
+
+    # Sandwich keys
+    r"transformer.encoder.layers.(\d+).post_mlp_layernorm.weight":                    r"model.layers.\1.post_mlp_layernorm.weight",
+    r"transformer.encoder.layers.(\d+).post_self_attn_layernorm.weight":              r"model.layers.\1.post_self_attn_layernorm.weight",
+
+    r"transformer.encoder.layers.(\d+).post_attention_layernorm.weight":              r"model.layers.\1.post_attention_layernorm.weight",
+
+    # Attention keys
+    r"transformer.encoder.layers.(\d+).self_attention.dense.weight":                  r"model.layers.\1.self_attn.o_proj.weight",
+    # qkv_proj will later be split in q|k|v|_proj
+    r"transformer.encoder.layers.(\d+).self_attention.query_key_value.(weight|bias)": r"model.layers.\1.self_attn.qkv_proj.\2",
+
+    # MLP keys
+    r"transformer.encoder.layers.(\d+).mlp.dense_h_to_4h.weight":                     r"model.layers.\1.mlp.gate_up_proj.weight",
+    r"transformer.encoder.layers.(\d+).mlp.dense_4h_to_h.weight":                     r"model.layers.\1.mlp.down_proj.weight",
+}
+# fmt: on
+
+
+def load_weights(input_dir: str):
+    safetensor_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".safetensors")]
+    bin_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".bin")]
+
+    all_weights = {}
+
+    if safetensor_files:
+        safetensor_files = sorted(safetensor_files, key=lambda x: int(x.rsplit("-", 3)[1]))
+        for file in safetensor_files:
+            tensors = load_file(file)
+            all_weights.update(tensors)
+        return all_weights
+
+    elif bin_files:
+        bin_files = sorted(bin_files, key=lambda x: int(x.rsplit("-", 3)[1]))
+        for file in bin_files:
+            tensors = torch.load(file, map_location="cpu")
+            all_weights.update(tensors)
+        return all_weights
+
+    else:
+        raise ValueError("No .safetensors or .bin files found in the specified directory.")
+
+
+def map_old_key_to_new(old_key):
+    for pattern, replacement in STATE_DICT_MAPPING.items():
+        if replacement is None:
+            if re.fullmatch(pattern, old_key):
+                return None
+        else:
+            new_key, n_replace = re.subn(pattern, replacement, old_key)
+            # Early exit of the loop
+            if n_replace > 0:
+                return new_key
+
+    raise ValueError(f"Key: {old_key} could not be mapped (check the mapping).")
+
+
+def convert_state_dict(original_state_dict: dict, config: Glm4Config):
+    new_dict = {}
+
+    head_dim = config.hidden_size // config.num_attention_heads
+    query_size = config.num_attention_heads * head_dim
+    kv_size = config.num_key_value_heads * head_dim
+
+    for old_key, value in original_state_dict.items():
+        new_key = map_old_key_to_new(old_key)
+        if new_key is None:
+            continue
+
+        if "qkv_proj." in new_key:
+            q_proj, k_proj, v_proj = (
+                value[:query_size, ...],
+                value[query_size : query_size + kv_size, ...],
+                value[query_size + kv_size :, ...],
+            )
+            new_dict[new_key.replace("qkv_proj.", "q_proj.")] = q_proj
+            new_dict[new_key.replace("qkv_proj.", "k_proj.")] = k_proj
+            new_dict[new_key.replace("qkv_proj.", "v_proj.")] = v_proj
+        else:
+            new_dict[new_key] = value
+    return new_dict
+
+
+def convert_config(original_config: dict):
+    key_mapping = {
+        "vocab_size": "padded_vocab_size",
+        "intermediate_size": "ffn_hidden_size",
+        "num_hidden_layers": "num_layers",
+        "max_position_embeddings": "seq_length",
+        "rms_norm_eps": "layernorm_epsilon",
+        "head_dim": "kv_channels",
+        "attention_bias": "add_qkv_bias",
+    }
+    similar_keys_to_keep = [
+        "num_attention_heads",
+        "hidden_size",
+        "attention_dropout",
+        "use_cache",
+        "eos_token_id",
+        "pad_token_id",
+        "tie_word_embeddings",
+    ]
+    new_config_kwargs = {k: original_config[v] for k, v in key_mapping.items()}
+    new_config_kwargs.update({k: v for k, v in original_config.items() if k in similar_keys_to_keep})
+    new_config_kwargs["num_key_value_heads"] = (
+        new_config_kwargs["num_attention_heads"]
+        if not original_config["multi_query_attention"]
+        else original_config["multi_query_group_num"]
+    )
+    new_config_kwargs["rope_theta"] = 10000.0 * getattr(original_config, "rope_ratio", 1)
+
+    new_config = Glm4Config(**new_config_kwargs)
+    return new_config
+
+
+def convert_glm4_tokenizer(input_dir, use_post_processor=False):
+    fast_tok = PreTrainedTokenizerFast.from_pretrained(input_dir, model_input_names=["input_ids", "attention_mask"])
+    if use_post_processor:
+        fast_tok._tokenizer.post_processor = processors.Sequence(
+            [
+                processors.ByteLevel(trim_offsets=False),
+                processors.TemplateProcessing(
+                    single="[gMASK]:0 <sop>:0 $A:0",
+                    pair="[gMASK]:0 <sop>:0 $A:0 $B:1",
+                    special_tokens=[("[gMASK]", 151331), ("<sop>", 151333)],
+                ),
+            ],
+        )
+    else:
+        fast_tok._tokenizer.post_processor = processors.Sequence(
+            [processors.ByteLevel(trim_offsets=False)],
+        )
+    return fast_tok
+
+
+def convert_glm4_model(input_dir, output_dir, use_post_processor=False):
+    # Load and convert config
+    with open(os.path.join(input_dir, "config.json")) as f:
+        original_config = json.load(f)
+    config = convert_config(original_config)
+    config.save_pretrained(output_dir)
+
+    # Load and convert weights
+    original_state_dict = load_weights(input_dir)
+    new_dict = convert_state_dict(original_state_dict, config)
+    with torch.device("meta"):
+        model = Glm4ForCausalLM(config)
+    model.load_state_dict(new_dict, strict=True, assign=True)
+    model.save_pretrained(output_dir)
+
+    # Load and convert tokenizer
+    tokenizer = convert_glm4_tokenizer(input_dir, use_post_processor)
+    tokenizer.save_pretrained(output_dir)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "input_dir",
+        type=str,
+        help="Location of the local folder copied from the Hub.",
+    )
+    parser.add_argument(
+        "output_dir",
+        type=str,
+        help="Location to write HF model and tokenizer",
+    )
+    parser.add_argument(
+        "--use_post_processor",
+        action="store_true",
+        help="Whether to apply post processor with special tokens",
+    )
+    args = parser.parse_args()
+    convert_glm4_model(args.input_dir, args.output_dir, args.use_post_processor)
--- a/src/transformers/models/glm4/modeling_glm4.py
+++ b/src/transformers/models/glm4/modeling_glm4.py
--- a/src/transformers/models/glm4/modular_glm4.py
+++ b/src/transformers/models/glm4/modular_glm4.py
@@ -0,0 +1,164 @@
+# coding=utf-8
+# Copyright 2025 The GLM4 & ZhipuAI team and HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union
+
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from ...cache_utils import Cache
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import CausalLMOutputWithPast
+from ...processing_utils import Unpack
+from ...utils import LossKwargs, logging
+from ..glm.modeling_glm import (
+    GlmAttention,
+    GlmForCausalLM,
+    GlmForSequenceClassification,
+    GlmForTokenClassification,
+)
+from ..phi3.modeling_phi3 import Phi3MLP
+from .configuration_glm4 import Glm4Config
+from .modeling_glm4 import Glm4RMSNorm
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "THUDM/GLM-4-9B-Chat-0414"
+
+
+class Glm4MLP(Phi3MLP):
+    pass
+
+
+class Glm4DecoderLayer(nn.Module):
+    def __init__(self, config: Glm4Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Glm4Attention(config=config, layer_idx=layer_idx)
+
+        self.mlp = Glm4MLP(config)
+        self.input_layernorm = Glm4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Glm4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_self_attn_layernorm = Glm4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_mlp_layernorm = Glm4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+
+        hidden_states = self.post_self_attn_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_mlp_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+class Glm4Attention(GlmAttention):
+    pass
+
+
+class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+
+
+class Glm4ForCausalLM(GlmForCausalLM):
+    def forward(
+        self,
+        **super_kwargs: Unpack[KwargsForCausalLM],
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Glm4ForCausalLM
+
+        >>> model = Glm4ForCausalLM.from_pretrained("THUDM/GLM-4-9B-Chat-0414")
+        >>> tokenizer = AutoTokenizer.from_pretrained("THUDM/GLM-4-9B-Chat-0414")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        return super().forward(**super_kwargs)
+
+
+class Glm4ForSequenceClassification(GlmForSequenceClassification):
+    pass
+
+
+class Glm4ForTokenClassification(GlmForTokenClassification):
+    pass
+
+
+__all__ = [
+    "Glm4PreTrainedModel",  # noqa: F822
+    "Glm4Model",  # noqa: F822
+    "Glm4ForCausalLM",
+    "Glm4ForSequenceClassification",
+    "Glm4ForTokenClassification",
+]
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4740,6 +4740,41 @@ class GlmPreTrainedModel(metaclass=DummyObject):
        requires_backends(self, ["torch"])


+class Glm4ForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Glm4ForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Glm4ForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Glm4Model(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Glm4PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class GLPNForDepthEstimation(metaclass=DummyObject):
    _backends = ["torch"]

--- a/tests/models/glm4/init.py
+++ b/tests/models/glm4/init.py
--- a/tests/models/glm4/test_modeling_glm4.py
+++ b/tests/models/glm4/test_modeling_glm4.py
@@ -0,0 +1,205 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Glm4 model."""
+
+import unittest
+
+import pytest
+
+from transformers import AutoModelForCausalLM, AutoTokenizer, Glm4Config, is_torch_available
+from transformers.testing_utils import (
+    require_flash_attn,
+    require_torch,
+    require_torch_large_gpu,
+    require_torch_sdpa,
+    slow,
+    torch_device,
+)
+
+from ...models.gemma.test_modeling_gemma import GemmaModelTest, GemmaModelTester
+from ...test_configuration_common import ConfigTester
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        Glm4ForCausalLM,
+        Glm4ForSequenceClassification,
+        Glm4ForTokenClassification,
+        Glm4Model,
+    )
+
+
+class Glm4ModelTester(GemmaModelTester):
+    if is_torch_available():
+        config_class = Glm4Config
+        model_class = Glm4Model
+        for_causal_lm_class = Glm4ForCausalLM
+        for_sequence_class = Glm4ForSequenceClassification
+        for_token_class = Glm4ForTokenClassification
+
+
+@require_torch
+class Glm4ModelTest(GemmaModelTest, unittest.TestCase):
+    all_model_classes = (
+        (Glm4Model, Glm4ForCausalLM, Glm4ForSequenceClassification, Glm4ForTokenClassification)
+        if is_torch_available()
+        else ()
+    )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": Glm4Model,
+            "text-classification": Glm4ForSequenceClassification,
+            "token-classification": Glm4ForTokenClassification,
+            "text-generation": Glm4ForCausalLM,
+            "zero-shot": Glm4ForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+    test_headmasking = False
+    test_pruning = False
+    _is_stateful = True
+    model_split_percents = [0.5, 0.6]
+
+    def setUp(self):
+        self.model_tester = Glm4ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Glm4Config, hidden_size=37)
+
+
+@slow
+@require_torch_large_gpu
+class Glm4IntegrationTest(unittest.TestCase):
+    input_text = ["Hello I am doing", "Hi today"]
+    model_id = "THUDM/glm-4-0414-9b-chat"
+    revision = "refs/pr/15"
+    # This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
+    # Depending on the hardware we get different logits / generations
+    cuda_compute_capability_major_version = None
+
+    @classmethod
+    def setUpClass(cls):
+        if is_torch_available() and torch.cuda.is_available():
+            # 8 is for A100 / A10 and 7 for T4
+            cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]
+
+    def test_model_9b_fp16(self):
+        EXPECTED_TEXTS = [
+            "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the",
+            "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.",
+        ]
+
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, revision=self.revision
+        ).to(torch_device)
+
+        tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision)
+        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
+
+        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
+
+        self.assertEqual(output_text, EXPECTED_TEXTS)
+
+    def test_model_9b_bf16(self):
+        EXPECTED_TEXTS = [
+            "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the",
+            "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.",
+        ]
+
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, revision=self.revision
+        ).to(torch_device)
+
+        tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision)
+        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
+
+        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
+
+        self.assertEqual(output_text, EXPECTED_TEXTS)
+
+    def test_model_9b_eager(self):
+        EXPECTED_TEXTS = [
+            "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the",
+            "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.",
+        ]
+
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_id,
+            low_cpu_mem_usage=True,
+            torch_dtype=torch.bfloat16,
+            attn_implementation="eager",
+            revision=self.revision,
+        )
+        model.to(torch_device)
+
+        tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision)
+        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
+
+        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
+
+        self.assertEqual(output_text, EXPECTED_TEXTS)
+
+    @require_torch_sdpa
+    def test_model_9b_sdpa(self):
+        EXPECTED_TEXTS = [
+            "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the",
+            "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.",
+        ]
+
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_id,
+            low_cpu_mem_usage=True,
+            torch_dtype=torch.bfloat16,
+            attn_implementation="sdpa",
+            revision=self.revision,
+        )
+        model.to(torch_device)
+
+        tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision)
+        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
+
+        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
+
+        self.assertEqual(output_text, EXPECTED_TEXTS)
+
+    @require_flash_attn
+    @pytest.mark.flash_attn_test
+    def test_model_9b_flash_attn(self):
+        EXPECTED_TEXTS = [
+            "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the",
+            "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.",
+        ]
+
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_id,
+            low_cpu_mem_usage=True,
+            torch_dtype=torch.bfloat16,
+            attn_implementation="flash_attention_2",
+            revision=self.revision,
+        )
+        model.to(torch_device)
+
+        tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision)
+        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
+
+        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
+
+        self.assertEqual(output_text, EXPECTED_TEXTS)