Add glm4 (#37388)
* add changed * Revert "add changed" This reverts commit 0a0166a1fe80556115a49fbf0c2132de0f4f85c9. * update with NEW MODEL class called GLM4 * update * Update glm4.md * Name * style * fix copies * fixup test --------- Co-authored-by: Yuxuan Zhang <2448370773@qq.com>
This commit is contained in:
@@ -461,6 +461,8 @@
|
||||
title: Gemma2
|
||||
- local: model_doc/glm
|
||||
title: GLM
|
||||
- local: model_doc/glm4
|
||||
title: glm4
|
||||
- local: model_doc/openai-gpt
|
||||
title: GPT
|
||||
- local: model_doc/gpt_neo
|
||||
|
||||
45
docs/source/en/model_doc/glm4.md
Normal file
45
docs/source/en/model_doc/glm4.md
Normal file
@@ -0,0 +1,45 @@
|
||||
<!--Copyright 2025 The GLM & ZhipuAI team and The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Glm4
|
||||
|
||||
## Overview
|
||||
|
||||
To be released with the official model launch.
|
||||
|
||||
## Glm4Config
|
||||
|
||||
[[autodoc]] Glm4Config
|
||||
|
||||
## Glm4Model
|
||||
|
||||
[[autodoc]] Glm4Model
|
||||
- forward
|
||||
|
||||
## Glm4ForCausalLM
|
||||
|
||||
[[autodoc]] Glm4ForCausalLM
|
||||
- forward
|
||||
|
||||
## Glm4ForSequenceClassification
|
||||
|
||||
[[autodoc]] Glm4ForSequenceClassification
|
||||
- forward
|
||||
|
||||
## Glm4ForTokenClassification
|
||||
|
||||
[[autodoc]] Glm4ForTokenClassification
|
||||
- forward
|
||||
@@ -482,6 +482,7 @@ _import_structure = {
|
||||
"GitVisionConfig",
|
||||
],
|
||||
"models.glm": ["GlmConfig"],
|
||||
"models.glm4": ["Glm4Config"],
|
||||
"models.glpn": ["GLPNConfig"],
|
||||
"models.got_ocr2": [
|
||||
"GotOcr2Config",
|
||||
@@ -2526,6 +2527,15 @@ else:
|
||||
"Llama4PreTrainedModel",
|
||||
]
|
||||
)
|
||||
_import_structure["models.glm4"].extend(
|
||||
[
|
||||
"Glm4ForCausalLM",
|
||||
"Glm4ForSequenceClassification",
|
||||
"Glm4ForTokenClassification",
|
||||
"Glm4Model",
|
||||
"Glm4PreTrainedModel",
|
||||
]
|
||||
)
|
||||
_import_structure["models.glpn"].extend(
|
||||
[
|
||||
"GLPNForDepthEstimation",
|
||||
@@ -5742,6 +5752,7 @@ if TYPE_CHECKING:
|
||||
GitVisionConfig,
|
||||
)
|
||||
from .models.glm import GlmConfig
|
||||
from .models.glm4 import Glm4Config
|
||||
from .models.glpn import GLPNConfig
|
||||
from .models.got_ocr2 import GotOcr2Config, GotOcr2Processor, GotOcr2VisionConfig
|
||||
from .models.gpt2 import (
|
||||
@@ -7624,6 +7635,13 @@ if TYPE_CHECKING:
|
||||
GlmModel,
|
||||
GlmPreTrainedModel,
|
||||
)
|
||||
from .models.glm4 import (
|
||||
Glm4ForCausalLM,
|
||||
Glm4ForSequenceClassification,
|
||||
Glm4ForTokenClassification,
|
||||
Glm4Model,
|
||||
Glm4PreTrainedModel,
|
||||
)
|
||||
from .models.glpn import (
|
||||
GLPNForDepthEstimation,
|
||||
GLPNModel,
|
||||
|
||||
@@ -110,6 +110,7 @@ from . import (
|
||||
gemma3,
|
||||
git,
|
||||
glm,
|
||||
glm4,
|
||||
glpn,
|
||||
got_ocr2,
|
||||
gpt2,
|
||||
|
||||
@@ -129,6 +129,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
|
||||
("gemma3_text", "Gemma3TextConfig"),
|
||||
("git", "GitConfig"),
|
||||
("glm", "GlmConfig"),
|
||||
("glm4", "Glm4Config"),
|
||||
("glpn", "GLPNConfig"),
|
||||
("got_ocr2", "GotOcr2Config"),
|
||||
("gpt-sw3", "GPT2Config"),
|
||||
@@ -476,6 +477,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
|
||||
("gemma3_text", "Gemma3ForCausalLM"),
|
||||
("git", "GIT"),
|
||||
("glm", "GLM"),
|
||||
("glm4", "glm4"),
|
||||
("glpn", "GLPN"),
|
||||
("got_ocr2", "GOT-OCR2"),
|
||||
("gpt-sw3", "GPT-Sw3"),
|
||||
|
||||
@@ -122,6 +122,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("gemma3_text", "Gemma3TextModel"),
|
||||
("git", "GitModel"),
|
||||
("glm", "GlmModel"),
|
||||
("glm4", "Glm4Model"),
|
||||
("glpn", "GLPNModel"),
|
||||
("got_ocr2", "GotOcr2ForConditionalGeneration"),
|
||||
("gpt-sw3", "GPT2Model"),
|
||||
@@ -532,6 +533,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
|
||||
("gemma3_text", "Gemma3ForCausalLM"),
|
||||
("git", "GitForCausalLM"),
|
||||
("glm", "GlmForCausalLM"),
|
||||
("glm4", "Glm4ForCausalLM"),
|
||||
("got_ocr2", "GotOcr2ForConditionalGeneration"),
|
||||
("gpt-sw3", "GPT2LMHeadModel"),
|
||||
("gpt2", "GPT2LMHeadModel"),
|
||||
@@ -1035,6 +1037,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||
("gemma", "GemmaForSequenceClassification"),
|
||||
("gemma2", "Gemma2ForSequenceClassification"),
|
||||
("glm", "GlmForSequenceClassification"),
|
||||
("glm4", "Glm4ForSequenceClassification"),
|
||||
("gpt-sw3", "GPT2ForSequenceClassification"),
|
||||
("gpt2", "GPT2ForSequenceClassification"),
|
||||
("gpt_bigcode", "GPTBigCodeForSequenceClassification"),
|
||||
@@ -1236,6 +1239,7 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||
("gemma", "GemmaForTokenClassification"),
|
||||
("gemma2", "Gemma2ForTokenClassification"),
|
||||
("glm", "GlmForTokenClassification"),
|
||||
("glm4", "Glm4ForTokenClassification"),
|
||||
("gpt-sw3", "GPT2ForTokenClassification"),
|
||||
("gpt2", "GPT2ForTokenClassification"),
|
||||
("gpt_bigcode", "GPTBigCodeForTokenClassification"),
|
||||
|
||||
@@ -238,6 +238,7 @@ else:
|
||||
),
|
||||
("git", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("glm", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("glm4", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)),
|
||||
("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
|
||||
("gpt_bigcode", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
|
||||
|
||||
27
src/transformers/models/glm4/__init__.py
Normal file
27
src/transformers/models/glm4/__init__.py
Normal file
@@ -0,0 +1,27 @@
|
||||
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ...utils import _LazyModule
|
||||
from ...utils.import_utils import define_import_structure
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_glm4 import *
|
||||
from .modeling_glm4 import *
|
||||
else:
|
||||
import sys
|
||||
|
||||
_file = globals()["__file__"]
|
||||
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|
||||
152
src/transformers/models/glm4/configuration_glm4.py
Normal file
152
src/transformers/models/glm4/configuration_glm4.py
Normal file
@@ -0,0 +1,152 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 The GLM4 & ZhipuAI team and HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class Glm4Config(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`Glm4Model`]. It is used to instantiate an Glm4
|
||||
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
||||
defaults will yield a similar configuration to that of the Glm4-4-9b-chat.
|
||||
e.g. [THUDM/glm-4-0414-9b-chat-chat](https://huggingface.co/THUDM/glm-4-0414-9b-chat-chat)
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 151552):
|
||||
Vocabulary size of the Glm4 model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`Glm4Model`]
|
||||
hidden_size (`int`, *optional*, defaults to 4096):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 13696):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 40):
|
||||
Number of hidden layers in the Transformer decoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer in the Transformer decoder.
|
||||
num_key_value_heads (`int`, *optional*, defaults to 2):
|
||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group. For more details checkout [this
|
||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||
`num_attention_heads`.
|
||||
partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position.
|
||||
head_dim (`int`, *optional*, defaults to 128):
|
||||
The attention head dimension.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||
The legacy activation function. It is overwritten by the `hidden_activation`.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 131072):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1.5625e-07):
|
||||
The epsilon used by the rms normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether to tie weight embeddings
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
pad_token_id (`int`, *optional*, defaults to 151329):
|
||||
Padding token id.
|
||||
eos_token_id (`int` | `list`, *optional*, defaults to `[151329, 151336, 151338]`):
|
||||
End of stream token id.
|
||||
bos_token_id (`int`, *optional*):
|
||||
Beginning of stream token id.
|
||||
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `True`):
|
||||
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
||||
```python
|
||||
>>> from transformers import Glm4Model, Glm4Config
|
||||
>>> # Initializing a Glm4 glm4-4-9b-chat style configuration
|
||||
>>> configuration = Glm4Config()
|
||||
>>> # Initializing a model from the glm4-4-9b-chat style configuration
|
||||
>>> model = Glm4Model(configuration)
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "glm4"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
base_model_tp_plan = {
|
||||
"layers.*.self_attn.q_proj": "colwise",
|
||||
"layers.*.self_attn.k_proj": "colwise",
|
||||
"layers.*.self_attn.v_proj": "colwise",
|
||||
"layers.*.self_attn.o_proj": "rowwise",
|
||||
"layers.*.mlp.gate_up_proj": "colwise_rep", # we need to replicate here due to the `chunk` operation
|
||||
"layers.*.mlp.down_proj": "rowwise_rep", # we need to replicate here due to the `chunk` operation
|
||||
}
|
||||
base_model_pp_plan = {
|
||||
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
|
||||
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
|
||||
"norm": (["hidden_states"], ["hidden_states"]),
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=151552,
|
||||
hidden_size=4096,
|
||||
intermediate_size=13696,
|
||||
num_hidden_layers=40,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=2,
|
||||
partial_rotary_factor=0.5,
|
||||
head_dim=128,
|
||||
hidden_act="silu",
|
||||
attention_dropout=0.0,
|
||||
max_position_embeddings=131072,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=0.00000015625,
|
||||
use_cache=True,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
pad_token_id=151329,
|
||||
eos_token_id=[151329, 151336, 151338],
|
||||
bos_token_id=None,
|
||||
attention_bias=True,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.partial_rotary_factor = partial_rotary_factor
|
||||
self.head_dim = head_dim
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Glm4Config"]
|
||||
199
src/transformers/models/glm4/convert_glm4_weights_to_hf.py
Normal file
199
src/transformers/models/glm4/convert_glm4_weights_to_hf.py
Normal file
@@ -0,0 +1,199 @@
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
import torch
|
||||
from safetensors.torch import load_file
|
||||
from tokenizers import processors
|
||||
|
||||
from transformers import Glm4Config, Glm4ForCausalLM, PreTrainedTokenizerFast
|
||||
|
||||
|
||||
# fmt: off
|
||||
# `None` means we drop the key
|
||||
STATE_DICT_MAPPING = {
|
||||
# CausalLM keys
|
||||
r"transformer.output_layer.weight": r"lm_head.weight",
|
||||
|
||||
# Model keys
|
||||
r"transformer.embedding.word_embeddings.weight": r"model.embed_tokens.weight",
|
||||
r"transformer.rotary_pos_emb.inv_freq": None,
|
||||
r"transformer.encoder.final_layernorm.weight": r"model.norm.weight",
|
||||
|
||||
# Layers keys
|
||||
r"transformer.encoder.layers.(\d+).input_layernorm.weight": r"model.layers.\1.input_layernorm.weight",
|
||||
|
||||
# Sandwich keys
|
||||
r"transformer.encoder.layers.(\d+).post_mlp_layernorm.weight": r"model.layers.\1.post_mlp_layernorm.weight",
|
||||
r"transformer.encoder.layers.(\d+).post_self_attn_layernorm.weight": r"model.layers.\1.post_self_attn_layernorm.weight",
|
||||
|
||||
r"transformer.encoder.layers.(\d+).post_attention_layernorm.weight": r"model.layers.\1.post_attention_layernorm.weight",
|
||||
|
||||
# Attention keys
|
||||
r"transformer.encoder.layers.(\d+).self_attention.dense.weight": r"model.layers.\1.self_attn.o_proj.weight",
|
||||
# qkv_proj will later be split in q|k|v|_proj
|
||||
r"transformer.encoder.layers.(\d+).self_attention.query_key_value.(weight|bias)": r"model.layers.\1.self_attn.qkv_proj.\2",
|
||||
|
||||
# MLP keys
|
||||
r"transformer.encoder.layers.(\d+).mlp.dense_h_to_4h.weight": r"model.layers.\1.mlp.gate_up_proj.weight",
|
||||
r"transformer.encoder.layers.(\d+).mlp.dense_4h_to_h.weight": r"model.layers.\1.mlp.down_proj.weight",
|
||||
}
|
||||
# fmt: on
|
||||
|
||||
|
||||
def load_weights(input_dir: str):
|
||||
safetensor_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".safetensors")]
|
||||
bin_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".bin")]
|
||||
|
||||
all_weights = {}
|
||||
|
||||
if safetensor_files:
|
||||
safetensor_files = sorted(safetensor_files, key=lambda x: int(x.rsplit("-", 3)[1]))
|
||||
for file in safetensor_files:
|
||||
tensors = load_file(file)
|
||||
all_weights.update(tensors)
|
||||
return all_weights
|
||||
|
||||
elif bin_files:
|
||||
bin_files = sorted(bin_files, key=lambda x: int(x.rsplit("-", 3)[1]))
|
||||
for file in bin_files:
|
||||
tensors = torch.load(file, map_location="cpu")
|
||||
all_weights.update(tensors)
|
||||
return all_weights
|
||||
|
||||
else:
|
||||
raise ValueError("No .safetensors or .bin files found in the specified directory.")
|
||||
|
||||
|
||||
def map_old_key_to_new(old_key):
|
||||
for pattern, replacement in STATE_DICT_MAPPING.items():
|
||||
if replacement is None:
|
||||
if re.fullmatch(pattern, old_key):
|
||||
return None
|
||||
else:
|
||||
new_key, n_replace = re.subn(pattern, replacement, old_key)
|
||||
# Early exit of the loop
|
||||
if n_replace > 0:
|
||||
return new_key
|
||||
|
||||
raise ValueError(f"Key: {old_key} could not be mapped (check the mapping).")
|
||||
|
||||
|
||||
def convert_state_dict(original_state_dict: dict, config: Glm4Config):
|
||||
new_dict = {}
|
||||
|
||||
head_dim = config.hidden_size // config.num_attention_heads
|
||||
query_size = config.num_attention_heads * head_dim
|
||||
kv_size = config.num_key_value_heads * head_dim
|
||||
|
||||
for old_key, value in original_state_dict.items():
|
||||
new_key = map_old_key_to_new(old_key)
|
||||
if new_key is None:
|
||||
continue
|
||||
|
||||
if "qkv_proj." in new_key:
|
||||
q_proj, k_proj, v_proj = (
|
||||
value[:query_size, ...],
|
||||
value[query_size : query_size + kv_size, ...],
|
||||
value[query_size + kv_size :, ...],
|
||||
)
|
||||
new_dict[new_key.replace("qkv_proj.", "q_proj.")] = q_proj
|
||||
new_dict[new_key.replace("qkv_proj.", "k_proj.")] = k_proj
|
||||
new_dict[new_key.replace("qkv_proj.", "v_proj.")] = v_proj
|
||||
else:
|
||||
new_dict[new_key] = value
|
||||
return new_dict
|
||||
|
||||
|
||||
def convert_config(original_config: dict):
|
||||
key_mapping = {
|
||||
"vocab_size": "padded_vocab_size",
|
||||
"intermediate_size": "ffn_hidden_size",
|
||||
"num_hidden_layers": "num_layers",
|
||||
"max_position_embeddings": "seq_length",
|
||||
"rms_norm_eps": "layernorm_epsilon",
|
||||
"head_dim": "kv_channels",
|
||||
"attention_bias": "add_qkv_bias",
|
||||
}
|
||||
similar_keys_to_keep = [
|
||||
"num_attention_heads",
|
||||
"hidden_size",
|
||||
"attention_dropout",
|
||||
"use_cache",
|
||||
"eos_token_id",
|
||||
"pad_token_id",
|
||||
"tie_word_embeddings",
|
||||
]
|
||||
new_config_kwargs = {k: original_config[v] for k, v in key_mapping.items()}
|
||||
new_config_kwargs.update({k: v for k, v in original_config.items() if k in similar_keys_to_keep})
|
||||
new_config_kwargs["num_key_value_heads"] = (
|
||||
new_config_kwargs["num_attention_heads"]
|
||||
if not original_config["multi_query_attention"]
|
||||
else original_config["multi_query_group_num"]
|
||||
)
|
||||
new_config_kwargs["rope_theta"] = 10000.0 * getattr(original_config, "rope_ratio", 1)
|
||||
|
||||
new_config = Glm4Config(**new_config_kwargs)
|
||||
return new_config
|
||||
|
||||
|
||||
def convert_glm4_tokenizer(input_dir, use_post_processor=False):
|
||||
fast_tok = PreTrainedTokenizerFast.from_pretrained(input_dir, model_input_names=["input_ids", "attention_mask"])
|
||||
if use_post_processor:
|
||||
fast_tok._tokenizer.post_processor = processors.Sequence(
|
||||
[
|
||||
processors.ByteLevel(trim_offsets=False),
|
||||
processors.TemplateProcessing(
|
||||
single="[gMASK]:0 <sop>:0 $A:0",
|
||||
pair="[gMASK]:0 <sop>:0 $A:0 $B:1",
|
||||
special_tokens=[("[gMASK]", 151331), ("<sop>", 151333)],
|
||||
),
|
||||
],
|
||||
)
|
||||
else:
|
||||
fast_tok._tokenizer.post_processor = processors.Sequence(
|
||||
[processors.ByteLevel(trim_offsets=False)],
|
||||
)
|
||||
return fast_tok
|
||||
|
||||
|
||||
def convert_glm4_model(input_dir, output_dir, use_post_processor=False):
|
||||
# Load and convert config
|
||||
with open(os.path.join(input_dir, "config.json")) as f:
|
||||
original_config = json.load(f)
|
||||
config = convert_config(original_config)
|
||||
config.save_pretrained(output_dir)
|
||||
|
||||
# Load and convert weights
|
||||
original_state_dict = load_weights(input_dir)
|
||||
new_dict = convert_state_dict(original_state_dict, config)
|
||||
with torch.device("meta"):
|
||||
model = Glm4ForCausalLM(config)
|
||||
model.load_state_dict(new_dict, strict=True, assign=True)
|
||||
model.save_pretrained(output_dir)
|
||||
|
||||
# Load and convert tokenizer
|
||||
tokenizer = convert_glm4_tokenizer(input_dir, use_post_processor)
|
||||
tokenizer.save_pretrained(output_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"input_dir",
|
||||
type=str,
|
||||
help="Location of the local folder copied from the Hub.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"output_dir",
|
||||
type=str,
|
||||
help="Location to write HF model and tokenizer",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use_post_processor",
|
||||
action="store_true",
|
||||
help="Whether to apply post processor with special tokens",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
convert_glm4_model(args.input_dir, args.output_dir, args.use_post_processor)
|
||||
1056
src/transformers/models/glm4/modeling_glm4.py
Normal file
1056
src/transformers/models/glm4/modeling_glm4.py
Normal file
File diff suppressed because it is too large
Load Diff
164
src/transformers/models/glm4/modular_glm4.py
Normal file
164
src/transformers/models/glm4/modular_glm4.py
Normal file
@@ -0,0 +1,164 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 The GLM4 & ZhipuAI team and HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
import torch.nn as nn
|
||||
import torch.utils.checkpoint
|
||||
|
||||
from ...cache_utils import Cache
|
||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...modeling_outputs import CausalLMOutputWithPast
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import LossKwargs, logging
|
||||
from ..glm.modeling_glm import (
|
||||
GlmAttention,
|
||||
GlmForCausalLM,
|
||||
GlmForSequenceClassification,
|
||||
GlmForTokenClassification,
|
||||
)
|
||||
from ..phi3.modeling_phi3 import Phi3MLP
|
||||
from .configuration_glm4 import Glm4Config
|
||||
from .modeling_glm4 import Glm4RMSNorm
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
_CHECKPOINT_FOR_DOC = "THUDM/GLM-4-9B-Chat-0414"
|
||||
|
||||
|
||||
class Glm4MLP(Phi3MLP):
|
||||
pass
|
||||
|
||||
|
||||
class Glm4DecoderLayer(nn.Module):
|
||||
def __init__(self, config: Glm4Config, layer_idx: int):
|
||||
super().__init__()
|
||||
self.hidden_size = config.hidden_size
|
||||
self.self_attn = Glm4Attention(config=config, layer_idx=layer_idx)
|
||||
|
||||
self.mlp = Glm4MLP(config)
|
||||
self.input_layernorm = Glm4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.post_attention_layernorm = Glm4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.post_self_attn_layernorm = Glm4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.post_mlp_layernorm = Glm4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_value: Optional[Cache] = None,
|
||||
output_attentions: Optional[bool] = False,
|
||||
use_cache: Optional[bool] = False,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
||||
residual = hidden_states
|
||||
|
||||
hidden_states = self.input_layernorm(hidden_states)
|
||||
|
||||
# Self Attention
|
||||
hidden_states, self_attn_weights = self.self_attn(
|
||||
hidden_states=hidden_states,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_value=past_key_value,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
position_embeddings=position_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
hidden_states = self.post_self_attn_layernorm(hidden_states)
|
||||
hidden_states = residual + hidden_states
|
||||
|
||||
# Fully Connected
|
||||
residual = hidden_states
|
||||
hidden_states = self.post_attention_layernorm(hidden_states)
|
||||
hidden_states = self.mlp(hidden_states)
|
||||
hidden_states = self.post_mlp_layernorm(hidden_states)
|
||||
hidden_states = residual + hidden_states
|
||||
|
||||
outputs = (hidden_states,)
|
||||
if output_attentions:
|
||||
outputs += (self_attn_weights,)
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
class Glm4Attention(GlmAttention):
|
||||
pass
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
|
||||
|
||||
|
||||
class Glm4ForCausalLM(GlmForCausalLM):
|
||||
def forward(
|
||||
self,
|
||||
**super_kwargs: Unpack[KwargsForCausalLM],
|
||||
) -> Union[Tuple, CausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
|
||||
logits_to_keep (`int` or `torch.Tensor`, *optional*):
|
||||
If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
|
||||
`input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
|
||||
token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
|
||||
If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
|
||||
This is useful when using packed tensor format (single dimension for batch and sequence length).
|
||||
|
||||
Returns:
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import AutoTokenizer, Glm4ForCausalLM
|
||||
|
||||
>>> model = Glm4ForCausalLM.from_pretrained("THUDM/GLM-4-9B-Chat-0414")
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("THUDM/GLM-4-9B-Chat-0414")
|
||||
|
||||
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
||||
>>> inputs = tokenizer(prompt, return_tensors="pt")
|
||||
|
||||
>>> # Generate
|
||||
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
|
||||
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
||||
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
|
||||
```"""
|
||||
return super().forward(**super_kwargs)
|
||||
|
||||
|
||||
class Glm4ForSequenceClassification(GlmForSequenceClassification):
|
||||
pass
|
||||
|
||||
|
||||
class Glm4ForTokenClassification(GlmForTokenClassification):
|
||||
pass
|
||||
|
||||
|
||||
__all__ = [
|
||||
"Glm4PreTrainedModel", # noqa: F822
|
||||
"Glm4Model", # noqa: F822
|
||||
"Glm4ForCausalLM",
|
||||
"Glm4ForSequenceClassification",
|
||||
"Glm4ForTokenClassification",
|
||||
]
|
||||
@@ -4740,6 +4740,41 @@ class GlmPreTrainedModel(metaclass=DummyObject):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class Glm4ForCausalLM(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class Glm4ForSequenceClassification(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class Glm4ForTokenClassification(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class Glm4Model(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class Glm4PreTrainedModel(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class GLPNForDepthEstimation(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
|
||||
0
tests/models/glm4/__init__.py
Normal file
0
tests/models/glm4/__init__.py
Normal file
205
tests/models/glm4/test_modeling_glm4.py
Normal file
205
tests/models/glm4/test_modeling_glm4.py
Normal file
@@ -0,0 +1,205 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Testing suite for the PyTorch Glm4 model."""
|
||||
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, Glm4Config, is_torch_available
|
||||
from transformers.testing_utils import (
|
||||
require_flash_attn,
|
||||
require_torch,
|
||||
require_torch_large_gpu,
|
||||
require_torch_sdpa,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ...models.gemma.test_modeling_gemma import GemmaModelTest, GemmaModelTester
|
||||
from ...test_configuration_common import ConfigTester
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
from transformers import (
|
||||
Glm4ForCausalLM,
|
||||
Glm4ForSequenceClassification,
|
||||
Glm4ForTokenClassification,
|
||||
Glm4Model,
|
||||
)
|
||||
|
||||
|
||||
class Glm4ModelTester(GemmaModelTester):
|
||||
if is_torch_available():
|
||||
config_class = Glm4Config
|
||||
model_class = Glm4Model
|
||||
for_causal_lm_class = Glm4ForCausalLM
|
||||
for_sequence_class = Glm4ForSequenceClassification
|
||||
for_token_class = Glm4ForTokenClassification
|
||||
|
||||
|
||||
@require_torch
|
||||
class Glm4ModelTest(GemmaModelTest, unittest.TestCase):
|
||||
all_model_classes = (
|
||||
(Glm4Model, Glm4ForCausalLM, Glm4ForSequenceClassification, Glm4ForTokenClassification)
|
||||
if is_torch_available()
|
||||
else ()
|
||||
)
|
||||
pipeline_model_mapping = (
|
||||
{
|
||||
"feature-extraction": Glm4Model,
|
||||
"text-classification": Glm4ForSequenceClassification,
|
||||
"token-classification": Glm4ForTokenClassification,
|
||||
"text-generation": Glm4ForCausalLM,
|
||||
"zero-shot": Glm4ForSequenceClassification,
|
||||
}
|
||||
if is_torch_available()
|
||||
else {}
|
||||
)
|
||||
test_headmasking = False
|
||||
test_pruning = False
|
||||
_is_stateful = True
|
||||
model_split_percents = [0.5, 0.6]
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = Glm4ModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=Glm4Config, hidden_size=37)
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_large_gpu
|
||||
class Glm4IntegrationTest(unittest.TestCase):
|
||||
input_text = ["Hello I am doing", "Hi today"]
|
||||
model_id = "THUDM/glm-4-0414-9b-chat"
|
||||
revision = "refs/pr/15"
|
||||
# This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
|
||||
# Depending on the hardware we get different logits / generations
|
||||
cuda_compute_capability_major_version = None
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
if is_torch_available() and torch.cuda.is_available():
|
||||
# 8 is for A100 / A10 and 7 for T4
|
||||
cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]
|
||||
|
||||
def test_model_9b_fp16(self):
|
||||
EXPECTED_TEXTS = [
|
||||
"Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the",
|
||||
"Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.",
|
||||
]
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, revision=self.revision
|
||||
).to(torch_device)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision)
|
||||
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
|
||||
|
||||
self.assertEqual(output_text, EXPECTED_TEXTS)
|
||||
|
||||
def test_model_9b_bf16(self):
|
||||
EXPECTED_TEXTS = [
|
||||
"Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the",
|
||||
"Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.",
|
||||
]
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, revision=self.revision
|
||||
).to(torch_device)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision)
|
||||
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
|
||||
|
||||
self.assertEqual(output_text, EXPECTED_TEXTS)
|
||||
|
||||
def test_model_9b_eager(self):
|
||||
EXPECTED_TEXTS = [
|
||||
"Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the",
|
||||
"Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.",
|
||||
]
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_id,
|
||||
low_cpu_mem_usage=True,
|
||||
torch_dtype=torch.bfloat16,
|
||||
attn_implementation="eager",
|
||||
revision=self.revision,
|
||||
)
|
||||
model.to(torch_device)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision)
|
||||
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
|
||||
|
||||
self.assertEqual(output_text, EXPECTED_TEXTS)
|
||||
|
||||
@require_torch_sdpa
|
||||
def test_model_9b_sdpa(self):
|
||||
EXPECTED_TEXTS = [
|
||||
"Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the",
|
||||
"Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.",
|
||||
]
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_id,
|
||||
low_cpu_mem_usage=True,
|
||||
torch_dtype=torch.bfloat16,
|
||||
attn_implementation="sdpa",
|
||||
revision=self.revision,
|
||||
)
|
||||
model.to(torch_device)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision)
|
||||
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
|
||||
|
||||
self.assertEqual(output_text, EXPECTED_TEXTS)
|
||||
|
||||
@require_flash_attn
|
||||
@pytest.mark.flash_attn_test
|
||||
def test_model_9b_flash_attn(self):
|
||||
EXPECTED_TEXTS = [
|
||||
"Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the",
|
||||
"Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.",
|
||||
]
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_id,
|
||||
low_cpu_mem_usage=True,
|
||||
torch_dtype=torch.bfloat16,
|
||||
attn_implementation="flash_attention_2",
|
||||
revision=self.revision,
|
||||
)
|
||||
model.to(torch_device)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision)
|
||||
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
|
||||
|
||||
self.assertEqual(output_text, EXPECTED_TEXTS)
|
||||
Reference in New Issue
Block a user