GLM-4.1V Model support (#38431)
* 20250508 Model Architecture * Update modeling_glm4v.py * Update modeling_glm4v.py * Update modeling_glm4v.py * update 1447 * 0526 * update * format * problem * update * update with only image embed diff * Final * upload * update * 1 * upload with ruff * update * update * work * 1 * 1 * update with new note * 2 * Update convert_glm4v_mgt_weights_to_hf.py * Update tokenization_auto.py * update with new format * remove rmsnrom * draft with videos * draft * update * update * fix for review problem * try to remove min_pixel * update * for test * remove timestamps * remove item * update with remove * change * update 2200 * update * Delete app.py * format * update * Update test_video_processing_glm4v.py * 1 * 2 * use new name * Update test_video_processing_glm4v.py * remove docs * change * update for image processors update * 2108 * 2128 * Update modular_glm4v.py * 1 * update some * update * rename * 1 * remove tests output * 2 * add configuration * update * Update test_video_processing_glm4v.py * fix simple forward tests * update with modular * 1 * fix more tests * fix generation test * fix beam search and init * modular changed * fix beam search in case of single-image/video. Fails if multiple visuals per text * update processor * update test * pass * fix beam search * update * param correct * Update convert_glm4v_mgt_weights_to_hf.py * 1 * Update test_modeling_glm4v.py * 4 * 2 * 2123 video process * 2 * revert * 1 * 2 * revert processing * update preprocesor * changed * 1 * update * update * 6 * update * update * update * Delete tmp.txt * config * Update video_processing_glm4v.py * apply modular correctly * move functions * fix order * update the longest_edge * style * simplify a lot * fix random order of classes * skip integration tests * correctly fix the tests * fix TP plan --------- Co-authored-by: raushan <raushan@huggingface.co> Co-authored-by: Cyril Vallez <cyril.vallez@huggingface.co> Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com>
This commit is contained in:
@@ -955,6 +955,8 @@
|
|||||||
title: Gemma3
|
title: Gemma3
|
||||||
- local: model_doc/git
|
- local: model_doc/git
|
||||||
title: GIT
|
title: GIT
|
||||||
|
- local: model_doc/glm4v
|
||||||
|
title: glm4v
|
||||||
- local: model_doc/got_ocr2
|
- local: model_doc/got_ocr2
|
||||||
title: GOT-OCR2
|
title: GOT-OCR2
|
||||||
- local: model_doc/granitevision
|
- local: model_doc/granitevision
|
||||||
|
|||||||
180
docs/source/en/model_doc/glm4v.md
Normal file
180
docs/source/en/model_doc/glm4v.md
Normal file
@@ -0,0 +1,180 @@
|
|||||||
|
<!--Copyright 2025 The ZhipuAI Inc. and The HuggingFace Inc. team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations under the License.
|
||||||
|
|
||||||
|
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||||
|
rendered properly in your Markdown viewer.
|
||||||
|
|
||||||
|
-->
|
||||||
|
|
||||||
|
<div style="float: right;">
|
||||||
|
<div class="flex flex-wrap space-x-1">
|
||||||
|
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||||
|
<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
|
||||||
|
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white"> </div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
# GLM-4.1V
|
||||||
|
|
||||||
|
The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.
|
||||||
|
|
||||||
|
<hfoptions id="usage">
|
||||||
|
<hfoption id="Pipeline">
|
||||||
|
|
||||||
|
```py
|
||||||
|
import torch
|
||||||
|
from transformers import pipeline
|
||||||
|
pipe = pipeline(
|
||||||
|
task="image-text-to-text",
|
||||||
|
model="THUDM/GLM-4.1V-9B-Thinking",
|
||||||
|
device=0,
|
||||||
|
torch_dtype=torch.bfloat16
|
||||||
|
)
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "image",
|
||||||
|
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
|
||||||
|
},
|
||||||
|
{ "type": "text", "text": "Describe this image."},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
pipe(text=messages,max_new_tokens=20, return_full_text=False)
|
||||||
|
```
|
||||||
|
</hfoption>
|
||||||
|
<hfoption id="AutoModel">
|
||||||
|
|
||||||
|
```py
|
||||||
|
import torch
|
||||||
|
from transformers import Glm4vForConditionalGeneration, AutoProcessor
|
||||||
|
|
||||||
|
model = Glm4vForConditionalGeneration.from_pretrained(
|
||||||
|
"THUDM/GLM-4.1V-9B-Thinking",
|
||||||
|
torch_dtype=torch.bfloat16,
|
||||||
|
device_map="auto",
|
||||||
|
attn_implementation="sdpa"
|
||||||
|
)
|
||||||
|
processor = AutoProcessor.from_pretrained("THUDM/GLM-4.1V-9B-Thinking")
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role":"user",
|
||||||
|
"content":[
|
||||||
|
{
|
||||||
|
"type":"image",
|
||||||
|
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type":"text",
|
||||||
|
"text":"Describe this image."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
||||||
|
|
||||||
|
inputs = processor.apply_chat_template(
|
||||||
|
messages,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
tokenize=True,
|
||||||
|
return_dict=True,
|
||||||
|
return_tensors="pt"
|
||||||
|
).to("cuda")
|
||||||
|
|
||||||
|
generated_ids = model.generate(**inputs, max_new_tokens=128)
|
||||||
|
generated_ids_trimmed = [
|
||||||
|
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
||||||
|
]
|
||||||
|
output_text = processor.batch_decode(
|
||||||
|
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
||||||
|
)
|
||||||
|
print(output_text)
|
||||||
|
```
|
||||||
|
</hfoption>
|
||||||
|
</hfoptions>
|
||||||
|
|
||||||
|
Using GLM-4.1V with video input is similar to using it with image input.
|
||||||
|
The model can process video data and generate text based on the content of the video.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from transformers import AutoProcessor, Glm4vForConditionalGeneration
|
||||||
|
import torch
|
||||||
|
|
||||||
|
processor = AutoProcessor.from_pretrained("THUDM/GLM-4.1V-9B-Thinking")
|
||||||
|
model = Glm4vForConditionalGeneration.from_pretrained(
|
||||||
|
pretrained_model_name_or_path="THUDM/GLM-4.1V-9B-Thinking",
|
||||||
|
torch_dtype=torch.bfloat16,
|
||||||
|
device_map="cuda:0"
|
||||||
|
)
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "video",
|
||||||
|
"url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "discribe this video",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
inputs = processor.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", padding=True).to("cuda:0")
|
||||||
|
generated_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=True, temperature=1.0)
|
||||||
|
output_text = processor.decode(generated_ids[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True)
|
||||||
|
print(output_text)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Glm4vConfig
|
||||||
|
|
||||||
|
[[autodoc]] Glm4vConfig
|
||||||
|
|
||||||
|
## Glm4vTextConfig
|
||||||
|
|
||||||
|
[[autodoc]] Glm4vTextConfig
|
||||||
|
|
||||||
|
## Glm4vImageProcessor
|
||||||
|
|
||||||
|
[[autodoc]] Glm4vImageProcessor
|
||||||
|
- preprocess
|
||||||
|
|
||||||
|
## Glm4vVideoProcessor
|
||||||
|
|
||||||
|
[[autodoc]] Glm4vVideoProcessor
|
||||||
|
- preprocess
|
||||||
|
|
||||||
|
## Glm4vImageProcessorFast
|
||||||
|
|
||||||
|
[[autodoc]] Glm4vImageProcessorFast
|
||||||
|
- preprocess
|
||||||
|
|
||||||
|
## Glm4vProcessor
|
||||||
|
|
||||||
|
[[autodoc]] Glm4vProcessor
|
||||||
|
|
||||||
|
## Glm4vTextModel
|
||||||
|
|
||||||
|
[[autodoc]] Glm4vTextModel
|
||||||
|
- forward
|
||||||
|
|
||||||
|
## Glm4vModel
|
||||||
|
|
||||||
|
[[autodoc]] Glm4vModel
|
||||||
|
- forward
|
||||||
|
|
||||||
|
## Glm4vForConditionalGeneration
|
||||||
|
|
||||||
|
[[autodoc]] Glm4vForConditionalGeneration
|
||||||
|
- forward
|
||||||
@@ -141,6 +141,8 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
|||||||
("git", "GitConfig"),
|
("git", "GitConfig"),
|
||||||
("glm", "GlmConfig"),
|
("glm", "GlmConfig"),
|
||||||
("glm4", "Glm4Config"),
|
("glm4", "Glm4Config"),
|
||||||
|
("glm4v", "Glm4vConfig"),
|
||||||
|
("glm4v_text", "Glm4vTextConfig"),
|
||||||
("glpn", "GLPNConfig"),
|
("glpn", "GLPNConfig"),
|
||||||
("got_ocr2", "GotOcr2Config"),
|
("got_ocr2", "GotOcr2Config"),
|
||||||
("gpt-sw3", "GPT2Config"),
|
("gpt-sw3", "GPT2Config"),
|
||||||
@@ -512,7 +514,9 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
|||||||
("gemma3_text", "Gemma3ForCausalLM"),
|
("gemma3_text", "Gemma3ForCausalLM"),
|
||||||
("git", "GIT"),
|
("git", "GIT"),
|
||||||
("glm", "GLM"),
|
("glm", "GLM"),
|
||||||
("glm4", "glm4"),
|
("glm4", "GLM4"),
|
||||||
|
("glm4v", "GLM4V"),
|
||||||
|
("glm4v_text", "GLM4V"),
|
||||||
("glpn", "GLPN"),
|
("glpn", "GLPN"),
|
||||||
("got_ocr2", "GOT-OCR2"),
|
("got_ocr2", "GOT-OCR2"),
|
||||||
("gpt-sw3", "GPT-Sw3"),
|
("gpt-sw3", "GPT-Sw3"),
|
||||||
@@ -827,6 +831,7 @@ SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict[str, str](
|
|||||||
("clip_text_model", "clip"),
|
("clip_text_model", "clip"),
|
||||||
("aria_text", "aria"),
|
("aria_text", "aria"),
|
||||||
("gemma3_text", "gemma3"),
|
("gemma3_text", "gemma3"),
|
||||||
|
("glm4v_text", "glm4v"),
|
||||||
("idefics3_vision", "idefics3"),
|
("idefics3_vision", "idefics3"),
|
||||||
("siglip_vision_model", "siglip"),
|
("siglip_vision_model", "siglip"),
|
||||||
("smolvlm_vision", "smolvlm"),
|
("smolvlm_vision", "smolvlm"),
|
||||||
|
|||||||
@@ -89,6 +89,7 @@ else:
|
|||||||
("fuyu", ("FuyuImageProcessor",)),
|
("fuyu", ("FuyuImageProcessor",)),
|
||||||
("gemma3", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")),
|
("gemma3", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")),
|
||||||
("git", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
|
("git", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
|
||||||
|
("glm4v", ("Glm4vImageProcessor", "Glm4vImageProcessorFast")),
|
||||||
("glpn", ("GLPNImageProcessor",)),
|
("glpn", ("GLPNImageProcessor",)),
|
||||||
("got_ocr2", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")),
|
("got_ocr2", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")),
|
||||||
("grounding-dino", ("GroundingDinoImageProcessor", "GroundingDinoImageProcessorFast")),
|
("grounding-dino", ("GroundingDinoImageProcessor", "GroundingDinoImageProcessorFast")),
|
||||||
|
|||||||
@@ -133,6 +133,8 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
|||||||
("git", "GitModel"),
|
("git", "GitModel"),
|
||||||
("glm", "GlmModel"),
|
("glm", "GlmModel"),
|
||||||
("glm4", "Glm4Model"),
|
("glm4", "Glm4Model"),
|
||||||
|
("glm4v", "Glm4vModel"),
|
||||||
|
("glm4v_text", "Glm4vTextModel"),
|
||||||
("glpn", "GLPNModel"),
|
("glpn", "GLPNModel"),
|
||||||
("got_ocr2", "GotOcr2Model"),
|
("got_ocr2", "GotOcr2Model"),
|
||||||
("gpt-sw3", "GPT2Model"),
|
("gpt-sw3", "GPT2Model"),
|
||||||
@@ -896,6 +898,7 @@ MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict(
|
|||||||
("fuyu", "FuyuForCausalLM"),
|
("fuyu", "FuyuForCausalLM"),
|
||||||
("gemma3", "Gemma3ForConditionalGeneration"),
|
("gemma3", "Gemma3ForConditionalGeneration"),
|
||||||
("git", "GitForCausalLM"),
|
("git", "GitForCausalLM"),
|
||||||
|
("glm4v", "Glm4vForConditionalGeneration"),
|
||||||
("got_ocr2", "GotOcr2ForConditionalGeneration"),
|
("got_ocr2", "GotOcr2ForConditionalGeneration"),
|
||||||
("idefics", "IdeficsForVisionText2Text"),
|
("idefics", "IdeficsForVisionText2Text"),
|
||||||
("idefics2", "Idefics2ForConditionalGeneration"),
|
("idefics2", "Idefics2ForConditionalGeneration"),
|
||||||
|
|||||||
@@ -66,6 +66,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
|
|||||||
("fuyu", "FuyuProcessor"),
|
("fuyu", "FuyuProcessor"),
|
||||||
("gemma3", "Gemma3Processor"),
|
("gemma3", "Gemma3Processor"),
|
||||||
("git", "GitProcessor"),
|
("git", "GitProcessor"),
|
||||||
|
("glm4v", "Glm4vProcessor"),
|
||||||
("got_ocr2", "GotOcr2Processor"),
|
("got_ocr2", "GotOcr2Processor"),
|
||||||
("granite_speech", "GraniteSpeechProcessor"),
|
("granite_speech", "GraniteSpeechProcessor"),
|
||||||
("grounding-dino", "GroundingDinoProcessor"),
|
("grounding-dino", "GroundingDinoProcessor"),
|
||||||
|
|||||||
@@ -238,6 +238,7 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
|
|||||||
("git", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
|
("git", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
("glm", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
|
("glm", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
("glm4", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
|
("glm4", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
|
("glm4v", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)),
|
("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)),
|
||||||
("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
|
("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
|
||||||
("gpt_bigcode", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
|
("gpt_bigcode", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
|
||||||
|
|||||||
@@ -46,6 +46,7 @@ if TYPE_CHECKING:
|
|||||||
else:
|
else:
|
||||||
VIDEO_PROCESSOR_MAPPING_NAMES = OrderedDict(
|
VIDEO_PROCESSOR_MAPPING_NAMES = OrderedDict(
|
||||||
[
|
[
|
||||||
|
("glm4v", "Glm4vVideoProcessor"),
|
||||||
("instructblip", "InstructBlipVideoVideoProcessor"),
|
("instructblip", "InstructBlipVideoVideoProcessor"),
|
||||||
("instructblipvideo", "InstructBlipVideoVideoProcessor"),
|
("instructblipvideo", "InstructBlipVideoVideoProcessor"),
|
||||||
("internvl", "InternVLVideoProcessor"),
|
("internvl", "InternVLVideoProcessor"),
|
||||||
|
|||||||
28
src/transformers/models/glm4v/__init__.py
Normal file
28
src/transformers/models/glm4v/__init__.py
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
from ...utils import _LazyModule
|
||||||
|
from ...utils.import_utils import define_import_structure
|
||||||
|
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .configuration_glm4v import *
|
||||||
|
from .modeling_glm4v import *
|
||||||
|
from .processing_glm4v import *
|
||||||
|
else:
|
||||||
|
import sys
|
||||||
|
|
||||||
|
_file = globals()["__file__"]
|
||||||
|
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|
||||||
354
src/transformers/models/glm4v/configuration_glm4v.py
Normal file
354
src/transformers/models/glm4v/configuration_glm4v.py
Normal file
@@ -0,0 +1,354 @@
|
|||||||
|
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||||
|
# This file was automatically generated from src/transformers/models/glm4v/modular_glm4v.py.
|
||||||
|
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
||||||
|
# the file from the modular. If any change should be done, please apply the change to the
|
||||||
|
# modular_glm4v.py file directly. One of our CI enforces this.
|
||||||
|
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
from ...configuration_utils import PretrainedConfig
|
||||||
|
from ...modeling_rope_utils import rope_config_validation
|
||||||
|
|
||||||
|
|
||||||
|
class Glm4vVisionConfig(PretrainedConfig):
|
||||||
|
r"""
|
||||||
|
This is the configuration class to store the configuration of a [`Glm4vVisionModel`]. It is used to instantiate an Glm4vVisionModel
|
||||||
|
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield
|
||||||
|
a similar configuration to that of
|
||||||
|
GLM-4.1V-9B-Thinking [THUDM/GLM-4.1V-9B-Thinking](https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
hidden_size (`int`, *optional*, defaults to 1536):
|
||||||
|
Dimensionality of the encoder layers and the pooler layer.
|
||||||
|
depth (`int`, *optional*, defaults to 24):
|
||||||
|
Number of layers (depth) in the model.
|
||||||
|
attention_bias (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether to add a bias to the queries, keys and values.
|
||||||
|
intermediate_size (`int`, *optional*, defaults to 13696):
|
||||||
|
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||||
|
hidden_act (`str` or `function`, *optional*, defaults to `"selu"`):
|
||||||
|
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||||
|
`"relu"`, `"selu"` and `"gelu_new"` are supported.
|
||||||
|
hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
|
||||||
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
|
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
|
Dropout probability for attention weights.
|
||||||
|
projection_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
|
Dropout probability for the projection layer.
|
||||||
|
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||||
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
|
image_size (`int` or `list[int]`, *optional*, defaults to `[336, 336]`):
|
||||||
|
The size (resolution) of each image.
|
||||||
|
patch_size (`int`, *optional*, defaults to `14`):
|
||||||
|
The size (resolution) of each patch.
|
||||||
|
num_channels (`int`, *optional*, defaults to 3):
|
||||||
|
The number of input channels.
|
||||||
|
out_hidden_size (`int`, *optional*, defaults to 4096):
|
||||||
|
The output hidden size of the vision model.
|
||||||
|
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||||
|
The epsilon used by the rms normalization layers.
|
||||||
|
spatial_merge_size (`int`, *optional*, defaults to 2):
|
||||||
|
The size used for merging spatial dimensions.
|
||||||
|
temporal_patch_size (`int`, *optional*, defaults to 2):
|
||||||
|
The size used for patches along the temporal dimension.
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> from transformers import Glm4vVisionConfig, Glm4vVisionModel
|
||||||
|
|
||||||
|
>>> # Initializing a Glm4vVisionConfig GLM-4.1V-9B style configuration
|
||||||
|
>>> configuration = Glm4vVisionConfig()
|
||||||
|
|
||||||
|
>>> # Initializing a model (with random weights) from the GLM-4.1V-9B configuration
|
||||||
|
>>> model = Glm4vVisionModel(configuration)
|
||||||
|
|
||||||
|
>>> # Accessing the model configuration
|
||||||
|
>>> configuration = model.config
|
||||||
|
```"""
|
||||||
|
|
||||||
|
model_type = "glm4v"
|
||||||
|
base_config_key = "vision_config"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
depth=24,
|
||||||
|
hidden_size=1536,
|
||||||
|
hidden_act="silu",
|
||||||
|
attention_bias=False,
|
||||||
|
attention_dropout=0.0,
|
||||||
|
num_heads=12,
|
||||||
|
in_channels=3,
|
||||||
|
image_size=336,
|
||||||
|
patch_size=14,
|
||||||
|
rms_norm_eps=1e-05,
|
||||||
|
spatial_merge_size=2,
|
||||||
|
temporal_patch_size=1,
|
||||||
|
out_hidden_size=4096,
|
||||||
|
intermediate_size=13696,
|
||||||
|
initializer_range=0.02,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
|
self.depth = depth
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.hidden_act = hidden_act
|
||||||
|
self.num_heads = num_heads
|
||||||
|
self.in_channels = in_channels
|
||||||
|
self.image_size = image_size
|
||||||
|
self.patch_size = patch_size
|
||||||
|
self.spatial_merge_size = spatial_merge_size
|
||||||
|
self.temporal_patch_size = temporal_patch_size
|
||||||
|
self.out_hidden_size = out_hidden_size
|
||||||
|
self.intermediate_size = intermediate_size
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
self.rms_norm_eps = rms_norm_eps
|
||||||
|
self.attention_bias = attention_bias
|
||||||
|
self.attention_dropout = attention_dropout
|
||||||
|
|
||||||
|
|
||||||
|
class Glm4vTextConfig(PretrainedConfig):
|
||||||
|
r"""
|
||||||
|
This is the configuration class to store the configuration of a [`Glm4vModel`]. It is used to instantiate a
|
||||||
|
GLM-4.1V model according to the specified arguments, defining the model architecture. Instantiating a
|
||||||
|
configuration with the defaults will yield a similar configuration to that of
|
||||||
|
GLM-4.1V-9B-Thinking [THUDM/GLM-4.1V-9B-Thinking](https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking).
|
||||||
|
|
||||||
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||||
|
documentation from [`PretrainedConfig`] for more information.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_size (`int`, *optional*, defaults to 151552):
|
||||||
|
Vocabulary size of the Glm4v model. Defines the number of different tokens that can be represented by the
|
||||||
|
`inputs_ids` passed when calling [`Glm4vModel`]
|
||||||
|
hidden_size (`int`, *optional*, defaults to 4096):
|
||||||
|
Dimension of the hidden representations.
|
||||||
|
intermediate_size (`int`, *optional*, defaults to 13696):
|
||||||
|
Dimension of the MLP representations.
|
||||||
|
num_hidden_layers (`int`, *optional*, defaults to 40):
|
||||||
|
Number of hidden layers in the Transformer encoder.
|
||||||
|
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||||
|
Number of attention heads for each attention layer in the Transformer encoder.
|
||||||
|
num_key_value_heads (`int`, *optional*, defaults to 2):
|
||||||
|
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||||
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
|
by meanpooling all the original heads within that group. For more details checkout [this
|
||||||
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
||||||
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
|
The non-linear activation function (function or string) in the decoder.
|
||||||
|
max_position_embeddings (`int`, *optional*, defaults to 32768):
|
||||||
|
The maximum sequence length that this model might ever be used with.
|
||||||
|
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||||
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
|
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||||
|
The epsilon used by the rms normalization layers.
|
||||||
|
use_cache (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||||
|
relevant if `config.is_decoder=True`.
|
||||||
|
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether the model's input and output word embeddings should be tied.
|
||||||
|
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||||
|
The base period of the RoPE embeddings.
|
||||||
|
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
|
The dropout ratio for the attention probabilities.
|
||||||
|
rope_scaling (`Dict`, *optional*):
|
||||||
|
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
|
||||||
|
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
|
||||||
|
accordingly.
|
||||||
|
Expected contents:
|
||||||
|
`rope_type` (`str`):
|
||||||
|
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
|
||||||
|
'llama3'], with 'default' being the original RoPE implementation.
|
||||||
|
`factor` (`float`, *optional*):
|
||||||
|
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
|
||||||
|
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
|
||||||
|
original maximum pre-trained length.
|
||||||
|
`original_max_position_embeddings` (`int`, *optional*):
|
||||||
|
Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
|
||||||
|
pretraining.
|
||||||
|
`attention_factor` (`float`, *optional*):
|
||||||
|
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
|
||||||
|
computation. If unspecified, it defaults to value recommended by the implementation, using the
|
||||||
|
`factor` field to infer the suggested value.
|
||||||
|
image_token_id (`int`, *optional*):
|
||||||
|
Token index used as placeholder for image embeddings.
|
||||||
|
video_token_id (`int`, *optional*):
|
||||||
|
Token index used as placeholder for video embeddings.
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> from transformers import Glm4vTextModel, Glm4vConfig
|
||||||
|
|
||||||
|
>>> # Initializing a GLM-4.1V style configuration
|
||||||
|
>>> configuration = Glm4vConfig()
|
||||||
|
|
||||||
|
>>> # Initializing a model from the GLM-4.1V style configuration
|
||||||
|
>>> model = Glm4vTextModel(configuration)
|
||||||
|
|
||||||
|
>>> # Accessing the model configuration
|
||||||
|
>>> configuration = model.config
|
||||||
|
```"""
|
||||||
|
|
||||||
|
model_type = "glm4v_text"
|
||||||
|
base_config_key = "text_config"
|
||||||
|
keys_to_ignore_at_inference = ["past_key_values"]
|
||||||
|
# Default tensor parallel plan for base model `Glm4v`
|
||||||
|
base_model_tp_plan = {
|
||||||
|
"layers.*.self_attn.q_proj": "colwise",
|
||||||
|
"layers.*.self_attn.k_proj": "colwise",
|
||||||
|
"layers.*.self_attn.v_proj": "colwise",
|
||||||
|
"layers.*.self_attn.o_proj": "rowwise",
|
||||||
|
"layers.*.mlp.gate_up_proj": "colwise_rep", # we need to replicate here due to the `chunk` operation
|
||||||
|
"layers.*.mlp.down_proj": "rowwise_rep", # we need to replicate here due to the `chunk` operation
|
||||||
|
}
|
||||||
|
base_model_pp_plan = {
|
||||||
|
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
|
||||||
|
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
|
||||||
|
"norm": (["hidden_states"], ["hidden_states"]),
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab_size=151552,
|
||||||
|
hidden_size=4096,
|
||||||
|
intermediate_size=13696,
|
||||||
|
num_hidden_layers=40,
|
||||||
|
num_attention_heads=32,
|
||||||
|
num_key_value_heads=2,
|
||||||
|
hidden_act="silu",
|
||||||
|
max_position_embeddings=32768,
|
||||||
|
initializer_range=0.02,
|
||||||
|
rms_norm_eps=1e-05,
|
||||||
|
use_cache=True,
|
||||||
|
tie_word_embeddings=False,
|
||||||
|
rope_theta=10000.0,
|
||||||
|
attention_dropout=0.0,
|
||||||
|
rope_scaling=None,
|
||||||
|
image_token_id=None,
|
||||||
|
video_token_id=None,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.max_position_embeddings = max_position_embeddings
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.intermediate_size = intermediate_size
|
||||||
|
self.num_hidden_layers = num_hidden_layers
|
||||||
|
self.num_attention_heads = num_attention_heads
|
||||||
|
|
||||||
|
# for backward compatibility
|
||||||
|
if num_key_value_heads is None:
|
||||||
|
num_key_value_heads = num_attention_heads
|
||||||
|
|
||||||
|
self.num_key_value_heads = num_key_value_heads
|
||||||
|
self.hidden_act = hidden_act
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
self.rms_norm_eps = rms_norm_eps
|
||||||
|
self.use_cache = use_cache
|
||||||
|
self.rope_theta = rope_theta
|
||||||
|
self.attention_dropout = attention_dropout
|
||||||
|
self.rope_scaling = rope_scaling
|
||||||
|
|
||||||
|
# Validate the correctness of rotary position embeddings parameters
|
||||||
|
# BC: if there is a 'type' field, move it to 'rope_type'.
|
||||||
|
if self.rope_scaling is not None and "type" in self.rope_scaling:
|
||||||
|
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
|
||||||
|
rope_config_validation(self, ignore_keys={"mrope_section"})
|
||||||
|
self.image_token_id = image_token_id
|
||||||
|
self.video_token_id = video_token_id
|
||||||
|
|
||||||
|
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class Glm4vConfig(PretrainedConfig):
|
||||||
|
r"""
|
||||||
|
This is the configuration class to store the configuration of a [`Glm4vModel`]. It is used to instantiate a
|
||||||
|
GLM-4.1V model according to the specified arguments, defining the model architecture. Instantiating a
|
||||||
|
configuration with the defaults will yield a similar configuration to that of
|
||||||
|
GLM-4.1V-9B-Thinking [THUDM/GLM-4.1V-9B-Thinking](https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking).
|
||||||
|
|
||||||
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||||
|
documentation from [`PretrainedConfig`] for more information.
|
||||||
|
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Glm4vTextConfig`):
|
||||||
|
The config object or dictionary of the text backbone.
|
||||||
|
vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Glm4vVisionConfig`):
|
||||||
|
The config object or dictionary of the vision backbone.
|
||||||
|
image_token_id (`int`, *optional*, defaults to 151343):
|
||||||
|
The image token index to encode the image prompt.
|
||||||
|
video_token_id (`int`, *optional*, defaults to 151344):
|
||||||
|
The video token index to encode the image prompt.
|
||||||
|
image_start_token_id (`int`, *optional*, defaults to 151339):
|
||||||
|
The image start token index to encode the start of image.
|
||||||
|
image_end_token_id (`int`, *optional*, defaults to 151340):
|
||||||
|
The image end token index to encode the end of image.
|
||||||
|
video_start_token_id (`int`, *optional*, defaults to 151341):
|
||||||
|
The video start token index to encode the start of video.
|
||||||
|
video_end_token_id (`int`, *optional*, defaults to 151342):
|
||||||
|
The video end token index to encode the end of video.
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> from transformers import Glm4vForConditionalGeneration, Glm4vConfig
|
||||||
|
|
||||||
|
>>> # Initializing a GLM-4.1V style configuration
|
||||||
|
>>> configuration = Glm4vConfig()
|
||||||
|
|
||||||
|
>>> # Initializing a model from the GLM-4.1V style configuration
|
||||||
|
>>> model = Glm4vForConditionalGeneration(configuration)
|
||||||
|
|
||||||
|
>>> # Accessing the model configuration
|
||||||
|
>>> configuration = model.config
|
||||||
|
```"""
|
||||||
|
|
||||||
|
model_type = "glm4v"
|
||||||
|
sub_configs = {"vision_config": Glm4vVisionConfig, "text_config": Glm4vTextConfig}
|
||||||
|
keys_to_ignore_at_inference = ["past_key_values"]
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
text_config=None,
|
||||||
|
vision_config=None,
|
||||||
|
image_token_id=151343,
|
||||||
|
video_token_id=151344,
|
||||||
|
image_start_token_id=151339,
|
||||||
|
image_end_token_id=151340,
|
||||||
|
video_start_token_id=151341,
|
||||||
|
video_end_token_id=151342,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
if isinstance(vision_config, dict):
|
||||||
|
self.vision_config = self.sub_configs["vision_config"](**vision_config)
|
||||||
|
elif vision_config is None:
|
||||||
|
self.vision_config = self.sub_configs["vision_config"]()
|
||||||
|
|
||||||
|
if isinstance(text_config, dict):
|
||||||
|
self.text_config = self.sub_configs["text_config"](**text_config)
|
||||||
|
elif text_config is None:
|
||||||
|
# For BC use all kwargs to init `TextConfig`
|
||||||
|
self.text_config = self.sub_configs["text_config"](**kwargs)
|
||||||
|
|
||||||
|
self.image_token_id = image_token_id
|
||||||
|
self.video_token_id = video_token_id
|
||||||
|
self.video_start_token_id = video_start_token_id
|
||||||
|
self.video_end_token_id = video_end_token_id
|
||||||
|
self.image_start_token_id = image_start_token_id
|
||||||
|
self.image_end_token_id = image_end_token_id
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Glm4vConfig", "Glm4vTextConfig"]
|
||||||
645
src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py
Normal file
645
src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py
Normal file
@@ -0,0 +1,645 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2025 The HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Callable, Optional
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from safetensors.torch import save_file
|
||||||
|
|
||||||
|
|
||||||
|
# Avoid Using Megatron Lib
|
||||||
|
class UnpicklerWrapper(pickle.Unpickler):
|
||||||
|
def find_class(self, mod_name, name):
|
||||||
|
class DummyClass:
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
pass
|
||||||
|
|
||||||
|
if mod_name.startswith("megatron") or mod_name.startswith("glm") or mod_name.startswith("__main__"):
|
||||||
|
return DummyClass
|
||||||
|
return super().find_class(mod_name, name)
|
||||||
|
|
||||||
|
|
||||||
|
pickle.Unpickler = UnpicklerWrapper
|
||||||
|
|
||||||
|
|
||||||
|
def dict_access_multi(a_dict, keys):
|
||||||
|
if len(keys) == 0:
|
||||||
|
return a_dict
|
||||||
|
return dict_access_multi(a_dict[keys[0]], keys[1:])
|
||||||
|
|
||||||
|
|
||||||
|
def merge_qkv(
|
||||||
|
sd_list,
|
||||||
|
original_tp,
|
||||||
|
num_attention_heads,
|
||||||
|
multi_query_group_num,
|
||||||
|
attention_dim,
|
||||||
|
multi_query_attention,
|
||||||
|
interleaved_qkv,
|
||||||
|
):
|
||||||
|
if not multi_query_attention and interleaved_qkv:
|
||||||
|
return torch.cat(sd_list, dim=0)
|
||||||
|
q, k, v = [], [], []
|
||||||
|
for sd in sd_list:
|
||||||
|
if multi_query_attention:
|
||||||
|
q_, k_, v_ = sd.split(
|
||||||
|
[
|
||||||
|
num_attention_heads * attention_dim // original_tp,
|
||||||
|
multi_query_group_num * attention_dim // original_tp,
|
||||||
|
multi_query_group_num * attention_dim // original_tp,
|
||||||
|
],
|
||||||
|
dim=0,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
q_, k_, v_ = sd.chunk(dim=0, chunks=3)
|
||||||
|
q.append(q_.clone())
|
||||||
|
k.append(k_.clone())
|
||||||
|
v.append(v_.clone())
|
||||||
|
q = torch.cat(q, dim=0)
|
||||||
|
k = torch.cat(k, dim=0)
|
||||||
|
v = torch.cat(v, dim=0)
|
||||||
|
if not interleaved_qkv:
|
||||||
|
rotary_dim = attention_dim // 2
|
||||||
|
half_rot = rotary_dim // 2
|
||||||
|
perm_rot = torch.empty(rotary_dim, dtype=torch.long)
|
||||||
|
perm_rot[0::2] = torch.arange(0, half_rot)
|
||||||
|
perm_rot[1::2] = torch.arange(half_rot, rotary_dim)
|
||||||
|
if q.dim() == 2:
|
||||||
|
qh = q.view(num_attention_heads, attention_dim, -1)
|
||||||
|
kh = k.view(multi_query_group_num, attention_dim, -1)
|
||||||
|
qh[:, :rotary_dim, :] = qh[:, perm_rot, :]
|
||||||
|
kh[:, :rotary_dim, :] = kh[:, perm_rot, :]
|
||||||
|
q = qh.reshape(-1, q.size(-1))
|
||||||
|
k = kh.reshape(-1, k.size(-1))
|
||||||
|
else:
|
||||||
|
qh = q.view(num_attention_heads, attention_dim)
|
||||||
|
kh = k.view(multi_query_group_num, attention_dim)
|
||||||
|
qh[:, :rotary_dim] = qh[:, perm_rot]
|
||||||
|
kh[:, :rotary_dim] = kh[:, perm_rot]
|
||||||
|
q = qh.reshape(-1)
|
||||||
|
k = kh.reshape(-1)
|
||||||
|
return q, k, v
|
||||||
|
|
||||||
|
|
||||||
|
def merge_glu(sd_list):
|
||||||
|
return torch.cat(
|
||||||
|
[sd.chunk(dim=0, chunks=2)[0].clone() for sd in sd_list]
|
||||||
|
+ [sd.chunk(dim=0, chunks=2)[1].clone() for sd in sd_list],
|
||||||
|
dim=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def merge_glu_vit(sd_list, original_tp=None):
|
||||||
|
gate_proj = torch.cat([sd.chunk(dim=0, chunks=2)[0].clone() for sd in sd_list], dim=0)
|
||||||
|
up_proj = torch.cat([sd.chunk(dim=0, chunks=2)[1].clone() for sd in sd_list], dim=0)
|
||||||
|
return gate_proj, up_proj
|
||||||
|
|
||||||
|
|
||||||
|
def split_glu(sd, cnt, idx):
|
||||||
|
return torch.cat(
|
||||||
|
(
|
||||||
|
sd.chunk(dim=0, chunks=2)[0].chunk(cnt, dim=0)[idx].clone(),
|
||||||
|
sd.chunk(dim=0, chunks=2)[1].chunk(cnt, dim=0)[idx].clone(),
|
||||||
|
),
|
||||||
|
dim=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def merge_qkv_vit(sd_list, original_tp=None):
|
||||||
|
q, k, v = [], [], []
|
||||||
|
for sd in sd_list:
|
||||||
|
q_, k_, v_ = sd.chunk(dim=0, chunks=3)
|
||||||
|
q.append(q_.clone().contiguous())
|
||||||
|
k.append(k_.clone().contiguous())
|
||||||
|
v.append(v_.clone().contiguous())
|
||||||
|
q = torch.cat(q, dim=0)
|
||||||
|
k = torch.cat(k, dim=0)
|
||||||
|
v = torch.cat(v, dim=0)
|
||||||
|
combined = torch.cat([q, k, v], dim=0)
|
||||||
|
return combined
|
||||||
|
|
||||||
|
|
||||||
|
def merge_tensors_vit(
|
||||||
|
tp_sd: list[dict],
|
||||||
|
keys: list[str],
|
||||||
|
original_tp: int,
|
||||||
|
target_tp: int,
|
||||||
|
slice_dim: Optional[int] = None,
|
||||||
|
merge_fn: Optional[Callable] = None,
|
||||||
|
):
|
||||||
|
cnt = original_tp // target_tp
|
||||||
|
sd_list = [dict_access_multi(tp_sd[i], keys) for i in range(cnt)]
|
||||||
|
if slice_dim is not None:
|
||||||
|
return torch.cat(sd_list, dim=slice_dim)
|
||||||
|
assert merge_fn is not None
|
||||||
|
return merge_fn(sd_list, original_tp)
|
||||||
|
|
||||||
|
|
||||||
|
def merge_tensors(
|
||||||
|
tp_sd,
|
||||||
|
keys,
|
||||||
|
original_tp,
|
||||||
|
target_tp,
|
||||||
|
current_tp,
|
||||||
|
slice_dim=None,
|
||||||
|
merge_fn=None,
|
||||||
|
):
|
||||||
|
cnt = original_tp // target_tp
|
||||||
|
offset = cnt * current_tp
|
||||||
|
sd_list = [dict_access_multi(tp_sd[i + offset], keys) for i in range(cnt)]
|
||||||
|
if slice_dim is not None:
|
||||||
|
return torch.cat(sd_list, dim=slice_dim)
|
||||||
|
assert merge_fn is not None
|
||||||
|
return merge_fn(sd_list)
|
||||||
|
|
||||||
|
|
||||||
|
def save_sharded_model(state_dict, output_path, max_shard_size_gb=5, num_layers=40, vision_num_layers=24):
|
||||||
|
os.makedirs(output_path, exist_ok=True)
|
||||||
|
|
||||||
|
layered_dict = {}
|
||||||
|
for layer_idx in range(num_layers):
|
||||||
|
layer_key = f"layer_{layer_idx}"
|
||||||
|
layered_dict[layer_key] = {}
|
||||||
|
|
||||||
|
for key, value in state_dict.items():
|
||||||
|
if f"model.language_model.layers.{layer_idx}." in key:
|
||||||
|
layered_dict[layer_key][key] = value
|
||||||
|
|
||||||
|
for layer_idx in range(vision_num_layers):
|
||||||
|
layer_key = f"visual_layer_{layer_idx}"
|
||||||
|
layered_dict[layer_key] = {}
|
||||||
|
|
||||||
|
for key, value in state_dict.items():
|
||||||
|
if f"model.visual.blocks.{layer_idx}." in key:
|
||||||
|
layered_dict[layer_key][key] = value
|
||||||
|
|
||||||
|
layered_dict["others"] = {}
|
||||||
|
for key, value in state_dict.items():
|
||||||
|
if not any(f"model.language_model.layers.{i}." in key for i in range(num_layers)) and not any(
|
||||||
|
f"model.visual.blocks.{i}." in key for i in range(vision_num_layers)
|
||||||
|
):
|
||||||
|
layered_dict["others"][key] = value
|
||||||
|
|
||||||
|
# Determine layer ordering
|
||||||
|
layer_order = []
|
||||||
|
for i in range(40):
|
||||||
|
layer_order.append(f"layer_{i}")
|
||||||
|
for i in range(24):
|
||||||
|
layer_order.append(f"visual_layer_{i}")
|
||||||
|
layer_order.append("others")
|
||||||
|
|
||||||
|
# Calculate sizes and create shards by layer
|
||||||
|
param_sizes = {}
|
||||||
|
shards = []
|
||||||
|
current_shard = {}
|
||||||
|
current_shard_size = 0
|
||||||
|
max_shard_size_bytes = max_shard_size_gb * 1024 * 1024 * 1024
|
||||||
|
|
||||||
|
for layer_key in layer_order:
|
||||||
|
layer_weights = layered_dict[layer_key]
|
||||||
|
layer_size = sum(param.numel() * param.element_size() for param in layer_weights.values())
|
||||||
|
if current_shard_size + layer_size > max_shard_size_bytes and current_shard:
|
||||||
|
shards.append(current_shard)
|
||||||
|
current_shard = {}
|
||||||
|
current_shard_size = 0
|
||||||
|
for param_name, param in layer_weights.items():
|
||||||
|
current_shard[param_name] = param
|
||||||
|
current_shard_size += param.numel() * param.element_size()
|
||||||
|
param_sizes[param_name] = param.numel() * param.element_size()
|
||||||
|
if current_shard:
|
||||||
|
shards.append(current_shard)
|
||||||
|
index_dict = {"metadata": {"total_size": sum(param_sizes.values())}, "weight_map": {}}
|
||||||
|
|
||||||
|
for i, shard in enumerate(shards):
|
||||||
|
shard_filename = f"model-{i + 1:05d}-of-{len(shards):05d}.safetensors"
|
||||||
|
shard_path = os.path.join(output_path, shard_filename)
|
||||||
|
|
||||||
|
for param_name in shard.keys():
|
||||||
|
index_dict["weight_map"][param_name] = shard_filename
|
||||||
|
|
||||||
|
save_file(shard, shard_path, metadata={"format": "pt"})
|
||||||
|
print(f"Saved shard {i + 1}/{len(shards)}: {shard_filename}")
|
||||||
|
print(f" Shard size: {sum(p.numel() * p.element_size() for p in shard.values()) / (1024**3):.2f} GB")
|
||||||
|
print(f" Keys in shard: {len(shard)}")
|
||||||
|
|
||||||
|
index_path = os.path.join(output_path, "model.safetensors.index.json")
|
||||||
|
with open(index_path, "w") as f:
|
||||||
|
json.dump(index_dict, f, indent=2)
|
||||||
|
|
||||||
|
return len(shards)
|
||||||
|
|
||||||
|
|
||||||
|
def merge_tp_weights(model_path, output_path, vllm_config_path=None):
|
||||||
|
tp_size = 0
|
||||||
|
for item in Path(model_path).iterdir():
|
||||||
|
if item.is_dir():
|
||||||
|
match = re.match(r"mp_rank_(\d{2})", item.name)
|
||||||
|
if match:
|
||||||
|
tp = int(match.group(1))
|
||||||
|
tp_size = max(tp_size, tp + 1)
|
||||||
|
|
||||||
|
print(f"Detected tensor parallel degree TP={tp_size}")
|
||||||
|
|
||||||
|
if tp_size <= 1:
|
||||||
|
print("Model is already at TP=1, no need to merge")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"Loading vLLM configuration file: {vllm_config_path}")
|
||||||
|
with open(vllm_config_path, "r") as f:
|
||||||
|
model_config = json.load(f)
|
||||||
|
num_layers = model_config.get("num_layers", 40)
|
||||||
|
vision_num_layers = model_config.get("vision_config", {}).get("num_hidden_layers", 24)
|
||||||
|
num_heads = model_config.get("num_attention_heads", 32)
|
||||||
|
num_kv_heads = model_config.get("num_query_groups", 2)
|
||||||
|
hidden_size = model_config.get("hidden_size", 4096)
|
||||||
|
head_dim = model_config.get("attention_dim", hidden_size // num_heads)
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"Model parameters: num_layers={num_layers}, vision_num_layers={vision_num_layers}, "
|
||||||
|
f"num_heads={num_heads}, multi_query_group_num={num_kv_heads}, hidden_size={hidden_size}"
|
||||||
|
)
|
||||||
|
|
||||||
|
weights = []
|
||||||
|
for tp_rank in range(tp_size):
|
||||||
|
print(f"Loading TP shard {tp_rank}...")
|
||||||
|
weight_path = Path(model_path) / f"mp_rank_{tp_rank:02d}" / "model_optim_rng.pt"
|
||||||
|
sd = torch.load(weight_path, map_location="cpu", pickle_module=pickle)
|
||||||
|
|
||||||
|
for k in list(sd.keys()):
|
||||||
|
if "_extra_state" in k or "dummy_parameter" in k:
|
||||||
|
sd.pop(k)
|
||||||
|
|
||||||
|
if "model" in sd:
|
||||||
|
weights.append(sd["model"])
|
||||||
|
else:
|
||||||
|
raise ValueError(f"'model' key not found in {weight_path}")
|
||||||
|
|
||||||
|
if not weights:
|
||||||
|
raise ValueError("No valid weight files found")
|
||||||
|
|
||||||
|
print("Merging tensor parallel weights...")
|
||||||
|
original_pp_enabled = os.path.exists(Path(model_path) / "mp_rank_00_000")
|
||||||
|
original_tp, original_pp = tp_size, 1
|
||||||
|
target_tp = 1
|
||||||
|
print(f"TP and PP INFO: original_tp: {original_tp}, original_pp:{original_pp}, target_tp: {target_tp}")
|
||||||
|
mgt_sd = [
|
||||||
|
[
|
||||||
|
torch.load(
|
||||||
|
Path(model_path)
|
||||||
|
/ (f"mp_rank_{j:02d}_{i:03d}" if original_pp_enabled else f"mp_rank_{j:02d}")
|
||||||
|
/ "model_optim_rng.pt",
|
||||||
|
map_location="cpu",
|
||||||
|
pickle_module=pickle,
|
||||||
|
)
|
||||||
|
for j in range(original_tp)
|
||||||
|
]
|
||||||
|
for i in range(original_pp)
|
||||||
|
]
|
||||||
|
|
||||||
|
interleaved_qkv = False
|
||||||
|
multi_query_attention = True
|
||||||
|
num_attention_heads = num_heads
|
||||||
|
multi_query_group_num = num_kv_heads
|
||||||
|
attention_dim = head_dim
|
||||||
|
complete_state_dict = {}
|
||||||
|
keys = ["model"]
|
||||||
|
rank = 0
|
||||||
|
|
||||||
|
# LLM
|
||||||
|
for pp in range(original_pp):
|
||||||
|
layer_i = 0
|
||||||
|
mgt_encoder_tp_0 = dict_access_multi(mgt_sd[pp][rank], keys)
|
||||||
|
|
||||||
|
while f"decoder.layers.{layer_i}.self_attention.linear_qkv.layer_norm_weight" in mgt_encoder_tp_0:
|
||||||
|
complete_state_dict.update(
|
||||||
|
{
|
||||||
|
f"model.language_model.layers.{layer_i}.input_layernorm.weight": mgt_encoder_tp_0[
|
||||||
|
f"decoder.layers.{layer_i}.self_attention.linear_qkv.layer_norm_weight"
|
||||||
|
],
|
||||||
|
f"model.language_model.layers.{layer_i}.post_attention_layernorm.weight": mgt_encoder_tp_0[
|
||||||
|
f"decoder.layers.{layer_i}.mlp.linear_fc1.layer_norm_weight"
|
||||||
|
],
|
||||||
|
f"model.language_model.layers.{layer_i}.post_self_attn_layernorm.weight": mgt_encoder_tp_0[
|
||||||
|
f"decoder.layers.{layer_i}.post_self_attn_layernorm.weight"
|
||||||
|
],
|
||||||
|
f"model.language_model.layers.{layer_i}.post_mlp_layernorm.weight": mgt_encoder_tp_0[
|
||||||
|
f"decoder.layers.{layer_i}.post_mlp_layernorm.weight"
|
||||||
|
],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
q, k, v = merge_tensors(
|
||||||
|
tp_sd=mgt_sd[pp],
|
||||||
|
keys=keys + [f"decoder.layers.{layer_i}.self_attention.linear_qkv.weight"],
|
||||||
|
original_tp=original_tp,
|
||||||
|
target_tp=target_tp,
|
||||||
|
current_tp=0,
|
||||||
|
merge_fn=lambda sd_list: merge_qkv(
|
||||||
|
sd_list,
|
||||||
|
original_tp,
|
||||||
|
num_attention_heads,
|
||||||
|
multi_query_group_num,
|
||||||
|
attention_dim,
|
||||||
|
multi_query_attention,
|
||||||
|
interleaved_qkv,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
complete_state_dict[f"model.language_model.layers.{layer_i}.self_attn.q_proj.weight"] = q.clone()
|
||||||
|
complete_state_dict[f"model.language_model.layers.{layer_i}.self_attn.k_proj.weight"] = k.clone()
|
||||||
|
complete_state_dict[f"model.language_model.layers.{layer_i}.self_attn.v_proj.weight"] = v.clone()
|
||||||
|
|
||||||
|
if f"decoder.layers.{layer_i}.self_attention.linear_qkv.bias" in mgt_encoder_tp_0:
|
||||||
|
q_bias, k_bias, v_bias = merge_tensors(
|
||||||
|
tp_sd=mgt_sd[pp],
|
||||||
|
keys=keys + [f"decoder.layers.{layer_i}.self_attention.linear_qkv.bias"],
|
||||||
|
original_tp=original_tp,
|
||||||
|
target_tp=target_tp,
|
||||||
|
current_tp=0,
|
||||||
|
merge_fn=lambda sd_list: merge_qkv(
|
||||||
|
sd_list,
|
||||||
|
original_tp,
|
||||||
|
num_attention_heads,
|
||||||
|
multi_query_group_num,
|
||||||
|
attention_dim,
|
||||||
|
multi_query_attention,
|
||||||
|
interleaved_qkv,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
complete_state_dict[f"model.language_model.layers.{layer_i}.self_attn.q_proj.bias"] = q_bias.clone()
|
||||||
|
complete_state_dict[f"model.language_model.layers.{layer_i}.self_attn.k_proj.bias"] = k_bias.clone()
|
||||||
|
complete_state_dict[f"model.language_model.layers.{layer_i}.self_attn.v_proj.bias"] = v_bias.clone()
|
||||||
|
|
||||||
|
o_proj = merge_tensors(
|
||||||
|
tp_sd=mgt_sd[pp],
|
||||||
|
keys=keys + [f"decoder.layers.{layer_i}.self_attention.linear_proj.weight"],
|
||||||
|
original_tp=original_tp,
|
||||||
|
target_tp=target_tp,
|
||||||
|
current_tp=0,
|
||||||
|
slice_dim=1,
|
||||||
|
)
|
||||||
|
complete_state_dict[f"model.language_model.layers.{layer_i}.self_attn.o_proj.weight"] = o_proj.clone()
|
||||||
|
|
||||||
|
# MLP - Use gate_up_proj
|
||||||
|
complete_state_dict[f"model.language_model.layers.{layer_i}.mlp.gate_up_proj.weight"] = merge_tensors(
|
||||||
|
tp_sd=mgt_sd[pp],
|
||||||
|
keys=keys + [f"decoder.layers.{layer_i}.mlp.linear_fc1.weight"],
|
||||||
|
original_tp=original_tp,
|
||||||
|
target_tp=target_tp,
|
||||||
|
current_tp=0,
|
||||||
|
merge_fn=merge_glu,
|
||||||
|
).clone()
|
||||||
|
complete_state_dict[f"model.language_model.layers.{layer_i}.mlp.down_proj.weight"] = merge_tensors(
|
||||||
|
tp_sd=mgt_sd[pp],
|
||||||
|
keys=keys + [f"decoder.layers.{layer_i}.mlp.linear_fc2.weight"],
|
||||||
|
original_tp=original_tp,
|
||||||
|
target_tp=target_tp,
|
||||||
|
current_tp=0,
|
||||||
|
slice_dim=1,
|
||||||
|
)
|
||||||
|
layer_i += 1
|
||||||
|
|
||||||
|
# Embedd Model, LM Head, and Norm
|
||||||
|
embed_tokens = merge_tensors(
|
||||||
|
tp_sd=mgt_sd[0],
|
||||||
|
keys=["model", "embedding.word_embeddings.weight"],
|
||||||
|
original_tp=original_tp,
|
||||||
|
target_tp=target_tp,
|
||||||
|
current_tp=0,
|
||||||
|
slice_dim=0,
|
||||||
|
)
|
||||||
|
complete_state_dict["model.language_model.embed_tokens.weight"] = embed_tokens.clone()
|
||||||
|
lm_head = merge_tensors(
|
||||||
|
tp_sd=mgt_sd[-1],
|
||||||
|
keys=["model", "output_layer.weight"],
|
||||||
|
original_tp=original_tp,
|
||||||
|
target_tp=target_tp,
|
||||||
|
current_tp=0,
|
||||||
|
slice_dim=0,
|
||||||
|
)
|
||||||
|
complete_state_dict["lm_head.weight"] = lm_head.clone()
|
||||||
|
complete_state_dict["model.language_model.norm.weight"] = mgt_sd[-1][rank]["model"][
|
||||||
|
"decoder.final_layernorm.weight"
|
||||||
|
].clone()
|
||||||
|
mgt_encoder_tp_0 = dict_access_multi(mgt_sd[0][0], keys)
|
||||||
|
|
||||||
|
# VLM
|
||||||
|
for layer_i in range(vision_num_layers):
|
||||||
|
complete_state_dict[f"model.visual.blocks.{layer_i}.norm1.weight"] = mgt_encoder_tp_0[
|
||||||
|
f"vision_model.transformer.layers.{layer_i}.input_layernorm.weight"
|
||||||
|
]
|
||||||
|
complete_state_dict[f"model.visual.blocks.{layer_i}.norm2.weight"] = mgt_encoder_tp_0[
|
||||||
|
f"vision_model.transformer.layers.{layer_i}.pre_mlp_layernorm.weight"
|
||||||
|
]
|
||||||
|
|
||||||
|
qkv_weight = merge_tensors_vit(
|
||||||
|
tp_sd=mgt_sd[0],
|
||||||
|
keys=keys + [f"vision_model.transformer.layers.{layer_i}.self_attention.linear_qkv.weight"],
|
||||||
|
original_tp=original_tp,
|
||||||
|
target_tp=target_tp,
|
||||||
|
merge_fn=merge_qkv_vit,
|
||||||
|
)
|
||||||
|
complete_state_dict[f"model.visual.blocks.{layer_i}.attn.qkv.weight"] = qkv_weight.clone()
|
||||||
|
|
||||||
|
proj_weight = merge_tensors_vit(
|
||||||
|
tp_sd=mgt_sd[0],
|
||||||
|
keys=keys + [f"vision_model.transformer.layers.{layer_i}.self_attention.linear_proj.weight"],
|
||||||
|
original_tp=original_tp,
|
||||||
|
target_tp=target_tp,
|
||||||
|
slice_dim=1,
|
||||||
|
)
|
||||||
|
complete_state_dict[f"model.visual.blocks.{layer_i}.attn.proj.weight"] = proj_weight.clone()
|
||||||
|
|
||||||
|
gate_proj_weight, up_proj_weight = merge_tensors_vit(
|
||||||
|
tp_sd=mgt_sd[0],
|
||||||
|
keys=keys + [f"vision_model.transformer.layers.{layer_i}.mlp.linear_fc1.weight"],
|
||||||
|
original_tp=original_tp,
|
||||||
|
target_tp=target_tp,
|
||||||
|
merge_fn=lambda sd_list, original_tp: merge_glu_vit(sd_list, original_tp),
|
||||||
|
)
|
||||||
|
complete_state_dict[f"model.visual.blocks.{layer_i}.mlp.gate_proj.weight"] = gate_proj_weight.clone()
|
||||||
|
complete_state_dict[f"model.visual.blocks.{layer_i}.mlp.up_proj.weight"] = up_proj_weight.clone()
|
||||||
|
|
||||||
|
down_proj_weight = merge_tensors_vit(
|
||||||
|
tp_sd=mgt_sd[0],
|
||||||
|
keys=keys + [f"vision_model.transformer.layers.{layer_i}.mlp.linear_fc2.weight"],
|
||||||
|
original_tp=original_tp,
|
||||||
|
target_tp=target_tp,
|
||||||
|
slice_dim=1,
|
||||||
|
)
|
||||||
|
complete_state_dict[f"model.visual.blocks.{layer_i}.mlp.down_proj.weight"] = down_proj_weight.clone()
|
||||||
|
|
||||||
|
complete_state_dict["model.visual.downsample.weight"] = (
|
||||||
|
mgt_sd[0][0]["model"]["vision_model.downsample.weight"].clone().contiguous()
|
||||||
|
)
|
||||||
|
complete_state_dict["model.visual.downsample.bias"] = (
|
||||||
|
mgt_sd[0][0]["model"]["vision_model.downsample.bias"].clone().contiguous()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Merger
|
||||||
|
gate_proj, up_proj = merge_tensors_vit(
|
||||||
|
tp_sd=mgt_sd[0],
|
||||||
|
keys=keys + ["vision_projection.encoder.linear_fc1.weight"],
|
||||||
|
original_tp=original_tp,
|
||||||
|
target_tp=target_tp,
|
||||||
|
merge_fn=merge_glu_vit,
|
||||||
|
)
|
||||||
|
|
||||||
|
down_proj = merge_tensors_vit(
|
||||||
|
tp_sd=mgt_sd[0],
|
||||||
|
keys=keys + ["vision_projection.encoder.linear_fc2.weight"],
|
||||||
|
original_tp=original_tp,
|
||||||
|
target_tp=target_tp,
|
||||||
|
slice_dim=1,
|
||||||
|
)
|
||||||
|
proj = merge_tensors_vit(
|
||||||
|
tp_sd=mgt_sd[0],
|
||||||
|
keys=keys + ["vision_projection.encoder.linear_fc_extra.weight"],
|
||||||
|
original_tp=original_tp,
|
||||||
|
target_tp=target_tp,
|
||||||
|
slice_dim=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
complete_state_dict["model.visual.merger.gate_proj.weight"] = gate_proj.clone().contiguous()
|
||||||
|
complete_state_dict["model.visual.merger.up_proj.weight"] = up_proj.clone().contiguous()
|
||||||
|
complete_state_dict["model.visual.merger.down_proj.weight"] = down_proj.clone().contiguous()
|
||||||
|
complete_state_dict["model.visual.merger.proj.weight"] = proj.clone().contiguous()
|
||||||
|
|
||||||
|
complete_state_dict["model.visual.merger.post_projection_norm.weight"] = (
|
||||||
|
mgt_sd[0][0]["model"]["vision_projection.encoder.layer_norm.weight"].clone().contiguous()
|
||||||
|
)
|
||||||
|
complete_state_dict["model.visual.merger.post_projection_norm.bias"] = (
|
||||||
|
mgt_sd[0][0]["model"]["vision_projection.encoder.layer_norm.bias"].clone().contiguous()
|
||||||
|
)
|
||||||
|
complete_state_dict["model.visual.embeddings.position_embedding.weight"] = (
|
||||||
|
mgt_sd[0][0]["model"]["vision_model.position_embeddings.weight"].clone().contiguous()
|
||||||
|
)
|
||||||
|
complete_state_dict["model.visual.patch_embed.proj.weight"] = (
|
||||||
|
mgt_sd[0][0]["model"]["vision_model.conv3d.weight"].clone().contiguous()
|
||||||
|
)
|
||||||
|
complete_state_dict["model.visual.patch_embed.proj.bias"] = (
|
||||||
|
mgt_sd[0][0]["model"]["vision_model.conv3d.bias"].clone().contiguous()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check for additional vision model norm layers mentioned in the expected output
|
||||||
|
if "vision_model.post_conv_layernorm.weight" in mgt_encoder_tp_0:
|
||||||
|
complete_state_dict["model.visual.post_conv_layernorm.weight"] = (
|
||||||
|
mgt_sd[0][0]["model"]["vision_model.post_conv_layernorm.weight"].clone().contiguous()
|
||||||
|
)
|
||||||
|
|
||||||
|
if "vision_model.post_layernorm.weight" in mgt_encoder_tp_0:
|
||||||
|
complete_state_dict["model.visual.post_layernorm.weight"] = (
|
||||||
|
mgt_sd[0][0]["model"]["vision_model.post_layernorm.weight"].clone().contiguous()
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"Total keys in state dict: {len(complete_state_dict)}")
|
||||||
|
|
||||||
|
for key, value in complete_state_dict.items():
|
||||||
|
if isinstance(value, torch.Tensor):
|
||||||
|
complete_state_dict[key] = value.to(torch.bfloat16)
|
||||||
|
print("Converted all tensors to bfloat16")
|
||||||
|
# Save Model weight
|
||||||
|
save_sharded_model(
|
||||||
|
complete_state_dict,
|
||||||
|
output_path=output_path,
|
||||||
|
max_shard_size_gb=5,
|
||||||
|
num_layers=num_layers,
|
||||||
|
vision_num_layers=vision_num_layers,
|
||||||
|
)
|
||||||
|
|
||||||
|
hf_config = {
|
||||||
|
"architectures": ["Glm4vForConditionalGeneration"],
|
||||||
|
"model_type": "glm4v",
|
||||||
|
"attention_bias": model_config.get("add_qkv_bias", True),
|
||||||
|
"attention_dropout": 0.0,
|
||||||
|
"pad_token_id": model_config.get("pad_token_id", 151329),
|
||||||
|
"eos_token_id": model_config.get("eos_token_id", [151329, 151336, 151338]),
|
||||||
|
"image_start_token_id": model_config.get("image_start_token_id", 151339),
|
||||||
|
"image_end_token_id": model_config.get("image_end_token_id", 151340),
|
||||||
|
"video_start_token_id": model_config.get("video_start_token_id", 151341),
|
||||||
|
"video_end_token_id": model_config.get("video_end_token_id", 151342),
|
||||||
|
"image_token_id": model_config.get("image_token_id", 151343),
|
||||||
|
"video_token_id": model_config.get("video_token_id", 151344),
|
||||||
|
"hidden_act": model_config.get("hidden_act", "silu"),
|
||||||
|
"hidden_size": model_config.get("hidden_size", 4096),
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"intermediate_size": model_config.get("ffn_hidden_size", 13696),
|
||||||
|
"max_position_embeddings": model_config.get("seq_length", 32768),
|
||||||
|
"num_attention_heads": model_config.get("num_attention_heads", 32),
|
||||||
|
"num_hidden_layers": model_config.get("num_layers", 40),
|
||||||
|
"num_key_value_heads": model_config.get("multi_query_group_num", 2),
|
||||||
|
"rms_norm_eps": model_config.get("layernorm_epsilon", 1e-05),
|
||||||
|
"rope_theta": model_config.get("rotary_base", 10000.0),
|
||||||
|
"tie_word_embeddings": False,
|
||||||
|
"torch_dtype": model_config.get("torch_dtype", "bfloat16"),
|
||||||
|
"transformers_version": "4.53.0dev",
|
||||||
|
"use_cache": model_config.get("use_cache", True),
|
||||||
|
"vocab_size": model_config.get("vocab_size", 151552),
|
||||||
|
"partial_rotary_factor": 0.5,
|
||||||
|
}
|
||||||
|
|
||||||
|
if "vision_config" in model_config:
|
||||||
|
vision_config = {
|
||||||
|
"hidden_size": model_config["vision_config"].get("hidden_size", 1536),
|
||||||
|
"depth": model_config["vision_config"].get("num_layers", 24),
|
||||||
|
"num_heads": model_config["vision_config"].get("num_attention_heads", 12),
|
||||||
|
"attention_bias": model_config["vision_config"].get("attention_bias", False),
|
||||||
|
"intermediate_size": model_config.get("ffn_hidden_size", 13696),
|
||||||
|
"hidden_act": model_config["vision_config"].get("hidden_act", "silu"),
|
||||||
|
"hidden_dropout_prob": model_config["vision_config"].get("hidden_dropout_prob", 0.0),
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"image_size": model_config["vision_config"].get("image_size", 336),
|
||||||
|
"patch_size": model_config["vision_config"].get("patch_size", 14),
|
||||||
|
"out_hidden_size": model_config.get("hidden_size", 4096),
|
||||||
|
"rms_norm_eps": model_config["vision_config"].get("layernorm_epsilon", 1e-05),
|
||||||
|
"spatial_merge_size": model_config["vision_config"].get("downsample_ratio", 2),
|
||||||
|
"temporal_patch_size": model_config["vision_config"].get("t_patch", 2),
|
||||||
|
}
|
||||||
|
hf_config["vision_config"] = vision_config
|
||||||
|
|
||||||
|
if "rope_scaling" in model_config:
|
||||||
|
hf_config["rope_scaling"] = model_config["rope_scaling"]
|
||||||
|
|
||||||
|
config_path = os.path.join(output_path, "config.json")
|
||||||
|
with open(config_path, "w") as f:
|
||||||
|
json.dump(hf_config, f, indent=2)
|
||||||
|
|
||||||
|
print(f"Conversion complete! Model saved to {output_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(description="Convert Megatron model to HuggingFace format")
|
||||||
|
parser.add_argument(
|
||||||
|
"--model_path",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Path to Megatron model directory",
|
||||||
|
)
|
||||||
|
parser.add_argument("--output_path", type=str, required=True, help="Output path for HuggingFace model directory")
|
||||||
|
parser.add_argument(
|
||||||
|
"--config_path", type=str, help="Path to vLLM configuration file for creating HuggingFace config"
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parse_args()
|
||||||
|
merge_tp_weights(args.model_path, args.output_path, args.config_path)
|
||||||
467
src/transformers/models/glm4v/image_processing_glm4v.py
Normal file
467
src/transformers/models/glm4v/image_processing_glm4v.py
Normal file
@@ -0,0 +1,467 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Image processor class for GLM-4.1V."""
|
||||||
|
|
||||||
|
import math
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from ...image_processing_utils import BaseImageProcessor, BatchFeature
|
||||||
|
from ...image_transforms import (
|
||||||
|
convert_to_rgb,
|
||||||
|
resize,
|
||||||
|
to_channel_dimension_format,
|
||||||
|
)
|
||||||
|
from ...image_utils import (
|
||||||
|
OPENAI_CLIP_MEAN,
|
||||||
|
OPENAI_CLIP_STD,
|
||||||
|
ChannelDimension,
|
||||||
|
ImageInput,
|
||||||
|
PILImageResampling,
|
||||||
|
get_image_size,
|
||||||
|
infer_channel_dimension_format,
|
||||||
|
is_scaled_image,
|
||||||
|
make_flat_list_of_images,
|
||||||
|
make_list_of_images,
|
||||||
|
to_numpy_array,
|
||||||
|
valid_images,
|
||||||
|
validate_preprocess_arguments,
|
||||||
|
)
|
||||||
|
from ...utils import TensorType, logging
|
||||||
|
from ...video_utils import VideoInput
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def smart_resize(
|
||||||
|
num_frames: int,
|
||||||
|
height: int,
|
||||||
|
width: int,
|
||||||
|
temporal_factor: int = 2,
|
||||||
|
factor: int = 28,
|
||||||
|
min_pixels: int = 112 * 112,
|
||||||
|
max_pixels: int = 14 * 14 * 2 * 2 * 2 * 6144,
|
||||||
|
):
|
||||||
|
if num_frames < temporal_factor:
|
||||||
|
raise ValueError(f"t:{num_frames} must be larger than temporal_factor:{temporal_factor}")
|
||||||
|
if height < factor or width < factor:
|
||||||
|
raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
|
||||||
|
elif max(height, width) / min(height, width) > 200:
|
||||||
|
raise ValueError(
|
||||||
|
f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
|
||||||
|
)
|
||||||
|
h_bar = round(height / factor) * factor
|
||||||
|
w_bar = round(width / factor) * factor
|
||||||
|
t_bar = round(num_frames / temporal_factor) * temporal_factor
|
||||||
|
|
||||||
|
if t_bar * h_bar * w_bar > max_pixels:
|
||||||
|
beta = math.sqrt((num_frames * height * width) / max_pixels)
|
||||||
|
h_bar = math.floor(height / beta / factor) * factor
|
||||||
|
w_bar = math.floor(width / beta / factor) * factor
|
||||||
|
elif t_bar * h_bar * w_bar < min_pixels:
|
||||||
|
beta = math.sqrt(min_pixels / (num_frames * height * width))
|
||||||
|
h_bar = math.ceil(height * beta / factor) * factor
|
||||||
|
w_bar = math.ceil(width * beta / factor) * factor
|
||||||
|
|
||||||
|
return h_bar, w_bar
|
||||||
|
|
||||||
|
|
||||||
|
class Glm4vImageProcessor(BaseImageProcessor):
|
||||||
|
r"""
|
||||||
|
Constructs a GLM-4V image processor that dynamically resizes images based on the original images.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
do_resize (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether to resize the image's (height, width) dimensions.
|
||||||
|
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 15000}`):
|
||||||
|
Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
|
||||||
|
in the `preprocess` method. Available options are:
|
||||||
|
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
|
||||||
|
Do NOT keep the aspect ratio.
|
||||||
|
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
|
||||||
|
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
|
||||||
|
less or equal to `longest_edge`.
|
||||||
|
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
|
||||||
|
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
|
||||||
|
`max_width`.
|
||||||
|
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
|
||||||
|
Resampling filter to use when resizing the image.
|
||||||
|
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether to rescale the image by the specified scale `rescale_factor`.
|
||||||
|
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||||
|
Scale factor to use if rescaling the image.
|
||||||
|
do_normalize (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether to normalize the image.
|
||||||
|
image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
|
||||||
|
Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
|
||||||
|
image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
|
||||||
|
Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
|
||||||
|
do_convert_rgb (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether to convert the image to RGB.
|
||||||
|
patch_size (`int`, *optional*, defaults to 14):
|
||||||
|
The spatial patch size of the vision encoder.
|
||||||
|
temporal_patch_size (`int`, *optional*, defaults to 2):
|
||||||
|
The temporal patch size of the vision encoder.
|
||||||
|
merge_size (`int`, *optional*, defaults to 2):
|
||||||
|
The merge size of the vision encoder to llm encoder.
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_input_names = ["pixel_values", "image_grid_thw"]
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_resize: bool = True,
|
||||||
|
size: Optional[dict[str, int]] = None,
|
||||||
|
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||||
|
do_rescale: bool = True,
|
||||||
|
rescale_factor: Union[int, float] = 1 / 255,
|
||||||
|
do_normalize: bool = True,
|
||||||
|
image_mean: Optional[Union[float, list[float]]] = None,
|
||||||
|
image_std: Optional[Union[float, list[float]]] = None,
|
||||||
|
do_convert_rgb: bool = True,
|
||||||
|
patch_size: int = 14,
|
||||||
|
temporal_patch_size: int = 2,
|
||||||
|
merge_size: int = 2,
|
||||||
|
**kwargs,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
|
||||||
|
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
|
||||||
|
else:
|
||||||
|
size = {"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 15000}
|
||||||
|
self.size = size
|
||||||
|
|
||||||
|
self.do_resize = do_resize
|
||||||
|
self.resample = resample
|
||||||
|
self.do_rescale = do_rescale
|
||||||
|
self.rescale_factor = rescale_factor
|
||||||
|
self.do_normalize = do_normalize
|
||||||
|
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
|
||||||
|
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
|
||||||
|
|
||||||
|
self.patch_size = patch_size
|
||||||
|
self.temporal_patch_size = temporal_patch_size
|
||||||
|
self.merge_size = merge_size
|
||||||
|
self.do_convert_rgb = do_convert_rgb
|
||||||
|
|
||||||
|
def _preprocess(
|
||||||
|
self,
|
||||||
|
images: Union[ImageInput, VideoInput],
|
||||||
|
do_resize: Optional[bool] = None,
|
||||||
|
size: Optional[dict[str, int]] = None,
|
||||||
|
resample: PILImageResampling = None,
|
||||||
|
do_rescale: Optional[bool] = None,
|
||||||
|
rescale_factor: Optional[float] = None,
|
||||||
|
do_normalize: Optional[bool] = None,
|
||||||
|
image_mean: Optional[Union[float, list[float]]] = None,
|
||||||
|
image_std: Optional[Union[float, list[float]]] = None,
|
||||||
|
patch_size: Optional[int] = None,
|
||||||
|
temporal_patch_size: Optional[int] = None,
|
||||||
|
merge_size: Optional[int] = None,
|
||||||
|
do_convert_rgb: Optional[bool] = None,
|
||||||
|
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||||
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
images (`ImageInput`):
|
||||||
|
Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
|
||||||
|
vision_info (`List[Dict]`, *optional*):
|
||||||
|
Optional list of dictionaries containing additional information about vision inputs.
|
||||||
|
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||||
|
Whether to resize the image.
|
||||||
|
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
|
||||||
|
Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
|
||||||
|
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
|
||||||
|
Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
|
||||||
|
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||||
|
Whether to rescale the image.
|
||||||
|
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||||
|
Scale factor to use if rescaling the image.
|
||||||
|
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||||
|
Whether to normalize the image.
|
||||||
|
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
|
||||||
|
Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
|
||||||
|
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
||||||
|
Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
|
||||||
|
patch_size (`int`, *optional*, defaults to `self.patch_size`):
|
||||||
|
The spatial patch size of the vision encoder.
|
||||||
|
temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
|
||||||
|
The temporal patch size of the vision encoder.
|
||||||
|
merge_size (`int`, *optional*, defaults to `self.merge_size`):
|
||||||
|
The merge size of the vision encoder to llm encoder.
|
||||||
|
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
||||||
|
Whether to convert the image to RGB.
|
||||||
|
data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
|
||||||
|
The channel dimension format for the output image. Can be one of:
|
||||||
|
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||||
|
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||||
|
- Unset: Use the channel dimension format of the input image.
|
||||||
|
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||||
|
The channel dimension format for the input image. Can be one of:
|
||||||
|
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||||
|
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||||
|
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||||
|
"""
|
||||||
|
images = make_list_of_images(images)
|
||||||
|
|
||||||
|
if do_convert_rgb:
|
||||||
|
images = [convert_to_rgb(image) for image in images]
|
||||||
|
|
||||||
|
# All transformations expect numpy arrays.
|
||||||
|
images = [to_numpy_array(image) for image in images]
|
||||||
|
|
||||||
|
if do_rescale and is_scaled_image(images[0]):
|
||||||
|
logger.warning_once(
|
||||||
|
"It looks like you are trying to rescale already rescaled images. If the input"
|
||||||
|
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
|
||||||
|
)
|
||||||
|
if input_data_format is None:
|
||||||
|
# We assume that all images have the same channel dimension format.
|
||||||
|
input_data_format = infer_channel_dimension_format(images[0])
|
||||||
|
|
||||||
|
height, width = get_image_size(images[0], channel_dim=input_data_format)
|
||||||
|
resized_height, resized_width = height, width
|
||||||
|
processed_images = []
|
||||||
|
for image in images:
|
||||||
|
if do_resize:
|
||||||
|
resized_height, resized_width = smart_resize(
|
||||||
|
num_frames=temporal_patch_size,
|
||||||
|
height=height,
|
||||||
|
width=width,
|
||||||
|
temporal_factor=temporal_patch_size,
|
||||||
|
factor=patch_size * merge_size,
|
||||||
|
)
|
||||||
|
image = resize(
|
||||||
|
image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
|
||||||
|
)
|
||||||
|
|
||||||
|
if do_rescale:
|
||||||
|
image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
|
||||||
|
|
||||||
|
if do_normalize:
|
||||||
|
image = self.normalize(
|
||||||
|
image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
|
||||||
|
)
|
||||||
|
|
||||||
|
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
||||||
|
processed_images.append(image)
|
||||||
|
|
||||||
|
patches = np.array(processed_images)
|
||||||
|
if data_format == ChannelDimension.LAST:
|
||||||
|
patches = patches.transpose(0, 3, 1, 2)
|
||||||
|
if patches.shape[0] % temporal_patch_size != 0:
|
||||||
|
repeats = np.repeat(
|
||||||
|
patches[-1][np.newaxis], temporal_patch_size - (patches.shape[0] % temporal_patch_size), axis=0
|
||||||
|
)
|
||||||
|
patches = np.concatenate([patches, repeats], axis=0)
|
||||||
|
channel = patches.shape[1]
|
||||||
|
grid_t = patches.shape[0] // temporal_patch_size
|
||||||
|
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
|
||||||
|
patches = patches.reshape(
|
||||||
|
grid_t,
|
||||||
|
temporal_patch_size,
|
||||||
|
channel,
|
||||||
|
grid_h // merge_size,
|
||||||
|
merge_size,
|
||||||
|
patch_size,
|
||||||
|
grid_w // merge_size,
|
||||||
|
merge_size,
|
||||||
|
patch_size,
|
||||||
|
)
|
||||||
|
patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
|
||||||
|
flatten_patches = patches.reshape(
|
||||||
|
grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size
|
||||||
|
)
|
||||||
|
|
||||||
|
return flatten_patches, (grid_t, grid_h, grid_w)
|
||||||
|
|
||||||
|
def preprocess(
|
||||||
|
self,
|
||||||
|
images: ImageInput,
|
||||||
|
videos: VideoInput = None,
|
||||||
|
do_resize: Optional[bool] = None,
|
||||||
|
size: Optional[dict[str, int]] = None,
|
||||||
|
resample: PILImageResampling = None,
|
||||||
|
do_rescale: Optional[bool] = None,
|
||||||
|
rescale_factor: Optional[float] = None,
|
||||||
|
do_normalize: Optional[bool] = None,
|
||||||
|
image_mean: Optional[Union[float, list[float]]] = None,
|
||||||
|
image_std: Optional[Union[float, list[float]]] = None,
|
||||||
|
patch_size: Optional[int] = None,
|
||||||
|
temporal_patch_size: Optional[int] = None,
|
||||||
|
merge_size: Optional[int] = None,
|
||||||
|
do_convert_rgb: Optional[bool] = None,
|
||||||
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||||
|
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||||
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
images (`ImageInput`):
|
||||||
|
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
|
||||||
|
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
|
||||||
|
videos (`VideoInput`):
|
||||||
|
Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
|
||||||
|
passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
|
||||||
|
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||||
|
Whether to resize the image.
|
||||||
|
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
|
||||||
|
Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
|
||||||
|
the longest edge resized to keep the input aspect ratio.
|
||||||
|
resample (`int`, *optional*, defaults to `self.resample`):
|
||||||
|
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
|
||||||
|
has an effect if `do_resize` is set to `True`.
|
||||||
|
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||||
|
Whether to rescale the image.
|
||||||
|
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||||
|
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
|
||||||
|
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||||
|
Whether to normalize the image.
|
||||||
|
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
|
||||||
|
Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
|
||||||
|
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
||||||
|
Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
|
||||||
|
`True`.
|
||||||
|
The max pixels of the image to resize the image.
|
||||||
|
patch_size (`int`, *optional*, defaults to `self.patch_size`):
|
||||||
|
The spatial patch size of the vision encoder.
|
||||||
|
temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
|
||||||
|
The temporal patch size of the vision encoder.
|
||||||
|
merge_size (`int`, *optional*, defaults to `self.merge_size`):
|
||||||
|
The merge size of the vision encoder to llm encoder.
|
||||||
|
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
||||||
|
Whether to convert the image to RGB.
|
||||||
|
return_tensors (`str` or `TensorType`, *optional*):
|
||||||
|
The type of tensors to return. Can be one of:
|
||||||
|
- Unset: Return a list of `np.ndarray`.
|
||||||
|
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
|
||||||
|
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
|
||||||
|
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
|
||||||
|
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
|
||||||
|
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
|
||||||
|
The channel dimension format for the output image. Can be one of:
|
||||||
|
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||||
|
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||||
|
- Unset: Use the channel dimension format of the input image.
|
||||||
|
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||||
|
The channel dimension format for the input image. If unset, the channel dimension format is inferred
|
||||||
|
from the input image. Can be one of:
|
||||||
|
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||||
|
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||||
|
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
|
||||||
|
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
|
||||||
|
else:
|
||||||
|
size = {"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 15000}
|
||||||
|
|
||||||
|
do_resize = do_resize if do_resize is not None else self.do_resize
|
||||||
|
|
||||||
|
resample = resample if resample is not None else self.resample
|
||||||
|
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||||
|
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||||
|
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||||
|
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||||
|
image_std = image_std if image_std is not None else self.image_std
|
||||||
|
patch_size = patch_size if patch_size is not None else self.patch_size
|
||||||
|
temporal_patch_size = temporal_patch_size if temporal_patch_size is not None else self.temporal_patch_size
|
||||||
|
merge_size = merge_size if merge_size is not None else self.merge_size
|
||||||
|
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
|
||||||
|
|
||||||
|
if images is not None:
|
||||||
|
images = make_flat_list_of_images(images)
|
||||||
|
|
||||||
|
if images is not None and not valid_images(images):
|
||||||
|
raise ValueError(
|
||||||
|
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
|
||||||
|
"torch.Tensor, tf.Tensor or jax.ndarray."
|
||||||
|
)
|
||||||
|
|
||||||
|
validate_preprocess_arguments(
|
||||||
|
rescale_factor=rescale_factor,
|
||||||
|
do_normalize=do_normalize,
|
||||||
|
image_mean=image_mean,
|
||||||
|
image_std=image_std,
|
||||||
|
do_resize=do_resize,
|
||||||
|
size=size,
|
||||||
|
resample=resample,
|
||||||
|
)
|
||||||
|
|
||||||
|
data = {}
|
||||||
|
if images is not None:
|
||||||
|
pixel_values, vision_grid_thws = [], []
|
||||||
|
for image in images:
|
||||||
|
patches, image_grid_thw = self._preprocess(
|
||||||
|
image,
|
||||||
|
do_resize=do_resize,
|
||||||
|
size=size,
|
||||||
|
resample=resample,
|
||||||
|
do_rescale=do_rescale,
|
||||||
|
rescale_factor=rescale_factor,
|
||||||
|
do_normalize=do_normalize,
|
||||||
|
image_mean=image_mean,
|
||||||
|
image_std=image_std,
|
||||||
|
patch_size=patch_size,
|
||||||
|
temporal_patch_size=temporal_patch_size,
|
||||||
|
merge_size=merge_size,
|
||||||
|
data_format=data_format,
|
||||||
|
do_convert_rgb=do_convert_rgb,
|
||||||
|
input_data_format=input_data_format,
|
||||||
|
)
|
||||||
|
pixel_values.extend(patches)
|
||||||
|
vision_grid_thws.append(image_grid_thw)
|
||||||
|
pixel_values = np.array(pixel_values)
|
||||||
|
vision_grid_thws = np.array(vision_grid_thws)
|
||||||
|
data.update({"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws})
|
||||||
|
|
||||||
|
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||||
|
|
||||||
|
def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
|
||||||
|
"""
|
||||||
|
A utility that returns number of image patches for a given image size.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
height (`int`):
|
||||||
|
Height of the input image.
|
||||||
|
width (`int`):
|
||||||
|
Width of the input image.
|
||||||
|
images_kwargs (`dict`, *optional*)
|
||||||
|
Any kwargs to override defaults of the image processor.
|
||||||
|
Returns:
|
||||||
|
`int`: Number of image patches per image.
|
||||||
|
"""
|
||||||
|
patch_size = images_kwargs.get("patch_size", None) or self.patch_size
|
||||||
|
merge_size = images_kwargs.get("merge_size", None) or self.merge_size
|
||||||
|
|
||||||
|
factor = patch_size * merge_size
|
||||||
|
resized_height, resized_width = smart_resize(
|
||||||
|
t=self.temporal_patch_size,
|
||||||
|
height=height,
|
||||||
|
width=width,
|
||||||
|
factor=factor,
|
||||||
|
t_factor=self.temporal_patch_size,
|
||||||
|
)
|
||||||
|
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
|
||||||
|
return grid_h * grid_w
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Glm4vImageProcessor"]
|
||||||
364
src/transformers/models/glm4v/image_processing_glm4v_fast.py
Normal file
364
src/transformers/models/glm4v/image_processing_glm4v_fast.py
Normal file
@@ -0,0 +1,364 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Fast Image processor class for GLM-4.1V."""
|
||||||
|
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
|
from ...image_processing_utils import (
|
||||||
|
BatchFeature,
|
||||||
|
)
|
||||||
|
from ...image_processing_utils_fast import (
|
||||||
|
BaseImageProcessorFast,
|
||||||
|
DefaultFastImageProcessorKwargs,
|
||||||
|
group_images_by_shape,
|
||||||
|
reorder_images,
|
||||||
|
)
|
||||||
|
from ...image_utils import (
|
||||||
|
OPENAI_CLIP_MEAN,
|
||||||
|
OPENAI_CLIP_STD,
|
||||||
|
ChannelDimension,
|
||||||
|
ImageInput,
|
||||||
|
PILImageResampling,
|
||||||
|
SizeDict,
|
||||||
|
get_image_size,
|
||||||
|
make_flat_list_of_images,
|
||||||
|
valid_images,
|
||||||
|
)
|
||||||
|
from ...processing_utils import Unpack
|
||||||
|
from ...utils import (
|
||||||
|
TensorType,
|
||||||
|
auto_docstring,
|
||||||
|
is_torch_available,
|
||||||
|
is_torchvision_available,
|
||||||
|
is_torchvision_v2_available,
|
||||||
|
logging,
|
||||||
|
)
|
||||||
|
from ...video_utils import VideoInput
|
||||||
|
from .image_processing_glm4v import smart_resize
|
||||||
|
|
||||||
|
|
||||||
|
if is_torch_available():
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
if is_torchvision_available():
|
||||||
|
from ...image_utils import pil_torch_interpolation_mapping
|
||||||
|
|
||||||
|
if is_torchvision_v2_available():
|
||||||
|
from torchvision.transforms.v2 import functional as F
|
||||||
|
else:
|
||||||
|
from torchvision.transforms import functional as F
|
||||||
|
|
||||||
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class Glm4vFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||||
|
"""
|
||||||
|
patch_size (`int`, *optional*, defaults to 14):
|
||||||
|
The spatial patch size of the vision encoder.
|
||||||
|
temporal_patch_size (`int`, *optional*, defaults to 2):
|
||||||
|
The temporal patch size of the vision encoder.
|
||||||
|
merge_size (`int`, *optional*, defaults to 2):
|
||||||
|
The merge size of the vision encoder to llm encoder.
|
||||||
|
"""
|
||||||
|
|
||||||
|
patch_size: Optional[int]
|
||||||
|
temporal_patch_size: Optional[int]
|
||||||
|
merge_size: Optional[int]
|
||||||
|
|
||||||
|
|
||||||
|
@auto_docstring
|
||||||
|
class Glm4vImageProcessorFast(BaseImageProcessorFast):
|
||||||
|
do_resize = True
|
||||||
|
resample = PILImageResampling.BICUBIC
|
||||||
|
size = {"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 15000}
|
||||||
|
do_rescale = True
|
||||||
|
do_normalize = True
|
||||||
|
image_mean = OPENAI_CLIP_MEAN
|
||||||
|
image_std = OPENAI_CLIP_STD
|
||||||
|
do_convert_rgb = True
|
||||||
|
patch_size = 14
|
||||||
|
temporal_patch_size = 2
|
||||||
|
merge_size = 2
|
||||||
|
valid_kwargs = Glm4vFastImageProcessorKwargs
|
||||||
|
model_input_names = ["pixel_values", "image_grid_thw"]
|
||||||
|
|
||||||
|
def __init__(self, **kwargs: Unpack[Glm4vFastImageProcessorKwargs]):
|
||||||
|
size = kwargs.pop("size", None)
|
||||||
|
if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
|
||||||
|
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
|
||||||
|
else:
|
||||||
|
size = self.size
|
||||||
|
|
||||||
|
super().__init__(size=size, **kwargs)
|
||||||
|
|
||||||
|
def _preprocess(
|
||||||
|
self,
|
||||||
|
images: list["torch.Tensor"],
|
||||||
|
do_resize: bool,
|
||||||
|
size: SizeDict,
|
||||||
|
interpolation: Optional["F.InterpolationMode"],
|
||||||
|
do_rescale: bool,
|
||||||
|
rescale_factor: float,
|
||||||
|
do_normalize: bool,
|
||||||
|
image_mean: Optional[Union[float, list[float]]],
|
||||||
|
image_std: Optional[Union[float, list[float]]],
|
||||||
|
patch_size: int,
|
||||||
|
temporal_patch_size: int,
|
||||||
|
merge_size: int,
|
||||||
|
do_convert_rgb: bool,
|
||||||
|
input_data_format: Optional[Union[str, ChannelDimension]],
|
||||||
|
device: Optional[Union[str, torch.device]],
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
images (`ImageInput`):
|
||||||
|
Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
|
||||||
|
vision_info (`List[Dict]`, *optional*):
|
||||||
|
Optional list of dictionaries containing additional information about vision inputs.
|
||||||
|
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||||
|
Whether to resize the image.
|
||||||
|
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
|
||||||
|
Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
|
||||||
|
interpolation (`InterpolationMode`):
|
||||||
|
Resampling filter to use if resizing the image.
|
||||||
|
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||||
|
Whether to rescale the image.
|
||||||
|
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||||
|
Scale factor to use if rescaling the image.
|
||||||
|
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||||
|
Whether to normalize the image.
|
||||||
|
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
|
||||||
|
Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
|
||||||
|
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
||||||
|
Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
|
||||||
|
patch_size (`int`, *optional*, defaults to `self.patch_size`):
|
||||||
|
The spatial patch size of the vision encoder.
|
||||||
|
temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
|
||||||
|
The temporal patch size of the vision encoder.
|
||||||
|
merge_size (`int`, *optional*, defaults to `self.merge_size`):
|
||||||
|
The merge size of the vision encoder to llm encoder.
|
||||||
|
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
||||||
|
Whether to convert the image to RGB.
|
||||||
|
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||||
|
The channel dimension format for the input image. Can be one of:
|
||||||
|
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||||
|
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||||
|
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||||
|
device (`torch.device`, *optional*):
|
||||||
|
The device to process the images on. If unset, the device is inferred from the input images.
|
||||||
|
"""
|
||||||
|
images = self._prepare_input_images(
|
||||||
|
images=images,
|
||||||
|
do_convert_rgb=do_convert_rgb,
|
||||||
|
input_data_format=input_data_format,
|
||||||
|
device=device,
|
||||||
|
)
|
||||||
|
|
||||||
|
height, width = get_image_size(images[0], channel_dim=ChannelDimension.FIRST)
|
||||||
|
resized_height, resized_width = height, width
|
||||||
|
|
||||||
|
# Group images by size for batched resizing
|
||||||
|
grouped_images, grouped_images_index = group_images_by_shape(images)
|
||||||
|
resized_images_grouped = {}
|
||||||
|
for shape, stacked_images in grouped_images.items():
|
||||||
|
if do_resize:
|
||||||
|
resized_height, resized_width = smart_resize(
|
||||||
|
num_frames=temporal_patch_size,
|
||||||
|
height=height,
|
||||||
|
width=width,
|
||||||
|
temporal_factor=temporal_patch_size,
|
||||||
|
factor=patch_size * merge_size,
|
||||||
|
)
|
||||||
|
stacked_images = F.resize(
|
||||||
|
stacked_images, size=(resized_height, resized_width), interpolation=interpolation
|
||||||
|
)
|
||||||
|
resized_images_grouped[shape] = stacked_images
|
||||||
|
resized_images = reorder_images(resized_images_grouped, grouped_images_index)
|
||||||
|
# Group images by size for further processing
|
||||||
|
# Needed in case do_resize is False, or resize returns images with different sizes
|
||||||
|
grouped_images, grouped_images_index = group_images_by_shape(resized_images)
|
||||||
|
processed_images_grouped = {}
|
||||||
|
for shape, stacked_images in grouped_images.items():
|
||||||
|
# Fused rescale and normalize
|
||||||
|
stacked_images = self.rescale_and_normalize(
|
||||||
|
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
|
||||||
|
)
|
||||||
|
processed_images_grouped[shape] = stacked_images
|
||||||
|
|
||||||
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
||||||
|
patches = torch.stack(processed_images, dim=0)
|
||||||
|
if patches.shape[0] % temporal_patch_size != 0:
|
||||||
|
repeats = patches[-1].unsqueeze(0).repeat(temporal_patch_size - 1, 1, 1, 1)
|
||||||
|
patches = torch.cat([patches, repeats], dim=0)
|
||||||
|
|
||||||
|
channel = patches.shape[1]
|
||||||
|
grid_t = patches.shape[0] // temporal_patch_size
|
||||||
|
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
|
||||||
|
|
||||||
|
patches = patches.view(
|
||||||
|
grid_t,
|
||||||
|
temporal_patch_size,
|
||||||
|
channel,
|
||||||
|
grid_h // merge_size,
|
||||||
|
merge_size,
|
||||||
|
patch_size,
|
||||||
|
grid_w // merge_size,
|
||||||
|
merge_size,
|
||||||
|
patch_size,
|
||||||
|
)
|
||||||
|
patches = patches.permute(0, 3, 6, 4, 7, 2, 1, 5, 8)
|
||||||
|
flatten_patches = patches.reshape(
|
||||||
|
grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size
|
||||||
|
)
|
||||||
|
|
||||||
|
return flatten_patches, (grid_t, grid_h, grid_w)
|
||||||
|
|
||||||
|
@auto_docstring
|
||||||
|
def preprocess(
|
||||||
|
self,
|
||||||
|
images: ImageInput,
|
||||||
|
videos: VideoInput = None,
|
||||||
|
do_resize: Optional[bool] = None,
|
||||||
|
size: Optional[dict[str, int]] = None,
|
||||||
|
resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]] = None,
|
||||||
|
do_rescale: Optional[bool] = None,
|
||||||
|
rescale_factor: Optional[float] = None,
|
||||||
|
do_normalize: Optional[bool] = None,
|
||||||
|
image_mean: Optional[Union[float, list[float]]] = None,
|
||||||
|
image_std: Optional[Union[float, list[float]]] = None,
|
||||||
|
patch_size: Optional[int] = None,
|
||||||
|
temporal_patch_size: Optional[int] = None,
|
||||||
|
merge_size: Optional[int] = None,
|
||||||
|
do_convert_rgb: Optional[bool] = None,
|
||||||
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||||
|
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||||
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
|
device: Optional["torch.device"] = None,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
r"""
|
||||||
|
patch_size (`int`, *optional*, defaults to 14):
|
||||||
|
The spatial patch size of the vision encoder.
|
||||||
|
temporal_patch_size (`int`, *optional*, defaults to 2):
|
||||||
|
The temporal patch size of the vision encoder.
|
||||||
|
merge_size (`int`, *optional*, defaults to 2):
|
||||||
|
The merge size of the vision encoder to llm encoder.
|
||||||
|
"""
|
||||||
|
|
||||||
|
do_resize = do_resize if do_resize is not None else self.do_resize
|
||||||
|
size = size if size is not None else self.size
|
||||||
|
resample = resample if resample is not None else self.resample
|
||||||
|
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||||
|
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||||
|
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||||
|
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||||
|
image_std = image_std if image_std is not None else self.image_std
|
||||||
|
patch_size = patch_size if patch_size is not None else self.patch_size
|
||||||
|
temporal_patch_size = temporal_patch_size if temporal_patch_size is not None else self.temporal_patch_size
|
||||||
|
merge_size = merge_size if merge_size is not None else self.merge_size
|
||||||
|
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
|
||||||
|
|
||||||
|
# Make hashable for cache
|
||||||
|
size = SizeDict(**size) if size is not None else None
|
||||||
|
image_mean = tuple(image_mean) if image_mean is not None else None
|
||||||
|
image_std = tuple(image_std) if image_std is not None else None
|
||||||
|
|
||||||
|
self._validate_preprocess_kwargs(
|
||||||
|
do_rescale=do_rescale,
|
||||||
|
rescale_factor=rescale_factor,
|
||||||
|
do_normalize=do_normalize,
|
||||||
|
image_mean=image_mean,
|
||||||
|
image_std=image_std,
|
||||||
|
do_resize=do_resize,
|
||||||
|
size=size,
|
||||||
|
resample=resample,
|
||||||
|
return_tensors=return_tensors,
|
||||||
|
data_format=data_format,
|
||||||
|
)
|
||||||
|
interpolation = (
|
||||||
|
pil_torch_interpolation_mapping[resample] if isinstance(resample, (PILImageResampling, int)) else resample
|
||||||
|
)
|
||||||
|
|
||||||
|
if images is not None:
|
||||||
|
images = make_flat_list_of_images(images)
|
||||||
|
|
||||||
|
if images is not None and not valid_images(images):
|
||||||
|
raise ValueError(
|
||||||
|
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
|
||||||
|
"torch.Tensor, tf.Tensor or jax.ndarray."
|
||||||
|
)
|
||||||
|
|
||||||
|
data = {}
|
||||||
|
if images is not None:
|
||||||
|
pixel_values, vision_grid_thws = [], []
|
||||||
|
for image in images:
|
||||||
|
patches, image_grid_thw = self._preprocess(
|
||||||
|
image,
|
||||||
|
do_resize=do_resize,
|
||||||
|
size=size,
|
||||||
|
interpolation=interpolation,
|
||||||
|
do_rescale=do_rescale,
|
||||||
|
rescale_factor=rescale_factor,
|
||||||
|
do_normalize=do_normalize,
|
||||||
|
image_mean=image_mean,
|
||||||
|
image_std=image_std,
|
||||||
|
patch_size=patch_size,
|
||||||
|
temporal_patch_size=temporal_patch_size,
|
||||||
|
merge_size=merge_size,
|
||||||
|
do_convert_rgb=do_convert_rgb,
|
||||||
|
input_data_format=input_data_format,
|
||||||
|
device=device,
|
||||||
|
)
|
||||||
|
pixel_values.extend(patches)
|
||||||
|
vision_grid_thws.append(image_grid_thw)
|
||||||
|
pixel_values = torch.stack(pixel_values)
|
||||||
|
vision_grid_thws = torch.tensor(vision_grid_thws)
|
||||||
|
data.update({"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws})
|
||||||
|
|
||||||
|
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||||
|
|
||||||
|
def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
|
||||||
|
"""
|
||||||
|
A utility that returns number of image patches for a given image size.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
height (`int`):
|
||||||
|
Height of the input image.
|
||||||
|
width (`int`):
|
||||||
|
Width of the input image.
|
||||||
|
images_kwargs (`dict`, *optional*)
|
||||||
|
Any kwargs to override defaults of the image processor.
|
||||||
|
Returns:
|
||||||
|
`int`: Number of image patches per image.
|
||||||
|
"""
|
||||||
|
patch_size = images_kwargs.get("patch_size", None) or self.patch_size
|
||||||
|
merge_size = images_kwargs.get("merge_size", None) or self.merge_size
|
||||||
|
|
||||||
|
factor = patch_size * merge_size
|
||||||
|
resized_height, resized_width = smart_resize(
|
||||||
|
t=self.temporal_patch_size,
|
||||||
|
height=height,
|
||||||
|
width=width,
|
||||||
|
factor=factor,
|
||||||
|
t_factor=self.temporal_patch_size,
|
||||||
|
)
|
||||||
|
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
|
||||||
|
return grid_h * grid_w
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Glm4vImageProcessorFast"]
|
||||||
1667
src/transformers/models/glm4v/modeling_glm4v.py
Normal file
1667
src/transformers/models/glm4v/modeling_glm4v.py
Normal file
File diff suppressed because it is too large
Load Diff
1733
src/transformers/models/glm4v/modular_glm4v.py
Normal file
1733
src/transformers/models/glm4v/modular_glm4v.py
Normal file
File diff suppressed because it is too large
Load Diff
289
src/transformers/models/glm4v/processing_glm4v.py
Normal file
289
src/transformers/models/glm4v/processing_glm4v.py
Normal file
@@ -0,0 +1,289 @@
|
|||||||
|
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||||
|
# This file was automatically generated from src/transformers/models/glm4v/modular_glm4v.py.
|
||||||
|
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
||||||
|
# the file from the modular. If any change should be done, please apply the change to the
|
||||||
|
# modular_glm4v.py file directly. One of our CI enforces this.
|
||||||
|
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
|
from ...feature_extraction_utils import BatchFeature
|
||||||
|
from ...image_utils import ImageInput
|
||||||
|
from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
|
||||||
|
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
||||||
|
from ...video_utils import VideoInput
|
||||||
|
|
||||||
|
|
||||||
|
class Glm4vVideosProcessorKwargs(VideosKwargs, total=False):
|
||||||
|
fps: Union[list[float], float]
|
||||||
|
|
||||||
|
|
||||||
|
class Glm4vImagesKwargs(ImagesKwargs):
|
||||||
|
patch_size: Optional[int]
|
||||||
|
temporal_patch_size: Optional[int]
|
||||||
|
merge_size: Optional[int]
|
||||||
|
|
||||||
|
|
||||||
|
class Glm4vProcessorKwargs(ProcessingKwargs, total=False):
|
||||||
|
images_kwargs: Glm4vImagesKwargs
|
||||||
|
videos_kwargs: Glm4vVideosProcessorKwargs
|
||||||
|
_defaults = {
|
||||||
|
"text_kwargs": {
|
||||||
|
"padding": False,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class Glm4vProcessor(ProcessorMixin):
|
||||||
|
r"""
|
||||||
|
Constructs a GLM-4V processor which wraps a GLM-4V image processor and a GLM-4 tokenizer into a single processor.
|
||||||
|
[`~Glm4vProcessor.__call__`] and [`~Glm4vProcessor.decode`] for more information.
|
||||||
|
Args:
|
||||||
|
image_processor ([`Glm4vProcessor`], *optional*):
|
||||||
|
The image processor is a required input.
|
||||||
|
tokenizer ([`PreTrainedTokenizerFast`], *optional*):
|
||||||
|
The tokenizer is a required input.
|
||||||
|
video_processor ([`Glm4vVideoProcessor`], *optional*):
|
||||||
|
The video processor is a required input.
|
||||||
|
chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
|
||||||
|
in a chat into a tokenizable string.
|
||||||
|
"""
|
||||||
|
|
||||||
|
attributes = ["image_processor", "tokenizer", "video_processor"]
|
||||||
|
|
||||||
|
image_processor_class = "AutoImageProcessor"
|
||||||
|
video_processor_class = "AutoVideoProcessor"
|
||||||
|
|
||||||
|
tokenizer_class = ("PreTrainedTokenizer", "PreTrainedTokenizerFast")
|
||||||
|
|
||||||
|
def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
|
||||||
|
super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
|
||||||
|
self.image_token = "<|image|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
|
||||||
|
self.video_token = "<|video|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
|
||||||
|
self.image_token_id = (
|
||||||
|
tokenizer.image_token_id
|
||||||
|
if getattr(tokenizer, "image_token_id", None)
|
||||||
|
else tokenizer.convert_tokens_to_ids(self.image_token)
|
||||||
|
)
|
||||||
|
self.video_token_id = (
|
||||||
|
tokenizer.video_token_id
|
||||||
|
if getattr(tokenizer, "video_token_id", None)
|
||||||
|
else tokenizer.convert_tokens_to_ids(self.video_token)
|
||||||
|
)
|
||||||
|
|
||||||
|
def __call__(
|
||||||
|
self,
|
||||||
|
images: ImageInput = None,
|
||||||
|
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
||||||
|
videos: VideoInput = None,
|
||||||
|
**kwargs: Unpack[Glm4vProcessorKwargs],
|
||||||
|
) -> BatchFeature:
|
||||||
|
"""
|
||||||
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
|
and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||||
|
the text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
||||||
|
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
||||||
|
tensor. Both channels-first and channels-last formats are supported.
|
||||||
|
text (`str`, `List[str]`, `List[List[str]]`):
|
||||||
|
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||||
|
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||||
|
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||||
|
videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
||||||
|
The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
|
||||||
|
tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
|
||||||
|
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||||
|
If set, will return tensors of a particular framework. Acceptable values are:
|
||||||
|
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
||||||
|
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||||
|
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||||
|
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
||||||
|
|
||||||
|
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
||||||
|
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
||||||
|
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
|
||||||
|
`None`).
|
||||||
|
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
|
||||||
|
- **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
|
||||||
|
- **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
|
||||||
|
- **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
|
||||||
|
"""
|
||||||
|
output_kwargs = self._merge_kwargs(
|
||||||
|
Glm4vProcessorKwargs,
|
||||||
|
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
if images is not None:
|
||||||
|
image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
|
||||||
|
image_grid_thw = image_inputs["image_grid_thw"]
|
||||||
|
else:
|
||||||
|
image_inputs = {}
|
||||||
|
image_grid_thw = None
|
||||||
|
|
||||||
|
if videos is not None:
|
||||||
|
videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
|
||||||
|
timestamps = videos_inputs.pop("timestamps")
|
||||||
|
video_grid_thw = videos_inputs["video_grid_thw"]
|
||||||
|
else:
|
||||||
|
videos_inputs = {}
|
||||||
|
timestamps = []
|
||||||
|
video_grid_thw = None
|
||||||
|
|
||||||
|
if not isinstance(text, list):
|
||||||
|
text = [text]
|
||||||
|
|
||||||
|
text = text.copy() # below lines change text in-place
|
||||||
|
if image_grid_thw is not None:
|
||||||
|
merge_length = self.image_processor.merge_size**2
|
||||||
|
index = 0
|
||||||
|
for i in range(len(text)):
|
||||||
|
while self.image_token in text[i]:
|
||||||
|
num_image_tokens = image_grid_thw[index].prod() // merge_length
|
||||||
|
text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
|
||||||
|
index += 1
|
||||||
|
text[i] = text[i].replace("<|placeholder|>", self.image_token)
|
||||||
|
|
||||||
|
if video_grid_thw is not None:
|
||||||
|
merge_length = self.video_processor.merge_size**2
|
||||||
|
video_index = 0
|
||||||
|
for i in range(len(text)):
|
||||||
|
while self.video_token in text[i]:
|
||||||
|
num_frames = len(video_grid_thw)
|
||||||
|
video_structure = ""
|
||||||
|
|
||||||
|
if hasattr(timestamps, "tolist"):
|
||||||
|
timestamps_list = timestamps.tolist()[0]
|
||||||
|
else:
|
||||||
|
timestamps_list = timestamps[0] if isinstance(timestamps[0], list) else timestamps
|
||||||
|
unique_timestamps = []
|
||||||
|
for idx in range(0, len(timestamps_list)):
|
||||||
|
unique_timestamps.append(timestamps_list[idx])
|
||||||
|
selected_timestamps = unique_timestamps[:num_frames]
|
||||||
|
while len(selected_timestamps) < num_frames:
|
||||||
|
selected_timestamps.append(selected_timestamps[-1] if selected_timestamps else 0)
|
||||||
|
for frame_idx in range(num_frames):
|
||||||
|
timestamp_sec = selected_timestamps[frame_idx]
|
||||||
|
frame_structure = f"<|begin_of_image|>{self.image_token}<|end_of_image|>{timestamp_sec}"
|
||||||
|
video_structure += frame_structure
|
||||||
|
text[i] = text[i].replace(self.video_token, video_structure, 1)
|
||||||
|
video_index += 1
|
||||||
|
|
||||||
|
for frame_idx in range(len(video_grid_thw)):
|
||||||
|
if self.image_token in text[i]:
|
||||||
|
num_image_tokens = video_grid_thw[frame_idx].prod() // merge_length
|
||||||
|
text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
|
||||||
|
text[i] = text[i].replace("<|placeholder|>", self.image_token)
|
||||||
|
|
||||||
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||||
|
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||||||
|
self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
|
||||||
|
|
||||||
|
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
|
||||||
|
|
||||||
|
def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
|
||||||
|
"""
|
||||||
|
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
|
||||||
|
Args:
|
||||||
|
image_sizes (`list[list[int]]`, *optional*):
|
||||||
|
The input sizes formatted as (height, width) per each image.
|
||||||
|
video_sizes (`list[list[int]]`, *optional*):
|
||||||
|
The input sizes formatted as (num_frames, height, width) per each video.
|
||||||
|
Returns:
|
||||||
|
`MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
|
||||||
|
input modalities, along with other useful data.
|
||||||
|
"""
|
||||||
|
|
||||||
|
vision_data = {}
|
||||||
|
if image_sizes is not None:
|
||||||
|
images_kwargs = Glm4vProcessorKwargs._defaults.get("images_kwargs", {})
|
||||||
|
images_kwargs.update(kwargs)
|
||||||
|
merge_size = images_kwargs.get("merge_size", None) or self.image_processor.merge_size
|
||||||
|
|
||||||
|
num_image_patches = [
|
||||||
|
self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
|
||||||
|
for image_size in image_sizes
|
||||||
|
]
|
||||||
|
num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches]
|
||||||
|
vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
|
||||||
|
|
||||||
|
if video_sizes is not None:
|
||||||
|
videos_kwargs = Glm4vProcessorKwargs._defaults.get("videos_kwargs", {})
|
||||||
|
videos_kwargs.update(kwargs)
|
||||||
|
num_video_patches = [
|
||||||
|
self.video_processor.get_number_of_video_patches(*video_size, videos_kwargs)
|
||||||
|
for video_size in video_sizes
|
||||||
|
]
|
||||||
|
num_video_tokens = [(num_patches // merge_size**2) for num_patches in num_video_patches]
|
||||||
|
vision_data["num_video_tokens"] = num_video_tokens
|
||||||
|
|
||||||
|
return MultiModalData(**vision_data)
|
||||||
|
|
||||||
|
def batch_decode(self, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
|
||||||
|
refer to the docstring of this method for more information.
|
||||||
|
"""
|
||||||
|
return self.tokenizer.batch_decode(*args, **kwargs)
|
||||||
|
|
||||||
|
def decode(self, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
|
||||||
|
the docstring of this method for more information.
|
||||||
|
"""
|
||||||
|
return self.tokenizer.decode(*args, **kwargs)
|
||||||
|
|
||||||
|
def post_process_image_text_to_text(
|
||||||
|
self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Post-process the output of the model to decode the text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
generated_outputs (`torch.Tensor` or `np.ndarray`):
|
||||||
|
The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
|
||||||
|
or `(sequence_length,)`.
|
||||||
|
skip_special_tokens (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
|
||||||
|
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
|
||||||
|
**kwargs:
|
||||||
|
Additional arguments to be passed to the tokenizer's `batch_decode method`.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`list[str]`: The decoded text.
|
||||||
|
"""
|
||||||
|
return self.tokenizer.batch_decode(
|
||||||
|
generated_outputs,
|
||||||
|
skip_special_tokens=skip_special_tokens,
|
||||||
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def model_input_names(self):
|
||||||
|
tokenizer_input_names = self.tokenizer.model_input_names
|
||||||
|
image_processor_input_names = self.image_processor.model_input_names
|
||||||
|
names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
||||||
|
return names_from_processor + ["second_per_grid_ts"]
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Glm4vProcessor"]
|
||||||
262
src/transformers/models/glm4v/video_processing_glm4v.py
Normal file
262
src/transformers/models/glm4v/video_processing_glm4v.py
Normal file
@@ -0,0 +1,262 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""video processor class for GLM-4.1V."""
|
||||||
|
|
||||||
|
import math
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from ...image_processing_utils import (
|
||||||
|
BatchFeature,
|
||||||
|
)
|
||||||
|
from ...image_utils import (
|
||||||
|
OPENAI_CLIP_MEAN,
|
||||||
|
OPENAI_CLIP_STD,
|
||||||
|
ChannelDimension,
|
||||||
|
SizeDict,
|
||||||
|
get_image_size,
|
||||||
|
)
|
||||||
|
from ...processing_utils import Unpack, VideosKwargs
|
||||||
|
from ...utils import (
|
||||||
|
TensorType,
|
||||||
|
add_start_docstrings,
|
||||||
|
is_torch_available,
|
||||||
|
is_vision_available,
|
||||||
|
)
|
||||||
|
from .image_processing_glm4v import smart_resize
|
||||||
|
|
||||||
|
|
||||||
|
if is_torch_available():
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from ...utils.import_utils import requires
|
||||||
|
from ...video_processing_utils import (
|
||||||
|
BASE_VIDEO_PROCESSOR_DOCSTRING,
|
||||||
|
BaseVideoProcessor,
|
||||||
|
)
|
||||||
|
from ...video_utils import VideoMetadata, group_videos_by_shape, reorder_videos
|
||||||
|
|
||||||
|
|
||||||
|
if is_vision_available():
|
||||||
|
from ...image_utils import PILImageResampling
|
||||||
|
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
|
||||||
|
class Glm4vVideoProcessorInitKwargs(VideosKwargs):
|
||||||
|
max_image_size: dict[str, int] = None
|
||||||
|
patch_size: Optional[int] = None
|
||||||
|
temporal_patch_size: Optional[int] = None
|
||||||
|
merge_size: Optional[int] = None
|
||||||
|
image_mean: Optional[list[float]] = None
|
||||||
|
image_std: Optional[list[float]] = None
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings(
|
||||||
|
"Constructs a fast GLM-4V image processor that dynamically resizes videos based on the original videos.",
|
||||||
|
BASE_VIDEO_PROCESSOR_DOCSTRING,
|
||||||
|
"""
|
||||||
|
patch_size (`int`, *optional*, defaults to 14):
|
||||||
|
The spacial patch size of the vision encoder.
|
||||||
|
temporal_patch_size (`int`, *optional*, defaults to 2):
|
||||||
|
The temporal patch size of the vision encoder.
|
||||||
|
merge_size (`int`, *optional*, defaults to 2):
|
||||||
|
The merge size of the vision encoder to llm encoder.
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
@requires(backends=("torchvision",))
|
||||||
|
class Glm4vVideoProcessor(BaseVideoProcessor):
|
||||||
|
resample = PILImageResampling.BICUBIC
|
||||||
|
size = {"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 2 * 30000}
|
||||||
|
max_image_size = {"longest_edge": 28 * 28 * 2 * 30000}
|
||||||
|
image_mean = OPENAI_CLIP_MEAN
|
||||||
|
image_std = OPENAI_CLIP_STD
|
||||||
|
do_resize = True
|
||||||
|
do_rescale = True
|
||||||
|
do_normalize = True
|
||||||
|
do_convert_rgb = True
|
||||||
|
do_sample_frames = True
|
||||||
|
patch_size = 14
|
||||||
|
temporal_patch_size = 2
|
||||||
|
max_duration = 300
|
||||||
|
merge_size = 2
|
||||||
|
valid_kwargs = Glm4vVideoProcessorInitKwargs
|
||||||
|
num_frames = 16
|
||||||
|
fps = 2
|
||||||
|
|
||||||
|
model_input_names = ["pixel_values_videos", "video_grid_thw"]
|
||||||
|
|
||||||
|
def __init__(self, **kwargs: Unpack[Glm4vVideoProcessorInitKwargs]):
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
|
def sample_frames(
|
||||||
|
self,
|
||||||
|
video: torch.Tensor,
|
||||||
|
metadata: Union[VideoMetadata, dict],
|
||||||
|
):
|
||||||
|
total_frames = video.shape[0]
|
||||||
|
video_fps = getattr(metadata, "fps", 2.0)
|
||||||
|
meta_frames = getattr(metadata, "total_num_frames", total_frames)
|
||||||
|
max_frame_idx = meta_frames - 1
|
||||||
|
duration = getattr(metadata, "duration", None)
|
||||||
|
if duration is None:
|
||||||
|
duration = round(max_frame_idx / video_fps) + 1
|
||||||
|
|
||||||
|
if duration <= self.max_duration:
|
||||||
|
n = int(math.floor(duration * self.fps))
|
||||||
|
frame_indices = [min(max_frame_idx, int(math.ceil(i * video_fps / self.fps))) for i in range(n)]
|
||||||
|
else:
|
||||||
|
num_samples = int(self.max_duration * self.fps)
|
||||||
|
if num_samples >= meta_frames:
|
||||||
|
frame_indices = list(range(meta_frames))
|
||||||
|
else:
|
||||||
|
target_seconds = np.linspace(0, duration, num_samples, endpoint=True)
|
||||||
|
frame_indices = [min(max_frame_idx, int(math.ceil(t * video_fps))) for t in target_seconds]
|
||||||
|
|
||||||
|
seen, uniq = set(), []
|
||||||
|
for idx in frame_indices:
|
||||||
|
if idx not in seen:
|
||||||
|
seen.add(idx)
|
||||||
|
uniq.append(idx)
|
||||||
|
|
||||||
|
if len(uniq) & 1:
|
||||||
|
uniq.append(uniq[-1])
|
||||||
|
|
||||||
|
frame_indices = uniq
|
||||||
|
sampled_video = video[frame_indices]
|
||||||
|
full_second_idxs = [int(idx / video_fps) for idx in frame_indices]
|
||||||
|
second_idxs = full_second_idxs[::2] # mrope
|
||||||
|
return sampled_video, second_idxs
|
||||||
|
|
||||||
|
def _preprocess(
|
||||||
|
self,
|
||||||
|
videos: list[torch.Tensor],
|
||||||
|
video_metadata: Optional[Union[list[VideoMetadata], list[dict]]] = None,
|
||||||
|
do_convert_rgb: bool = True,
|
||||||
|
do_resize: bool = True,
|
||||||
|
size: SizeDict = None,
|
||||||
|
do_rescale: bool = True,
|
||||||
|
rescale_factor: float = 1 / 255.0,
|
||||||
|
do_normalize: bool = True,
|
||||||
|
do_sample_frames: bool = True,
|
||||||
|
image_mean: Optional[Union[float, list[float]]] = None,
|
||||||
|
image_std: Optional[Union[float, list[float]]] = None,
|
||||||
|
patch_size: Optional[int] = None,
|
||||||
|
temporal_patch_size: Optional[int] = None,
|
||||||
|
merge_size: Optional[int] = None,
|
||||||
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
timestamps_list = []
|
||||||
|
if do_sample_frames:
|
||||||
|
if video_metadata is None or (isinstance(video_metadata, list) and video_metadata[0] is None):
|
||||||
|
raise ValueError(
|
||||||
|
"Frame sampling is enabled but no video metadata was found. "
|
||||||
|
"Please pass in `VideoMetadata` object per each input video or set `do_sample_frames=False`"
|
||||||
|
)
|
||||||
|
processed_videos = []
|
||||||
|
for video, metadata in zip(videos, video_metadata):
|
||||||
|
video, timestamps = self.sample_frames(video, metadata)
|
||||||
|
timestamps_list.append(timestamps)
|
||||||
|
processed_videos.append(video)
|
||||||
|
else:
|
||||||
|
raise AssertionError("Must set `do_sample_frames=True` to sample frames from GLM-4.1V Model.")
|
||||||
|
|
||||||
|
grouped_videos, grouped_videos_index = group_videos_by_shape(processed_videos)
|
||||||
|
resized_videos_grouped = {}
|
||||||
|
|
||||||
|
for shape, stacked_videos in grouped_videos.items():
|
||||||
|
B, T, C, H, W = stacked_videos.shape
|
||||||
|
num_frames, height, width = T, H, W
|
||||||
|
if do_resize:
|
||||||
|
resized_height, resized_width = smart_resize(
|
||||||
|
num_frames=num_frames,
|
||||||
|
height=height,
|
||||||
|
width=width,
|
||||||
|
temporal_factor=temporal_patch_size,
|
||||||
|
factor=patch_size * merge_size,
|
||||||
|
max_pixels=self.max_image_size["longest_edge"],
|
||||||
|
)
|
||||||
|
stacked_videos = stacked_videos.view(B * T, C, H, W)
|
||||||
|
stacked_videos = F.interpolate(
|
||||||
|
stacked_videos, size=(resized_height, resized_width), mode="bicubic", align_corners=False
|
||||||
|
)
|
||||||
|
stacked_videos = stacked_videos.view(B, T, C, resized_height, resized_width)
|
||||||
|
resized_videos_grouped[shape] = stacked_videos
|
||||||
|
resized_videos = reorder_videos(resized_videos_grouped, grouped_videos_index)
|
||||||
|
|
||||||
|
# Group videos by size for further processing
|
||||||
|
# Needed in case do_resize is False, or resize returns videos with different sizes
|
||||||
|
grouped_videos, grouped_videos_index = group_videos_by_shape(resized_videos)
|
||||||
|
processed_videos_grouped = {}
|
||||||
|
processed_grids = {}
|
||||||
|
for shape, stacked_videos in grouped_videos.items():
|
||||||
|
resized_height, resized_width = get_image_size(stacked_videos[0], channel_dim=ChannelDimension.FIRST)
|
||||||
|
|
||||||
|
# Fused rescale and normalize
|
||||||
|
stacked_videos = self.rescale_and_normalize(
|
||||||
|
stacked_videos, do_rescale, rescale_factor, do_normalize, image_mean, image_std
|
||||||
|
)
|
||||||
|
patches = stacked_videos
|
||||||
|
|
||||||
|
# Check that videos have `num_frames` divisible by `temporal_patch_size`
|
||||||
|
if patches.shape[1] % temporal_patch_size != 0:
|
||||||
|
repeats = patches[:, -1:].repeat(1, temporal_patch_size - 1, 1, 1, 1)
|
||||||
|
patches = torch.cat([patches, repeats], dim=1)
|
||||||
|
batch_size, grid_t, channel = patches.shape[:3]
|
||||||
|
grid_t = grid_t // temporal_patch_size
|
||||||
|
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
|
||||||
|
|
||||||
|
patches = patches.view(
|
||||||
|
batch_size,
|
||||||
|
grid_t,
|
||||||
|
temporal_patch_size,
|
||||||
|
channel,
|
||||||
|
grid_h // merge_size,
|
||||||
|
merge_size,
|
||||||
|
patch_size,
|
||||||
|
grid_w // merge_size,
|
||||||
|
merge_size,
|
||||||
|
patch_size,
|
||||||
|
)
|
||||||
|
patches = patches.permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9)
|
||||||
|
flatten_patches = patches.reshape(
|
||||||
|
batch_size,
|
||||||
|
grid_t * grid_h * grid_w,
|
||||||
|
channel * temporal_patch_size * patch_size * patch_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
processed_videos_grouped[shape] = flatten_patches
|
||||||
|
processed_grids[shape] = [[grid_t, grid_h, grid_w]] * batch_size
|
||||||
|
|
||||||
|
processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index)
|
||||||
|
processed_grids = reorder_videos(processed_grids, grouped_videos_index)
|
||||||
|
pixel_values_videos = torch.cat(processed_videos, dim=0)
|
||||||
|
video_grid_thw = torch.tensor(processed_grids)
|
||||||
|
total_frames = video_grid_thw[0][0].item()
|
||||||
|
h = video_grid_thw[0][1].item()
|
||||||
|
w = video_grid_thw[0][2].item()
|
||||||
|
video_grid_thw = [[1, h, w] for _ in range(total_frames)]
|
||||||
|
data = {
|
||||||
|
"pixel_values_videos": pixel_values_videos,
|
||||||
|
"video_grid_thw": video_grid_thw,
|
||||||
|
"timestamps": timestamps_list,
|
||||||
|
}
|
||||||
|
|
||||||
|
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Glm4vVideoProcessor"]
|
||||||
0
tests/models/glm4v/__init__.py
Normal file
0
tests/models/glm4v/__init__.py
Normal file
512
tests/models/glm4v/test_modeling_glm4v.py
Normal file
512
tests/models/glm4v/test_modeling_glm4v.py
Normal file
@@ -0,0 +1,512 @@
|
|||||||
|
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Testing suite for the PyTorch GLM-4.1V model."""
|
||||||
|
|
||||||
|
import copy
|
||||||
|
import gc
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from parameterized import parameterized
|
||||||
|
|
||||||
|
from transformers import (
|
||||||
|
AutoProcessor,
|
||||||
|
Glm4vConfig,
|
||||||
|
Glm4vForConditionalGeneration,
|
||||||
|
Glm4vModel,
|
||||||
|
is_torch_available,
|
||||||
|
is_vision_available,
|
||||||
|
)
|
||||||
|
from transformers.testing_utils import (
|
||||||
|
require_flash_attn,
|
||||||
|
require_torch,
|
||||||
|
require_torch_gpu,
|
||||||
|
slow,
|
||||||
|
torch_device,
|
||||||
|
)
|
||||||
|
|
||||||
|
from ...generation.test_utils import GenerationTesterMixin
|
||||||
|
from ...test_configuration_common import ConfigTester
|
||||||
|
from ...test_modeling_common import (
|
||||||
|
ModelTesterMixin,
|
||||||
|
floats_tensor,
|
||||||
|
ids_tensor,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if is_torch_available():
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
if is_vision_available():
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
|
||||||
|
class Glm4vVisionText2TextModelTester:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
parent,
|
||||||
|
batch_size=3,
|
||||||
|
seq_length=7,
|
||||||
|
num_channels=3,
|
||||||
|
ignore_index=-100,
|
||||||
|
image_size=112,
|
||||||
|
video_start_token_id=3,
|
||||||
|
video_end_token_id=4,
|
||||||
|
image_start_token_id=5,
|
||||||
|
image_end_token_id=6,
|
||||||
|
image_token_id=7,
|
||||||
|
video_token_id=8,
|
||||||
|
is_training=True,
|
||||||
|
text_config={
|
||||||
|
"vocab_size": 99,
|
||||||
|
"hidden_size": 32,
|
||||||
|
"intermediate_size": 37,
|
||||||
|
"num_hidden_layers": 4,
|
||||||
|
"num_attention_heads": 4,
|
||||||
|
"num_key_value_heads": 2,
|
||||||
|
"output_channels": 64,
|
||||||
|
"hidden_act": "silu",
|
||||||
|
"max_position_embeddings": 512,
|
||||||
|
"rope_scaling": {"type": "default", "mrope_section": [2, 1, 1]},
|
||||||
|
"max_window_layers": 3,
|
||||||
|
"rope_theta": 10000,
|
||||||
|
"tie_word_embeddings": True,
|
||||||
|
"bos_token_id": 0,
|
||||||
|
"eos_token_id": 0,
|
||||||
|
"pad_token_id": 0,
|
||||||
|
},
|
||||||
|
vision_config={
|
||||||
|
"depth": 2,
|
||||||
|
"embed_dim": 32,
|
||||||
|
"hidden_act": "silu",
|
||||||
|
"hidden_size": 32,
|
||||||
|
"mlp_ratio": 4,
|
||||||
|
"num_heads": 4,
|
||||||
|
"patch_size": 14,
|
||||||
|
"spatial_merge_size": 1,
|
||||||
|
"temporal_patch_size": 2,
|
||||||
|
},
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.ignore_index = ignore_index
|
||||||
|
self.bos_token_id = text_config["bos_token_id"]
|
||||||
|
self.eos_token_id = text_config["eos_token_id"]
|
||||||
|
self.pad_token_id = text_config["pad_token_id"]
|
||||||
|
self.video_start_token_id = video_start_token_id
|
||||||
|
self.video_end_token_id = video_end_token_id
|
||||||
|
self.image_start_token_id = image_start_token_id
|
||||||
|
self.image_end_token_id = image_end_token_id
|
||||||
|
self.image_token_id = image_token_id
|
||||||
|
self.video_token_id = video_token_id
|
||||||
|
self.text_config = text_config
|
||||||
|
self.vision_config = vision_config
|
||||||
|
self.batch_size = batch_size
|
||||||
|
self.num_channels = num_channels
|
||||||
|
self.image_size = image_size
|
||||||
|
self.is_training = is_training
|
||||||
|
self.hidden_size = text_config["hidden_size"]
|
||||||
|
self.num_hidden_layers = text_config["num_hidden_layers"]
|
||||||
|
self.num_attention_heads = text_config["num_attention_heads"]
|
||||||
|
self.vocab_size = text_config["vocab_size"]
|
||||||
|
self.num_image_tokens = 64
|
||||||
|
self.seq_length = seq_length + self.num_image_tokens
|
||||||
|
|
||||||
|
def get_config(self):
|
||||||
|
return Glm4vConfig(
|
||||||
|
text_config=self.text_config,
|
||||||
|
vision_config=self.vision_config,
|
||||||
|
image_token_id=self.image_token_id,
|
||||||
|
video_token_id=self.video_token_id,
|
||||||
|
video_start_token_id=self.video_start_token_id,
|
||||||
|
video_end_token_id=self.video_end_token_id,
|
||||||
|
image_start_token_id=self.image_start_token_id,
|
||||||
|
image_end_token_id=self.image_end_token_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
config = self.get_config()
|
||||||
|
patch_size = config.vision_config.patch_size
|
||||||
|
temporal_patch_size = config.vision_config.temporal_patch_size
|
||||||
|
pixel_values = floats_tensor(
|
||||||
|
[
|
||||||
|
self.batch_size * (self.image_size**2) // (patch_size**2),
|
||||||
|
self.num_channels * (patch_size**2) * temporal_patch_size,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
return config, pixel_values
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
config, pixel_values = config_and_inputs
|
||||||
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
|
||||||
|
|
||||||
|
input_ids[input_ids == self.video_token_id] = self.pad_token_id
|
||||||
|
input_ids[input_ids == self.image_token_id] = self.pad_token_id
|
||||||
|
input_ids[input_ids == self.video_start_token_id] = self.pad_token_id
|
||||||
|
input_ids[input_ids == self.image_start_token_id] = self.pad_token_id
|
||||||
|
input_ids[input_ids == self.video_end_token_id] = self.pad_token_id
|
||||||
|
input_ids[input_ids == self.image_end_token_id] = self.pad_token_id
|
||||||
|
|
||||||
|
input_ids[:, 0] = self.image_start_token_id
|
||||||
|
input_ids[:, 1 : 1 + self.num_image_tokens] = self.image_token_id
|
||||||
|
input_ids[:, 1 + self.num_image_tokens] = self.image_end_token_id
|
||||||
|
patch_size = config.vision_config.patch_size
|
||||||
|
patches_per_side = self.image_size // patch_size
|
||||||
|
|
||||||
|
inputs_dict = {
|
||||||
|
"pixel_values": pixel_values,
|
||||||
|
"image_grid_thw": torch.tensor([[1, patches_per_side, patches_per_side]] * self.batch_size),
|
||||||
|
"input_ids": input_ids,
|
||||||
|
"attention_mask": attention_mask,
|
||||||
|
}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
|
@require_torch
|
||||||
|
class Glm4vModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
|
||||||
|
all_model_classes = (Glm4vModel, Glm4vForConditionalGeneration) if is_torch_available() else ()
|
||||||
|
test_pruning = False
|
||||||
|
test_head_masking = False
|
||||||
|
_is_composite = True
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.model_tester = Glm4vVisionText2TextModelTester(self)
|
||||||
|
self.config_tester = ConfigTester(self, config_class=Glm4vConfig, has_text_modality=False)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
|
# GLM4V has images shaped as (bs*patch_len, dim) so we can't slice to batches in generate
|
||||||
|
def prepare_config_and_inputs_for_generate(self, batch_size=2):
|
||||||
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|
||||||
|
# We don't want a few model inputs in our model input dictionary for generation tests
|
||||||
|
input_keys_to_ignore = [
|
||||||
|
# we don't want to mask attention heads
|
||||||
|
"head_mask",
|
||||||
|
"decoder_head_mask",
|
||||||
|
"cross_attn_head_mask",
|
||||||
|
# we don't want encoder-decoder models to start from filled decoder ids
|
||||||
|
"decoder_input_ids",
|
||||||
|
"decoder_attention_mask",
|
||||||
|
# we'll set cache use in each test differently
|
||||||
|
"use_cache",
|
||||||
|
# Ignore labels if it is in the input dict
|
||||||
|
"labels",
|
||||||
|
# model-specific exceptions should overload/overwrite this function
|
||||||
|
]
|
||||||
|
|
||||||
|
# The diff from the general `prepare_config_and_inputs_for_generate` lies here
|
||||||
|
patch_size = config.vision_config.patch_size
|
||||||
|
filtered_image_length = batch_size * (self.model_tester.image_size**2) // (patch_size**2)
|
||||||
|
filtered_inputs_dict = {
|
||||||
|
k: v[:batch_size, ...] if isinstance(v, torch.Tensor) else v
|
||||||
|
for k, v in inputs_dict.items()
|
||||||
|
if k not in input_keys_to_ignore
|
||||||
|
}
|
||||||
|
filtered_inputs_dict["pixel_values"] = inputs_dict["pixel_values"][:filtered_image_length]
|
||||||
|
|
||||||
|
# It is important set `eos_token_id` to `None` to avoid early stopping (would break for length-based checks)
|
||||||
|
text_gen_config = config.get_text_config(decoder=True)
|
||||||
|
if text_gen_config.eos_token_id is not None and text_gen_config.pad_token_id is None:
|
||||||
|
text_gen_config.pad_token_id = (
|
||||||
|
text_gen_config.eos_token_id
|
||||||
|
if isinstance(text_gen_config.eos_token_id, int)
|
||||||
|
else text_gen_config.eos_token_id[0]
|
||||||
|
)
|
||||||
|
text_gen_config.eos_token_id = None
|
||||||
|
text_gen_config.forced_eos_token_id = None
|
||||||
|
|
||||||
|
return config, filtered_inputs_dict
|
||||||
|
|
||||||
|
@unittest.skip(reason="No available kernels - not supported")
|
||||||
|
def test_sdpa_can_dispatch_on_flash(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@parameterized.expand([("greedy", 1), ("beam search", 2)])
|
||||||
|
@unittest.skip("Cannot generate from inputs embeds with pixel values")
|
||||||
|
def test_generate_from_inputs_embeds(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@unittest.skip(reason="Size mismatch")
|
||||||
|
def test_multi_gpu_data_parallel_forward(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@unittest.skip(reason="We cannot configure to output a smaller model.")
|
||||||
|
def test_model_is_small(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@unittest.skip("Cannot generate from inputs embeds with pixel values")
|
||||||
|
def test_generate_from_inputs_embeds_with_static_cache(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# The multimodal base model embeds will not match ids, due to pixel values. We can't change base test
|
||||||
|
# because in some models `pixel_values` are required. Will be fixed when we add support for merging `embeds+pixels`
|
||||||
|
# TODO: @raushan
|
||||||
|
|
||||||
|
def test_inputs_embeds(self):
|
||||||
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|
||||||
|
for model_class in self.all_model_classes:
|
||||||
|
model = model_class(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
|
||||||
|
|
||||||
|
input_ids = inputs["input_ids"]
|
||||||
|
del inputs["input_ids"]
|
||||||
|
del inputs["pixel_values"]
|
||||||
|
del inputs["image_grid_thw"]
|
||||||
|
|
||||||
|
wte = model.get_input_embeddings()
|
||||||
|
inputs["inputs_embeds"] = wte(input_ids)
|
||||||
|
with torch.no_grad():
|
||||||
|
model(**inputs)[0]
|
||||||
|
|
||||||
|
def test_inputs_embeds_matches_input_ids(self):
|
||||||
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|
||||||
|
for model_class in self.all_model_classes:
|
||||||
|
model = model_class(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
inputs = self._prepare_for_class(inputs_dict, model_class)
|
||||||
|
input_ids = inputs["input_ids"]
|
||||||
|
del inputs["input_ids"]
|
||||||
|
del inputs["pixel_values"]
|
||||||
|
del inputs["image_grid_thw"]
|
||||||
|
|
||||||
|
inputs_embeds = model.get_input_embeddings()(input_ids)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
out_ids = model(input_ids=input_ids, **inputs)[0]
|
||||||
|
out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
|
||||||
|
torch.testing.assert_close(out_embeds, out_ids)
|
||||||
|
|
||||||
|
|
||||||
|
@unittest.skip("Model checkpoint not yet released")
|
||||||
|
@require_torch
|
||||||
|
class Glm4vIntegrationTest(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
self.processor = AutoProcessor.from_pretrained("z")
|
||||||
|
self.messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "image"},
|
||||||
|
{"type": "text", "text": "What kind of dog is this?"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
url = "https://qianwen-res.oss-accelerate-overseas.aliyuncs.com/Qwen2-VL/demo_small.jpg"
|
||||||
|
self.image = Image.open(requests.get(url, stream=True).raw)
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
gc.collect()
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_small_model_integration_test(self):
|
||||||
|
model = Glm4vForConditionalGeneration.from_pretrained(
|
||||||
|
"THUDM/GLM-4.1V-9B-Thinking", torch_dtype="auto", device_map="auto"
|
||||||
|
)
|
||||||
|
|
||||||
|
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
|
||||||
|
inputs = self.processor(text=[text], images=[self.image], return_tensors="pt")
|
||||||
|
|
||||||
|
expected_input_ids = [151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 151652, 151655, 151655] # fmt: skip
|
||||||
|
assert expected_input_ids == inputs.input_ids[0].tolist()[:17]
|
||||||
|
|
||||||
|
expected_pixel_slice = torch.tensor(
|
||||||
|
[
|
||||||
|
[0.8792, 0.8792, 0.9084],
|
||||||
|
[1.1858, 1.1858, 1.2296],
|
||||||
|
[1.2004, 1.2004, 1.2150],
|
||||||
|
[1.4340, 1.4340, 1.4194],
|
||||||
|
[1.3902, 1.4048, 1.4194],
|
||||||
|
[1.5216, 1.5362, 1.5362],
|
||||||
|
],
|
||||||
|
dtype=torch.float32,
|
||||||
|
device="cpu",
|
||||||
|
)
|
||||||
|
assert torch.allclose(expected_pixel_slice, inputs.pixel_values[:6, :3], atol=3e-3)
|
||||||
|
|
||||||
|
# verify generation
|
||||||
|
inputs = inputs.to(torch_device)
|
||||||
|
|
||||||
|
output = model.generate(**inputs, max_new_tokens=30)
|
||||||
|
EXPECTED_DECODED_TEXT = "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices"
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
self.processor.decode(output[0], skip_special_tokens=True),
|
||||||
|
EXPECTED_DECODED_TEXT,
|
||||||
|
)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_small_model_integration_test_batch(self):
|
||||||
|
model = Glm4vForConditionalGeneration.from_pretrained(
|
||||||
|
"THUDM/GLM-4.1V-9B-Thinking", torch_dtype="auto", device_map="auto"
|
||||||
|
)
|
||||||
|
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
|
||||||
|
inputs = self.processor(text=[text, text], images=[self.image, self.image], return_tensors="pt").to(
|
||||||
|
torch_device
|
||||||
|
)
|
||||||
|
|
||||||
|
# it should not matter whether two images are the same size or not
|
||||||
|
output = model.generate(**inputs, max_new_tokens=30)
|
||||||
|
|
||||||
|
EXPECTED_DECODED_TEXT = [
|
||||||
|
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
|
||||||
|
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
|
||||||
|
] # fmt: skip
|
||||||
|
self.assertEqual(
|
||||||
|
self.processor.batch_decode(output, skip_special_tokens=True),
|
||||||
|
EXPECTED_DECODED_TEXT,
|
||||||
|
)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_small_model_integration_test_expand(self):
|
||||||
|
model = Glm4vForConditionalGeneration.from_pretrained(
|
||||||
|
"THUDM/GLM-4.1V-9B-Thinking", torch_dtype="auto", device_map="auto"
|
||||||
|
)
|
||||||
|
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
|
||||||
|
inputs = self.processor(text=[text], images=[self.image], return_tensors="pt").to(torch_device)
|
||||||
|
|
||||||
|
output = model.generate(**inputs, max_new_tokens=30, num_return_sequences=3)
|
||||||
|
|
||||||
|
EXPECTED_DECODED_TEXT = [
|
||||||
|
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
|
||||||
|
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
|
||||||
|
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
|
||||||
|
] # fmt: skip
|
||||||
|
self.assertEqual(
|
||||||
|
self.processor.batch_decode(output, skip_special_tokens=True),
|
||||||
|
EXPECTED_DECODED_TEXT,
|
||||||
|
)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_small_model_integration_test_batch_wo_image(self):
|
||||||
|
model = Glm4vForConditionalGeneration.from_pretrained(
|
||||||
|
"THUDM/GLM-4.1V-9B-Thinking", torch_dtype="auto", device_map="auto"
|
||||||
|
)
|
||||||
|
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
|
||||||
|
messages2 = [
|
||||||
|
{"role": "system", "content": "You are a helpful assistant."},
|
||||||
|
{"role": "user", "content": "Who are you?"},
|
||||||
|
]
|
||||||
|
text2 = self.processor.apply_chat_template(messages2, tokenize=False, add_generation_prompt=True)
|
||||||
|
inputs = self.processor(text=[text, text2], images=[self.image], padding=True, return_tensors="pt").to(
|
||||||
|
torch_device
|
||||||
|
)
|
||||||
|
|
||||||
|
# it should not matter whether two images are the same size or not
|
||||||
|
output = model.generate(**inputs, max_new_tokens=30)
|
||||||
|
|
||||||
|
EXPECTED_DECODED_TEXT = [
|
||||||
|
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
|
||||||
|
'system\nYou are a helpful assistant.\nuser\nWho are you?\nassistant\nI am a large language model created by Alibaba Cloud. I am called Qwen.'
|
||||||
|
] # fmt: skip
|
||||||
|
self.assertEqual(
|
||||||
|
self.processor.batch_decode(output, skip_special_tokens=True),
|
||||||
|
EXPECTED_DECODED_TEXT,
|
||||||
|
)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_small_model_integration_test_batch_different_resolutions(self):
|
||||||
|
model = Glm4vForConditionalGeneration.from_pretrained(
|
||||||
|
"THUDM/GLM-4.1V-9B-Thinking", torch_dtype="auto", device_map="auto"
|
||||||
|
)
|
||||||
|
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
|
||||||
|
text2 = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
|
||||||
|
image2 = self.image.resize((224, 224))
|
||||||
|
inputs = self.processor(text=[text, text2], images=[self.image, image2], padding=True, return_tensors="pt").to(
|
||||||
|
torch_device
|
||||||
|
)
|
||||||
|
|
||||||
|
# it should not matter whether two images are the same size or not
|
||||||
|
output = model.generate(**inputs, max_new_tokens=30)
|
||||||
|
|
||||||
|
EXPECTED_DECODED_TEXT = [
|
||||||
|
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
|
||||||
|
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets'
|
||||||
|
] # fmt: skip
|
||||||
|
self.assertEqual(
|
||||||
|
self.processor.batch_decode(output, skip_special_tokens=True),
|
||||||
|
EXPECTED_DECODED_TEXT,
|
||||||
|
)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
@require_flash_attn
|
||||||
|
@require_torch_gpu
|
||||||
|
def test_small_model_integration_test_batch_flashatt2(self):
|
||||||
|
model = Glm4vForConditionalGeneration.from_pretrained(
|
||||||
|
"THUDM/GLM-4.1V-9B-Thinking",
|
||||||
|
torch_dtype=torch.bfloat16,
|
||||||
|
attn_implementation="flash_attention_2",
|
||||||
|
device_map="auto",
|
||||||
|
)
|
||||||
|
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
|
||||||
|
inputs = self.processor(text=[text, text], images=[self.image, self.image], return_tensors="pt").to(
|
||||||
|
torch_device
|
||||||
|
)
|
||||||
|
|
||||||
|
# it should not matter whether two images are the same size or not
|
||||||
|
output = model.generate(**inputs, max_new_tokens=30)
|
||||||
|
|
||||||
|
EXPECTED_DECODED_TEXT = [
|
||||||
|
"system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices",
|
||||||
|
"system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices",
|
||||||
|
]
|
||||||
|
self.assertEqual(
|
||||||
|
self.processor.batch_decode(output, skip_special_tokens=True),
|
||||||
|
EXPECTED_DECODED_TEXT,
|
||||||
|
)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
@require_flash_attn
|
||||||
|
@require_torch_gpu
|
||||||
|
def test_small_model_integration_test_batch_wo_image_flashatt2(self):
|
||||||
|
model = Glm4vForConditionalGeneration.from_pretrained(
|
||||||
|
"THUDM/GLM-4.1V-9B-Thinking",
|
||||||
|
torch_dtype=torch.bfloat16,
|
||||||
|
attn_implementation="flash_attention_2",
|
||||||
|
device_map="auto",
|
||||||
|
)
|
||||||
|
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
|
||||||
|
messages2 = [
|
||||||
|
{"role": "system", "content": "You are a helpful assistant."},
|
||||||
|
{"role": "user", "content": "Who are you?"},
|
||||||
|
]
|
||||||
|
text2 = self.processor.apply_chat_template(messages2, tokenize=False, add_generation_prompt=True)
|
||||||
|
inputs = self.processor(text=[text, text2], images=[self.image], padding=True, return_tensors="pt").to(
|
||||||
|
torch_device
|
||||||
|
)
|
||||||
|
|
||||||
|
# it should not matter whether two images are the same size or not
|
||||||
|
output = model.generate(**inputs, max_new_tokens=30)
|
||||||
|
|
||||||
|
EXPECTED_DECODED_TEXT = [
|
||||||
|
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
|
||||||
|
'system\nYou are a helpful assistant.\nuser\nWho are you?\nassistant\nI am a large language model created by Alibaba Cloud. I am called Qwen.'
|
||||||
|
] # fmt: skip
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
self.processor.batch_decode(output, skip_special_tokens=True),
|
||||||
|
EXPECTED_DECODED_TEXT,
|
||||||
|
)
|
||||||
330
tests/models/glm4v/test_video_processing_glm4v.py
Normal file
330
tests/models/glm4v/test_video_processing_glm4v.py
Normal file
@@ -0,0 +1,330 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2025 HuggingFace Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from transformers.image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD
|
||||||
|
from transformers.testing_utils import require_torch, require_vision
|
||||||
|
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||||
|
|
||||||
|
from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
|
||||||
|
|
||||||
|
|
||||||
|
if is_torch_available():
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
if is_vision_available():
|
||||||
|
if is_torchvision_available():
|
||||||
|
from transformers import Glm4vVideoProcessor
|
||||||
|
from transformers.models.glm4v.video_processing_glm4v import smart_resize
|
||||||
|
|
||||||
|
|
||||||
|
class Glm4vVideoProcessingTester:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
parent,
|
||||||
|
batch_size=5,
|
||||||
|
num_frames=8,
|
||||||
|
num_channels=3,
|
||||||
|
min_resolution=30,
|
||||||
|
max_resolution=80,
|
||||||
|
temporal_patch_size=2,
|
||||||
|
patch_size=14,
|
||||||
|
merge_size=2,
|
||||||
|
do_resize=True,
|
||||||
|
size=None,
|
||||||
|
do_normalize=True,
|
||||||
|
image_mean=IMAGENET_STANDARD_MEAN,
|
||||||
|
image_std=IMAGENET_STANDARD_STD,
|
||||||
|
do_convert_rgb=True,
|
||||||
|
):
|
||||||
|
size = size if size is not None else {"longest_edge": 20}
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = batch_size
|
||||||
|
self.num_frames = num_frames
|
||||||
|
self.num_channels = num_channels
|
||||||
|
self.min_resolution = min_resolution
|
||||||
|
self.max_resolution = max_resolution
|
||||||
|
self.do_resize = do_resize
|
||||||
|
self.size = size
|
||||||
|
self.do_normalize = do_normalize
|
||||||
|
self.image_mean = image_mean
|
||||||
|
self.image_std = image_std
|
||||||
|
self.do_convert_rgb = do_convert_rgb
|
||||||
|
self.temporal_patch_size = temporal_patch_size
|
||||||
|
self.patch_size = patch_size
|
||||||
|
self.merge_size = merge_size
|
||||||
|
|
||||||
|
def prepare_video_processor_dict(self):
|
||||||
|
return {
|
||||||
|
"do_resize": self.do_resize,
|
||||||
|
"size": self.size,
|
||||||
|
"do_normalize": self.do_normalize,
|
||||||
|
"image_mean": self.image_mean,
|
||||||
|
"image_std": self.image_std,
|
||||||
|
"do_convert_rgb": self.do_convert_rgb,
|
||||||
|
"do_sample_frames": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
def prepare_video_metadata(self, videos):
|
||||||
|
video_metadata = []
|
||||||
|
for video in videos:
|
||||||
|
if isinstance(video, list):
|
||||||
|
num_frames = len(video)
|
||||||
|
elif hasattr(video, "shape"):
|
||||||
|
if len(video.shape) == 4: # (T, H, W, C)
|
||||||
|
num_frames = video.shape[0]
|
||||||
|
else:
|
||||||
|
num_frames = 1
|
||||||
|
else:
|
||||||
|
num_frames = self.num_frames
|
||||||
|
|
||||||
|
metadata = {
|
||||||
|
"fps": 2,
|
||||||
|
"duration": num_frames / 2,
|
||||||
|
"total_frames": num_frames,
|
||||||
|
}
|
||||||
|
video_metadata.append(metadata)
|
||||||
|
return video_metadata
|
||||||
|
|
||||||
|
def expected_output_video_shape(self, videos):
|
||||||
|
grid_t = self.num_frames // self.temporal_patch_size
|
||||||
|
hidden_dim = self.num_channels * self.temporal_patch_size * self.patch_size * self.patch_size
|
||||||
|
seq_len = 0
|
||||||
|
for video in videos:
|
||||||
|
if isinstance(video, list) and isinstance(video[0], Image.Image):
|
||||||
|
video = np.stack([np.array(frame) for frame in video])
|
||||||
|
elif hasattr(video, "shape"):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
video = np.array(video)
|
||||||
|
|
||||||
|
if hasattr(video, "shape") and len(video.shape) >= 3:
|
||||||
|
if len(video.shape) == 4:
|
||||||
|
t, height, width = video.shape[:3]
|
||||||
|
elif len(video.shape) == 3:
|
||||||
|
height, width = video.shape[:2]
|
||||||
|
t = 1
|
||||||
|
else:
|
||||||
|
t, height, width = self.num_frames, self.min_resolution, self.min_resolution
|
||||||
|
else:
|
||||||
|
t, height, width = self.num_frames, self.min_resolution, self.min_resolution
|
||||||
|
|
||||||
|
resized_height, resized_width = smart_resize(
|
||||||
|
t,
|
||||||
|
height,
|
||||||
|
width,
|
||||||
|
factor=self.patch_size * self.merge_size,
|
||||||
|
)
|
||||||
|
grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
|
||||||
|
seq_len += grid_t * grid_h * grid_w
|
||||||
|
return [seq_len, hidden_dim]
|
||||||
|
|
||||||
|
def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
|
||||||
|
videos = prepare_video_inputs(
|
||||||
|
batch_size=self.batch_size,
|
||||||
|
num_frames=self.num_frames,
|
||||||
|
num_channels=self.num_channels,
|
||||||
|
min_resolution=self.min_resolution,
|
||||||
|
max_resolution=self.max_resolution,
|
||||||
|
equal_resolution=equal_resolution,
|
||||||
|
return_tensors=return_tensors,
|
||||||
|
)
|
||||||
|
return videos
|
||||||
|
|
||||||
|
|
||||||
|
@require_torch
|
||||||
|
@require_vision
|
||||||
|
class Glm4vVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
|
||||||
|
fast_video_processing_class = Glm4vVideoProcessor if is_torchvision_available() else None
|
||||||
|
input_name = "pixel_values_videos"
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
|
self.video_processor_tester = Glm4vVideoProcessingTester(self)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def video_processor_dict(self):
|
||||||
|
return self.video_processor_tester.prepare_video_processor_dict()
|
||||||
|
|
||||||
|
def test_video_processor_from_dict_with_kwargs(self):
|
||||||
|
video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict)
|
||||||
|
self.assertEqual(video_processor.size, {"longest_edge": 20})
|
||||||
|
|
||||||
|
video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict, size=42)
|
||||||
|
self.assertEqual(video_processor.size, {"height": 42, "width": 42})
|
||||||
|
|
||||||
|
def test_call_pil(self):
|
||||||
|
for video_processing_class in self.video_processor_list:
|
||||||
|
video_processing = video_processing_class(**self.video_processor_dict)
|
||||||
|
video_inputs = self.video_processor_tester.prepare_video_inputs(
|
||||||
|
equal_resolution=False, return_tensors="pil"
|
||||||
|
)
|
||||||
|
|
||||||
|
for video in video_inputs:
|
||||||
|
self.assertIsInstance(video[0], Image.Image)
|
||||||
|
|
||||||
|
video_metadata = self.video_processor_tester.prepare_video_metadata(video_inputs)
|
||||||
|
encoded_videos = video_processing(
|
||||||
|
video_inputs[0], video_metadata=[video_metadata[0]], return_tensors="pt"
|
||||||
|
)[self.input_name]
|
||||||
|
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
|
||||||
|
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||||
|
encoded_videos = video_processing(video_inputs, video_metadata=video_metadata, return_tensors="pt")[
|
||||||
|
self.input_name
|
||||||
|
]
|
||||||
|
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
|
||||||
|
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||||
|
|
||||||
|
def test_call_numpy(self):
|
||||||
|
for video_processing_class in self.video_processor_list:
|
||||||
|
video_processing = video_processing_class(**self.video_processor_dict)
|
||||||
|
video_inputs = self.video_processor_tester.prepare_video_inputs(
|
||||||
|
equal_resolution=False, return_tensors="np"
|
||||||
|
)
|
||||||
|
|
||||||
|
video_metadata = self.video_processor_tester.prepare_video_metadata(video_inputs)
|
||||||
|
encoded_videos = video_processing(
|
||||||
|
video_inputs[0], video_metadata=[video_metadata[0]], return_tensors="pt"
|
||||||
|
)[self.input_name]
|
||||||
|
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
|
||||||
|
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||||
|
|
||||||
|
encoded_videos = video_processing(video_inputs, video_metadata=video_metadata, return_tensors="pt")[
|
||||||
|
self.input_name
|
||||||
|
]
|
||||||
|
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
|
||||||
|
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||||
|
|
||||||
|
def test_call_pytorch(self):
|
||||||
|
for video_processing_class in self.video_processor_list:
|
||||||
|
video_processing = video_processing_class(**self.video_processor_dict)
|
||||||
|
video_inputs = self.video_processor_tester.prepare_video_inputs(
|
||||||
|
equal_resolution=False, return_tensors="pt"
|
||||||
|
)
|
||||||
|
video_metadata = self.video_processor_tester.prepare_video_metadata(video_inputs)
|
||||||
|
encoded_videos = video_processing(
|
||||||
|
video_inputs[0], video_metadata=[video_metadata[0]], return_tensors="pt"
|
||||||
|
)[self.input_name]
|
||||||
|
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
|
||||||
|
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||||
|
encoded_videos = video_processing(video_inputs, video_metadata=video_metadata, return_tensors="pt")[
|
||||||
|
self.input_name
|
||||||
|
]
|
||||||
|
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
|
||||||
|
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||||
|
|
||||||
|
@unittest.skip("Skip for now, the test needs adjustment fo GLM-4.1V")
|
||||||
|
def test_call_numpy_4_channels(self):
|
||||||
|
for video_processing_class in self.video_processor_list:
|
||||||
|
# Test that can process videos which have an arbitrary number of channels
|
||||||
|
# Initialize video_processing
|
||||||
|
video_processor = video_processing_class(**self.video_processor_dict)
|
||||||
|
|
||||||
|
# create random numpy tensors
|
||||||
|
self.video_processor_tester.num_channels = 4
|
||||||
|
video_inputs = self.video_processor_tester.prepare_video_inputs(
|
||||||
|
equal_resolution=False, return_tensors="np"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test not batched input
|
||||||
|
encoded_videos = video_processor(
|
||||||
|
video_inputs[0],
|
||||||
|
return_tensors="pt",
|
||||||
|
input_data_format="channels_last",
|
||||||
|
image_mean=0,
|
||||||
|
image_std=1,
|
||||||
|
)[self.input_name]
|
||||||
|
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
|
||||||
|
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||||
|
|
||||||
|
# Test batched
|
||||||
|
encoded_videos = video_processor(
|
||||||
|
video_inputs,
|
||||||
|
return_tensors="pt",
|
||||||
|
input_data_format="channels_last",
|
||||||
|
image_mean=0,
|
||||||
|
image_std=1,
|
||||||
|
)[self.input_name]
|
||||||
|
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
|
||||||
|
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||||
|
|
||||||
|
def test_nested_input(self):
|
||||||
|
"""Tests that the processor can work with nested list where each video is a list of arrays"""
|
||||||
|
for video_processing_class in self.video_processor_list:
|
||||||
|
video_processing = video_processing_class(**self.video_processor_dict)
|
||||||
|
video_inputs = self.video_processor_tester.prepare_video_inputs(
|
||||||
|
equal_resolution=False, return_tensors="np"
|
||||||
|
)
|
||||||
|
|
||||||
|
video_inputs_nested = [list(video) for video in video_inputs]
|
||||||
|
video_metadata = self.video_processor_tester.prepare_video_metadata(video_inputs)
|
||||||
|
|
||||||
|
# Test not batched input
|
||||||
|
encoded_videos = video_processing(
|
||||||
|
video_inputs_nested[0], video_metadata=[video_metadata[0]], return_tensors="pt"
|
||||||
|
)[self.input_name]
|
||||||
|
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
|
||||||
|
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||||
|
|
||||||
|
# Test batched
|
||||||
|
encoded_videos = video_processing(video_inputs_nested, video_metadata=video_metadata, return_tensors="pt")[
|
||||||
|
self.input_name
|
||||||
|
]
|
||||||
|
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
|
||||||
|
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||||
|
|
||||||
|
def test_call_sample_frames(self):
|
||||||
|
for video_processing_class in self.video_processor_list:
|
||||||
|
video_processor_dict = self.video_processor_dict.copy()
|
||||||
|
video_processing = video_processing_class(**video_processor_dict)
|
||||||
|
|
||||||
|
prev_num_frames = self.video_processor_tester.num_frames
|
||||||
|
self.video_processor_tester.num_frames = 8
|
||||||
|
prev_min_resolution = getattr(self.video_processor_tester, "min_resolution", None)
|
||||||
|
prev_max_resolution = getattr(self.video_processor_tester, "max_resolution", None)
|
||||||
|
self.video_processor_tester.min_resolution = 56
|
||||||
|
self.video_processor_tester.max_resolution = 112
|
||||||
|
|
||||||
|
video_inputs = self.video_processor_tester.prepare_video_inputs(
|
||||||
|
equal_resolution=False,
|
||||||
|
return_tensors="torch",
|
||||||
|
)
|
||||||
|
|
||||||
|
metadata = [[{"total_num_frames": 8, "fps": 4}]]
|
||||||
|
batched_metadata = metadata * len(video_inputs)
|
||||||
|
|
||||||
|
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", video_metadata=metadata)[
|
||||||
|
self.input_name
|
||||||
|
]
|
||||||
|
encoded_videos_batched = video_processing(
|
||||||
|
video_inputs, return_tensors="pt", video_metadata=batched_metadata
|
||||||
|
)[self.input_name]
|
||||||
|
|
||||||
|
self.assertIsNotNone(encoded_videos)
|
||||||
|
self.assertIsNotNone(encoded_videos_batched)
|
||||||
|
self.assertEqual(len(encoded_videos.shape), 2)
|
||||||
|
self.assertEqual(len(encoded_videos_batched.shape), 2)
|
||||||
|
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
video_processing(video_inputs[0], return_tensors="pt")[self.input_name]
|
||||||
|
|
||||||
|
self.video_processor_tester.num_frames = prev_num_frames
|
||||||
|
if prev_min_resolution is not None:
|
||||||
|
self.video_processor_tester.min_resolution = prev_min_resolution
|
||||||
|
if prev_max_resolution is not None:
|
||||||
|
self.video_processor_tester.max_resolution = prev_max_resolution
|
||||||
@@ -91,6 +91,7 @@ PRIVATE_MODELS = [
|
|||||||
"AriaTextModel",
|
"AriaTextModel",
|
||||||
"Phi4MultimodalAudioModel",
|
"Phi4MultimodalAudioModel",
|
||||||
"Phi4MultimodalVisionModel",
|
"Phi4MultimodalVisionModel",
|
||||||
|
"Glm4vVisionModel",
|
||||||
]
|
]
|
||||||
|
|
||||||
# Update this list for models that are not tested with a comment explaining the reason it should not be.
|
# Update this list for models that are not tested with a comment explaining the reason it should not be.
|
||||||
@@ -155,6 +156,7 @@ IGNORE_NON_TESTED = (
|
|||||||
"Llama4VisionModel", # Building part of bigger (tested) model. # TODO: add tests
|
"Llama4VisionModel", # Building part of bigger (tested) model. # TODO: add tests
|
||||||
"Emu3VQVAE", # Building part of bigger (tested) model
|
"Emu3VQVAE", # Building part of bigger (tested) model
|
||||||
"Emu3TextModel", # Building part of bigger (tested) model
|
"Emu3TextModel", # Building part of bigger (tested) model
|
||||||
|
"Glm4vTextModel", # Building part of bigger (tested) model
|
||||||
"Qwen2VLTextModel", # Building part of bigger (tested) model
|
"Qwen2VLTextModel", # Building part of bigger (tested) model
|
||||||
"Qwen2_5_VLTextModel", # Building part of bigger (tested) model
|
"Qwen2_5_VLTextModel", # Building part of bigger (tested) model
|
||||||
"InternVLVisionModel", # Building part of bigger (tested) model
|
"InternVLVisionModel", # Building part of bigger (tested) model
|
||||||
|
|||||||
Reference in New Issue
Block a user