fix Glm4v batch videos forward (#39172)

* changes for video

* update modular

* change get_video_features

* update video token replacement

* update modular

* add test and fix typo

* lint

* fix order

* lint

* fix

* remove dependency

* lint

* lint

* remove todo

* resize video for test

* lint..

* fix test

* new a processor for video_test

* fix test
This commit is contained in:
Kingsley
2025-07-10 16:44:28 +08:00
committed by GitHub
parent bc161d5d06
commit 520b9dcb42
6 changed files with 127 additions and 23 deletions

View File

@@ -13,6 +13,7 @@
# limitations under the License.
"""Testing suite for the PyTorch GLM-4.1V model."""
import copy
import gc
import unittest
@@ -236,7 +237,26 @@ class Glm4vModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase)
def test_generate_from_inputs_embeds_with_static_cache(self):
pass
# RoPE index doesn't match when using embeddings
def test_inputs_embeds(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
input_ids = inputs["input_ids"]
del inputs["input_ids"]
del inputs["pixel_values"]
del inputs["image_grid_thw"]
wte = model.get_input_embeddings()
inputs["inputs_embeds"] = wte(input_ids)
with torch.no_grad():
model(**inputs)[0]
def test_inputs_embeds_matches_input_ids(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -350,6 +370,44 @@ class Glm4vIntegrationTest(unittest.TestCase):
EXPECTED_DECODED_TEXT,
)
@slow
def test_small_model_integration_test_with_video(self):
processor = AutoProcessor.from_pretrained("THUDM/GLM-4.1V-9B-Thinking", max_image_size={"longest_edge": 50176})
model = Glm4vForConditionalGeneration.from_pretrained(
"THUDM/GLM-4.1V-9B-Thinking", torch_dtype=torch.float16, device_map="auto"
)
questions = ["Describe this video."] * 2
video_urls = [
"https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4"
] * 2
messages = [
[
{
"role": "user",
"content": [
{
"type": "video",
"video": video_url,
},
{"type": "text", "text": question},
],
}
]
for question, video_url in zip(questions, video_urls)
]
inputs = processor.apply_chat_template(
messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", padding=True
).to(torch_device)
output = model.generate(**inputs, max_new_tokens=30)
EXPECTED_DECODED_TEXT = [
"\n012345Describe this video.\n<think>Got it, let's analyze the video. First, the scene is a room with a wooden floor, maybe a traditional Japanese room with tatami",
"\n012345Describe this video.\n<think>Got it, let's analyze the video. First, the scene is a room with a wooden floor, maybe a traditional Japanese room with tatami"
] # fmt: skip
self.assertEqual(
processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
@slow
def test_small_model_integration_test_expand(self):
model = Glm4vForConditionalGeneration.from_pretrained(

View File

@@ -228,7 +228,7 @@ class Glm4vVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
@unittest.skip("Skip for now, the test needs adjustment fo GLM-4.1V")
@unittest.skip("Skip for now, the test needs adjustment for GLM-4.1V")
def test_call_numpy_4_channels(self):
for video_processing_class in self.video_processor_list:
# Test that can process videos which have an arbitrary number of channels