Compare commits

...

7 Commits

Author SHA1 Message Date
Lysandre
51f94ea06d Release: v4.52.4
Some checks failed
Release - Conda / build_and_package (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
2025-05-30 11:14:55 +02:00
Raushan Turganbay
cdf04ff738 [qwen-vl] Look for vocab size in text config (#38372)
fix qwen
2025-05-30 11:05:43 +02:00
hoshi-hiyouga
2842b82c61 Fix convert to original state dict for VLMs (#38385)
* fix convert to original state dict

* fix

* lint

* Update modeling_utils.py
2025-05-28 09:31:32 +02:00
Raushan Turganbay
24c6d5b082 [video utils] group and reorder by number of frames (#38374)
fix
2025-05-28 09:31:32 +02:00
Raushan Turganbay
222af35ca9 [paligemma] fix processor with suffix (#38365)
fix pg processor
2025-05-28 09:31:32 +02:00
Cyril Vallez
7c34e2c3cb Protect get_default_device for torch<2.3 (#38376)
* Update modeling_utils.py

* CIs
2025-05-28 09:31:32 +02:00
Anton Vlasjuk
66d32abcbf [OPT] Fix attention scaling (#38290)
* fix opt attention scaling

* add comment to why we do this
2025-05-28 09:31:31 +02:00
10 changed files with 82 additions and 13 deletions

View File

@@ -451,7 +451,7 @@ install_requires = [
setup(
name="transformers",
version="4.52.3", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
version="4.52.4", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
author_email="transformers@huggingface.co",
description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",

View File

@@ -18,7 +18,7 @@
# to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
# in the namespace without actually importing anything (and especially none of the backends).
__version__ = "4.52.3"
__version__ = "4.52.4"
from pathlib import Path
from typing import TYPE_CHECKING

View File

@@ -319,7 +319,8 @@ def get_torch_context_manager_or_global_device():
is not "cpu". This is used to infer the correct device to load the model on, in case `device_map` is not provided.
"""
device_in_context = torch.tensor([]).device
default_device = torch.get_default_device()
# `get_default_device` was only introduced in torch>=2.3 - use cpu otherwise to align the behavior
default_device = torch.get_default_device() if is_torch_greater_or_equal("2.3") else torch.device("cpu")
# This case means no context manager was used -> we still check if the default that was potentially set is not cpu
if device_in_context == default_device:
if default_device != torch.device("cpu"):
@@ -3532,7 +3533,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
for key, value in state_dict.items():
for pattern, replacement in reverse_key_mapping.items():
replacement = replacement.lstrip("^") # strip off un-needed chars and patterns
replacement = re.sub(r"\(.*?\)", "", pattern)
replacement = re.sub(r"\(.*\)", "", replacement)
key, n_replace = re.subn(pattern, replacement, key)
# Early exit of the loop
if n_replace > 0:

View File

@@ -154,7 +154,11 @@ class OPTAttention(nn.Module):
"""Input shape: Batch x Time x Channel"""
bsz, tgt_len, _ = hidden_states.size()
# get query proj
# Scaling is susceptible to floating point arithmetics' inprecisions
# which can lead to different results (this is dependent from model
# to model, e.g. whisper is one such case). We therefore keep the
# original order of scaling to follow the original implementation
# and enforce no scaling (1.0) in the attention call below.
query_states = self.q_proj(hidden_states) * self.scaling
query_states = query_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
@@ -187,7 +191,7 @@ class OPTAttention(nn.Module):
value_states,
attention_mask,
dropout=0.0 if not self.training else self.dropout,
scaling=self.scaling,
scaling=1.0,
**kwargs,
)

View File

@@ -310,7 +310,8 @@ class PaliGemmaProcessor(ProcessorMixin):
return_data = {**inputs, "pixel_values": pixel_values}
if return_token_type_ids:
labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100)
labels = np.array(inputs["input_ids"])
labels[np.array(inputs["token_type_ids"]) == 0] = -100
return_data.update({"labels": labels})
return BatchFeature(data=return_data, tensor_type=return_tensors)

View File

@@ -1797,7 +1797,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
def __init__(self, config):
super().__init__(config)
self.model = Qwen2_5_VLModel(config)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
self.post_init()

View File

@@ -1673,7 +1673,7 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel, GenerationMixin):
def __init__(self, config):
super().__init__(config)
self.model = Qwen2VLModel(config)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
self.post_init()

View File

@@ -696,11 +696,13 @@ def group_videos_by_shape(
grouped_videos_index = {}
for i, video in enumerate(videos):
shape = video.shape[-2::]
num_frames = video.shape[-4] # video format BTCHW
shape = (num_frames, *shape)
if shape not in grouped_videos:
grouped_videos[shape] = []
grouped_videos[shape].append(video)
grouped_videos_index[i] = (shape, len(grouped_videos[shape]) - 1)
# stack videos with the same shape
# stack videos with the same size and number of frames
grouped_videos = {shape: torch.stack(videos, dim=0) for shape, videos in grouped_videos.items()}
return grouped_videos, grouped_videos_index

View File

@@ -62,6 +62,20 @@ class PaliGemmaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
)
self.assertEqual(len(inputs["input_ids"][0]), 112)
@require_torch
def test_call_with_suffix(self):
input_str = "lower newer"
suffix = "upper older longer string"
image_input = self.prepare_image_inputs()
processor = self.get_processor()
inputs = processor(text=input_str, images=image_input, suffix=suffix)
self.assertTrue("labels" in inputs)
self.assertEqual(len(inputs["labels"][0]), len(inputs["input_ids"][0]))
inputs = processor(text=input_str, images=image_input, suffix=suffix, return_tensors="pt")
self.assertTrue("labels" in inputs)
self.assertEqual(len(inputs["labels"][0]), len(inputs["input_ids"][0]))
def test_text_with_image_tokens(self):
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")

View File

@@ -30,7 +30,7 @@ from transformers.testing_utils import (
require_torchvision,
require_vision,
)
from transformers.video_utils import make_batched_videos
from transformers.video_utils import group_videos_by_shape, make_batched_videos, reorder_videos
if is_torch_available():
@@ -43,9 +43,9 @@ if is_vision_available():
from transformers.video_utils import VideoMetadata, load_video
def get_random_video(height, width, return_torch=False):
def get_random_video(height, width, num_frames=8, return_torch=False):
random_frame = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
video = np.array(([random_frame] * 8))
video = np.array(([random_frame] * num_frames))
if return_torch:
# move channel first
return torch.from_numpy(video).permute(0, 3, 1, 2)
@@ -189,6 +189,53 @@ class BaseVideoProcessorTester(unittest.TestCase):
rgb_video = video_processor.convert_to_rgb(torch.cat([video, video[:, :1]], dim=1))
self.assertEqual(rgb_video.shape, (8, 3, 20, 20))
def test_group_and_reorder_videos(self):
"""Tests that videos can be grouped by frame size and number of frames"""
video_1 = get_random_video(20, 20, num_frames=3, return_torch=True)
video_2 = get_random_video(20, 20, num_frames=5, return_torch=True)
# Group two videos of same size but different number of frames
grouped_videos, grouped_videos_index = group_videos_by_shape([video_1, video_2])
self.assertEqual(len(grouped_videos), 2)
regrouped_videos = reorder_videos(grouped_videos, grouped_videos_index)
self.assertTrue(len(regrouped_videos), 2)
self.assertEqual(video_1.shape, regrouped_videos[0].shape)
# Group two videos of different size but same number of frames
video_3 = get_random_video(15, 20, num_frames=3, return_torch=True)
grouped_videos, grouped_videos_index = group_videos_by_shape([video_1, video_3])
self.assertEqual(len(grouped_videos), 2)
regrouped_videos = reorder_videos(grouped_videos, grouped_videos_index)
self.assertTrue(len(regrouped_videos), 2)
self.assertEqual(video_1.shape, regrouped_videos[0].shape)
# Group all three videos where some have same size or same frame count
# But since none have frames and sizes identical, we'll have 3 groups
grouped_videos, grouped_videos_index = group_videos_by_shape([video_1, video_2, video_3])
self.assertEqual(len(grouped_videos), 3)
regrouped_videos = reorder_videos(grouped_videos, grouped_videos_index)
self.assertTrue(len(regrouped_videos), 3)
self.assertEqual(video_1.shape, regrouped_videos[0].shape)
# Group if we had some videos with identical shapes
grouped_videos, grouped_videos_index = group_videos_by_shape([video_1, video_1, video_3])
self.assertEqual(len(grouped_videos), 2)
regrouped_videos = reorder_videos(grouped_videos, grouped_videos_index)
self.assertTrue(len(regrouped_videos), 2)
self.assertEqual(video_1.shape, regrouped_videos[0].shape)
# Group if we had all videos with identical shapes
grouped_videos, grouped_videos_index = group_videos_by_shape([video_1, video_1, video_1])
self.assertEqual(len(grouped_videos), 1)
regrouped_videos = reorder_videos(grouped_videos, grouped_videos_index)
self.assertTrue(len(regrouped_videos), 1)
self.assertEqual(video_1.shape, regrouped_videos[0].shape)
@require_vision
@require_av