Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
51f94ea06d | ||
|
|
cdf04ff738 | ||
|
|
2842b82c61 | ||
|
|
24c6d5b082 | ||
|
|
222af35ca9 | ||
|
|
7c34e2c3cb | ||
|
|
66d32abcbf |
2
setup.py
2
setup.py
@@ -451,7 +451,7 @@ install_requires = [
|
||||
|
||||
setup(
|
||||
name="transformers",
|
||||
version="4.52.3", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
||||
version="4.52.4", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
||||
author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
|
||||
author_email="transformers@huggingface.co",
|
||||
description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
# to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
|
||||
# in the namespace without actually importing anything (and especially none of the backends).
|
||||
|
||||
__version__ = "4.52.3"
|
||||
__version__ = "4.52.4"
|
||||
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
@@ -319,7 +319,8 @@ def get_torch_context_manager_or_global_device():
|
||||
is not "cpu". This is used to infer the correct device to load the model on, in case `device_map` is not provided.
|
||||
"""
|
||||
device_in_context = torch.tensor([]).device
|
||||
default_device = torch.get_default_device()
|
||||
# `get_default_device` was only introduced in torch>=2.3 - use cpu otherwise to align the behavior
|
||||
default_device = torch.get_default_device() if is_torch_greater_or_equal("2.3") else torch.device("cpu")
|
||||
# This case means no context manager was used -> we still check if the default that was potentially set is not cpu
|
||||
if device_in_context == default_device:
|
||||
if default_device != torch.device("cpu"):
|
||||
@@ -3532,7 +3533,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
|
||||
for key, value in state_dict.items():
|
||||
for pattern, replacement in reverse_key_mapping.items():
|
||||
replacement = replacement.lstrip("^") # strip off un-needed chars and patterns
|
||||
replacement = re.sub(r"\(.*?\)", "", pattern)
|
||||
replacement = re.sub(r"\(.*\)", "", replacement)
|
||||
key, n_replace = re.subn(pattern, replacement, key)
|
||||
# Early exit of the loop
|
||||
if n_replace > 0:
|
||||
|
||||
@@ -154,7 +154,11 @@ class OPTAttention(nn.Module):
|
||||
"""Input shape: Batch x Time x Channel"""
|
||||
bsz, tgt_len, _ = hidden_states.size()
|
||||
|
||||
# get query proj
|
||||
# Scaling is susceptible to floating point arithmetics' inprecisions
|
||||
# which can lead to different results (this is dependent from model
|
||||
# to model, e.g. whisper is one such case). We therefore keep the
|
||||
# original order of scaling to follow the original implementation
|
||||
# and enforce no scaling (1.0) in the attention call below.
|
||||
query_states = self.q_proj(hidden_states) * self.scaling
|
||||
query_states = query_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
|
||||
|
||||
@@ -187,7 +191,7 @@ class OPTAttention(nn.Module):
|
||||
value_states,
|
||||
attention_mask,
|
||||
dropout=0.0 if not self.training else self.dropout,
|
||||
scaling=self.scaling,
|
||||
scaling=1.0,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
@@ -310,7 +310,8 @@ class PaliGemmaProcessor(ProcessorMixin):
|
||||
return_data = {**inputs, "pixel_values": pixel_values}
|
||||
|
||||
if return_token_type_ids:
|
||||
labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100)
|
||||
labels = np.array(inputs["input_ids"])
|
||||
labels[np.array(inputs["token_type_ids"]) == 0] = -100
|
||||
return_data.update({"labels": labels})
|
||||
return BatchFeature(data=return_data, tensor_type=return_tensors)
|
||||
|
||||
|
||||
@@ -1797,7 +1797,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.model = Qwen2_5_VLModel(config)
|
||||
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
||||
self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
|
||||
|
||||
self.post_init()
|
||||
|
||||
|
||||
@@ -1673,7 +1673,7 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel, GenerationMixin):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.model = Qwen2VLModel(config)
|
||||
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
||||
self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
|
||||
|
||||
self.post_init()
|
||||
|
||||
|
||||
@@ -696,11 +696,13 @@ def group_videos_by_shape(
|
||||
grouped_videos_index = {}
|
||||
for i, video in enumerate(videos):
|
||||
shape = video.shape[-2::]
|
||||
num_frames = video.shape[-4] # video format BTCHW
|
||||
shape = (num_frames, *shape)
|
||||
if shape not in grouped_videos:
|
||||
grouped_videos[shape] = []
|
||||
grouped_videos[shape].append(video)
|
||||
grouped_videos_index[i] = (shape, len(grouped_videos[shape]) - 1)
|
||||
# stack videos with the same shape
|
||||
# stack videos with the same size and number of frames
|
||||
grouped_videos = {shape: torch.stack(videos, dim=0) for shape, videos in grouped_videos.items()}
|
||||
return grouped_videos, grouped_videos_index
|
||||
|
||||
|
||||
@@ -62,6 +62,20 @@ class PaliGemmaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 112)
|
||||
|
||||
@require_torch
|
||||
def test_call_with_suffix(self):
|
||||
input_str = "lower newer"
|
||||
suffix = "upper older longer string"
|
||||
image_input = self.prepare_image_inputs()
|
||||
processor = self.get_processor()
|
||||
inputs = processor(text=input_str, images=image_input, suffix=suffix)
|
||||
self.assertTrue("labels" in inputs)
|
||||
self.assertEqual(len(inputs["labels"][0]), len(inputs["input_ids"][0]))
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, suffix=suffix, return_tensors="pt")
|
||||
self.assertTrue("labels" in inputs)
|
||||
self.assertEqual(len(inputs["labels"][0]), len(inputs["input_ids"][0]))
|
||||
|
||||
def test_text_with_image_tokens(self):
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
@@ -30,7 +30,7 @@ from transformers.testing_utils import (
|
||||
require_torchvision,
|
||||
require_vision,
|
||||
)
|
||||
from transformers.video_utils import make_batched_videos
|
||||
from transformers.video_utils import group_videos_by_shape, make_batched_videos, reorder_videos
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
@@ -43,9 +43,9 @@ if is_vision_available():
|
||||
from transformers.video_utils import VideoMetadata, load_video
|
||||
|
||||
|
||||
def get_random_video(height, width, return_torch=False):
|
||||
def get_random_video(height, width, num_frames=8, return_torch=False):
|
||||
random_frame = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
|
||||
video = np.array(([random_frame] * 8))
|
||||
video = np.array(([random_frame] * num_frames))
|
||||
if return_torch:
|
||||
# move channel first
|
||||
return torch.from_numpy(video).permute(0, 3, 1, 2)
|
||||
@@ -189,6 +189,53 @@ class BaseVideoProcessorTester(unittest.TestCase):
|
||||
rgb_video = video_processor.convert_to_rgb(torch.cat([video, video[:, :1]], dim=1))
|
||||
self.assertEqual(rgb_video.shape, (8, 3, 20, 20))
|
||||
|
||||
def test_group_and_reorder_videos(self):
|
||||
"""Tests that videos can be grouped by frame size and number of frames"""
|
||||
video_1 = get_random_video(20, 20, num_frames=3, return_torch=True)
|
||||
video_2 = get_random_video(20, 20, num_frames=5, return_torch=True)
|
||||
|
||||
# Group two videos of same size but different number of frames
|
||||
grouped_videos, grouped_videos_index = group_videos_by_shape([video_1, video_2])
|
||||
self.assertEqual(len(grouped_videos), 2)
|
||||
|
||||
regrouped_videos = reorder_videos(grouped_videos, grouped_videos_index)
|
||||
self.assertTrue(len(regrouped_videos), 2)
|
||||
self.assertEqual(video_1.shape, regrouped_videos[0].shape)
|
||||
|
||||
# Group two videos of different size but same number of frames
|
||||
video_3 = get_random_video(15, 20, num_frames=3, return_torch=True)
|
||||
grouped_videos, grouped_videos_index = group_videos_by_shape([video_1, video_3])
|
||||
self.assertEqual(len(grouped_videos), 2)
|
||||
|
||||
regrouped_videos = reorder_videos(grouped_videos, grouped_videos_index)
|
||||
self.assertTrue(len(regrouped_videos), 2)
|
||||
self.assertEqual(video_1.shape, regrouped_videos[0].shape)
|
||||
|
||||
# Group all three videos where some have same size or same frame count
|
||||
# But since none have frames and sizes identical, we'll have 3 groups
|
||||
grouped_videos, grouped_videos_index = group_videos_by_shape([video_1, video_2, video_3])
|
||||
self.assertEqual(len(grouped_videos), 3)
|
||||
|
||||
regrouped_videos = reorder_videos(grouped_videos, grouped_videos_index)
|
||||
self.assertTrue(len(regrouped_videos), 3)
|
||||
self.assertEqual(video_1.shape, regrouped_videos[0].shape)
|
||||
|
||||
# Group if we had some videos with identical shapes
|
||||
grouped_videos, grouped_videos_index = group_videos_by_shape([video_1, video_1, video_3])
|
||||
self.assertEqual(len(grouped_videos), 2)
|
||||
|
||||
regrouped_videos = reorder_videos(grouped_videos, grouped_videos_index)
|
||||
self.assertTrue(len(regrouped_videos), 2)
|
||||
self.assertEqual(video_1.shape, regrouped_videos[0].shape)
|
||||
|
||||
# Group if we had all videos with identical shapes
|
||||
grouped_videos, grouped_videos_index = group_videos_by_shape([video_1, video_1, video_1])
|
||||
self.assertEqual(len(grouped_videos), 1)
|
||||
|
||||
regrouped_videos = reorder_videos(grouped_videos, grouped_videos_index)
|
||||
self.assertTrue(len(regrouped_videos), 1)
|
||||
self.assertEqual(video_1.shape, regrouped_videos[0].shape)
|
||||
|
||||
|
||||
@require_vision
|
||||
@require_av
|
||||
|
||||
Reference in New Issue
Block a user