Uniformize model processors (#31368)

* add initial design for uniform processors + align model

* add uniform processors for altclip + chinese_clip

* add uniform processors for blip + blip2

* fix mutable default 👀

* add configuration test

* handle structured kwargs w defaults + add test

* protect torch-specific test

* fix style

* fix

* rebase

* update processor to generic kwargs + test

* fix style

* add sensible kwargs merge

* update test

* fix assertEqual

* move kwargs merging to processing common

* rework kwargs for type hinting

* just get Unpack from extensions

* run-slow[align]

* handle kwargs passed as nested dict

* add from_pretrained test for nested kwargs handling

* [run-slow]align

* update documentation + imports

* update audio inputs

* protect audio types, silly

* try removing imports

* make things simpler

* simplerer

* move out kwargs test to common mixin

* [run-slow]align

* skip tests for old processors

* [run-slow]align, clip

* !$#@!! protect imports, darn it

* [run-slow]align, clip

* [run-slow]align, clip

* update common processor testing

* add altclip

* add chinese_clip

* add pad_size

* [run-slow]align, clip, chinese_clip, altclip

* remove duplicated tests

* fix

* add blip, blip2, bridgetower

Added tests for bridgetower which override common. Also modified common
tests to force center cropping if existing

* fix

* update doc

* improve documentation for default values

* add model_max_length testing

This parameter depends on tokenizers received.

* Raise if kwargs are specified in two places

* fix

* removed copied from

* match defaults

* force padding

* fix tokenizer test

* clean defaults

* move tests to common

* add missing import

* fix

* adapt bridgetower tests to shortest edge

* uniformize donut processor + tests

* add wav2vec2

* extend common testing to audio processors

* add testing + bert version

* propagate common kwargs to different modalities

* BC order of arguments

* check py version

* revert kwargs merging

* add draft overlap test

* update

* fix blip2 and wav2vec due to updates

* fix copies

* ensure overlapping kwargs do not disappear

* replace .pop by .get to handle duplicated kwargs

* fix copies

* fix missing import

* add clearly wav2vec2_bert to uniformized models

* fix copies

* increase number of features

* fix style

* [run-slow] blip, blip2, bridgetower, donut, wav2vec2, wav2vec2_bert

* [run-slow] blip, blip_2, bridgetower, donut, wav2vec2, wav2vec2_bert

* fix concatenation

* [run-slow] blip, blip_2, bridgetower, donut, wav2vec2, wav2vec2_bert

* Update tests/test_processing_common.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* 🧹

* address comments

* clean up + tests

* [run-slow] instructblip, blip, blip_2, bridgetower, donut, wav2vec2, wav2vec2_bert

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
This commit is contained in:
Pablo Montalvo
2024-10-02 10:41:08 +02:00
committed by GitHub
parent 2292be6c1b
commit 50290cf7a0
18 changed files with 769 additions and 273 deletions

View File

@@ -16,6 +16,7 @@
import inspect
import json
import random
import tempfile
from typing import Optional
@@ -31,11 +32,7 @@ from transformers.testing_utils import (
from transformers.utils import is_vision_available
try:
from typing import Unpack
except ImportError:
from typing_extensions import Unpack
global_rng = random.Random()
if is_vision_available():
from PIL import Image
@@ -48,6 +45,21 @@ def prepare_image_inputs():
return image_inputs
# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
def floats_list(shape, scale=1.0, rng=None, name=None):
"""Creates a random float32 tensor"""
if rng is None:
rng = global_rng
values = []
for batch_idx in range(shape[0]):
values.append([])
for _ in range(shape[1]):
values[-1].append(rng.random() * scale)
return values
@require_torch
@require_vision
class ProcessorTesterMixin:
@@ -333,6 +345,135 @@ class ProcessorTesterMixin:
self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
# text + audio kwargs testing
@require_torch
def test_tokenizer_defaults_preserved_by_kwargs_audio(self):
if "feature_extractor" not in self.processor_class.attributes:
self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
feature_extractor = self.get_component("feature_extractor")
if hasattr(self, "get_tokenizer"):
tokenizer = self.get_tokenizer(max_length=117, padding="max_length")
elif hasattr(self, "get_component"):
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
else:
self.assertTrue(False, "Processor doesn't have get_tokenizer or get_component defined")
if not tokenizer.pad_token:
tokenizer.pad_token = "[TEST_PAD]"
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
raw_speech = floats_list((3, 1000))
inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt")
if "input_ids" in inputs:
self.assertEqual(len(inputs["input_ids"][0]), 117)
elif "labels" in inputs:
self.assertEqual(len(inputs["labels"][0]), 117)
@require_torch
def test_kwargs_overrides_default_tokenizer_kwargs_audio(self):
if "feature_extractor" not in self.processor_class.attributes:
self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
feature_extractor = self.get_component("feature_extractor")
if hasattr(self, "get_tokenizer"):
tokenizer = self.get_tokenizer(max_length=117)
elif hasattr(self, "get_component"):
tokenizer = self.get_component("tokenizer", max_length=117)
if not tokenizer.pad_token:
tokenizer.pad_token = "[TEST_PAD]"
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
raw_speech = floats_list((3, 1000))
inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt", max_length=112, padding="max_length")
if "input_ids" in inputs:
self.assertEqual(len(inputs["input_ids"][0]), 112)
elif "labels" in inputs:
self.assertEqual(len(inputs["labels"][0]), 112)
@require_torch
def test_unstructured_kwargs_audio(self):
if "feature_extractor" not in self.processor_class.attributes:
self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
feature_extractor = self.get_component("feature_extractor")
if hasattr(self, "get_tokenizer"):
tokenizer = self.get_tokenizer(max_length=117)
elif hasattr(self, "get_component"):
tokenizer = self.get_component("tokenizer", max_length=117)
if not tokenizer.pad_token:
tokenizer.pad_token = "[TEST_PAD]"
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
raw_speech = floats_list((3, 1000))
inputs = processor(
text=input_str,
audio=raw_speech,
return_tensors="pt",
padding="max_length",
max_length=76,
)
if "input_ids" in inputs:
self.assertEqual(len(inputs["input_ids"][0]), 76)
elif "labels" in inputs:
self.assertEqual(len(inputs["labels"][0]), 76)
@require_torch
def test_doubly_passed_kwargs_audio(self):
if "feature_extractor" not in self.processor_class.attributes:
self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
feature_extractor = self.get_component("feature_extractor")
if hasattr(self, "get_tokenizer"):
tokenizer = self.get_tokenizer()
elif hasattr(self, "get_component"):
tokenizer = self.get_component("tokenizer")
if not tokenizer.pad_token:
tokenizer.pad_token = "[TEST_PAD]"
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
self.skip_processor_without_typed_kwargs(processor)
input_str = ["lower newer"]
raw_speech = floats_list((3, 1000))
with self.assertRaises(ValueError):
_ = processor(
text=input_str,
audio=raw_speech,
audio_kwargs={"padding": "max_length"},
padding="max_length",
)
@require_torch
@require_vision
def test_structured_kwargs_audio_nested(self):
if "feature_extractor" not in self.processor_class.attributes:
self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
feature_extractor = self.get_component("feature_extractor")
if hasattr(self, "get_tokenizer"):
tokenizer = self.get_tokenizer()
elif hasattr(self, "get_component"):
tokenizer = self.get_component("tokenizer")
if not tokenizer.pad_token:
tokenizer.pad_token = "[TEST_PAD]"
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
self.skip_processor_without_typed_kwargs(processor)
input_str = ["lower newer"]
raw_speech = floats_list((3, 1000))
# Define the kwargs for each modality
all_kwargs = {
"common_kwargs": {"return_tensors": "pt"},
"text_kwargs": {"padding": "max_length", "max_length": 76},
"audio_kwargs": {"padding": "max_length", "max_length": 66},
}
inputs = processor(text=input_str, audio=raw_speech, **all_kwargs)
if "input_ids" in inputs:
self.assertEqual(len(inputs["input_ids"][0]), 76)
elif "labels" in inputs:
self.assertEqual(len(inputs["labels"][0]), 76)
# TODO: the same test, but for audio + text processors that have strong overlap in kwargs
# TODO (molbap) use the same structure of attribute kwargs for other tests to avoid duplication
def test_overlapping_text_kwargs_handling(self):