Uniformize model processors (#31368)
* add initial design for uniform processors + align model * add uniform processors for altclip + chinese_clip * add uniform processors for blip + blip2 * fix mutable default 👀 * add configuration test * handle structured kwargs w defaults + add test * protect torch-specific test * fix style * fix * rebase * update processor to generic kwargs + test * fix style * add sensible kwargs merge * update test * fix assertEqual * move kwargs merging to processing common * rework kwargs for type hinting * just get Unpack from extensions * run-slow[align] * handle kwargs passed as nested dict * add from_pretrained test for nested kwargs handling * [run-slow]align * update documentation + imports * update audio inputs * protect audio types, silly * try removing imports * make things simpler * simplerer * move out kwargs test to common mixin * [run-slow]align * skip tests for old processors * [run-slow]align, clip * !$#@!! protect imports, darn it * [run-slow]align, clip * [run-slow]align, clip * update common processor testing * add altclip * add chinese_clip * add pad_size * [run-slow]align, clip, chinese_clip, altclip * remove duplicated tests * fix * add blip, blip2, bridgetower Added tests for bridgetower which override common. Also modified common tests to force center cropping if existing * fix * update doc * improve documentation for default values * add model_max_length testing This parameter depends on tokenizers received. * Raise if kwargs are specified in two places * fix * removed copied from * match defaults * force padding * fix tokenizer test * clean defaults * move tests to common * add missing import * fix * adapt bridgetower tests to shortest edge * uniformize donut processor + tests * add wav2vec2 * extend common testing to audio processors * add testing + bert version * propagate common kwargs to different modalities * BC order of arguments * check py version * revert kwargs merging * add draft overlap test * update * fix blip2 and wav2vec due to updates * fix copies * ensure overlapping kwargs do not disappear * replace .pop by .get to handle duplicated kwargs * fix copies * fix missing import * add clearly wav2vec2_bert to uniformized models * fix copies * increase number of features * fix style * [run-slow] blip, blip2, bridgetower, donut, wav2vec2, wav2vec2_bert * [run-slow] blip, blip_2, bridgetower, donut, wav2vec2, wav2vec2_bert * fix concatenation * [run-slow] blip, blip_2, bridgetower, donut, wav2vec2, wav2vec2_bert * Update tests/test_processing_common.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * 🧹 * address comments * clean up + tests * [run-slow] instructblip, blip, blip_2, bridgetower, donut, wav2vec2, wav2vec2_bert --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
This commit is contained in:
@@ -16,6 +16,7 @@
|
||||
|
||||
import inspect
|
||||
import json
|
||||
import random
|
||||
import tempfile
|
||||
from typing import Optional
|
||||
|
||||
@@ -31,11 +32,7 @@ from transformers.testing_utils import (
|
||||
from transformers.utils import is_vision_available
|
||||
|
||||
|
||||
try:
|
||||
from typing import Unpack
|
||||
except ImportError:
|
||||
from typing_extensions import Unpack
|
||||
|
||||
global_rng = random.Random()
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
@@ -48,6 +45,21 @@ def prepare_image_inputs():
|
||||
return image_inputs
|
||||
|
||||
|
||||
# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
|
||||
def floats_list(shape, scale=1.0, rng=None, name=None):
|
||||
"""Creates a random float32 tensor"""
|
||||
if rng is None:
|
||||
rng = global_rng
|
||||
|
||||
values = []
|
||||
for batch_idx in range(shape[0]):
|
||||
values.append([])
|
||||
for _ in range(shape[1]):
|
||||
values[-1].append(rng.random() * scale)
|
||||
|
||||
return values
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class ProcessorTesterMixin:
|
||||
@@ -333,6 +345,135 @@ class ProcessorTesterMixin:
|
||||
self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
|
||||
self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
|
||||
|
||||
# text + audio kwargs testing
|
||||
@require_torch
|
||||
def test_tokenizer_defaults_preserved_by_kwargs_audio(self):
|
||||
if "feature_extractor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
|
||||
feature_extractor = self.get_component("feature_extractor")
|
||||
if hasattr(self, "get_tokenizer"):
|
||||
tokenizer = self.get_tokenizer(max_length=117, padding="max_length")
|
||||
elif hasattr(self, "get_component"):
|
||||
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
|
||||
else:
|
||||
self.assertTrue(False, "Processor doesn't have get_tokenizer or get_component defined")
|
||||
if not tokenizer.pad_token:
|
||||
tokenizer.pad_token = "[TEST_PAD]"
|
||||
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
input_str = "lower newer"
|
||||
raw_speech = floats_list((3, 1000))
|
||||
inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt")
|
||||
if "input_ids" in inputs:
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 117)
|
||||
elif "labels" in inputs:
|
||||
self.assertEqual(len(inputs["labels"][0]), 117)
|
||||
|
||||
@require_torch
|
||||
def test_kwargs_overrides_default_tokenizer_kwargs_audio(self):
|
||||
if "feature_extractor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
|
||||
feature_extractor = self.get_component("feature_extractor")
|
||||
if hasattr(self, "get_tokenizer"):
|
||||
tokenizer = self.get_tokenizer(max_length=117)
|
||||
elif hasattr(self, "get_component"):
|
||||
tokenizer = self.get_component("tokenizer", max_length=117)
|
||||
if not tokenizer.pad_token:
|
||||
tokenizer.pad_token = "[TEST_PAD]"
|
||||
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
input_str = "lower newer"
|
||||
raw_speech = floats_list((3, 1000))
|
||||
inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt", max_length=112, padding="max_length")
|
||||
if "input_ids" in inputs:
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 112)
|
||||
elif "labels" in inputs:
|
||||
self.assertEqual(len(inputs["labels"][0]), 112)
|
||||
|
||||
@require_torch
|
||||
def test_unstructured_kwargs_audio(self):
|
||||
if "feature_extractor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
|
||||
feature_extractor = self.get_component("feature_extractor")
|
||||
if hasattr(self, "get_tokenizer"):
|
||||
tokenizer = self.get_tokenizer(max_length=117)
|
||||
elif hasattr(self, "get_component"):
|
||||
tokenizer = self.get_component("tokenizer", max_length=117)
|
||||
if not tokenizer.pad_token:
|
||||
tokenizer.pad_token = "[TEST_PAD]"
|
||||
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = "lower newer"
|
||||
raw_speech = floats_list((3, 1000))
|
||||
inputs = processor(
|
||||
text=input_str,
|
||||
audio=raw_speech,
|
||||
return_tensors="pt",
|
||||
padding="max_length",
|
||||
max_length=76,
|
||||
)
|
||||
|
||||
if "input_ids" in inputs:
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 76)
|
||||
elif "labels" in inputs:
|
||||
self.assertEqual(len(inputs["labels"][0]), 76)
|
||||
|
||||
@require_torch
|
||||
def test_doubly_passed_kwargs_audio(self):
|
||||
if "feature_extractor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
|
||||
feature_extractor = self.get_component("feature_extractor")
|
||||
if hasattr(self, "get_tokenizer"):
|
||||
tokenizer = self.get_tokenizer()
|
||||
elif hasattr(self, "get_component"):
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
if not tokenizer.pad_token:
|
||||
tokenizer.pad_token = "[TEST_PAD]"
|
||||
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = ["lower newer"]
|
||||
raw_speech = floats_list((3, 1000))
|
||||
with self.assertRaises(ValueError):
|
||||
_ = processor(
|
||||
text=input_str,
|
||||
audio=raw_speech,
|
||||
audio_kwargs={"padding": "max_length"},
|
||||
padding="max_length",
|
||||
)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_structured_kwargs_audio_nested(self):
|
||||
if "feature_extractor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
|
||||
feature_extractor = self.get_component("feature_extractor")
|
||||
if hasattr(self, "get_tokenizer"):
|
||||
tokenizer = self.get_tokenizer()
|
||||
elif hasattr(self, "get_component"):
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
if not tokenizer.pad_token:
|
||||
tokenizer.pad_token = "[TEST_PAD]"
|
||||
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = ["lower newer"]
|
||||
raw_speech = floats_list((3, 1000))
|
||||
|
||||
# Define the kwargs for each modality
|
||||
all_kwargs = {
|
||||
"common_kwargs": {"return_tensors": "pt"},
|
||||
"text_kwargs": {"padding": "max_length", "max_length": 76},
|
||||
"audio_kwargs": {"padding": "max_length", "max_length": 66},
|
||||
}
|
||||
|
||||
inputs = processor(text=input_str, audio=raw_speech, **all_kwargs)
|
||||
if "input_ids" in inputs:
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 76)
|
||||
elif "labels" in inputs:
|
||||
self.assertEqual(len(inputs["labels"][0]), 76)
|
||||
|
||||
# TODO: the same test, but for audio + text processors that have strong overlap in kwargs
|
||||
# TODO (molbap) use the same structure of attribute kwargs for other tests to avoid duplication
|
||||
def test_overlapping_text_kwargs_handling(self):
|
||||
|
||||
Reference in New Issue
Block a user