Uniformize model processors (#31368)

* add initial design for uniform processors + align model * add uniform processors for altclip + chinese_clip * add uniform processors for blip + blip2 * fix mutable default 👀 * add configuration test * handle structured kwargs w defaults + add test * protect torch-specific test * fix style * fix * rebase * update processor to generic kwargs + test * fix style * add sensible kwargs merge * update test * fix assertEqual * move kwargs merging to processing common * rework kwargs for type hinting * just get Unpack from extensions * run-slow[align] * handle kwargs passed as nested dict * add from_pretrained test for nested kwargs handling * [run-slow]align * update documentation + imports * update audio inputs * protect audio types, silly * try removing imports * make things simpler * simplerer * move out kwargs test to common mixin * [run-slow]align * skip tests for old processors * [run-slow]align, clip * !$#@!! protect imports, darn it * [run-slow]align, clip * [run-slow]align, clip * update common processor testing * add altclip * add chinese_clip * add pad_size * [run-slow]align, clip, chinese_clip, altclip * remove duplicated tests * fix * add blip, blip2, bridgetower Added tests for bridgetower which override common. Also modified common tests to force center cropping if existing * fix * update doc * improve documentation for default values * add model_max_length testing This parameter depends on tokenizers received. * Raise if kwargs are specified in two places * fix * removed copied from * match defaults * force padding * fix tokenizer test * clean defaults * move tests to common * add missing import * fix * adapt bridgetower tests to shortest edge * uniformize donut processor + tests * add wav2vec2 * extend common testing to audio processors * add testing + bert version * propagate common kwargs to different modalities * BC order of arguments * check py version * revert kwargs merging * add draft overlap test * update * fix blip2 and wav2vec due to updates * fix copies * ensure overlapping kwargs do not disappear * replace .pop by .get to handle duplicated kwargs * fix copies * fix missing import * add clearly wav2vec2_bert to uniformized models * fix copies * increase number of features * fix style * [run-slow] blip, blip2, bridgetower, donut, wav2vec2, wav2vec2_bert * [run-slow] blip, blip_2, bridgetower, donut, wav2vec2, wav2vec2_bert * fix concatenation * [run-slow] blip, blip_2, bridgetower, donut, wav2vec2, wav2vec2_bert * Update tests/test_processing_common.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * 🧹 * address comments * clean up + tests * [run-slow] instructblip, blip, blip_2, bridgetower, donut, wav2vec2, wav2vec2_bert --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
2024-10-02 10:41:08 +02:00
parent 2292be6c1b
commit 50290cf7a0
18 changed files with 769 additions and 273 deletions
--- a/tests/models/altclip/test_processor_altclip.py
+++ b/tests/models/altclip/test_processor_altclip.py
@@ -17,17 +17,12 @@
 import tempfile
 import unittest

-from transformers import XLMRobertaTokenizer, XLMRobertaTokenizerFast
+from transformers import AltCLIPProcessor, CLIPImageProcessor, XLMRobertaTokenizer, XLMRobertaTokenizerFast
 from transformers.testing_utils import require_vision
-from transformers.utils import is_vision_available

 from ...test_processing_common import ProcessorTesterMixin


-if is_vision_available():
-    from transformers import AltCLIPProcessor, CLIPImageProcessor
-
-
@require_vision
 class AltClipProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    processor_class = AltCLIPProcessor
--- a/tests/models/blip/test_processor_blip.py
+++ b/tests/models/blip/test_processor_blip.py
@@ -17,7 +17,7 @@ import unittest

 import pytest

-from transformers.testing_utils import require_vision
+from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_vision_available

 from ...test_processing_common import ProcessorTesterMixin
@@ -139,3 +139,29 @@ class BlipProcessorTest(ProcessorTesterMixin, unittest.TestCase):

        # For now the processor supports only ['pixel_values', 'input_ids', 'attention_mask']
        self.assertListEqual(list(inputs.keys()), ["pixel_values", "input_ids", "attention_mask"])
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer", "upper older longer string"]
+        image_input = self.prepare_image_inputs() * 2
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            crop_size={"height": 214, "width": 214},
+            size={"height": 214, "width": 214},
+            padding="longest",
+            max_length=76,
+        )
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 24)
--- a/tests/models/blip_2/test_processor_blip_2.py
+++ b/tests/models/blip_2/test_processor_blip_2.py
@@ -17,7 +17,7 @@ import unittest

 import pytest

-from transformers.testing_utils import require_vision
+from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_vision_available

 from ...test_processing_common import ProcessorTesterMixin
@@ -94,7 +94,7 @@ class Blip2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        encoded_tok = tokenizer(input_str, return_token_type_ids=False)

        for key in encoded_tok.keys():
-            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+            self.assertListEqual(encoded_tok[key], encoded_processor[key][0])

    def test_processor(self):
        image_processor = self.get_image_processor()
@@ -107,7 +107,7 @@ class Blip2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):

        inputs = processor(text=input_str, images=image_input)

-        self.assertListEqual(list(inputs.keys()), ["pixel_values", "input_ids", "attention_mask"])
+        self.assertCountEqual(list(inputs.keys()), ["input_ids", "pixel_values", "attention_mask"])

        # test if it raises when no input is passed
        with pytest.raises(ValueError):
@@ -138,4 +138,31 @@ class Blip2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        inputs = processor(text=input_str, images=image_input)

        # For now the processor supports only ['pixel_values', 'input_ids', 'attention_mask']
-        self.assertListEqual(list(inputs.keys()), ["pixel_values", "input_ids", "attention_mask"])
+        self.assertCountEqual(list(inputs.keys()), ["input_ids", "pixel_values", "attention_mask"])
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer", "upper older longer string"]
+        image_input = self.prepare_image_inputs() * 2
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            crop_size={"height": 214, "width": 214},
+            size={"height": 214, "width": 214},
+            padding="longest",
+            max_length=76,
+        )
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 11)
--- a/tests/models/bridgetower/test_processing_bridgetower.py
+++ b/tests/models/bridgetower/test_processing_bridgetower.py
@@ -0,0 +1,218 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import (
+        AutoProcessor,
+        BridgeTowerImageProcessor,
+        BridgeTowerProcessor,
+        RobertaTokenizerFast,
+    )
+
+
+@require_vision
+class Blip2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = BridgeTowerProcessor
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+
+        image_processor = BridgeTowerImageProcessor()
+        tokenizer = RobertaTokenizerFast.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")
+
+        processor = BridgeTowerProcessor(image_processor, tokenizer)
+
+        processor.save_pretrained(self.tmpdirname)
+
+    def get_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def prepare_image_inputs(self):
+        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+        or a list of PyTorch tensors if one specifies torchify=True.
+        """
+
+        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
+
+        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+
+        return image_inputs
+
+    # Some kwargs tests are overriden from common tests to handle shortest_edge
+    # and size_divisor behaviour
+
+    @require_torch
+    @require_vision
+    def test_image_processor_defaults_preserved_by_image_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component(
+            "image_processor",
+            crop_size={"shortest_edge": 234, "longest_edge": 234},
+        )
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+        self.assertEqual(len(inputs["pixel_values"][0][0]), 234)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested_from_dict(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {
+                "crop_size": {"shortest_edge": 214},
+            },
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_kwargs_overrides_default_image_processor_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", crop_size={"shortest_edge": 234})
+        tokenizer = self.get_component("tokenizer", max_length=117)
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+        inputs = processor(text=input_str, images=image_input, crop_size={"shortest_edge": 224})
+        self.assertEqual(len(inputs["pixel_values"][0][0]), 224)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer", "upper older longer string"]
+        image_input = self.prepare_image_inputs() * 2
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            crop_size={"shortest_edge": 214},
+            padding="longest",
+            max_length=76,
+        )
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 6)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            crop_size={"shortest_edge": 214},
+            padding="max_length",
+            max_length=76,
+        )
+
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"crop_size": {"shortest_edge": 214}},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
--- a/tests/models/donut/test_processing_donut.py
+++ b/tests/models/donut/test_processing_donut.py
@@ -14,16 +14,32 @@
 # limitations under the License.


+import tempfile
 import unittest

-from transformers import DonutProcessor
+from transformers import DonutImageProcessor, DonutProcessor, XLMRobertaTokenizerFast
+from transformers.testing_utils import (
+    require_torch,
+    require_vision,
+)
+
+from ...test_processing_common import ProcessorTesterMixin


-class DonutProcessorTest(unittest.TestCase):
+class DonutProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    from_pretrained_id = "naver-clova-ix/donut-base"
+    processor_class = DonutProcessor

    def setUp(self):
        self.processor = DonutProcessor.from_pretrained(self.from_pretrained_id)
+        self.tmpdirname = tempfile.mkdtemp()
+
+        image_processor = DonutImageProcessor()
+        tokenizer = XLMRobertaTokenizerFast.from_pretrained(self.from_pretrained_id)
+
+        processor = DonutProcessor(image_processor, tokenizer)
+
+        processor.save_pretrained(self.tmpdirname)

    def test_token2json(self):
        expected_json = {
@@ -49,3 +65,30 @@ class DonutProcessorTest(unittest.TestCase):
        actual_json = self.processor.token2json(sequence)

        self.assertDictEqual(actual_json, expected_json)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer", "upper older longer string"]
+        image_input = self.prepare_image_inputs() * 2
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            crop_size={"height": 214, "width": 214},
+            size={"height": 214, "width": 214},
+            padding="longest",
+            max_length=76,
+        )
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 7)
--- a/tests/models/wav2vec2/test_processor_wav2vec2.py
+++ b/tests/models/wav2vec2/test_processor_wav2vec2.py
@@ -18,14 +18,19 @@ import shutil
 import tempfile
 import unittest

+import numpy as np
+
 from transformers.models.wav2vec2 import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor
 from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES
 from transformers.utils import FEATURE_EXTRACTOR_NAME

+from ...test_processing_common import ProcessorTesterMixin
 from .test_feature_extraction_wav2vec2 import floats_list


-class Wav2Vec2ProcessorTest(unittest.TestCase):
+class Wav2Vec2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = Wav2Vec2Processor
+
    def setUp(self):
        vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
        vocab_tokens = dict(zip(vocab, range(len(vocab))))
@@ -53,6 +58,9 @@ class Wav2Vec2ProcessorTest(unittest.TestCase):
        with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
            fp.write(json.dumps(feature_extractor_map) + "\n")

+        tokenizer = self.get_tokenizer()
+        tokenizer.save_pretrained(self.tmpdirname)
+
    def get_tokenizer(self, **kwargs_init):
        kwargs = self.add_kwargs_tokens_map.copy()
        kwargs.update(kwargs_init)
@@ -117,7 +125,6 @@ class Wav2Vec2ProcessorTest(unittest.TestCase):
        processor = Wav2Vec2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)

        input_str = "This is a test string"
-
        encoded_processor = processor(text=input_str)

        encoded_tok = tokenizer(input_str)
@@ -125,6 +132,22 @@ class Wav2Vec2ProcessorTest(unittest.TestCase):
        for key in encoded_tok.keys():
            self.assertListEqual(encoded_tok[key], encoded_processor[key])

+    def test_padding_argument_not_ignored(self):
+        # padding, or any other overlap arg between audio extractor and tokenizer
+        # should be passed to both text and audio and not ignored
+
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Wav2Vec2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        batch_duration_in_seconds = [1, 3, 2, 6]
+        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
+
+        # padding = True should not raise an error and will if the audio processor popped its value to None
+        _ = processor(
+            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+
    def test_tokenizer_decode(self):
        feature_extractor = self.get_feature_extractor()
        tokenizer = self.get_tokenizer()
--- a/tests/models/wav2vec2_bert/test_processor_wav2vec2_bert.py
+++ b/tests/models/wav2vec2_bert/test_processor_wav2vec2_bert.py
@@ -18,17 +18,21 @@ import shutil
 import tempfile
 import unittest

+import numpy as np
+
 from transformers.models.seamless_m4t import SeamlessM4TFeatureExtractor
 from transformers.models.wav2vec2 import Wav2Vec2CTCTokenizer
 from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES
 from transformers.models.wav2vec2_bert import Wav2Vec2BertProcessor
 from transformers.utils import FEATURE_EXTRACTOR_NAME

+from ...test_processing_common import ProcessorTesterMixin
 from ..wav2vec2.test_feature_extraction_wav2vec2 import floats_list


-# Copied from tests.models.wav2vec2.test_processor_wav2vec2.Wav2Vec2ProcessorTest with Wav2Vec2FeatureExtractor->SeamlessM4TFeatureExtractor, Wav2Vec2Processor->Wav2Vec2BertProcessor
-class Wav2Vec2BertProcessorTest(unittest.TestCase):
+class Wav2Vec2BertProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = Wav2Vec2BertProcessor
+
    def setUp(self):
        vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
        vocab_tokens = dict(zip(vocab, range(len(vocab))))
@@ -40,7 +44,7 @@ class Wav2Vec2BertProcessorTest(unittest.TestCase):
            "eos_token": "</s>",
        }
        feature_extractor_map = {
-            "feature_size": 1,
+            "feature_size": 80,
            "padding_value": 0.0,
            "sampling_rate": 16000,
            "return_attention_mask": False,
@@ -56,6 +60,9 @@ class Wav2Vec2BertProcessorTest(unittest.TestCase):
        with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
            fp.write(json.dumps(feature_extractor_map) + "\n")

+        tokenizer = self.get_tokenizer()
+        tokenizer.save_pretrained(self.tmpdirname)
+
    def get_tokenizer(self, **kwargs_init):
        kwargs = self.add_kwargs_tokens_map.copy()
        kwargs.update(kwargs_init)
@@ -122,7 +129,6 @@ class Wav2Vec2BertProcessorTest(unittest.TestCase):
        processor = Wav2Vec2BertProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)

        input_str = "This is a test string"
-
        encoded_processor = processor(text=input_str)

        encoded_tok = tokenizer(input_str)
@@ -130,6 +136,22 @@ class Wav2Vec2BertProcessorTest(unittest.TestCase):
        for key in encoded_tok.keys():
            self.assertListEqual(encoded_tok[key], encoded_processor[key])

+    def test_padding_argument_not_ignored(self):
+        # padding, or any other overlap arg between audio extractor and tokenizer
+        # should be passed to both text and audio and not ignored
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Wav2Vec2BertProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        batch_duration_in_seconds = [1, 3, 2, 6]
+        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
+
+        # padding = True should not raise an error and will if the audio processor popped its value to None
+        # processor(input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt")
+        _ = processor(
+            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+
    def test_tokenizer_decode(self):
        feature_extractor = self.get_feature_extractor()
        tokenizer = self.get_tokenizer()
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -16,6 +16,7 @@

 import inspect
 import json
+import random
 import tempfile
 from typing import Optional

@@ -31,11 +32,7 @@ from transformers.testing_utils import (
 from transformers.utils import is_vision_available


-try:
-    from typing import Unpack
-except ImportError:
-    from typing_extensions import Unpack
-
+global_rng = random.Random()

 if is_vision_available():
    from PIL import Image
@@ -48,6 +45,21 @@ def prepare_image_inputs():
    return image_inputs


+# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
+def floats_list(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    values = []
+    for batch_idx in range(shape[0]):
+        values.append([])
+        for _ in range(shape[1]):
+            values[-1].append(rng.random() * scale)
+
+    return values
+
+
@require_torch
@require_vision
 class ProcessorTesterMixin:
@@ -333,6 +345,135 @@ class ProcessorTesterMixin:
        self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
        self.assertEqual(inputs[self.text_input_name].shape[-1], 76)

+    #  text + audio kwargs testing
+    @require_torch
+    def test_tokenizer_defaults_preserved_by_kwargs_audio(self):
+        if "feature_extractor" not in self.processor_class.attributes:
+            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+        feature_extractor = self.get_component("feature_extractor")
+        if hasattr(self, "get_tokenizer"):
+            tokenizer = self.get_tokenizer(max_length=117, padding="max_length")
+        elif hasattr(self, "get_component"):
+            tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+        else:
+            self.assertTrue(False, "Processor doesn't have get_tokenizer or get_component defined")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        raw_speech = floats_list((3, 1000))
+        inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt")
+        if "input_ids" in inputs:
+            self.assertEqual(len(inputs["input_ids"][0]), 117)
+        elif "labels" in inputs:
+            self.assertEqual(len(inputs["labels"][0]), 117)
+
+    @require_torch
+    def test_kwargs_overrides_default_tokenizer_kwargs_audio(self):
+        if "feature_extractor" not in self.processor_class.attributes:
+            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+        feature_extractor = self.get_component("feature_extractor")
+        if hasattr(self, "get_tokenizer"):
+            tokenizer = self.get_tokenizer(max_length=117)
+        elif hasattr(self, "get_component"):
+            tokenizer = self.get_component("tokenizer", max_length=117)
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        raw_speech = floats_list((3, 1000))
+        inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt", max_length=112, padding="max_length")
+        if "input_ids" in inputs:
+            self.assertEqual(len(inputs["input_ids"][0]), 112)
+        elif "labels" in inputs:
+            self.assertEqual(len(inputs["labels"][0]), 112)
+
+    @require_torch
+    def test_unstructured_kwargs_audio(self):
+        if "feature_extractor" not in self.processor_class.attributes:
+            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+        feature_extractor = self.get_component("feature_extractor")
+        if hasattr(self, "get_tokenizer"):
+            tokenizer = self.get_tokenizer(max_length=117)
+        elif hasattr(self, "get_component"):
+            tokenizer = self.get_component("tokenizer", max_length=117)
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        raw_speech = floats_list((3, 1000))
+        inputs = processor(
+            text=input_str,
+            audio=raw_speech,
+            return_tensors="pt",
+            padding="max_length",
+            max_length=76,
+        )
+
+        if "input_ids" in inputs:
+            self.assertEqual(len(inputs["input_ids"][0]), 76)
+        elif "labels" in inputs:
+            self.assertEqual(len(inputs["labels"][0]), 76)
+
+    @require_torch
+    def test_doubly_passed_kwargs_audio(self):
+        if "feature_extractor" not in self.processor_class.attributes:
+            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+        feature_extractor = self.get_component("feature_extractor")
+        if hasattr(self, "get_tokenizer"):
+            tokenizer = self.get_tokenizer()
+        elif hasattr(self, "get_component"):
+            tokenizer = self.get_component("tokenizer")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer"]
+        raw_speech = floats_list((3, 1000))
+        with self.assertRaises(ValueError):
+            _ = processor(
+                text=input_str,
+                audio=raw_speech,
+                audio_kwargs={"padding": "max_length"},
+                padding="max_length",
+            )
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_audio_nested(self):
+        if "feature_extractor" not in self.processor_class.attributes:
+            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+        feature_extractor = self.get_component("feature_extractor")
+        if hasattr(self, "get_tokenizer"):
+            tokenizer = self.get_tokenizer()
+        elif hasattr(self, "get_component"):
+            tokenizer = self.get_component("tokenizer")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer"]
+        raw_speech = floats_list((3, 1000))
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+            "audio_kwargs": {"padding": "max_length", "max_length": 66},
+        }
+
+        inputs = processor(text=input_str, audio=raw_speech, **all_kwargs)
+        if "input_ids" in inputs:
+            self.assertEqual(len(inputs["input_ids"][0]), 76)
+        elif "labels" in inputs:
+            self.assertEqual(len(inputs["labels"][0]), 76)
+
    # TODO: the same test, but for audio + text processors that have strong overlap in kwargs
    # TODO (molbap) use the same structure of attribute kwargs for other tests to avoid duplication
    def test_overlapping_text_kwargs_handling(self):