Remove script datasets in tests (#38940)

* remove trust_remote_code * again * Revert "Skip some tests for now (#38931)" This reverts commit 31d30b7224. * again * style * again * again * style * fix integration test * fix tests * style * fix * fix * fix the last ones * style * last one * fix last * fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2025-06-25 16:31:20 +02:00
parent 3c322c9cdf
commit 858f9b71a8
51 changed files with 154 additions and 293 deletions
--- a/tests/models/beit/test_image_processing_beit.py
+++ b/tests/models/beit/test_image_processing_beit.py
@@ -27,8 +27,6 @@ if is_torch_available():
    import torch

 if is_vision_available():
-    from PIL import Image
-
    from transformers import BeitImageProcessor

    if is_torchvision_available():
@@ -98,23 +96,14 @@ class BeitImageProcessingTester:


 def prepare_semantic_single_inputs():
-    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-
-    image = Image.open(dataset[0]["file"])
-    map = Image.open(dataset[1]["file"])
-
-    return image, map
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    example = ds[0]
+    return example["image"], example["map"]


 def prepare_semantic_batch_inputs():
-    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-
-    image1 = Image.open(ds[0]["file"])
-    map1 = Image.open(ds[1]["file"])
-    image2 = Image.open(ds[2]["file"])
-    map2 = Image.open(ds[3]["file"])
-
-    return [image1, image2], [map1, map2]
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    return list(ds["image"][:2]), list(ds["map"][:2])


@require_torch
@@ -157,7 +146,6 @@ class BeitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
            self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
            self.assertEqual(image_processor.do_reduce_labels, True)

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_call_segmentation_maps(self):
        for image_processing_class in self.image_processor_list:
            # Initialize image_processing
@@ -265,7 +253,6 @@ class BeitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
            self.assertTrue(encoding["labels"].min().item() >= 0)
            self.assertTrue(encoding["labels"].max().item() <= 255)

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_reduce_labels(self):
        for image_processing_class in self.image_processor_list:
            # Initialize image_processing
@@ -282,7 +269,6 @@ class BeitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
            self.assertTrue(encoding["labels"].min().item() >= 0)
            self.assertTrue(encoding["labels"].max().item() <= 255)

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_slow_fast_equivalence(self):
        if not self.test_slow_image_processor or not self.test_fast_image_processor:
            self.skipTest(reason="Skipping slow/fast equivalence test")
--- a/tests/models/beit/test_modeling_beit.py
+++ b/tests/models/beit/test_modeling_beit.py
@@ -16,7 +16,6 @@
 import unittest

 from datasets import load_dataset
-from packaging import version

 from transformers import BeitConfig
 from transformers.testing_utils import (
@@ -53,7 +52,6 @@ if is_torch_available():


 if is_vision_available():
-    import PIL
    from PIL import Image

    from transformers import BeitImageProcessor
@@ -504,8 +502,8 @@ class BeitModelIntegrationTest(unittest.TestCase):

        image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False)

-        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-        image = Image.open(ds[0]["file"])
+        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+        image = ds[0]["image"].convert("RGB")
        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)

        # forward pass
@@ -517,27 +515,14 @@ class BeitModelIntegrationTest(unittest.TestCase):
        expected_shape = torch.Size((1, 150, 160, 160))
        self.assertEqual(logits.shape, expected_shape)

-        is_pillow_less_than_9 = version.parse(PIL.__version__) < version.parse("9.0.0")
-
-        if is_pillow_less_than_9:
-            expected_slice = torch.tensor(
-                [
-                    [[-4.9225, -2.3954, -3.0522], [-2.8822, -1.0046, -1.7561], [-2.9549, -1.3228, -2.1347]],
-                    [[-5.8168, -3.4129, -4.0778], [-3.8651, -2.2214, -3.0277], [-3.8356, -2.4643, -3.3535]],
-                    [[-0.0078, 3.9952, 4.0754], [2.9856, 4.6944, 5.0035], [3.2413, 4.7813, 4.9969]],
-                ],
-                device=torch_device,
-            )
-        else:
-            expected_slice = torch.tensor(
-                [
-                    [[-4.8960, -2.3688, -3.0355], [-2.8478, -0.9836, -1.7418], [-2.9449, -1.3332, -2.1456]],
-                    [[-5.8081, -3.4124, -4.1006], [-3.8561, -2.2081, -3.0323], [-3.8365, -2.4601, -3.3669]],
-                    [[-0.0309, 3.9868, 4.0540], [2.9640, 4.6877, 4.9976], [3.2081, 4.7690, 4.9942]],
-                ],
-                device=torch_device,
-            )
-
+        expected_slice = torch.tensor(
+            [
+                [[-4.8963, -2.3696, -3.0359], [-2.8485, -0.9842, -1.7426], [-2.9453, -1.3338, -2.1463]],
+                [[-5.8099, -3.4140, -4.1025], [-3.8578, -2.2100, -3.0337], [-3.8383, -2.4615, -3.3681]],
+                [[-0.0314, 3.9864, 4.0536], [2.9637, 4.6879, 4.9976], [3.2074, 4.7690, 4.9946]],
+            ],
+            device=torch_device,
+        )
        torch.testing.assert_close(logits[0, :3, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)

    @slow
@@ -547,8 +532,8 @@ class BeitModelIntegrationTest(unittest.TestCase):

        image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False)

-        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-        image = Image.open(ds[0]["file"])
+        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+        image = ds[0]["image"].convert("RGB")
        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)

        # forward pass
--- a/tests/models/data2vec/test_modeling_data2vec_audio.py
+++ b/tests/models/data2vec/test_modeling_data2vec_audio.py
@@ -669,7 +669,7 @@ class Data2VecAudioModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]

    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")

        return ds[:num_samples]

--- a/tests/models/dpt/test_image_processing_dpt.py
+++ b/tests/models/dpt/test_image_processing_dpt.py
@@ -29,8 +29,6 @@ if is_torch_available():
    import torch

 if is_vision_available():
-    from PIL import Image
-
    from transformers import DPTImageProcessor

    if is_torchvision_available():
@@ -94,24 +92,15 @@ class DPTImageProcessingTester:

 # Copied from transformers.tests.models.beit.test_image_processing_beit.prepare_semantic_single_inputs
 def prepare_semantic_single_inputs():
-    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-
-    image = Image.open(dataset[0]["file"])
-    map = Image.open(dataset[1]["file"])
-
-    return image, map
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    example = ds[0]
+    return example["image"], example["map"]


 # Copied from transformers.tests.models.beit.test_image_processing_beit.prepare_semantic_batch_inputs
 def prepare_semantic_batch_inputs():
-    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-
-    image1 = Image.open(ds[0]["file"])
-    map1 = Image.open(ds[1]["file"])
-    image2 = Image.open(ds[2]["file"])
-    map2 = Image.open(ds[3]["file"])
-
-    return [image1, image2], [map1, map2]
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    return list(ds["image"][:2]), list(ds["map"][:2])


@require_torch
@@ -187,7 +176,6 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):

            self.assertEqual(list(pixel_values.shape), [1, 3, 512, 672])

-    @unittest.skip("temporary to avoid failing on circleci")
    # Copied from transformers.tests.models.beit.test_image_processing_beit.BeitImageProcessingTest.test_call_segmentation_maps
    def test_call_segmentation_maps(self):
        for image_processing_class in self.image_processor_list:
@@ -296,7 +284,6 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
            self.assertTrue(encoding["labels"].min().item() >= 0)
            self.assertTrue(encoding["labels"].max().item() <= 255)

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_reduce_labels(self):
        for image_processing_class in self.image_processor_list:
            image_processor = image_processing_class(**self.image_processor_dict)
@@ -319,7 +306,6 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
            # Compare with non-reduced label to see if it's reduced by 1
            self.assertEqual(encoding["labels"][first_non_zero_coords].item(), first_non_zero_value - 1)

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_slow_fast_equivalence(self):
        if not self.test_slow_image_processor or not self.test_fast_image_processor:
            self.skipTest(reason="Skipping slow/fast equivalence test")
@@ -341,7 +327,6 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        )
        self.assertTrue(torch.allclose(image_encoding_slow.labels, image_encoding_fast.labels, atol=1e-1))

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_slow_fast_equivalence_batched(self):
        if not self.test_slow_image_processor or not self.test_fast_image_processor:
            self.skipTest(reason="Skipping slow/fast equivalence test")
--- a/tests/models/granite_speech/test_modeling_granite_speech.py
+++ b/tests/models/granite_speech/test_modeling_granite_speech.py
@@ -391,7 +391,7 @@ class GraniteSpeechForConditionalGenerationIntegrationTest(unittest.TestCase):

        EXPECTED_DECODED_TEXT = [
            "systemKnowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant\nusercan you transcribe the speech into a written format?\nassistantmister quilter is the apostle of the middle classes and we are glad to welcome his gospel",
-            "systemKnowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant\nusercan you transcribe the speech into a written format?\nassistantnor is mister quilp's manner less interesting than his matter"
+            "systemKnowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant\nusercan you transcribe the speech into a written format?\nassistantnor is mister quilter's manner less interesting than his matter"
        ]  # fmt: skip

        self.assertEqual(
--- a/tests/models/hubert/test_modeling_hubert.py
+++ b/tests/models/hubert/test_modeling_hubert.py
@@ -767,7 +767,7 @@ class HubertModelIntegrationTest(unittest.TestCase):
    def _load_superb(self, task, num_samples):
        from datasets import load_dataset

-        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")

        return ds[:num_samples]

--- a/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py
@@ -123,13 +123,13 @@ class LayoutLMv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
    def test_layoutlmv2_integration_test(self):
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")

        for image_processing_class in self.image_processor_list:
            # with apply_OCR = True
            image_processing = image_processing_class()

-            image = Image.open(ds[0]["file"]).convert("RGB")
+            image = ds[0]["image"]

            encoding = image_processing(image, return_tensors="pt")

--- a/tests/models/layoutlmv2/test_processor_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_processor_layoutlmv2.py
@@ -28,8 +28,6 @@ from ...test_processing_common import ProcessorTesterMixin


 if is_pytesseract_available():
-    from PIL import Image
-
    from transformers import LayoutLMv2ImageProcessor


@@ -156,11 +154,11 @@ class LayoutLMv2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        from datasets import load_dataset

        # set up
-        datasets = load_dataset("nielsr/funsd", trust_remote_code=True)
+        datasets = load_dataset("nielsr/funsd")
        processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")

        def preprocess_data(examples):
-            images = [Image.open(path).convert("RGB") for path in examples["image_path"]]
+            images = [image.convert("RGB") for image in examples["image"]]
            words = examples["words"]
            boxes = examples["bboxes"]
            word_labels = examples["ner_tags"]
@@ -192,12 +190,8 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
        # we verify our implementation on 2 document images from the DocVQA dataset
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
-
-        image_1 = Image.open(ds[0]["file"]).convert("RGB")
-        image_2 = Image.open(ds[1]["file"]).convert("RGB")
-
-        return image_1, image_2
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+        return ds[0]["image"].convert("RGB"), ds[1]["image"].convert("RGB")

    @cached_property
    def get_tokenizers(self):
--- a/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py
@@ -22,8 +22,6 @@ from ...test_image_processing_common import ImageProcessingTestMixin, prepare_im


 if is_pytesseract_available():
-    from PIL import Image
-
    from transformers import LayoutLMv3ImageProcessor

    if is_torchvision_available():
@@ -103,17 +101,16 @@ class LayoutLMv3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
            image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42)
            self.assertEqual(image_processor.size, {"height": 42, "width": 42})

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_LayoutLMv3_integration_test(self):
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")

        # with apply_OCR = True
        for image_processing_class in self.image_processor_list:
            image_processor = image_processing_class()

-            image = Image.open(ds[0]["file"]).convert("RGB")
+            image = ds[0]["image"].convert("RGB")

            encoding = image_processor(image, return_tensors="pt")

--- a/tests/models/layoutlmv3/test_processor_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_processor_layoutlmv3.py
@@ -28,8 +28,6 @@ from ...test_processing_common import ProcessorTesterMixin


 if is_pytesseract_available():
-    from PIL import Image
-
    from transformers import LayoutLMv3ImageProcessor


@@ -172,12 +170,8 @@ class LayoutLMv3ProcessorIntegrationTests(unittest.TestCase):
        # we verify our implementation on 2 document images from the DocVQA dataset
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
-
-        image_1 = Image.open(ds[0]["file"]).convert("RGB")
-        image_2 = Image.open(ds[1]["file"]).convert("RGB")
-
-        return image_1, image_2
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+        return ds[0]["image"].convert("RGB"), ds[1]["image"].convert("RGB")

    @cached_property
    def get_tokenizers(self):
--- a/tests/models/layoutxlm/test_processor_layoutxlm.py
+++ b/tests/models/layoutxlm/test_processor_layoutxlm.py
@@ -33,8 +33,6 @@ from ...test_processing_common import ProcessorTesterMixin


 if is_pytesseract_available():
-    from PIL import Image
-
    from transformers import LayoutLMv2ImageProcessor


@@ -162,11 +160,11 @@ class LayoutXLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        from datasets import load_dataset

        # set up
-        datasets = load_dataset("nielsr/funsd", trust_remote_code=True)
+        datasets = load_dataset("nielsr/funsd")
        processor = LayoutXLMProcessor.from_pretrained("microsoft/layoutxlm-base", apply_ocr=False)

        def preprocess_data(examples):
-            images = [Image.open(path).convert("RGB") for path in examples["image_path"]]
+            images = [image.convert("RGB") for image in examples["image"]]
            words = examples["words"]
            boxes = examples["bboxes"]
            word_labels = examples["ner_tags"]
@@ -200,12 +198,8 @@ class LayoutXLMProcessorIntegrationTests(unittest.TestCase):
        # we verify our implementation on 2 document images from the DocVQA dataset
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
-
-        image_1 = Image.open(ds[0]["file"]).convert("RGB")
-        image_2 = Image.open(ds[1]["file"]).convert("RGB")
-
-        return image_1, image_2
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+        return ds[0]["image"].convert("RGB"), ds[1]["image"].convert("RGB")

    @cached_property
    def get_tokenizers(self):
--- a/tests/models/mobilevit/test_image_processing_mobilevit.py
+++ b/tests/models/mobilevit/test_image_processing_mobilevit.py
@@ -27,8 +27,6 @@ if is_torch_available():
    import torch

 if is_vision_available():
-    from PIL import Image
-
    from transformers import MobileViTImageProcessor


@@ -86,23 +84,14 @@ class MobileViTImageProcessingTester:


 def prepare_semantic_single_inputs():
-    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-
-    image = Image.open(dataset[0]["file"])
-    map = Image.open(dataset[1]["file"])
-
-    return image, map
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    example = ds[0]
+    return example["image"], example["map"]


 def prepare_semantic_batch_inputs():
-    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-
-    image1 = Image.open(dataset[0]["file"])
-    map1 = Image.open(dataset[1]["file"])
-    image2 = Image.open(dataset[2]["file"])
-    map2 = Image.open(dataset[3]["file"])
-
-    return [image1, image2], [map1, map2]
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    return list(ds["image"][:2]), list(ds["map"][:2])


@require_torch
@@ -135,7 +124,6 @@ class MobileViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        self.assertEqual(image_processor.size, {"shortest_edge": 42})
        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_call_segmentation_maps(self):
        # Initialize image_processing
        image_processing = self.image_processing_class(**self.image_processor_dict)
--- a/tests/models/nougat/test_image_processing_nougat.py
+++ b/tests/models/nougat/test_image_processing_nougat.py
@@ -86,8 +86,12 @@ class NougatImageProcessingTester:
        return self.num_channels, self.size["height"], self.size["width"]

    def prepare_dummy_image(self):
+        revision = "ec57bf8c8b1653a209c13f6e9ee66b12df0fc2db"
        filepath = hf_hub_download(
-            repo_id="hf-internal-testing/fixtures_docvqa", filename="nougat_pdf.png", repo_type="dataset"
+            repo_id="hf-internal-testing/fixtures_docvqa",
+            filename="nougat_pdf.png",
+            repo_type="dataset",
+            revision=revision,
        )
        image = Image.open(filepath).convert("RGB")
        return image
@@ -136,7 +140,6 @@ class NougatImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
        self.assertEqual(image_processor.size, {"height": 42, "width": 42})

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_expected_output(self):
        dummy_image = self.image_processor_tester.prepare_dummy_image()
        image_processor = self.image_processor
@@ -180,13 +183,16 @@ class NougatImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        self.assertEqual((3, 100, 200), aligned_image.shape)

    def prepare_dummy_np_image(self):
+        revision = "ec57bf8c8b1653a209c13f6e9ee66b12df0fc2db"
        filepath = hf_hub_download(
-            repo_id="hf-internal-testing/fixtures_docvqa", filename="nougat_pdf.png", repo_type="dataset"
+            repo_id="hf-internal-testing/fixtures_docvqa",
+            filename="nougat_pdf.png",
+            repo_type="dataset",
+            revision=revision,
        )
        image = Image.open(filepath).convert("RGB")
        return np.array(image)

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_crop_margin_equality_cv2_python(self):
        image = self.prepare_dummy_np_image()
        image_processor = self.image_processor
--- a/tests/models/perceiver/test_modeling_perceiver.py
+++ b/tests/models/perceiver/test_modeling_perceiver.py
@@ -842,11 +842,8 @@ def prepare_img():

 # Helper functions for optical flow integration test
 def prepare_optical_flow_images():
-    dataset = load_dataset("hf-internal-testing/fixtures_sintel", split="test", trust_remote_code=True)
-    image1 = Image.open(dataset[0]["file"]).convert("RGB")
-    image2 = Image.open(dataset[0]["file"]).convert("RGB")
-
-    return image1, image2
+    ds = load_dataset("hf-internal-testing/fixtures_sintel", split="test")
+    return list(ds["image"][:2])


 def normalize(img):
--- a/tests/models/segformer/test_image_processing_segformer.py
+++ b/tests/models/segformer/test_image_processing_segformer.py
@@ -27,8 +27,6 @@ if is_torch_available():
    import torch

 if is_vision_available():
-    from PIL import Image
-
    from transformers import SegformerImageProcessor


@@ -86,23 +84,14 @@ class SegformerImageProcessingTester:


 def prepare_semantic_single_inputs():
-    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-
-    image = Image.open(dataset[0]["file"])
-    map = Image.open(dataset[1]["file"])
-
-    return image, map
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    example = ds[0]
+    return example["image"], example["map"]


 def prepare_semantic_batch_inputs():
-    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-
-    image1 = Image.open(dataset[0]["file"])
-    map1 = Image.open(dataset[1]["file"])
-    image2 = Image.open(dataset[2]["file"])
-    map2 = Image.open(dataset[3]["file"])
-
-    return [image1, image2], [map1, map2]
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    return list(ds["image"][:2]), list(ds["map"][:2])


@require_torch
@@ -138,7 +127,6 @@ class SegformerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
        self.assertEqual(image_processor.do_reduce_labels, True)

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_call_segmentation_maps(self):
        # Initialize image_processing
        image_processing = self.image_processing_class(**self.image_processor_dict)
@@ -245,7 +233,6 @@ class SegformerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        self.assertTrue(encoding["labels"].min().item() >= 0)
        self.assertTrue(encoding["labels"].max().item() <= 255)

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_reduce_labels(self):
        # Initialize image_processing
        image_processing = self.image_processing_class(**self.image_processor_dict)
--- a/tests/models/udop/test_modeling_udop.py
+++ b/tests/models/udop/test_modeling_udop.py
@@ -16,9 +16,9 @@ import copy
 import inspect
 import unittest

-from huggingface_hub import hf_hub_download
+from datasets import load_dataset

-from transformers import UdopConfig, is_torch_available, is_vision_available
+from transformers import UdopConfig, is_torch_available
 from transformers.testing_utils import (
    require_sentencepiece,
    require_tokenizers,
@@ -42,10 +42,6 @@ if is_torch_available():
    from transformers import UdopEncoderModel, UdopForConditionalGeneration, UdopModel, UdopProcessor


-if is_vision_available():
-    from PIL import Image
-
-
 class UdopModelTester:
    def __init__(
        self,
@@ -618,12 +614,8 @@ class UdopEncoderOnlyModelTest(ModelTesterMixin, unittest.TestCase):
 class UdopModelIntegrationTests(unittest.TestCase):
    @cached_property
    def image(self):
-        filepath = hf_hub_download(
-            repo_id="hf-internal-testing/fixtures_docvqa", filename="document_2.png", repo_type="dataset"
-        )
-        image = Image.open(filepath).convert("RGB")
-
-        return image
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+        return ds[1]["image"]

    @cached_property
    def processor(self):
--- a/tests/models/udop/test_processor_udop.py
+++ b/tests/models/udop/test_processor_udop.py
@@ -41,8 +41,6 @@ if is_torch_available():


 if is_pytesseract_available():
-    from PIL import Image
-
    from transformers import LayoutLMv3ImageProcessor


@@ -184,11 +182,11 @@ class UdopProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        from datasets import load_dataset

        # set up
-        datasets = load_dataset("nielsr/funsd", trust_remote_code=True)
+        datasets = load_dataset("nielsr/funsd")
        processor = UdopProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False)

        def preprocess_data(examples):
-            images = [Image.open(path).convert("RGB") for path in examples["image_path"]]
+            images = [image.convert("RGB") for image in examples["image"]]
            words = examples["words"]
            boxes = examples["bboxes"]
            word_labels = examples["ner_tags"]
@@ -222,12 +220,8 @@ class UdopProcessorIntegrationTests(unittest.TestCase):
        # we verify our implementation on 2 document images from the DocVQA dataset
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
-
-        image_1 = Image.open(ds[0]["file"]).convert("RGB")
-        image_2 = Image.open(ds[1]["file"]).convert("RGB")
-
-        return image_1, image_2
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+        return ds[0]["image"].convert("RGB"), ds[1]["image"].convert("RGB")

    @cached_property
    def get_tokenizers(self):
--- a/tests/models/unispeech/test_modeling_unispeech.py
+++ b/tests/models/unispeech/test_modeling_unispeech.py
@@ -566,7 +566,7 @@ class UniSpeechModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]

    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")

        return ds[:num_samples]

--- a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
+++ b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
@@ -820,7 +820,7 @@ class UniSpeechSatModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]

    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")

        return ds[:num_samples]

--- a/tests/models/upernet/test_modeling_upernet.py
+++ b/tests/models/upernet/test_modeling_upernet.py
@@ -15,7 +15,7 @@

 import unittest

-from huggingface_hub import hf_hub_download
+from datasets import load_dataset

 from transformers import ConvNextConfig, UperNetConfig
 from transformers.testing_utils import (
@@ -41,8 +41,6 @@ if is_torch_available():


 if is_vision_available():
-    from PIL import Image
-
    from transformers import AutoImageProcessor


@@ -277,11 +275,8 @@ class UperNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)

 # We will verify our results on an image of ADE20k
 def prepare_img():
-    filepath = hf_hub_download(
-        repo_id="hf-internal-testing/fixtures_ade20k", repo_type="dataset", filename="ADE_val_00000001.jpg"
-    )
-    image = Image.open(filepath).convert("RGB")
-    return image
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    return ds[0]["image"].convert("RGB")


@require_torch
@@ -302,7 +297,7 @@ class UperNetModelIntegrationTest(unittest.TestCase):
        self.assertEqual(outputs.logits.shape, expected_shape)

        expected_slice = torch.tensor(
-            [[-7.5958, -7.5958, -7.4302], [-7.5958, -7.5958, -7.4302], [-7.4797, -7.4797, -7.3068]]
+            [[-7.5969, -7.5969, -7.4313], [-7.5969, -7.5969, -7.4313], [-7.4808, -7.4808, -7.3080]]
        ).to(torch_device)
        torch.testing.assert_close(outputs.logits[0, 0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)

--- a/tests/models/vilt/test_modeling_vilt.py
+++ b/tests/models/vilt/test_modeling_vilt.py
@@ -637,9 +637,9 @@ class ViltModelIntegrationTest(unittest.TestCase):

        processor = self.default_processor

-        dataset = load_dataset("hf-internal-testing/fixtures_nlvr2", split="test", trust_remote_code=True)
-        image1 = Image.open(dataset[0]["file"]).convert("RGB")
-        image2 = Image.open(dataset[1]["file"]).convert("RGB")
+        dataset = load_dataset("hf-internal-testing/fixtures_nlvr2", split="train")
+        image1 = dataset[0]["image"]
+        image2 = dataset[1]["image"]

        text = (
            "The left image contains twice the number of dogs as the right image, and at least two dogs in total are"
--- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
@@ -1149,8 +1149,8 @@ class TrOCRModelIntegrationTest(unittest.TestCase):
    def test_inference_handwritten(self):
        model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten").to(torch_device)

-        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="test", trust_remote_code=True)
-        image = Image.open(dataset[0]["file"]).convert("RGB")
+        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="train")
+        image = dataset[1]["image"].convert("RGB")

        processor = self.default_processor
        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
@@ -1174,8 +1174,8 @@ class TrOCRModelIntegrationTest(unittest.TestCase):
    def test_inference_printed(self):
        model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed").to(torch_device)

-        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="test", trust_remote_code=True)
-        image = Image.open(dataset[1]["file"]).convert("RGB")
+        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="train")
+        image = dataset[0]["image"].convert("RGB")

        processor = self.default_processor
        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@@ -97,9 +97,7 @@ def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout):
    try:
        _ = in_queue.get(timeout=timeout)

-        ds = load_dataset(
-            "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True
-        )
+        ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True)
        sample = next(iter(ds))

        resampled_audio = torchaudio.functional.resample(
@@ -1470,7 +1468,7 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]

    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")

        return ds[:num_samples]

@@ -1836,9 +1834,7 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
    @require_pyctcdecode
    @require_torchaudio
    def test_wav2vec2_with_lm(self):
-        ds = load_dataset(
-            "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True
-        )
+        ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True)
        sample = next(iter(ds))

        resampled_audio = torchaudio.functional.resample(
@@ -1862,9 +1858,7 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
    @require_pyctcdecode
    @require_torchaudio
    def test_wav2vec2_with_lm_pool(self):
-        ds = load_dataset(
-            "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True
-        )
+        ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True)
        sample = next(iter(ds))

        resampled_audio = torchaudio.functional.resample(
@@ -1963,9 +1957,7 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
        LANG_MAP = {"it": "ita", "es": "spa", "fr": "fra", "en": "eng"}

        def run_model(lang):
-            ds = load_dataset(
-                "mozilla-foundation/common_voice_11_0", lang, split="test", streaming=True, trust_remote_code=True
-            )
+            ds = load_dataset("mozilla-foundation/common_voice_11_0", lang, split="test", streaming=True)
            sample = next(iter(ds))

            wav2vec2_lang = LANG_MAP[lang]
--- a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
+++ b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
@@ -463,9 +463,7 @@ class Wav2Vec2ProcessorWithLMTest(unittest.TestCase):
    def test_word_time_stamp_integration(self):
        import torch

-        ds = load_dataset(
-            "mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True, trust_remote_code=True
-        )
+        ds = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True)
        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
        ds_iter = iter(ds)
        sample = next(ds_iter)
--- a/tests/models/wavlm/test_modeling_wavlm.py
+++ b/tests/models/wavlm/test_modeling_wavlm.py
@@ -473,7 +473,7 @@ class WavLMModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]

    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")

        return ds[:num_samples]

--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -1645,9 +1645,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
        model.to(torch_device)

-        ds = load_dataset(
-            "facebook/multilingual_librispeech", "german", split="test", streaming=True, trust_remote_code=True
-        )
+        ds = load_dataset("facebook/multilingual_librispeech", "german", split="test", streaming=True)
        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))

        input_speech = next(iter(ds))["audio"]["array"]
@@ -1714,11 +1712,10 @@ class WhisperModelIntegrationTests(unittest.TestCase):

        token = os.getenv("HF_HUB_READ_TOKEN", True)
        ds = load_dataset(
-            "mozilla-foundation/common_voice_6_1",
+            "hf-internal-testing/fixtures_common_voice",
            "ja",
            split="test",
            streaming=True,
-            trust_remote_code=True,
            token=token,
        )
        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
@@ -1728,7 +1725,10 @@ class WhisperModelIntegrationTests(unittest.TestCase):
            torch_device
        )

-        EXPECTED_TRANSCRIPTS = ["木村さんに電話を貸してもらいました", " Kimura-san called me."]
+        EXPECTED_TRANSCRIPTS = [
+            "夏の時期の時期でした",
+            " It was the time of day and all of the pens left during the summer.",
+        ]

        generated_ids = model.generate(
            input_features.repeat(2, 1, 1),