[tests] remove flax-pt equivalence and cross tests (#36283)

2025-02-19 15:13:27 +00:00
parent fa8cdccd91
commit 99adc74462
39 changed files with 33 additions and 3103 deletions
--- a/tests/models/big_bird/test_modeling_big_bird.py
+++ b/tests/models/big_bird/test_modeling_big_bird.py
@@ -624,15 +624,6 @@ class BigBirdModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
    def test_training_gradient_checkpointing_use_reentrant_false(self):
        pass

-    # overwrite from common in order to skip the check on `attentions`
-    def check_pt_flax_outputs(self, fx_outputs, pt_outputs, model_class, tol=1e-5, name="outputs", attributes=None):
-        # `bigbird_block_sparse_attention` in `FlaxBigBird` returns `attention_probs = None`, while in PyTorch version,
-        # an effort was done to return `attention_probs` (yet to be verified).
-        if name.startswith("outputs.attentions"):
-            return
-        else:
-            super().check_pt_flax_outputs(fx_outputs, pt_outputs, model_class, tol, name, attributes)
-

@require_torch
@slow
--- a/tests/models/big_bird/test_modeling_flax_big_bird.py
+++ b/tests/models/big_bird/test_modeling_flax_big_bird.py
@@ -212,12 +212,3 @@ class FlaxBigBirdModelTest(FlaxModelTesterMixin, unittest.TestCase):
                self.assertEqual(len(outputs), len(jitted_outputs))
                for jitted_output, output in zip(jitted_outputs, outputs):
                    self.assertEqual(jitted_output.shape, output.shape)
-
-    # overwrite from common in order to skip the check on `attentions`
-    def check_pt_flax_outputs(self, fx_outputs, pt_outputs, model_class, tol=1e-5, name="outputs", attributes=None):
-        # `bigbird_block_sparse_attention` in `FlaxBigBird` returns `attention_probs = None`, while in PyTorch version,
-        # an effort was done to return `attention_probs` (yet to be verified).
-        if name.startswith("outputs.attentions"):
-            return
-        else:
-            super().check_pt_flax_outputs(fx_outputs, pt_outputs, model_class, tol, name, attributes)
--- a/tests/models/clip/test_modeling_clip.py
+++ b/tests/models/clip/test_modeling_clip.py
@@ -25,11 +25,8 @@ import requests
 from parameterized import parameterized
 from pytest import mark

-import transformers
 from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
 from transformers.testing_utils import (
-    is_flax_available,
-    is_pt_flax_cross_test,
    require_flash_attn,
    require_torch,
    require_torch_gpu,
@@ -82,15 +79,6 @@ if is_vision_available():
    from transformers import CLIPProcessor


-if is_flax_available():
-    import jax.numpy as jnp
-
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
-
-
 class CLIPVisionModelTester:
    def __init__(
        self,
@@ -883,126 +871,6 @@ class CLIPModelTest(CLIPModelTesterMixin, PipelineTesterMixin, unittest.TestCase
            text_config = CLIPTextConfig.from_pretrained(tmp_dir_name)
            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())

-    # overwrite from common since FlaxCLIPModel returns nested output
-    # which is not supported in the common test
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # load PyTorch class
-                pt_model = model_class(config).eval()
-                pt_model.to(torch_device)
-                # Flax models don't use the `use_cache` option and cache is not returned as a default.
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    self.skipTest(reason="No Flax model exists for this class")
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load Flax class
-                fx_model = fx_model_class(config, dtype=jnp.float32)
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                # convert inputs to Flax
-                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-                fx_outputs = fx_model(**fx_inputs).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(force=True), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = fx_model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**fx_inputs).to_tuple()
-                self.assertEqual(
-                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output_loaded, pt_output in zip(fx_outputs_loaded[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output_loaded, pt_output.numpy(force=True), 4e-2)
-
-    # overwrite from common since FlaxCLIPModel returns nested output
-    # which is not supported in the common test
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # load corresponding PyTorch class
-                pt_model = model_class(config).eval()
-
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    self.skipTest(reason="No Flax model exists for this class")
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load Flax class
-                fx_model = fx_model_class(config, dtype=jnp.float32)
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-                pt_model.to(torch_device)
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-
-                fx_outputs = fx_model(**fx_inputs).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(force=True), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = model_class.from_pretrained(tmpdirname, from_flax=True)
-                    pt_model_loaded.to(torch_device)
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-                self.assertEqual(
-                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs_loaded[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(force=True), 4e-2)
-
    @slow
    def test_model_from_pretrained(self):
        model_name = "openai/clip-vit-base-patch32"
--- a/tests/models/clip/test_modeling_flax_clip.py
+++ b/tests/models/clip/test_modeling_flax_clip.py
@@ -4,21 +4,15 @@ import unittest

 import numpy as np

-import transformers
-from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig, is_flax_available, is_torch_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow
+from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig, is_flax_available
+from transformers.testing_utils import require_flax, slow

 from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask


 if is_flax_available():
    import jax
-    import jax.numpy as jnp

-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
    from transformers.models.clip.modeling_flax_clip import (
        FlaxCLIPModel,
        FlaxCLIPTextModel,
@@ -26,9 +20,6 @@ if is_flax_available():
        FlaxCLIPVisionModel,
    )

-if is_torch_available():
-    import torch
-

 class FlaxCLIPVisionModelTester:
    def __init__(
@@ -223,21 +214,6 @@ class FlaxCLIPVisionModelTest(FlaxModelTesterMixin, unittest.TestCase):
    def test_save_load_to_base(self):
        pass

-    # FlaxCLIPVisionModel does not have any base model
-    @is_pt_flax_cross_test
-    def test_save_load_from_base_pt(self):
-        pass
-
-    # FlaxCLIPVisionModel does not have any base model
-    @is_pt_flax_cross_test
-    def test_save_load_to_base_pt(self):
-        pass
-
-    # FlaxCLIPVisionModel does not have any base model
-    @is_pt_flax_cross_test
-    def test_save_load_bf16_to_base_pt(self):
-        pass
-
    @slow
    def test_model_from_pretrained(self):
        for model_class_name in self.all_model_classes:
@@ -333,21 +309,6 @@ class FlaxCLIPTextModelTest(FlaxModelTesterMixin, unittest.TestCase):
    def test_save_load_to_base(self):
        pass

-    # FlaxCLIPVisionModel does not have any base model
-    @is_pt_flax_cross_test
-    def test_save_load_from_base_pt(self):
-        pass
-
-    # FlaxCLIPVisionModel does not have any base model
-    @is_pt_flax_cross_test
-    def test_save_load_to_base_pt(self):
-        pass
-
-    # FlaxCLIPVisionModel does not have any base model
-    @is_pt_flax_cross_test
-    def test_save_load_bf16_to_base_pt(self):
-        pass
-
    @slow
    def test_model_from_pretrained(self):
        for model_class_name in self.all_model_classes:
@@ -472,92 +433,6 @@ class FlaxCLIPModelTest(FlaxModelTesterMixin, unittest.TestCase):
            outputs = model(input_ids=np.ones((1, 1)), pixel_values=np.ones((1, 3, 224, 224)))
            self.assertIsNotNone(outputs)

-    # overwrite from common since FlaxCLIPModel returns nested output
-    # which is not supported in the common test
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                pt_model = pt_model_class(config).eval()
-                fx_model = model_class(config, dtype=jnp.float32)
-
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(
-                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output_loaded, pt_output in zip(fx_outputs_loaded[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output_loaded, pt_output.numpy(), 4e-2)
-
-    # overwrite from common since FlaxCLIPModel returns nested output
-    # which is not supported in the common test
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                pt_model = pt_model_class(config).eval()
-                fx_model = model_class(config, dtype=jnp.float32)
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = pt_model_class.from_pretrained(tmpdirname, from_flax=True)
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-                self.assertEqual(
-                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs_loaded[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
-
    # overwrite from common since FlaxCLIPModel returns nested output
    # which is not supported in the common test
    def test_from_pretrained_save_pretrained(self):
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -22,11 +22,8 @@ import unittest
 import numpy as np
 import requests

-import transformers
 from transformers import CLIPSegConfig, CLIPSegProcessor, CLIPSegTextConfig, CLIPSegVisionConfig
 from transformers.testing_utils import (
-    is_flax_available,
-    is_pt_flax_cross_test,
    require_torch,
    require_vision,
    slow,
@@ -57,15 +54,6 @@ if is_vision_available():
    from PIL import Image


-if is_flax_available():
-    import jax.numpy as jnp
-
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
-
-
 class CLIPSegVisionModelTester:
    def __init__(
        self,
@@ -635,123 +623,6 @@ class CLIPSegModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
            text_config = CLIPSegTextConfig.from_pretrained(tmp_dir_name)
            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())

-    # overwrite from common since FlaxCLIPSegModel returns nested output
-    # which is not supported in the common test
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # load PyTorch class
-                pt_model = model_class(config).eval()
-                # Flax models don't use the `use_cache` option and cache is not returned as a default.
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    self.skipTest(reason="No Flax model exists for this class")
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load Flax class
-                fx_model = fx_model_class(config, dtype=jnp.float32)
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                # convert inputs to Flax
-                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-                fx_outputs = fx_model(**fx_inputs).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = fx_model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**fx_inputs).to_tuple()
-                self.assertEqual(
-                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output_loaded, pt_output in zip(fx_outputs_loaded[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output_loaded, pt_output.numpy(), 4e-2)
-
-    # overwrite from common since FlaxCLIPSegModel returns nested output
-    # which is not supported in the common test
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # load corresponding PyTorch class
-                pt_model = model_class(config).eval()
-
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    self.skipTest(reason="No Flax model exists for this class")
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load Flax class
-                fx_model = fx_model_class(config, dtype=jnp.float32)
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-
-                fx_outputs = fx_model(**fx_inputs).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = model_class.from_pretrained(tmpdirname, from_flax=True)
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-                self.assertEqual(
-                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs_loaded[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
-
    def test_training(self):
        if not self.model_tester.is_training:
            self.skipTest(reason="Training test is skipped as the model was not trained")
--- a/tests/models/data2vec/test_modeling_data2vec_audio.py
+++ b/tests/models/data2vec/test_modeling_data2vec_audio.py
@@ -22,7 +22,7 @@ from datasets import load_dataset

 from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
 from transformers import Data2VecAudioConfig, is_torch_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_soundfile, require_torch, slow, torch_device
+from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device

 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init
@@ -442,16 +442,6 @@ class Data2VecAudioModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Tes
    def test_model_get_set_embeddings(self):
        pass

-    @is_pt_flax_cross_test
-    # non-robust architecture does not exist in Flax
-    def test_equivalence_flax_to_pt(self):
-        pass
-
-    @is_pt_flax_cross_test
-    # non-robust architecture does not exist in Flax
-    def test_equivalence_pt_to_flax(self):
-        pass
-
    def test_retain_grad_hidden_states_attentions(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        config.output_hidden_states = True
--- a/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py
+++ b/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py
@@ -19,8 +19,8 @@ import unittest

 import numpy as np

-from transformers import is_flax_available, is_torch_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow, torch_device
+from transformers import is_flax_available
+from transformers.testing_utils import require_flax, slow

 from ...test_modeling_flax_common import ids_tensor
 from ..bart.test_modeling_flax_bart import FlaxBartStandaloneDecoderModelTester
@@ -38,15 +38,6 @@ if is_flax_available():
        FlaxEncoderDecoderModel,
        FlaxGPT2LMHeadModel,
    )
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
-
-if is_torch_available():
-    import torch
-
-    from transformers import EncoderDecoderModel


@require_flax
@@ -291,68 +282,6 @@ class FlaxEncoderDecoderMixin:
        generated_sequences = generated_output.sequences
        self.assertEqual(generated_sequences.shape, (input_ids.shape[0],) + (decoder_config.max_length,))

-    def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
-        pt_model.to(torch_device)
-        pt_model.eval()
-
-        # prepare inputs
-        flax_inputs = inputs_dict
-        pt_inputs = {k: torch.tensor(v.tolist()).to(torch_device) for k, v in flax_inputs.items()}
-
-        with torch.no_grad():
-            pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-        fx_outputs = fx_model(**inputs_dict).to_tuple()
-        self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-            self.assert_almost_equals(fx_output, pt_output.numpy(force=True), 1e-5)
-
-        # PT -> Flax
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pt_model.save_pretrained(tmpdirname)
-            fx_model_loaded = FlaxEncoderDecoderModel.from_pretrained(tmpdirname, from_pt=True)
-
-        fx_outputs_loaded = fx_model_loaded(**inputs_dict).to_tuple()
-        self.assertEqual(len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
-            self.assert_almost_equals(fx_output_loaded, pt_output.numpy(force=True), 1e-5)
-
-        # Flax -> PT
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            fx_model.save_pretrained(tmpdirname)
-            pt_model_loaded = EncoderDecoderModel.from_pretrained(tmpdirname, from_flax=True)
-
-        pt_model_loaded.to(torch_device)
-        pt_model_loaded.eval()
-
-        with torch.no_grad():
-            pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-        self.assertEqual(len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch")
-        for fx_output, pt_output_loaded in zip(fx_outputs, pt_outputs_loaded):
-            self.assert_almost_equals(fx_output, pt_output_loaded.numpy(force=True), 1e-5)
-
-    def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict):
-        encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
-
-        pt_model = EncoderDecoderModel(encoder_decoder_config)
-        fx_model = FlaxEncoderDecoderModel(encoder_decoder_config)
-
-        fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-        fx_model.params = fx_state
-
-        self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
-
-    def check_equivalence_flax_to_pt(self, config, decoder_config, inputs_dict):
-        encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
-
-        pt_model = EncoderDecoderModel(encoder_decoder_config)
-        fx_model = FlaxEncoderDecoderModel(encoder_decoder_config)
-
-        pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-
-        self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
-
    def test_encoder_decoder_model_from_pretrained_configs(self):
        input_ids_dict = self.prepare_config_and_inputs()
        self.check_encoder_decoder_model_from_pretrained_configs(**input_ids_dict)
@@ -385,40 +314,6 @@ class FlaxEncoderDecoderMixin:
        diff = np.abs((a - b)).max()
        self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).")

-    @is_pt_flax_cross_test
-    def test_pt_flax_equivalence(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        config = config_inputs_dict.pop("config")
-        decoder_config = config_inputs_dict.pop("decoder_config")
-
-        inputs_dict = config_inputs_dict
-        # `encoder_hidden_states` is not used in model call/forward
-        del inputs_dict["encoder_hidden_states"]
-
-        # Avoid the case where a sequence has no place to attend (after combined with the causal attention mask)
-        batch_size = inputs_dict["decoder_attention_mask"].shape[0]
-        inputs_dict["decoder_attention_mask"] = np.concatenate(
-            [np.ones(shape=(batch_size, 1)), inputs_dict["decoder_attention_mask"][:, 1:]], axis=1
-        )
-
-        # Flax models don't use the `use_cache` option and cache is not returned as a default.
-        # So we disable `use_cache` here for PyTorch model.
-        decoder_config.use_cache = False
-
-        self.assertTrue(decoder_config.cross_attention_hidden_size is None)
-
-        # check without `enc_to_dec_proj` projection
-        decoder_config.hidden_size = config.hidden_size
-        self.assertTrue(config.hidden_size == decoder_config.hidden_size)
-        self.check_equivalence_pt_to_flax(config, decoder_config, inputs_dict)
-        self.check_equivalence_flax_to_pt(config, decoder_config, inputs_dict)
-
-        # check `enc_to_dec_proj` work as expected
-        decoder_config.hidden_size = decoder_config.hidden_size * 2
-        self.assertTrue(config.hidden_size != decoder_config.hidden_size)
-        self.check_equivalence_pt_to_flax(config, decoder_config, inputs_dict)
-        self.check_equivalence_flax_to_pt(config, decoder_config, inputs_dict)
-
    @slow
    def test_real_model_save_load_from_pretrained(self):
        model_2 = self.get_pretrained_model()
--- a/tests/models/gpt2/test_modeling_flax_gpt2.py
+++ b/tests/models/gpt2/test_modeling_flax_gpt2.py
@@ -13,14 +13,12 @@
 # limitations under the License.


-import tempfile
 import unittest

 import numpy as np

-import transformers
-from transformers import GPT2Config, GPT2Tokenizer, is_flax_available, is_torch_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow
+from transformers import GPT2Config, GPT2Tokenizer, is_flax_available
+from transformers.testing_utils import require_flax, slow

 from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask

@@ -29,15 +27,8 @@ if is_flax_available():
    import jax
    import jax.numpy as jnp

-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
    from transformers.models.gpt2.modeling_flax_gpt2 import FlaxGPT2LMHeadModel, FlaxGPT2Model

-if is_torch_available():
-    import torch
-

 class FlaxGPT2ModelTester:
    def __init__(
@@ -255,105 +246,6 @@ class FlaxGPT2ModelTest(FlaxModelTesterMixin, unittest.TestCase):

        self.assertListEqual(output_string, expected_string)

-    # overwrite from common since `attention_mask` in combination
-    # with `causal_mask` behaves slighly differently
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                batch_size, seq_length = pt_inputs["input_ids"].shape
-                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
-                for batch_idx, start_index in enumerate(rnd_start_indices):
-                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
-                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
-                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
-                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
-                pt_model = pt_model_class(config).eval()
-                fx_model = model_class(config, dtype=jnp.float32)
-
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(
-                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
-                    self.assert_almost_equals(fx_output_loaded[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-    # overwrite from common since `attention_mask` in combination
-    # with `causal_mask` behaves slighly differently
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                pt_model = pt_model_class(config).eval()
-                fx_model = model_class(config, dtype=jnp.float32)
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-                batch_size, seq_length = pt_inputs["input_ids"].shape
-                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
-                for batch_idx, start_index in enumerate(rnd_start_indices):
-                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
-                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
-                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
-                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = pt_model_class.from_pretrained(tmpdirname, from_flax=True)
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-                self.assertEqual(
-                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs_loaded):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
    @slow
    def test_model_from_pretrained(self):
        for model_class_name in self.all_model_classes:
--- a/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
+++ b/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
@@ -13,14 +13,12 @@
 # limitations under the License.


-import tempfile
 import unittest

 import numpy as np

-import transformers
-from transformers import GPT2Tokenizer, GPTNeoConfig, is_flax_available, is_torch_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow
+from transformers import GPT2Tokenizer, GPTNeoConfig, is_flax_available
+from transformers.testing_utils import require_flax, slow

 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask

@@ -29,15 +27,8 @@ if is_flax_available():
    import jax
    import jax.numpy as jnp

-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
    from transformers.models.gpt_neo.modeling_flax_gpt_neo import FlaxGPTNeoForCausalLM, FlaxGPTNeoModel

-if is_torch_available():
-    import torch
-

 class FlaxGPTNeoModelTester:
    def __init__(
@@ -224,105 +215,6 @@ class FlaxGPTNeoModelTest(FlaxModelTesterMixin, unittest.TestCase):

        self.assertListEqual(output_string, expected_string)

-    # overwrite from common since `attention_mask` in combination
-    # with `causal_mask` behaves slighly differently
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                batch_size, seq_length = pt_inputs["input_ids"].shape
-                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
-                for batch_idx, start_index in enumerate(rnd_start_indices):
-                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
-                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
-                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
-                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
-                pt_model = pt_model_class(config).eval()
-                fx_model = model_class(config, dtype=jnp.float32)
-
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(
-                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
-                    self.assert_almost_equals(fx_output_loaded[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-    # overwrite from common since `attention_mask` in combination
-    # with `causal_mask` behaves slighly differently
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                pt_model = pt_model_class(config).eval()
-                fx_model = model_class(config, dtype=jnp.float32)
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-                batch_size, seq_length = pt_inputs["input_ids"].shape
-                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
-                for batch_idx, start_index in enumerate(rnd_start_indices):
-                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
-                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
-                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
-                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = pt_model_class.from_pretrained(tmpdirname, from_flax=True)
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-                self.assertEqual(
-                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs_loaded):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
    @slow
    def test_model_from_pretrained(self):
        for model_class_name in self.all_model_classes:
--- a/tests/models/gptj/test_modeling_flax_gptj.py
+++ b/tests/models/gptj/test_modeling_flax_gptj.py
@@ -13,14 +13,12 @@
 # limitations under the License.


-import tempfile
 import unittest

 import numpy as np

-import transformers
-from transformers import GPT2Tokenizer, GPTJConfig, is_flax_available, is_torch_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_flax, tooslow
+from transformers import GPT2Tokenizer, GPTJConfig, is_flax_available
+from transformers.testing_utils import require_flax, tooslow

 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask

@@ -29,15 +27,8 @@ if is_flax_available():
    import jax
    import jax.numpy as jnp

-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
    from transformers.models.gptj.modeling_flax_gptj import FlaxGPTJForCausalLM, FlaxGPTJModel

-if is_torch_available():
-    import torch
-

 class FlaxGPTJModelTester:
    def __init__(
@@ -221,105 +212,6 @@ class FlaxGPTJModelTest(FlaxModelTesterMixin, unittest.TestCase):

        self.assertListEqual(output_string, expected_string)

-    # overwrite from common since `attention_mask` in combination
-    # with `causal_mask` behaves slighly differently
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                batch_size, seq_length = pt_inputs["input_ids"].shape
-                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
-                for batch_idx, start_index in enumerate(rnd_start_indices):
-                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
-                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
-                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
-                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
-                pt_model = pt_model_class(config).eval()
-                fx_model = model_class(config, dtype=jnp.float32)
-
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(
-                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
-                    self.assert_almost_equals(fx_output_loaded[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-    # overwrite from common since `attention_mask` in combination
-    # with `causal_mask` behaves slighly differently
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                pt_model = pt_model_class(config).eval()
-                fx_model = model_class(config, dtype=jnp.float32)
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-                batch_size, seq_length = pt_inputs["input_ids"].shape
-                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
-                for batch_idx, start_index in enumerate(rnd_start_indices):
-                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
-                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
-                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
-                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = pt_model_class.from_pretrained(tmpdirname, from_flax=True)
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-                self.assertEqual(
-                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs_loaded):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
    @tooslow
    def test_model_from_pretrained(self):
        for model_class_name in self.all_model_classes:
--- a/tests/models/longt5/test_modeling_flax_longt5.py
+++ b/tests/models/longt5/test_modeling_flax_longt5.py
@@ -17,11 +17,9 @@ import unittest

 import numpy as np

-import transformers
 from transformers import is_flax_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import (
-    is_pt_flax_cross_test,
    require_flax,
    require_sentencepiece,
    require_tokenizers,
@@ -46,7 +44,6 @@ if is_flax_available():
    from flax.traverse_util import flatten_dict

    from transformers import FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING, FLAX_MODEL_MAPPING, AutoTokenizer, LongT5Config
-    from transformers.modeling_flax_pytorch_utils import load_flax_weights_in_pytorch_model
    from transformers.models.longt5.modeling_flax_longt5 import (
        FlaxLongT5ForConditionalGeneration,
        FlaxLongT5Model,
@@ -467,95 +464,6 @@ class FlaxLongT5ModelTest(FlaxModelTesterMixin, unittest.TestCase):
                [self.model_tester.num_attention_heads, block_len, 3 * block_len],
            )

-    # overwrite since special base model prefix is used
-    @is_pt_flax_cross_test
-    def test_save_load_from_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = base_class(config)
-            base_params = flatten_dict(unfreeze(model.params))
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, base_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                # save pt model
-                pt_model.save_pretrained(tmpdirname)
-                head_model = model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_param_from_head = flatten_dict(unfreeze(head_model.params))
-
-                for key in base_param_from_head.keys():
-                    max_diff = (base_params[key] - base_param_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    # overwrite since special base model prefix is used
-    @is_pt_flax_cross_test
-    def test_save_load_to_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = model_class(config)
-            base_params_from_head = flatten_dict(unfreeze(model.params))
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, model_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_model.save_pretrained(tmpdirname)
-                base_model = base_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_params = flatten_dict(unfreeze(base_model.params))
-
-                for key in base_params_from_head.keys():
-                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    # overwrite since special base model prefix is used
-    @is_pt_flax_cross_test
-    def test_save_load_bf16_to_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = model_class(config)
-            model.params = model.to_bf16(model.params)
-            base_params_from_head = flatten_dict(unfreeze(model.params))
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, model_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_model.save_pretrained(tmpdirname)
-                base_model = base_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_params = flatten_dict(unfreeze(base_model.params))
-
-                for key in base_params_from_head.keys():
-                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-

 class FlaxLongT5TGlobalModelTest(FlaxLongT5ModelTest):
    def setUp(self):
--- a/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py
+++ b/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py
@@ -19,7 +19,7 @@ import unittest
 import numpy as np

 from transformers import is_flax_available, is_torch_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow, torch_device
+from transformers.testing_utils import require_flax, slow

 from ...test_modeling_flax_common import floats_tensor, ids_tensor, random_attention_mask
 from ..bart.test_modeling_flax_bart import FlaxBartStandaloneDecoderModelTester
@@ -43,14 +43,8 @@ if is_flax_available():
        SpeechEncoderDecoderConfig,
    )
    from transformers.modeling_flax_outputs import FlaxBaseModelOutput
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )

 if is_torch_available():
-    import torch
-
    from transformers import SpeechEncoderDecoderModel


@@ -406,68 +400,6 @@ class FlaxEncoderDecoderMixin:
        for grad, grad_frozen in zip(grads, grads_frozen):
            self.assertTrue((grad == grad_frozen).all())

-    def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
-        pt_model.to(torch_device)
-        pt_model.eval()
-
-        # prepare inputs
-        flax_inputs = inputs_dict
-        pt_inputs = {k: torch.tensor(v.tolist()).to(torch_device) for k, v in flax_inputs.items()}
-
-        with torch.no_grad():
-            pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-        fx_outputs = fx_model(**inputs_dict).to_tuple()
-        self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-            self.assert_almost_equals(fx_output, pt_output.numpy(force=True), 1e-5)
-
-        # PT -> Flax
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pt_model.save_pretrained(tmpdirname)
-            fx_model_loaded = FlaxSpeechEncoderDecoderModel.from_pretrained(tmpdirname, from_pt=True)
-
-        fx_outputs_loaded = fx_model_loaded(**inputs_dict).to_tuple()
-        self.assertEqual(len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
-            self.assert_almost_equals(fx_output_loaded, pt_output.numpy(force=True), 1e-5)
-
-        # Flax -> PT
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            fx_model.save_pretrained(tmpdirname)
-            pt_model_loaded = SpeechEncoderDecoderModel.from_pretrained(tmpdirname, from_flax=True)
-
-        pt_model_loaded.to(torch_device)
-        pt_model_loaded.eval()
-
-        with torch.no_grad():
-            pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-        self.assertEqual(len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch")
-        for fx_output, pt_output_loaded in zip(fx_outputs, pt_outputs_loaded):
-            self.assert_almost_equals(fx_output, pt_output_loaded.numpy(force=True), 1e-5)
-
-    def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict):
-        encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
-
-        pt_model = SpeechEncoderDecoderModel(encoder_decoder_config)
-        fx_model = FlaxSpeechEncoderDecoderModel(encoder_decoder_config)
-
-        fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-        fx_model.params = fx_state
-
-        self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
-
-    def check_equivalence_flax_to_pt(self, config, decoder_config, inputs_dict):
-        encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
-
-        pt_model = SpeechEncoderDecoderModel(encoder_decoder_config)
-        fx_model = FlaxSpeechEncoderDecoderModel(encoder_decoder_config)
-
-        pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-
-        self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
-
    def test_encoder_decoder_model_from_pretrained_configs(self):
        input_ids_dict = self.prepare_config_and_inputs()
        self.check_encoder_decoder_model_from_pretrained_configs(**input_ids_dict)
@@ -504,46 +436,6 @@ class FlaxEncoderDecoderMixin:
        diff = np.abs((a - b)).max()
        self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).")

-    @is_pt_flax_cross_test
-    def test_pt_flax_equivalence(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        config = config_inputs_dict.pop("config")
-        decoder_config = config_inputs_dict.pop("decoder_config")
-
-        inputs_dict = config_inputs_dict
-        # `encoder_hidden_states` is not used in model call/forward
-        del inputs_dict["encoder_hidden_states"]
-
-        # Avoid the case where a sequence has no place to attend (after combined with the causal attention mask)
-        batch_size = inputs_dict["decoder_attention_mask"].shape[0]
-        inputs_dict["decoder_attention_mask"] = np.concatenate(
-            [np.ones(shape=(batch_size, 1)), inputs_dict["decoder_attention_mask"][:, 1:]], axis=1
-        )
-
-        # Flax models don't use the `use_cache` option and cache is not returned as a default.
-        # So we disable `use_cache` here for PyTorch model.
-        decoder_config.use_cache = False
-
-        self.assertTrue(decoder_config.cross_attention_hidden_size is None)
-
-        # check without `enc_to_dec_proj` projection
-        decoder_config.hidden_size = config.hidden_size
-        self.assertTrue(config.hidden_size == decoder_config.hidden_size)
-        self.check_equivalence_pt_to_flax(config, decoder_config, inputs_dict)
-        self.check_equivalence_flax_to_pt(config, decoder_config, inputs_dict)
-
-        # check `enc_to_dec_proj` work as expected
-        decoder_config.hidden_size = decoder_config.hidden_size * 2
-        self.assertTrue(config.hidden_size != decoder_config.hidden_size)
-        self.check_equivalence_pt_to_flax(config, decoder_config, inputs_dict)
-        self.check_equivalence_flax_to_pt(config, decoder_config, inputs_dict)
-
-        # check `add_adapter` works as expected
-        config.add_adapter = True
-        self.assertTrue(config.add_adapter)
-        self.check_equivalence_pt_to_flax(config, decoder_config, inputs_dict)
-        self.check_equivalence_flax_to_pt(config, decoder_config, inputs_dict)
-
    @slow
    def test_real_model_save_load_from_pretrained(self):
        model_2 = self.get_pretrained_model()
@@ -625,71 +517,6 @@ class FlaxWav2Vec2GPT2ModelTest(FlaxEncoderDecoderMixin, unittest.TestCase):
            "encoder_hidden_states": encoder_hidden_states,
        }

-    @slow
-    def test_flaxwav2vec2gpt2_pt_flax_equivalence(self):
-        pt_model = SpeechEncoderDecoderModel.from_pretrained("jsnfly/wav2vec2-large-xlsr-53-german-gpt2")
-        fx_model = FlaxSpeechEncoderDecoderModel.from_pretrained(
-            "jsnfly/wav2vec2-large-xlsr-53-german-gpt2", from_pt=True
-        )
-
-        pt_model.to(torch_device)
-        pt_model.eval()
-
-        # prepare inputs
-        batch_size = 13
-        input_values = floats_tensor([batch_size, 512], scale=1.0)
-        attention_mask = random_attention_mask([batch_size, 512])
-        decoder_input_ids = ids_tensor([batch_size, 4], fx_model.config.decoder.vocab_size)
-        decoder_attention_mask = random_attention_mask([batch_size, 4])
-        inputs_dict = {
-            "inputs": input_values,
-            "attention_mask": attention_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-        }
-
-        flax_inputs = inputs_dict
-        pt_inputs = {k: torch.tensor(v.tolist()) for k, v in flax_inputs.items()}
-
-        with torch.no_grad():
-            pt_outputs = pt_model(**pt_inputs)
-        pt_logits = pt_outputs.logits
-        pt_outputs = pt_outputs.to_tuple()
-
-        fx_outputs = fx_model(**inputs_dict)
-        fx_logits = fx_outputs.logits
-        fx_outputs = fx_outputs.to_tuple()
-
-        self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        self.assert_almost_equals(fx_logits, pt_logits.numpy(), 4e-2)
-
-        # PT -> Flax
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pt_model.save_pretrained(tmpdirname)
-            fx_model_loaded = FlaxSpeechEncoderDecoderModel.from_pretrained(tmpdirname, from_pt=True)
-
-        fx_outputs_loaded = fx_model_loaded(**inputs_dict)
-        fx_logits_loaded = fx_outputs_loaded.logits
-        fx_outputs_loaded = fx_outputs_loaded.to_tuple()
-        self.assertEqual(len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        self.assert_almost_equals(fx_logits_loaded, pt_logits.numpy(), 4e-2)
-
-        # Flax -> PT
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            fx_model.save_pretrained(tmpdirname)
-            pt_model_loaded = SpeechEncoderDecoderModel.from_pretrained(tmpdirname, from_flax=True)
-
-        pt_model_loaded.to(torch_device)
-        pt_model_loaded.eval()
-
-        with torch.no_grad():
-            pt_outputs_loaded = pt_model_loaded(**pt_inputs)
-        pt_logits_loaded = pt_outputs_loaded.logits
-        pt_outputs_loaded = pt_outputs_loaded.to_tuple()
-
-        self.assertEqual(len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch")
-        self.assert_almost_equals(fx_logits, pt_logits_loaded.numpy(), 4e-2)
-

@require_flax
 class FlaxWav2Vec2BartModelTest(FlaxEncoderDecoderMixin, unittest.TestCase):
@@ -742,71 +569,6 @@ class FlaxWav2Vec2BartModelTest(FlaxEncoderDecoderMixin, unittest.TestCase):
            "encoder_hidden_states": encoder_hidden_states,
        }

-    @slow
-    def test_flaxwav2vec2bart_pt_flax_equivalence(self):
-        pt_model = SpeechEncoderDecoderModel.from_pretrained("patrickvonplaten/wav2vec2-2-bart-large")
-        fx_model = FlaxSpeechEncoderDecoderModel.from_pretrained(
-            "patrickvonplaten/wav2vec2-2-bart-large", from_pt=True
-        )
-
-        pt_model.to(torch_device)
-        pt_model.eval()
-
-        # prepare inputs
-        batch_size = 13
-        input_values = floats_tensor([batch_size, 512], scale=1.0)
-        attention_mask = random_attention_mask([batch_size, 512])
-        decoder_input_ids = ids_tensor([batch_size, 4], fx_model.config.decoder.vocab_size)
-        decoder_attention_mask = random_attention_mask([batch_size, 4])
-        inputs_dict = {
-            "inputs": input_values,
-            "attention_mask": attention_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-        }
-
-        flax_inputs = inputs_dict
-        pt_inputs = {k: torch.tensor(v.tolist()) for k, v in flax_inputs.items()}
-
-        with torch.no_grad():
-            pt_outputs = pt_model(**pt_inputs)
-        pt_logits = pt_outputs.logits
-        pt_outputs = pt_outputs.to_tuple()
-
-        fx_outputs = fx_model(**inputs_dict)
-        fx_logits = fx_outputs.logits
-        fx_outputs = fx_outputs.to_tuple()
-
-        self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        self.assert_almost_equals(fx_logits, pt_logits.numpy(), 4e-2)
-
-        # PT -> Flax
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pt_model.save_pretrained(tmpdirname)
-            fx_model_loaded = FlaxSpeechEncoderDecoderModel.from_pretrained(tmpdirname, from_pt=True)
-
-        fx_outputs_loaded = fx_model_loaded(**inputs_dict)
-        fx_logits_loaded = fx_outputs_loaded.logits
-        fx_outputs_loaded = fx_outputs_loaded.to_tuple()
-        self.assertEqual(len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        self.assert_almost_equals(fx_logits_loaded, pt_logits.numpy(), 4e-2)
-
-        # Flax -> PT
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            fx_model.save_pretrained(tmpdirname)
-            pt_model_loaded = SpeechEncoderDecoderModel.from_pretrained(tmpdirname, from_flax=True)
-
-        pt_model_loaded.to(torch_device)
-        pt_model_loaded.eval()
-
-        with torch.no_grad():
-            pt_outputs_loaded = pt_model_loaded(**pt_inputs)
-        pt_logits_loaded = pt_outputs_loaded.logits
-        pt_outputs_loaded = pt_outputs_loaded.to_tuple()
-
-        self.assertEqual(len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch")
-        self.assert_almost_equals(fx_logits, pt_logits_loaded.numpy(), 4e-2)
-

@require_flax
 class FlaxWav2Vec2BertModelTest(FlaxEncoderDecoderMixin, unittest.TestCase):
@@ -858,66 +620,3 @@ class FlaxWav2Vec2BertModelTest(FlaxEncoderDecoderMixin, unittest.TestCase):
            "decoder_attention_mask": decoder_attention_mask,
            "encoder_hidden_states": encoder_hidden_states,
        }
-
-    @slow
-    def test_flaxwav2vec2bert_pt_flax_equivalence(self):
-        pt_model = SpeechEncoderDecoderModel.from_pretrained("speech-seq2seq/wav2vec2-2-bert-large")
-        fx_model = FlaxSpeechEncoderDecoderModel.from_pretrained("speech-seq2seq/wav2vec2-2-bert-large", from_pt=True)
-
-        pt_model.to(torch_device)
-        pt_model.eval()
-
-        # prepare inputs
-        batch_size = 13
-        input_values = floats_tensor([batch_size, 512], fx_model.config.encoder.vocab_size)
-        attention_mask = random_attention_mask([batch_size, 512])
-        decoder_input_ids = ids_tensor([batch_size, 4], fx_model.config.decoder.vocab_size)
-        decoder_attention_mask = random_attention_mask([batch_size, 4])
-        inputs_dict = {
-            "inputs": input_values,
-            "attention_mask": attention_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-        }
-
-        flax_inputs = inputs_dict
-        pt_inputs = {k: torch.tensor(v.tolist()) for k, v in flax_inputs.items()}
-
-        with torch.no_grad():
-            pt_outputs = pt_model(**pt_inputs)
-        pt_logits = pt_outputs.logits
-        pt_outputs = pt_outputs.to_tuple()
-
-        fx_outputs = fx_model(**inputs_dict)
-        fx_logits = fx_outputs.logits
-        fx_outputs = fx_outputs.to_tuple()
-
-        self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        self.assert_almost_equals(fx_logits, pt_logits.numpy(), 4e-2)
-
-        # PT -> Flax
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pt_model.save_pretrained(tmpdirname)
-            fx_model_loaded = FlaxSpeechEncoderDecoderModel.from_pretrained(tmpdirname, from_pt=True)
-
-        fx_outputs_loaded = fx_model_loaded(**inputs_dict)
-        fx_logits_loaded = fx_outputs_loaded.logits
-        fx_outputs_loaded = fx_outputs_loaded.to_tuple()
-        self.assertEqual(len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        self.assert_almost_equals(fx_logits_loaded, pt_logits.numpy(), 4e-2)
-
-        # Flax -> PT
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            fx_model.save_pretrained(tmpdirname)
-            pt_model_loaded = SpeechEncoderDecoderModel.from_pretrained(tmpdirname, from_flax=True)
-
-        pt_model_loaded.to(torch_device)
-        pt_model_loaded.eval()
-
-        with torch.no_grad():
-            pt_outputs_loaded = pt_model_loaded(**pt_inputs)
-        pt_logits_loaded = pt_outputs_loaded.logits
-        pt_outputs_loaded = pt_outputs_loaded.to_tuple()
-
-        self.assertEqual(len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch")
-        self.assert_almost_equals(fx_logits, pt_logits_loaded.numpy(), 4e-2)
--- a/tests/models/t5/test_modeling_flax_t5.py
+++ b/tests/models/t5/test_modeling_flax_t5.py
@@ -17,15 +17,8 @@ import unittest

 import numpy as np

-import transformers
 from transformers import is_flax_available
-from transformers.testing_utils import (
-    is_pt_flax_cross_test,
-    require_flax,
-    require_sentencepiece,
-    require_tokenizers,
-    slow,
-)
+from transformers.testing_utils import require_flax, require_sentencepiece, require_tokenizers, slow

 from ...test_configuration_common import ConfigTester
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
@@ -47,7 +40,6 @@ if is_flax_available():
    from flax.traverse_util import flatten_dict

    from transformers import FLAX_MODEL_MAPPING, ByT5Tokenizer, T5Config, T5Tokenizer
-    from transformers.modeling_flax_pytorch_utils import load_flax_weights_in_pytorch_model
    from transformers.models.t5.modeling_flax_t5 import (
        FlaxT5EncoderModel,
        FlaxT5ForConditionalGeneration,
@@ -373,95 +365,6 @@ class FlaxT5ModelTest(FlaxModelTesterMixin, unittest.TestCase):
                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")

-    # overwrite since special base model prefix is used
-    @is_pt_flax_cross_test
-    def test_save_load_from_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = base_class(config)
-            base_params = flatten_dict(unfreeze(model.params))
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, base_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                # save pt model
-                pt_model.save_pretrained(tmpdirname)
-                head_model = model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_param_from_head = flatten_dict(unfreeze(head_model.params))
-
-                for key in base_param_from_head.keys():
-                    max_diff = (base_params[key] - base_param_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    # overwrite since special base model prefix is used
-    @is_pt_flax_cross_test
-    def test_save_load_to_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = model_class(config)
-            base_params_from_head = flatten_dict(unfreeze(model.params))
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, model_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_model.save_pretrained(tmpdirname)
-                base_model = base_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_params = flatten_dict(unfreeze(base_model.params))
-
-                for key in base_params_from_head.keys():
-                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    # overwrite since special base model prefix is used
-    @is_pt_flax_cross_test
-    def test_save_load_bf16_to_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = model_class(config)
-            model.params = model.to_bf16(model.params)
-            base_params_from_head = flatten_dict(unfreeze(model.params))
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, model_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_model.save_pretrained(tmpdirname)
-                base_model = base_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_params = flatten_dict(unfreeze(base_model.params))
-
-                for key in base_params_from_head.keys():
-                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-

 class FlaxT5EncoderOnlyModelTester:
    def __init__(
@@ -663,95 +566,6 @@ class FlaxT5EncoderOnlyModelTest(FlaxModelTesterMixin, unittest.TestCase):
                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")

-    # overwrite since special base model prefix is used
-    @is_pt_flax_cross_test
-    def test_save_load_from_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = base_class(config)
-            base_params = flatten_dict(unfreeze(model.params))
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, base_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                # save pt model
-                pt_model.save_pretrained(tmpdirname)
-                head_model = model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_param_from_head = flatten_dict(unfreeze(head_model.params))
-
-                for key in base_param_from_head.keys():
-                    max_diff = (base_params[key] - base_param_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    # overwrite since special base model prefix is used
-    @is_pt_flax_cross_test
-    def test_save_load_to_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = model_class(config)
-            base_params_from_head = flatten_dict(unfreeze(model.params))
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, model_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_model.save_pretrained(tmpdirname)
-                base_model = base_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_params = flatten_dict(unfreeze(base_model.params))
-
-                for key in base_params_from_head.keys():
-                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    # overwrite since special base model prefix is used
-    @is_pt_flax_cross_test
-    def test_save_load_bf16_to_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = model_class(config)
-            model.params = model.to_bf16(model.params)
-            base_params_from_head = flatten_dict(unfreeze(model.params))
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, model_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_model.save_pretrained(tmpdirname)
-                base_model = base_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_params = flatten_dict(unfreeze(base_model.params))
-
-                for key in base_params_from_head.keys():
-                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-

@require_sentencepiece
@require_tokenizers
--- a/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py
@@ -19,8 +19,8 @@ import unittest

 import numpy as np

-from transformers import is_flax_available, is_torch_available, is_vision_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_flax, require_vision, slow, torch_device
+from transformers import is_flax_available, is_vision_available
+from transformers.testing_utils import require_flax, require_vision, slow

 from ...test_modeling_flax_common import floats_tensor, ids_tensor
 from ..gpt2.test_modeling_flax_gpt2 import FlaxGPT2ModelTester
@@ -35,15 +35,7 @@ if is_flax_available():
        FlaxViTModel,
        VisionEncoderDecoderConfig,
    )
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )

-if is_torch_available():
-    import torch
-
-    from transformers import VisionEncoderDecoderModel

 if is_vision_available():
    from PIL import Image
@@ -235,68 +227,6 @@ class FlaxEncoderDecoderMixin:
        generated_sequences = generated_output.sequences
        self.assertEqual(generated_sequences.shape, (pixel_values.shape[0],) + (decoder_config.max_length,))

-    def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
-        pt_model.to(torch_device)
-        pt_model.eval()
-
-        # prepare inputs
-        flax_inputs = inputs_dict
-        pt_inputs = {k: torch.tensor(v.tolist()).to(torch_device) for k, v in flax_inputs.items()}
-
-        with torch.no_grad():
-            pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-        fx_outputs = fx_model(**inputs_dict).to_tuple()
-        self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-            self.assert_almost_equals(fx_output, pt_output.numpy(force=True), 1e-5)
-
-        # PT -> Flax
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pt_model.save_pretrained(tmpdirname)
-            fx_model_loaded = FlaxVisionEncoderDecoderModel.from_pretrained(tmpdirname, from_pt=True)
-
-        fx_outputs_loaded = fx_model_loaded(**inputs_dict).to_tuple()
-        self.assertEqual(len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
-            self.assert_almost_equals(fx_output_loaded, pt_output.numpy(force=True), 1e-5)
-
-        # Flax -> PT
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            fx_model.save_pretrained(tmpdirname)
-            pt_model_loaded = VisionEncoderDecoderModel.from_pretrained(tmpdirname, from_flax=True)
-
-        pt_model_loaded.to(torch_device)
-        pt_model_loaded.eval()
-
-        with torch.no_grad():
-            pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-        self.assertEqual(len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch")
-        for fx_output, pt_output_loaded in zip(fx_outputs, pt_outputs_loaded):
-            self.assert_almost_equals(fx_output, pt_output_loaded.numpy(force=True), 1e-5)
-
-    def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict):
-        encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
-
-        pt_model = VisionEncoderDecoderModel(encoder_decoder_config)
-        fx_model = FlaxVisionEncoderDecoderModel(encoder_decoder_config)
-
-        fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-        fx_model.params = fx_state
-
-        self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
-
-    def check_equivalence_flax_to_pt(self, config, decoder_config, inputs_dict):
-        encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
-
-        pt_model = VisionEncoderDecoderModel(encoder_decoder_config)
-        fx_model = FlaxVisionEncoderDecoderModel(encoder_decoder_config)
-
-        pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-
-        self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
-
    def test_encoder_decoder_model_from_pretrained_configs(self):
        config_inputs_dict = self.prepare_config_and_inputs()
        self.check_encoder_decoder_model_from_pretrained_configs(**config_inputs_dict)
@@ -325,39 +255,6 @@ class FlaxEncoderDecoderMixin:
        diff = np.abs((a - b)).max()
        self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).")

-    @is_pt_flax_cross_test
-    def test_pt_flax_equivalence(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        config = config_inputs_dict.pop("config")
-        decoder_config = config_inputs_dict.pop("decoder_config")
-
-        inputs_dict = config_inputs_dict
-        # `encoder_hidden_states` is not used in model call/forward
-        del inputs_dict["encoder_hidden_states"]
-
-        # Avoid the case where a sequence has no place to attend (after combined with the causal attention mask)
-        batch_size = inputs_dict["decoder_attention_mask"].shape[0]
-        inputs_dict["decoder_attention_mask"] = np.concatenate(
-            [np.ones(shape=(batch_size, 1)), inputs_dict["decoder_attention_mask"][:, 1:]], axis=1
-        )
-
-        # Flax models don't use the `use_cache` option and cache is not returned as a default.
-        # So we disable `use_cache` here for PyTorch model.
-        decoder_config.use_cache = False
-
-        self.assertTrue(decoder_config.cross_attention_hidden_size is None)
-
-        # check without `enc_to_dec_proj` projection
-        self.assertTrue(config.hidden_size == decoder_config.hidden_size)
-        self.check_equivalence_pt_to_flax(config, decoder_config, inputs_dict)
-        self.check_equivalence_flax_to_pt(config, decoder_config, inputs_dict)
-
-        # check `enc_to_dec_proj` work as expected
-        decoder_config.hidden_size = decoder_config.hidden_size * 2
-        self.assertTrue(config.hidden_size != decoder_config.hidden_size)
-        self.check_equivalence_pt_to_flax(config, decoder_config, inputs_dict)
-        self.check_equivalence_flax_to_pt(config, decoder_config, inputs_dict)
-
    @slow
    def test_real_model_save_load_from_pretrained(self):
        model_2 = self.get_pretrained_model()
--- a/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py
+++ b/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py
@@ -20,15 +20,8 @@ import unittest

 import numpy as np

-from transformers.testing_utils import (
-    is_pt_flax_cross_test,
-    require_flax,
-    require_torch,
-    require_vision,
-    slow,
-    torch_device,
-)
-from transformers.utils import is_flax_available, is_torch_available, is_vision_available
+from transformers.testing_utils import require_flax, require_torch, require_vision, slow
+from transformers.utils import is_flax_available, is_vision_available

 from ...test_modeling_flax_common import floats_tensor, ids_tensor, random_attention_mask
 from ..bert.test_modeling_flax_bert import FlaxBertModelTester
@@ -45,17 +38,8 @@ if is_flax_available():
        VisionTextDualEncoderConfig,
        VisionTextDualEncoderProcessor,
    )
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )


-if is_torch_available():
-    import torch
-
-    from transformers import VisionTextDualEncoderModel
-
 if is_vision_available():
    from PIL import Image

@@ -154,68 +138,6 @@ class VisionTextDualEncoderMixin:
            (text_config.num_attention_heads, input_ids.shape[-1], input_ids.shape[-1]),
        )

-    def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
-        pt_model.to(torch_device)
-        pt_model.eval()
-
-        # prepare inputs
-        flax_inputs = inputs_dict
-        pt_inputs = {k: torch.tensor(v.tolist()).to(torch_device) for k, v in flax_inputs.items()}
-
-        with torch.no_grad():
-            pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-        fx_outputs = fx_model(**inputs_dict).to_tuple()
-        self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
-            self.assert_almost_equals(fx_output, pt_output.numpy(force=True), 4e-2)
-
-        # PT -> Flax
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pt_model.save_pretrained(tmpdirname)
-            fx_model_loaded = FlaxVisionTextDualEncoderModel.from_pretrained(tmpdirname, from_pt=True)
-
-        fx_outputs_loaded = fx_model_loaded(**inputs_dict).to_tuple()
-        self.assertEqual(len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        for fx_output_loaded, pt_output in zip(fx_outputs_loaded[:4], pt_outputs[:4]):
-            self.assert_almost_equals(fx_output_loaded, pt_output.numpy(force=True), 4e-2)
-
-        # Flax -> PT
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            fx_model.save_pretrained(tmpdirname)
-            pt_model_loaded = VisionTextDualEncoderModel.from_pretrained(tmpdirname, from_flax=True)
-
-        pt_model_loaded.to(torch_device)
-        pt_model_loaded.eval()
-
-        with torch.no_grad():
-            pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-        self.assertEqual(len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch")
-        for fx_output, pt_output_loaded in zip(fx_outputs[:4], pt_outputs_loaded[:4]):
-            self.assert_almost_equals(fx_output, pt_output_loaded.numpy(force=True), 4e-2)
-
-    def check_equivalence_pt_to_flax(self, vision_config, text_config, inputs_dict):
-        config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config)
-
-        pt_model = VisionTextDualEncoderModel(config)
-        fx_model = FlaxVisionTextDualEncoderModel(config)
-
-        fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-        fx_model.params = fx_state
-
-        self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
-
-    def check_equivalence_flax_to_pt(self, vision_config, text_config, inputs_dict):
-        config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config)
-
-        pt_model = VisionTextDualEncoderModel(config)
-        fx_model = FlaxVisionTextDualEncoderModel(config)
-
-        pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-
-        self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
-
    def test_model_from_pretrained_configs(self):
        inputs_dict = self.prepare_config_and_inputs()
        self.check_model_from_pretrained_configs(**inputs_dict)
@@ -232,17 +154,6 @@ class VisionTextDualEncoderMixin:
        inputs_dict = self.prepare_config_and_inputs()
        self.check_vision_text_output_attention(**inputs_dict)

-    @is_pt_flax_cross_test
-    def test_pt_flax_equivalence(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        vision_config = config_inputs_dict.pop("vision_config")
-        text_config = config_inputs_dict.pop("text_config")
-
-        inputs_dict = config_inputs_dict
-
-        self.check_equivalence_pt_to_flax(vision_config, text_config, inputs_dict)
-        self.check_equivalence_flax_to_pt(vision_config, text_config, inputs_dict)
-
    @slow
    def test_real_model_save_load_from_pretrained(self):
        model_2, inputs = self.get_pretrained_model_and_inputs()
--- a/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
+++ b/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
@@ -20,8 +20,8 @@ import unittest

 import numpy as np

-from transformers.testing_utils import is_pt_flax_cross_test, require_torch, require_vision, slow, torch_device
-from transformers.utils import is_flax_available, is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import is_torch_available, is_vision_available

 from ...test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
 from ..bert.test_modeling_bert import BertModelTester
@@ -44,12 +44,6 @@ if is_torch_available():
        ViTModel,
    )

-if is_flax_available():
-    from transformers import FlaxVisionTextDualEncoderModel
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )

 if is_vision_available():
    from PIL import Image
@@ -172,69 +166,6 @@ class VisionTextDualEncoderMixin:
        diff = np.abs((a - b)).max()
        self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).")

-    def check_pt_flax_equivalence(self, pt_model, fx_model, input_ids, attention_mask, pixel_values, **kwargs):
-        pt_model.to(torch_device)
-        pt_model.eval()
-
-        # prepare inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask, "pixel_values": pixel_values}
-        pt_inputs = inputs_dict
-        flax_inputs = {k: v.numpy(force=True) for k, v in pt_inputs.items()}
-
-        with torch.no_grad():
-            pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-        fx_outputs = fx_model(**flax_inputs).to_tuple()
-        self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
-            self.assert_almost_equals(fx_output, pt_output.numpy(force=True), 4e-2)
-
-        # PT -> Flax
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pt_model.save_pretrained(tmpdirname)
-            fx_model_loaded = FlaxVisionTextDualEncoderModel.from_pretrained(tmpdirname, from_pt=True)
-
-        fx_outputs_loaded = fx_model_loaded(**flax_inputs).to_tuple()
-        self.assertEqual(len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        for fx_output_loaded, pt_output in zip(fx_outputs_loaded[:4], pt_outputs[:4]):
-            self.assert_almost_equals(fx_output_loaded, pt_output.numpy(force=True), 4e-2)
-
-        # Flax -> PT
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            fx_model.save_pretrained(tmpdirname)
-            pt_model_loaded = VisionTextDualEncoderModel.from_pretrained(tmpdirname, from_flax=True)
-
-        pt_model_loaded.to(torch_device)
-        pt_model_loaded.eval()
-
-        with torch.no_grad():
-            pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-        self.assertEqual(len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch")
-        for fx_output, pt_output_loaded in zip(fx_outputs[:4], pt_outputs_loaded[:4]):
-            self.assert_almost_equals(fx_output, pt_output_loaded.numpy(force=True), 4e-2)
-
-    def check_equivalence_pt_to_flax(self, vision_config, text_config, inputs_dict):
-        config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config)
-
-        pt_model = VisionTextDualEncoderModel(config)
-        fx_model = FlaxVisionTextDualEncoderModel(config)
-
-        fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-        fx_model.params = fx_state
-
-        self.check_pt_flax_equivalence(pt_model, fx_model, **inputs_dict)
-
-    def check_equivalence_flax_to_pt(self, vision_config, text_config, inputs_dict):
-        config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config)
-
-        pt_model = VisionTextDualEncoderModel(config)
-        fx_model = FlaxVisionTextDualEncoderModel(config)
-
-        pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-
-        self.check_pt_flax_equivalence(pt_model, fx_model, **inputs_dict)
-
    def test_vision_text_dual_encoder_model(self):
        inputs_dict = self.prepare_config_and_inputs()
        self.check_vision_text_dual_encoder_model(**inputs_dict)
@@ -255,17 +186,6 @@ class VisionTextDualEncoderMixin:
        inputs_dict = self.prepare_config_and_inputs()
        self.check_vision_text_output_attention(**inputs_dict)

-    @is_pt_flax_cross_test
-    def test_pt_flax_equivalence(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        vision_config = config_inputs_dict.pop("vision_config")
-        text_config = config_inputs_dict.pop("text_config")
-
-        inputs_dict = config_inputs_dict
-
-        self.check_equivalence_pt_to_flax(vision_config, text_config, inputs_dict)
-        self.check_equivalence_flax_to_pt(vision_config, text_config, inputs_dict)
-
    @slow
    def test_real_model_save_load_from_pretrained(self):
        model_2, inputs = self.get_pretrained_model_and_inputs()
@@ -429,10 +349,6 @@ class DeiTRobertaModelTest(VisionTextDualEncoderMixin, unittest.TestCase):
            "text_choice_labels": choice_labels,
        }

-    @unittest.skip(reason="DeiT is not available in Flax")
-    def test_pt_flax_equivalence(self):
-        pass
-

@require_torch
 class CLIPVisionBertModelTest(VisionTextDualEncoderMixin, unittest.TestCase):
--- a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
@@ -24,9 +24,7 @@ from datasets import load_dataset
 from transformers import Wav2Vec2Config, is_flax_available
 from transformers.testing_utils import (
    CaptureLogger,
-    is_flaky,
    is_librosa_available,
-    is_pt_flax_cross_test,
    is_pyctcdecode_available,
    require_flax,
    require_librosa,
@@ -350,11 +348,6 @@ class FlaxWav2Vec2ModelTest(FlaxModelTesterMixin, unittest.TestCase):
            outputs = model(np.ones((1, 1024), dtype="f4"))
            self.assertIsNotNone(outputs)

-    @is_pt_flax_cross_test
-    @is_flaky()
-    def test_equivalence_pt_to_flax(self):
-        super().test_equivalence_pt_to_flax()
-

@require_flax
 class FlaxWav2Vec2UtilsTest(unittest.TestCase):
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@@ -31,7 +31,6 @@ from transformers.testing_utils import (
    CaptureLogger,
    cleanup,
    is_flaky,
-    is_pt_flax_cross_test,
    is_pyctcdecode_available,
    is_torchaudio_available,
    require_flash_attn,
@@ -569,16 +568,6 @@ class Wav2Vec2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
    def test_model_get_set_embeddings(self):
        pass

-    @is_pt_flax_cross_test
-    @unittest.skip(reason="Non-rubst architecture does not exist in Flax")
-    def test_equivalence_flax_to_pt(self):
-        pass
-
-    @is_pt_flax_cross_test
-    @unittest.skip(reason="Non-rubst architecture does not exist in Flax")
-    def test_equivalence_pt_to_flax(self):
-        pass
-
    def test_retain_grad_hidden_states_attentions(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        config.output_hidden_states = True
--- a/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py
+++ b/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py
@@ -21,7 +21,6 @@ from datasets import load_dataset

 from transformers import Wav2Vec2BertConfig, is_torch_available
 from transformers.testing_utils import (
-    is_pt_flax_cross_test,
    require_torch,
    require_torch_accelerator,
    require_torch_fp16,
@@ -559,18 +558,6 @@ class Wav2Vec2BertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Test
    def test_model_get_set_embeddings(self):
        pass

-    # Ignore copy
-    @unittest.skip(reason="non-robust architecture does not exist in Flax")
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        pass
-
-    # Ignore copy
-    @unittest.skip(reason="non-robust architecture does not exist in Flax")
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        pass
-
    def test_retain_grad_hidden_states_attentions(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        config.output_hidden_states = True
--- a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
+++ b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
@@ -24,7 +24,6 @@ from datasets import load_dataset
 from transformers import Wav2Vec2ConformerConfig, is_torch_available
 from transformers.testing_utils import (
    is_flaky,
-    is_pt_flax_cross_test,
    require_torch,
    require_torch_accelerator,
    require_torch_fp16,
@@ -535,16 +534,6 @@ class Wav2Vec2ConformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest
    def test_model_get_set_embeddings(self):
        pass

-    @is_pt_flax_cross_test
-    @unittest.skip(reason="Non-robust architecture does not exist in Flax")
-    def test_equivalence_flax_to_pt(self):
-        pass
-
-    @is_pt_flax_cross_test
-    @unittest.skip(reason="Non-robust architecture does not exist in Flax")
-    def test_equivalence_pt_to_flax(self):
-        pass
-
    def test_retain_grad_hidden_states_attentions(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        config.output_hidden_states = True
--- a/tests/models/whisper/test_modeling_flax_whisper.py
+++ b/tests/models/whisper/test_modeling_flax_whisper.py
@@ -17,9 +17,8 @@ import inspect
 import tempfile
 import unittest

-import transformers
 from transformers import WhisperConfig, is_flax_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow
+from transformers.testing_utils import require_flax, slow
 from transformers.utils import cached_property
 from transformers.utils.import_utils import is_datasets_available

@@ -45,7 +44,6 @@ if is_flax_available():
        WhisperFeatureExtractor,
        WhisperProcessor,
    )
-    from transformers.modeling_flax_pytorch_utils import load_flax_weights_in_pytorch_model
    from transformers.models.whisper.modeling_flax_whisper import sinusoidal_embedding_init


@@ -245,99 +243,6 @@ class FlaxWhisperModelTest(FlaxModelTesterMixin, unittest.TestCase):
                for jitted_output, output in zip(jitted_outputs, outputs):
                    self.assertEqual(jitted_output.shape, output.shape)

-    def check_pt_flax_outputs(self, fx_outputs, pt_outputs, model_class, tol=5e-5, name="outputs", attributes=None):
-        # We override with a slightly higher tol value, as test recently became flaky
-        super().check_pt_flax_outputs(fx_outputs, pt_outputs, model_class, tol, name, attributes)
-
-    # overwrite because of `input_features`
-    @is_pt_flax_cross_test
-    def test_save_load_bf16_to_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = make_partial_class(FLAX_MODEL_MAPPING[config.__class__], input_shape=self.init_shape)
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ == base_class.__name__:
-                continue
-
-            model = model_class(config)
-            model.params = model.to_bf16(model.params)
-            base_params_from_head = flatten_dict(unfreeze(model.params[model.base_model_prefix]))
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, model_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_model.save_pretrained(tmpdirname)
-                base_model = base_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_params = flatten_dict(unfreeze(base_model.params))
-
-                for key in base_params_from_head.keys():
-                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    # overwrite because of `input_features`
-    @is_pt_flax_cross_test
-    def test_save_load_from_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = make_partial_class(FLAX_MODEL_MAPPING[config.__class__], input_shape=self.init_shape)
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ == base_class.__name__:
-                continue
-
-            model = base_class(config)
-            base_params = flatten_dict(unfreeze(model.params))
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, base_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                # save pt model
-                pt_model.save_pretrained(tmpdirname)
-                head_model = model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_param_from_head = flatten_dict(unfreeze(head_model.params[head_model.base_model_prefix]))
-
-                for key in base_param_from_head.keys():
-                    max_diff = (base_params[key] - base_param_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    # overwrite because of `input_features`
-    @is_pt_flax_cross_test
-    def test_save_load_to_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = make_partial_class(FLAX_MODEL_MAPPING[config.__class__], input_shape=self.init_shape)
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ == base_class.__name__:
-                continue
-
-            model = model_class(config)
-            base_params_from_head = flatten_dict(unfreeze(model.params[model.base_model_prefix]))
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, model_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_model.save_pretrained(tmpdirname)
-                base_model = base_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_params = flatten_dict(unfreeze(base_model.params))
-
-                for key in base_params_from_head.keys():
-                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
    # overwrite because of `input_features`
    def test_save_load_from_base(self):
        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
@@ -899,18 +804,3 @@ class WhisperEncoderModelTest(FlaxModelTesterMixin, unittest.TestCase):
    # WhisperEncoder does not have any base model
    def test_save_load_from_base(self):
        pass
-
-    # WhisperEncoder does not have any base model
-    @is_pt_flax_cross_test
-    def test_save_load_from_base_pt(self):
-        pass
-
-    # WhisperEncoder does not have any base model
-    @is_pt_flax_cross_test
-    def test_save_load_to_base_pt(self):
-        pass
-
-    # WhisperEncoder does not have any base model
-    @is_pt_flax_cross_test
-    def test_save_load_bf16_to_base_pt(self):
-        pass
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -32,7 +32,6 @@ import transformers
 from transformers import WhisperConfig
 from transformers.testing_utils import (
    is_flaky,
-    is_pt_flax_cross_test,
    require_flash_attn,
    require_non_xpu,
    require_torch,
@@ -44,7 +43,7 @@ from transformers.testing_utils import (
    slow,
    torch_device,
 )
-from transformers.utils import cached_property, is_flax_available, is_torch_available, is_torchaudio_available
+from transformers.utils import cached_property, is_torch_available, is_torchaudio_available
 from transformers.utils.import_utils import is_datasets_available

 from ...generation.test_utils import GenerationTesterMixin
@@ -155,15 +154,6 @@ if is_torchaudio_available():
    import torchaudio


-if is_flax_available():
-    import jax.numpy as jnp
-
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
-
-
 def prepare_whisper_inputs_dict(
    config,
    input_features,
@@ -1069,161 +1059,6 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi

            self.assertTrue(models_equal)

-    def check_pt_flax_outputs(self, fx_outputs, pt_outputs, model_class, tol=5e-5, name="outputs", attributes=None):
-        # We override with a slightly higher tol value, as test recently became flaky
-        super().check_pt_flax_outputs(fx_outputs, pt_outputs, model_class, tol, name, attributes)
-
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        init_shape = (1,) + inputs_dict["input_features"].shape[1:]
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    self.skipTest(reason="No Flax model exists for this class")
-
-                # Output all for aggressive testing
-                config.output_hidden_states = True
-                config.output_attentions = self.has_attentions
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load PyTorch class
-                pt_model = model_class(config).eval()
-                # Flax models don't use the `use_cache` option and cache is not returned as a default.
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                # load Flax class
-                fx_model = fx_model_class(config, input_shape=init_shape, dtype=jnp.float32)
-
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                # send pytorch inputs to the correct device
-                pt_inputs = {
-                    k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs.items()
-                }
-
-                # convert inputs to Flax
-                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                # send pytorch model to the correct device
-                pt_model.to(torch_device)
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs)
-                fx_outputs = fx_model(**fx_inputs)
-
-                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
-
-                self.assertEqual(fx_keys, pt_keys)
-                self.check_pt_flax_outputs(fx_outputs, pt_outputs, model_class)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = fx_model_class.from_pretrained(tmpdirname, input_shape=init_shape, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**fx_inputs)
-
-                fx_keys = tuple([k for k, v in fx_outputs_loaded.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
-
-                self.assertEqual(fx_keys, pt_keys)
-                self.check_pt_flax_outputs(fx_outputs_loaded, pt_outputs, model_class)
-
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        init_shape = (1,) + inputs_dict["input_features"].shape[1:]
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    self.skipTest(reason="No Flax model exists for this class")
-
-                # Output all for aggressive testing
-                config.output_hidden_states = True
-                config.output_attentions = self.has_attentions
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load PyTorch class
-                pt_model = model_class(config).eval()
-                # Flax models don't use the `use_cache` option and cache is not returned as a default.
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                # load Flax class
-                fx_model = fx_model_class(config, input_shape=init_shape, dtype=jnp.float32)
-
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                # send pytorch inputs to the correct device
-                pt_inputs = {
-                    k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs.items()
-                }
-
-                # convert inputs to Flax
-                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                # send pytorch model to the correct device
-                pt_model.to(torch_device)
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs)
-                fx_outputs = fx_model(**fx_inputs)
-
-                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
-
-                self.assertEqual(fx_keys, pt_keys)
-                self.check_pt_flax_outputs(fx_outputs, pt_outputs, model_class)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = model_class.from_pretrained(tmpdirname, from_flax=True)
-
-                # send pytorch model to the correct device
-                pt_model_loaded.to(torch_device)
-                pt_model_loaded.eval()
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs)
-
-                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs_loaded.items() if v is not None])
-
-                self.assertEqual(fx_keys, pt_keys)
-                self.check_pt_flax_outputs(fx_outputs, pt_outputs_loaded, model_class)
-
    def test_mask_feature_prob(self):
        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
        config.mask_feature_prob = 0.2
@@ -3622,157 +3457,6 @@ class WhisperEncoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.
    def test_resize_tokens_embeddings(self):
        pass

-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        init_shape = (1,) + inputs_dict["input_features"].shape[1:]
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    self.skipTest(reason="Flax model does not exist")
-
-                # Output all for aggressive testing
-                config.output_hidden_states = True
-                config.output_attentions = self.has_attentions
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load PyTorch class
-                pt_model = model_class(config).eval()
-                # Flax models don't use the `use_cache` option and cache is not returned as a default.
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                # load Flax class
-                fx_model = fx_model_class(config, input_shape=init_shape, dtype=jnp.float32)
-
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                # send pytorch inputs to the correct device
-                pt_inputs = {
-                    k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs.items()
-                }
-
-                # convert inputs to Flax
-                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                # send pytorch model to the correct device
-                pt_model.to(torch_device)
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs)
-                fx_outputs = fx_model(**fx_inputs)
-
-                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
-
-                self.assertEqual(fx_keys, pt_keys)
-                self.check_pt_flax_outputs(fx_outputs, pt_outputs, model_class)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = fx_model_class.from_pretrained(tmpdirname, input_shape=init_shape, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**fx_inputs)
-
-                fx_keys = tuple([k for k, v in fx_outputs_loaded.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
-
-                self.assertEqual(fx_keys, pt_keys)
-                self.check_pt_flax_outputs(fx_outputs_loaded, pt_outputs, model_class)
-
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        init_shape = (1,) + inputs_dict["input_features"].shape[1:]
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    self.skipTest("Flax model does not exist")
-
-                # Output all for aggressive testing
-                config.output_hidden_states = True
-                config.output_attentions = self.has_attentions
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load PyTorch class
-                pt_model = model_class(config).eval()
-                # Flax models don't use the `use_cache` option and cache is not returned as a default.
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                # load Flax class
-                fx_model = fx_model_class(config, input_shape=init_shape, dtype=jnp.float32)
-
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                # send pytorch inputs to the correct device
-                pt_inputs = {
-                    k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs.items()
-                }
-
-                # convert inputs to Flax
-                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                # send pytorch model to the correct device
-                pt_model.to(torch_device)
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs)
-                fx_outputs = fx_model(**fx_inputs)
-
-                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
-
-                self.assertEqual(fx_keys, pt_keys)
-                self.check_pt_flax_outputs(fx_outputs, pt_outputs, model_class)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = model_class.from_pretrained(tmpdirname, from_flax=True)
-
-                # send pytorch model to the correct device
-                pt_model_loaded.to(torch_device)
-                pt_model_loaded.eval()
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs)
-
-                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs_loaded.items() if v is not None])
-
-                self.assertEqual(fx_keys, pt_keys)
-                self.check_pt_flax_outputs(fx_outputs, pt_outputs_loaded, model_class)
-

 class WhisperStandaloneDecoderModelTester:
    def __init__(
--- a/tests/models/xglm/test_modeling_flax_xglm.py
+++ b/tests/models/xglm/test_modeling_flax_xglm.py
@@ -14,12 +14,10 @@
 # limitations under the License.


-import tempfile
 import unittest

-import transformers
-from transformers import XGLMConfig, XGLMTokenizer, is_flax_available, is_torch_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_flax, require_sentencepiece, slow
+from transformers import XGLMConfig, XGLMTokenizer, is_flax_available
+from transformers.testing_utils import require_flax, require_sentencepiece, slow

 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask

@@ -29,17 +27,9 @@ if is_flax_available():
    import jax.numpy as jnp
    import numpy as np

-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
    from transformers.models.xglm.modeling_flax_xglm import FlaxXGLMForCausalLM, FlaxXGLMModel


-if is_torch_available():
-    import torch
-
-
@require_flax
 class FlaxXGLMModelTester:
    def __init__(
@@ -220,108 +210,6 @@ class FlaxXGLMModelTest(FlaxModelTesterMixin, unittest.TestCase):

        self.assertListEqual(output_string, expected_string)

-    # overwrite from common since `attention_mask` in combination
-    # with `causal_mask` behaves slighly differently
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                batch_size, seq_length = pt_inputs["input_ids"].shape
-                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
-                for batch_idx, start_index in enumerate(rnd_start_indices):
-                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
-                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
-                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
-                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
-                pt_model = pt_model_class(config).eval()
-                # Flax models don't use the `use_cache` option and cache is not returned as a default.
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-                fx_model = model_class(config, dtype=jnp.float32)
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(
-                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
-                    self.assert_almost_equals(fx_output_loaded[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-    # overwrite from common since `attention_mask` in combination
-    # with `causal_mask` behaves slighly differently
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                pt_model = pt_model_class(config).eval()
-                pt_model.config.use_cache = False
-                fx_model = model_class(config, dtype=jnp.float32)
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-                batch_size, seq_length = pt_inputs["input_ids"].shape
-                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
-                for batch_idx, start_index in enumerate(rnd_start_indices):
-                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
-                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
-                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
-                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = pt_model_class.from_pretrained(tmpdirname, from_flax=True)
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-                self.assertEqual(
-                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs_loaded):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
    @slow
    def test_model_from_pretrained(self):
        for model_class_name in self.all_model_classes: