Deprecate TF + JAX (#38758)

* Scatter deprecation warnings around * Delete the tests * Make logging work properly!
2025-06-11 17:28:06 +01:00
parent 337757cbd5
commit 9f563ada70
121 changed files with 27 additions and 50030 deletions
--- a/tests/models/roberta/test_modeling_flax_roberta.py
+++ b/tests/models/roberta/test_modeling_flax_roberta.py
@@ -1,159 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-from transformers import RobertaConfig, is_flax_available
-from transformers.testing_utils import require_flax, slow
-
-from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-
-if is_flax_available():
-    from transformers.models.roberta.modeling_flax_roberta import (
-        FlaxRobertaForCausalLM,
-        FlaxRobertaForMaskedLM,
-        FlaxRobertaForMultipleChoice,
-        FlaxRobertaForQuestionAnswering,
-        FlaxRobertaForSequenceClassification,
-        FlaxRobertaForTokenClassification,
-        FlaxRobertaModel,
-    )
-
-
-class FlaxRobertaModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_attention_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_choices=4,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_choices = num_choices
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        config = RobertaConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-        return config, input_ids, token_type_ids, attention_mask
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, token_type_ids, attention_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_decoder(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, token_type_ids, attention_mask = config_and_inputs
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-
-@require_flax
-class FlaxRobertaModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    test_head_masking = True
-
-    all_model_classes = (
-        (
-            FlaxRobertaModel,
-            FlaxRobertaForCausalLM,
-            FlaxRobertaForMaskedLM,
-            FlaxRobertaForSequenceClassification,
-            FlaxRobertaForTokenClassification,
-            FlaxRobertaForMultipleChoice,
-            FlaxRobertaForQuestionAnswering,
-        )
-        if is_flax_available()
-        else ()
-    )
-
-    def setUp(self):
-        self.model_tester = FlaxRobertaModelTester(self)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("FacebookAI/roberta-base", from_pt=True)
-            outputs = model(np.ones((1, 1)))
-            self.assertIsNotNone(outputs)
--- a/tests/models/roberta/test_modeling_tf_roberta.py
+++ b/tests/models/roberta/test_modeling_tf_roberta.py
@@ -1,700 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import RobertaConfig, is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import numpy
-    import tensorflow as tf
-
-    from transformers.models.roberta.modeling_tf_roberta import (
-        TFRobertaForCausalLM,
-        TFRobertaForMaskedLM,
-        TFRobertaForMultipleChoice,
-        TFRobertaForQuestionAnswering,
-        TFRobertaForSequenceClassification,
-        TFRobertaForTokenClassification,
-        TFRobertaModel,
-    )
-
-
-class TFRobertaModelTester:
-    def __init__(
-        self,
-        parent,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_input_mask = True
-        self.use_token_type_ids = True
-        self.use_labels = True
-        self.vocab_size = 99
-        self.hidden_size = 32
-        self.num_hidden_layers = 2
-        self.num_attention_heads = 4
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.scope = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = RobertaConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFRobertaModel(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_causal_lm_base_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.is_decoder = True
-
-        model = TFRobertaModel(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-
-        model = TFRobertaModel(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-            "encoder_hidden_states": encoder_hidden_states,
-            "encoder_attention_mask": encoder_attention_mask,
-        }
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
-
-        # Also check the case where encoder outputs are not passed
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_causal_lm_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.is_decoder = True
-
-        model = TFRobertaForCausalLM(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        prediction_scores = model(inputs)["logits"]
-        self.parent.assertListEqual(
-            list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size]
-        )
-
-    def create_and_check_causal_lm_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-
-        model = TFRobertaForCausalLM(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-            "encoder_hidden_states": encoder_hidden_states,
-            "encoder_attention_mask": encoder_attention_mask,
-        }
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
-
-        prediction_scores = result["logits"]
-        self.parent.assertListEqual(
-            list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size]
-        )
-
-    def create_and_check_causal_lm_model_past(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.is_decoder = True
-
-        model = TFRobertaForCausalLM(config=config)
-
-        # special to `RobertaEmbeddings` in `Roberta`:
-        #   - its `padding_idx` and its effect on `position_ids`
-        #     (TFRobertaEmbeddings.create_position_ids_from_input_ids)
-        #   - `1` here is `TFRobertaEmbeddings.padding_idx`
-        input_ids = tf.where(input_ids == 1, 2, input_ids)
-
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and attn_mask
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-
-        output_from_no_past = model(next_input_ids, output_hidden_states=True).hidden_states[0]
-        output_from_past = model(
-            next_tokens, past_key_values=past_key_values, output_hidden_states=True
-        ).hidden_states[0]
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
-
-    def create_and_check_causal_lm_model_past_with_attn_mask(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.is_decoder = True
-
-        model = TFRobertaForCausalLM(config=config)
-
-        # special to `RobertaEmbeddings` in `Roberta`:
-        #   - its `padding_idx` and its effect on `position_ids`
-        #     (TFRobertaEmbeddings.create_position_ids_from_input_ids)
-        #   - `1` here is `TFRobertaEmbeddings.padding_idx`
-        # avoid `padding_idx` in the past
-        input_ids = tf.where(input_ids == 1, 2, input_ids)
-
-        # create attention mask
-        half_seq_length = self.seq_length // 2
-        attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32)
-        attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32)
-        attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attn_mask, use_cache=True)
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        past_key_values = outputs.past_key_values
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size)
-        vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change)
-        condition = tf.transpose(
-            tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size))
-        )
-        input_ids = tf.where(condition, random_other_next_tokens, input_ids)
-        # avoid `padding_idx` in the past
-        input_ids = tf.where(input_ids == 1, 2, input_ids)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        attn_mask = tf.concat(
-            [attn_mask, tf.ones((attn_mask.shape[0], 1), dtype=tf.int32)],
-            axis=1,
-        )
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=attn_mask,
-            output_hidden_states=True,
-        ).hidden_states[0]
-        output_from_past = model(
-            next_tokens, past_key_values=past_key_values, attention_mask=attn_mask, output_hidden_states=True
-        ).hidden_states[0]
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
-
-    def create_and_check_causal_lm_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.is_decoder = True
-
-        model = TFRobertaForCausalLM(config=config)
-
-        # special to `RobertaEmbeddings` in `Roberta`:
-        #   - its `padding_idx` and its effect on `position_ids`
-        #     (TFRobertaEmbeddings.create_position_ids_from_input_ids)
-        #   - `1` here is `TFRobertaEmbeddings.padding_idx`
-        # avoid `padding_idx` in the past
-        input_ids = tf.where(input_ids == 1, 2, input_ids)
-
-        input_ids = input_ids[:1, :]
-        input_mask = input_mask[:1, :]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=input_mask, use_cache=True)
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            output_hidden_states=True,
-        ).hidden_states[0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        ).hidden_states[0]
-
-        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-
-        model = TFRobertaForCausalLM(config=config)
-
-        # special to `RobertaEmbeddings` in `Roberta`:
-        #   - its `padding_idx` and its effect on `position_ids`
-        #     (TFRobertaEmbeddings.create_position_ids_from_input_ids)
-        #   - `1` here is `TFRobertaEmbeddings.padding_idx`
-        # avoid `padding_idx` in the past
-        input_ids = tf.where(input_ids == 1, 2, input_ids)
-
-        input_ids = input_ids[:1, :]
-        input_mask = input_mask[:1, :]
-        encoder_hidden_states = encoder_hidden_states[:1, :, :]
-        encoder_attention_mask = encoder_attention_mask[:1, :]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        ).hidden_states[0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        ).hidden_states[0]
-
-        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFRobertaForMaskedLM(config=config)
-        result = model([input_ids, input_mask, token_type_ids])
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TFRobertaForTokenClassification(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TFRobertaForQuestionAnswering(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = TFRobertaForMultipleChoice(config=config)
-        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-        inputs = {
-            "input_ids": multiple_choice_inputs_ids,
-            "attention_mask": multiple_choice_input_mask,
-            "token_type_ids": multiple_choice_token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_tf
-class TFRobertaModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TFRobertaModel,
-            TFRobertaForCausalLM,
-            TFRobertaForMaskedLM,
-            TFRobertaForSequenceClassification,
-            TFRobertaForTokenClassification,
-            TFRobertaForQuestionAnswering,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFRobertaModel,
-            "fill-mask": TFRobertaForMaskedLM,
-            "question-answering": TFRobertaForQuestionAnswering,
-            "text-classification": TFRobertaForSequenceClassification,
-            "text-generation": TFRobertaForCausalLM,
-            "token-classification": TFRobertaForTokenClassification,
-            "zero-shot": TFRobertaForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFRobertaModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        """Test the base model"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_causal_lm_base_model(self):
-        """Test the base model of the causal LM model
-
-        is_decoder=True, no cross_attention, no encoder outputs
-        """
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_base_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        """Test the base model as a decoder (of an encoder-decoder architecture)
-
-        is_decoder=True + cross_attention + pass encoder outputs
-        """
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_causal_lm(self):
-        """Test the causal LM model"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_model(*config_and_inputs)
-
-    def test_causal_lm_model_as_decoder(self):
-        """Test the causal LM model as a decoder"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_causal_lm_model_as_decoder(*config_and_inputs)
-
-    def test_causal_lm_model_past(self):
-        """Test causal LM model with `past_key_values`"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_model_past(*config_and_inputs)
-
-    def test_causal_lm_model_past_with_attn_mask(self):
-        """Test the causal LM model with `past_key_values` and `attention_mask`"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_model_past_with_attn_mask(*config_and_inputs)
-
-    def test_causal_lm_model_past_with_large_inputs(self):
-        """Test the causal LM model with `past_key_values` and a longer decoder sequence length"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_model_past_large_inputs(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        """Similar to `test_causal_lm_model_past_with_large_inputs` but with cross-attention"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "FacebookAI/roberta-base"
-        model = TFRobertaModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_tf
-@require_sentencepiece
-@require_tokenizers
-class TFRobertaModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_masked_lm(self):
-        model = TFRobertaForMaskedLM.from_pretrained("FacebookAI/roberta-base")
-
-        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        output = model(input_ids)[0]
-        expected_shape = [1, 11, 50265]
-        self.assertEqual(list(output.numpy().shape), expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = tf.constant(
-            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
-        )
-        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
-
-    @slow
-    def test_inference_no_head(self):
-        model = TFRobertaModel.from_pretrained("FacebookAI/roberta-base")
-
-        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        output = model(input_ids)[0]
-        # compare the actual values for a slice.
-        expected_slice = tf.constant(
-            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0540, -0.0175], [0.0548, 0.0799, 0.1687]]]
-        )
-        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
-
-    @slow
-    def test_inference_classification_head(self):
-        model = TFRobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-large-mnli")
-
-        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        output = model(input_ids)[0]
-        expected_shape = [1, 3]
-        self.assertEqual(list(output.numpy().shape), expected_shape)
-        expected_tensor = tf.constant([[-0.9469, 0.3913, 0.5118]])
-        self.assertTrue(numpy.allclose(output.numpy(), expected_tensor.numpy(), atol=1e-4))