[cleanup] Hoist ModelTester objects to top level (#4939)

Co-authored-by: Sam Shleifer <sshleifer@gmail.com>
2020-06-16 17:33:43 +05:30
parent 0c55a384f8
commit c852036b4a
25 changed files with 4721 additions and 5212 deletions
--- a/tests/test_modeling_t5.py
+++ b/tests/test_modeling_t5.py
@@ -30,6 +30,268 @@ if is_torch_available():
    from transformers.tokenization_t5 import T5Tokenizer


+class T5ModelTester:
+    def __init__(self, parent):
+        self.parent = parent
+        self.batch_size = 13
+        self.encoder_seq_length = 7
+        self.decoder_seq_length = 9
+        self.is_training = True
+        self.use_attention_mask = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.n_positions = 14
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.d_ff = 37
+        self.relative_attention_num_buckets = 8
+        self.dropout_rate = 0.1
+        self.initializer_factor = 0.002
+        self.eos_token_id = 1
+        self.pad_token_id = 0
+        self.decoder_start_token_id = 0
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+        decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        attention_mask = None
+        decoder_attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
+            decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
+
+        lm_labels = None
+        if self.use_labels:
+            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        config = T5Config(
+            vocab_size=self.vocab_size,
+            n_positions=self.n_positions,
+            d_model=self.hidden_size,
+            d_ff=self.d_ff,
+            d_kv=self.hidden_size // self.num_attention_heads,
+            num_layers=self.num_hidden_layers,
+            num_heads=self.num_attention_heads,
+            relative_attention_num_buckets=self.relative_attention_num_buckets,
+            dropout_rate=self.dropout_rate,
+            initializer_factor=self.initializer_factor,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.pad_token_id,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+        )
+
+        return (
+            config,
+            input_ids,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        )
+
+    def check_loss_output(self, result):
+        self.parent.assertListEqual(list(result["loss"].size()), [])
+
+    def check_prepare_lm_labels_via_shift_left(
+        self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
+    ):
+        model = T5Model(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # make sure that lm_labels are correctly padded from the right
+        lm_labels.masked_fill_((lm_labels == self.decoder_start_token_id), self.eos_token_id)
+
+        # add casaul pad token mask
+        triangular_mask = torch.tril(lm_labels.new_ones(lm_labels.shape)).logical_not()
+        lm_labels.masked_fill_(triangular_mask, self.pad_token_id)
+        decoder_input_ids = model._shift_right(lm_labels)
+
+        for i, (decoder_input_ids_slice, lm_labels_slice) in enumerate(zip(decoder_input_ids, lm_labels)):
+            # first item
+            self.parent.assertEqual(decoder_input_ids_slice[0].item(), self.decoder_start_token_id)
+            if i < decoder_input_ids_slice.shape[-1]:
+                if i < decoder_input_ids.shape[-1] - 1:
+                    # items before diagonal
+                    self.parent.assertListEqual(
+                        decoder_input_ids_slice[1 : i + 1].tolist(), lm_labels_slice[:i].tolist()
+                    )
+                # pad items after diagonal
+                if i < decoder_input_ids.shape[-1] - 2:
+                    self.parent.assertListEqual(
+                        decoder_input_ids_slice[i + 2 :].tolist(), lm_labels_slice[i + 1 : -1].tolist()
+                    )
+            else:
+                # all items after square
+                self.parent.assertListEqual(decoder_input_ids_slice[1:].tolist(), lm_labels_slice[:-1].tolist())
+
+    def create_and_check_t5_model(
+        self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
+    ):
+        model = T5Model(config=config)
+        model.to(torch_device)
+        model.eval()
+        decoder_output, decoder_past, encoder_output = model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            attention_mask=attention_mask,
+            decoder_attention_mask=decoder_attention_mask,
+        )
+        decoder_output, decoder_past, encoder_output = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+
+        result = {
+            "encoder_output": encoder_output,
+            "decoder_output": decoder_output,
+            "decoder_past": decoder_past,
+        }
+        self.parent.assertListEqual(
+            list(result["encoder_output"].size()), [self.batch_size, self.encoder_seq_length, self.hidden_size]
+        )
+        self.parent.assertListEqual(
+            list(result["decoder_output"].size()), [self.batch_size, self.decoder_seq_length, self.hidden_size]
+        )
+        self.parent.assertEqual(len(decoder_past), 2)
+        # decoder_past[0] should correspond to encoder output
+        self.parent.assertTrue(torch.all(decoder_past[0][0] == encoder_output))
+        # There should be `num_layers` key value embeddings stored in decoder_past[1]
+        self.parent.assertEqual(len(decoder_past[1]), config.num_layers)
+        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past[1] tuple
+        self.parent.assertEqual(len(decoder_past[1][0]), 4)
+
+    def create_and_check_t5_with_lm_head(
+        self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
+    ):
+        model = T5ForConditionalGeneration(config=config)
+        model.to(torch_device)
+        model.eval()
+        outputs = model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            labels=lm_labels,
+        )
+        loss, prediction_scores, _, _ = outputs
+        self.parent.assertEqual(len(outputs), 4)
+        result = {
+            "loss": loss,
+            "prediction_scores": prediction_scores,
+        }
+        self.parent.assertListEqual(
+            list(result["prediction_scores"].size()), [self.batch_size, self.decoder_seq_length, self.vocab_size]
+        )
+        self.check_loss_output(result)
+
+    def create_and_check_t5_decoder_model_past(
+        self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
+    ):
+        model = T5Model(config=config).get_decoder()
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        output, past_key_value_states = model(input_ids, use_cache=True)
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+
+        output_from_no_past = model(next_input_ids)[0]
+        output_from_past = model(next_tokens, past_key_value_states=past_key_value_states)[0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_t5_decoder_model_attention_mask_past(
+        self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
+    ):
+        model = T5Model(config=config).get_decoder()
+        model.to(torch_device)
+        model.eval()
+
+        # create attention mask
+        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+
+        half_seq_length = input_ids.shape[-1] // 2
+        attn_mask[:, half_seq_length:] = 0
+
+        # first forward pass
+        output, past_key_value_states = model(input_ids, attention_mask=attn_mask, use_cache=True)
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
+        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
+
+        # append to next input_ids and attn_mask
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        attn_mask = torch.cat(
+            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)], dim=1,
+        )
+
+        # get two different outputs
+        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)[0]
+        output_from_past = model(next_tokens, past_key_value_states=past_key_value_states, attention_mask=attn_mask)[0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_t5_and_check_t5_generate_with_past_key_value_states(
+        self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
+    ):
+        model = T5ForConditionalGeneration(config=config)
+        model.to(torch_device)
+        model.eval()
+        torch.manual_seed(0)
+        output_without_past_cache = model.generate(
+            input_ids[:1], num_beams=2, max_length=5, do_sample=True, use_cache=False
+        )
+        torch.manual_seed(0)
+        output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True)
+        self.parent.assertTrue(torch.all(output_with_past_cache == output_without_past_cache))
+
+    def create_and_check_t5_model_fp16_forward(
+        self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
+    ):
+        model = T5Model(config=config)
+        model.to(torch_device)
+        model.half()
+        model.eval()
+        output = model(input_ids, decoder_input_ids=input_ids, attention_mask=attention_mask)[0]
+        self.parent.assertFalse(torch.isnan(output).any().item())
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "decoder_input_ids": decoder_input_ids,
+            "decoder_attention_mask": decoder_attention_mask,
+            "use_cache": False,
+        }
+        return config, inputs_dict
+
+
@require_torch
 class T5ModelTest(ModelTesterMixin, unittest.TestCase):

@@ -40,302 +302,8 @@ class T5ModelTest(ModelTesterMixin, unittest.TestCase):
    test_resize_embeddings = False
    is_encoder_decoder = True

-    class T5ModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            encoder_seq_length=7,
-            decoder_seq_length=9,
-            is_training=True,
-            use_attention_mask=True,
-            use_labels=True,
-            vocab_size=99,
-            n_positions=14,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            d_ff=37,
-            relative_attention_num_buckets=8,
-            dropout_rate=0.1,
-            initializer_factor=0.002,
-            eos_token_id=1,
-            pad_token_id=0,
-            decoder_start_token_id=0,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.encoder_seq_length = encoder_seq_length
-            self.decoder_seq_length = decoder_seq_length
-            self.is_training = is_training
-            self.use_attention_mask = use_attention_mask
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.n_positions = n_positions
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.d_ff = d_ff
-            self.relative_attention_num_buckets = relative_attention_num_buckets
-            self.dropout_rate = dropout_rate
-            self.initializer_factor = initializer_factor
-            self.scope = scope
-            self.eos_token_id = eos_token_id
-            self.pad_token_id = pad_token_id
-            self.decoder_start_token_id = decoder_start_token_id
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
-            decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-            attention_mask = None
-            decoder_attention_mask = None
-            if self.use_attention_mask:
-                attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
-                decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-            lm_labels = None
-            if self.use_labels:
-                lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-            config = T5Config(
-                vocab_size=self.vocab_size,
-                n_positions=self.n_positions,
-                d_model=self.hidden_size,
-                d_ff=self.d_ff,
-                d_kv=self.hidden_size // self.num_attention_heads,
-                num_layers=self.num_hidden_layers,
-                num_heads=self.num_attention_heads,
-                relative_attention_num_buckets=self.relative_attention_num_buckets,
-                dropout_rate=self.dropout_rate,
-                initializer_factor=self.initializer_factor,
-                eos_token_id=self.eos_token_id,
-                bos_token_id=self.pad_token_id,
-                pad_token_id=self.pad_token_id,
-                decoder_start_token_id=self.decoder_start_token_id,
-            )
-
-            return (
-                config,
-                input_ids,
-                decoder_input_ids,
-                attention_mask,
-                decoder_attention_mask,
-                lm_labels,
-            )
-
-        def check_loss_output(self, result):
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-
-        def check_prepare_lm_labels_via_shift_left(
-            self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
-        ):
-            model = T5Model(config=config)
-            model.to(torch_device)
-            model.eval()
-
-            # make sure that lm_labels are correctly padded from the right
-            lm_labels.masked_fill_((lm_labels == self.decoder_start_token_id), self.eos_token_id)
-
-            # add casaul pad token mask
-            triangular_mask = torch.tril(lm_labels.new_ones(lm_labels.shape)).logical_not()
-            lm_labels.masked_fill_(triangular_mask, self.pad_token_id)
-            decoder_input_ids = model._shift_right(lm_labels)
-
-            for i, (decoder_input_ids_slice, lm_labels_slice) in enumerate(zip(decoder_input_ids, lm_labels)):
-                # first item
-                self.parent.assertEqual(decoder_input_ids_slice[0].item(), self.decoder_start_token_id)
-                if i < decoder_input_ids_slice.shape[-1]:
-                    if i < decoder_input_ids.shape[-1] - 1:
-                        # items before diagonal
-                        self.parent.assertListEqual(
-                            decoder_input_ids_slice[1 : i + 1].tolist(), lm_labels_slice[:i].tolist()
-                        )
-                    # pad items after diagonal
-                    if i < decoder_input_ids.shape[-1] - 2:
-                        self.parent.assertListEqual(
-                            decoder_input_ids_slice[i + 2 :].tolist(), lm_labels_slice[i + 1 : -1].tolist()
-                        )
-                else:
-                    # all items after square
-                    self.parent.assertListEqual(decoder_input_ids_slice[1:].tolist(), lm_labels_slice[:-1].tolist())
-
-        def create_and_check_t5_model(
-            self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
-        ):
-            model = T5Model(config=config)
-            model.to(torch_device)
-            model.eval()
-            decoder_output, decoder_past, encoder_output = model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-            decoder_output, decoder_past, encoder_output = model(
-                input_ids=input_ids, decoder_input_ids=decoder_input_ids
-            )
-
-            result = {
-                "encoder_output": encoder_output,
-                "decoder_output": decoder_output,
-                "decoder_past": decoder_past,
-            }
-            self.parent.assertListEqual(
-                list(result["encoder_output"].size()), [self.batch_size, self.encoder_seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(
-                list(result["decoder_output"].size()), [self.batch_size, self.decoder_seq_length, self.hidden_size]
-            )
-            self.parent.assertEqual(len(decoder_past), 2)
-            # decoder_past[0] should correspond to encoder output
-            self.parent.assertTrue(torch.all(decoder_past[0][0] == encoder_output))
-            # There should be `num_layers` key value embeddings stored in decoder_past[1]
-            self.parent.assertEqual(len(decoder_past[1]), config.num_layers)
-            # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past[1] tuple
-            self.parent.assertEqual(len(decoder_past[1][0]), 4)
-
-        def create_and_check_t5_with_lm_head(
-            self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
-        ):
-            model = T5ForConditionalGeneration(config=config)
-            model.to(torch_device)
-            model.eval()
-            outputs = model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                decoder_attention_mask=decoder_attention_mask,
-                labels=lm_labels,
-            )
-            loss, prediction_scores, _, _ = outputs
-            self.parent.assertEqual(len(outputs), 4)
-            result = {
-                "loss": loss,
-                "prediction_scores": prediction_scores,
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].size()), [self.batch_size, self.decoder_seq_length, self.vocab_size]
-            )
-            self.check_loss_output(result)
-
-        def create_and_check_t5_decoder_model_past(
-            self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
-        ):
-            model = T5Model(config=config).get_decoder()
-            model.to(torch_device)
-            model.eval()
-
-            # first forward pass
-            output, past_key_value_states = model(input_ids, use_cache=True)
-
-            # create hypothetical next token and extent to next_input_ids
-            next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-            # append to next input_ids and
-            next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-
-            output_from_no_past = model(next_input_ids)[0]
-            output_from_past = model(next_tokens, past_key_value_states=past_key_value_states)[0]
-
-            # select random slice
-            random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-            output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
-            output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
-
-            # test that outputs are equal for slice
-            self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-        def create_and_check_t5_decoder_model_attention_mask_past(
-            self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
-        ):
-            model = T5Model(config=config).get_decoder()
-            model.to(torch_device)
-            model.eval()
-
-            # create attention mask
-            attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
-
-            half_seq_length = input_ids.shape[-1] // 2
-            attn_mask[:, half_seq_length:] = 0
-
-            # first forward pass
-            output, past_key_value_states = model(input_ids, attention_mask=attn_mask, use_cache=True)
-
-            # create hypothetical next token and extent to next_input_ids
-            next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-            # change a random masked slice from input_ids
-            random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-            random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-            input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-            # append to next input_ids and attn_mask
-            next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-            attn_mask = torch.cat(
-                [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)], dim=1,
-            )
-
-            # get two different outputs
-            output_from_no_past = model(next_input_ids, attention_mask=attn_mask)[0]
-            output_from_past = model(
-                next_tokens, past_key_value_states=past_key_value_states, attention_mask=attn_mask
-            )[0]
-
-            # select random slice
-            random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-            output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
-            output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
-
-            # test that outputs are equal for slice
-            self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-        def create_t5_and_check_t5_generate_with_past_key_value_states(
-            self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
-        ):
-            model = T5ForConditionalGeneration(config=config)
-            model.to(torch_device)
-            model.eval()
-            torch.manual_seed(0)
-            output_without_past_cache = model.generate(
-                input_ids[:1], num_beams=2, max_length=5, do_sample=True, use_cache=False
-            )
-            torch.manual_seed(0)
-            output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True)
-            self.parent.assertTrue(torch.all(output_with_past_cache == output_without_past_cache))
-
-        def create_and_check_t5_model_fp16_forward(
-            self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
-        ):
-            model = T5Model(config=config)
-            model.to(torch_device)
-            model.half()
-            model.eval()
-            output = model(input_ids, decoder_input_ids=input_ids, attention_mask=attention_mask)[0]
-            self.parent.assertFalse(torch.isnan(output).any().item())
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                decoder_input_ids,
-                attention_mask,
-                decoder_attention_mask,
-                lm_labels,
-            ) = config_and_inputs
-
-            inputs_dict = {
-                "input_ids": input_ids,
-                "attention_mask": attention_mask,
-                "decoder_input_ids": decoder_input_ids,
-                "decoder_attention_mask": decoder_attention_mask,
-                "use_cache": False,
-            }
-            return config, inputs_dict
-
    def setUp(self):
-        self.model_tester = T5ModelTest.T5ModelTester(self)
+        self.model_tester = T5ModelTester(self)
        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)

    def test_config(self):