Remove n_ctx from configs (#14165)

* Remove n_ctx from configs * Fix GPTJ and OpenAIGPT, both are acceptable breaking changes as there are no configs such that it breaks * Remove unecessary n_positions from TFOpenAIGPT
2021-10-29 11:50:25 +02:00
parent be236361f1
commit 5b45422b58
22 changed files with 30 additions and 56 deletions
--- a/examples/research_projects/distillation/training_configs/distilgpt2.json
+++ b/examples/research_projects/distillation/training_configs/distilgpt2.json
@@ -1,7 +1,6 @@
 {
 	"initializer_range": 0.02,
 	"layer_norm_epsilon": 0.00001,
-	"n_ctx": 1024,
 	"n_embd": 768,
 	"n_head": 12,
 	"n_layer": 6,
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -653,7 +653,7 @@ class CLIPTextTransformer(nn.Module):
        last_hidden_state = encoder_outputs[0]
        last_hidden_state = self.final_layer_norm(last_hidden_state)

-        # text_embeds.shape = [batch_size, n_ctx, transformer.width]
+        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
        # take features from the eot embedding (eot_token is the highest number in each sequence)
        pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)]

--- a/src/transformers/models/clip/modeling_flax_clip.py
+++ b/src/transformers/models/clip/modeling_flax_clip.py
@@ -521,7 +521,7 @@ class FlaxCLIPTextTransformer(nn.Module):
        last_hidden_state = encoder_outputs[0]
        last_hidden_state = self.final_layer_norm(last_hidden_state)

-        # text_embeds.shape = [batch_size, n_ctx, transformer.width]
+        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
        # take features from the EOS embedding (eos_token_id is the highest number in each sequence)
        pooled_output = last_hidden_state[jnp.arange(last_hidden_state.shape[0]), input_ids.argmax(axis=-1)]

--- a/src/transformers/models/ctrl/configuration_ctrl.py
+++ b/src/transformers/models/ctrl/configuration_ctrl.py
@@ -41,8 +41,6 @@ class CTRLConfig(PretrainedConfig):
        n_positions (:obj:`int`, `optional`, defaults to 256):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        n_ctx (:obj:`int`, `optional`, defaults to 256):
-            Dimensionality of the causal mask (usually same as n_positions).
        n_embd (:obj:`int`, `optional`, defaults to 1280):
            Dimensionality of the embeddings and hidden states.
        dff (:obj:`int`, `optional`, defaults to 8192):
@@ -92,7 +90,6 @@ class CTRLConfig(PretrainedConfig):
        self,
        vocab_size=246534,
        n_positions=256,
-        n_ctx=256,
        n_embd=1280,
        dff=8192,
        n_layer=48,
@@ -111,7 +108,6 @@ class CTRLConfig(PretrainedConfig):
        **kwargs
    ):
        self.vocab_size = vocab_size
-        self.n_ctx = n_ctx
        self.n_positions = n_positions
        self.n_embd = n_embd
        self.n_layer = n_layer
--- a/src/transformers/models/gpt2/configuration_gpt2.py
+++ b/src/transformers/models/gpt2/configuration_gpt2.py
@@ -54,8 +54,6 @@ class GPT2Config(PretrainedConfig):
        n_positions (:obj:`int`, `optional`, defaults to 1024):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        n_ctx (:obj:`int`, `optional`, defaults to 1024):
-            Dimensionality of the causal mask (usually same as n_positions).
        n_embd (:obj:`int`, `optional`, defaults to 768):
            Dimensionality of the embeddings and hidden states.
        n_layer (:obj:`int`, `optional`, defaults to 12):
@@ -144,7 +142,6 @@ class GPT2Config(PretrainedConfig):
        self,
        vocab_size=50257,
        n_positions=1024,
-        n_ctx=1024,
        n_embd=768,
        n_layer=12,
        n_head=12,
@@ -169,7 +166,6 @@ class GPT2Config(PretrainedConfig):
        **kwargs,
    ):
        self.vocab_size = vocab_size
-        self.n_ctx = n_ctx
        self.n_positions = n_positions
        self.n_embd = n_embd
        self.n_layer = n_layer
--- a/src/transformers/models/gpt2/modeling_tf_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py
@@ -66,13 +66,12 @@ TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [


 class TFAttention(tf.keras.layers.Layer):
-    def __init__(self, nx, n_ctx, config, scale=False, **kwargs):
+    def __init__(self, nx, config, scale=False, **kwargs):
        super().__init__(**kwargs)

        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
        # [switch nx => n_state from Block to Attention to keep identical to TF implementation]
        assert n_state % config.n_head == 0
-        self.n_ctx = n_ctx
        self.n_head = config.n_head
        self.split_size = n_state
        self.scale = scale
@@ -185,12 +184,12 @@ class TFMLP(tf.keras.layers.Layer):


 class TFBlock(tf.keras.layers.Layer):
-    def __init__(self, n_ctx, config, scale=False, **kwargs):
+    def __init__(self, config, scale=False, **kwargs):
        super().__init__(**kwargs)
        nx = config.n_embd
        inner_dim = config.n_inner if config.n_inner is not None else 4 * nx
        self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
-        self.attn = TFAttention(nx, n_ctx, config, scale, name="attn")
+        self.attn = TFAttention(nx, config, scale, name="attn")
        self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
        self.mlp = TFMLP(inner_dim, config, name="mlp")

@@ -233,7 +232,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
            config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte"
        )
        self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
-        self.h = [TFBlock(config.n_ctx, config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)]
+        self.h = [TFBlock(config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)]
        self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f")

    def build(self, input_shape):
--- a/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py
+++ b/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py
@@ -33,7 +33,7 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_du
        num_layers=config_json["n_layer"],
        num_heads=config_json["n_head"],
        attention_types=config_json["attention_types"],
-        max_position_embeddings=config_json["n_ctx"],
+        max_position_embeddings=config_json["n_positions"],
        resid_dropout=config_json["res_dropout"],
        embed_dropout=config_json["embed_dropout"],
        attention_dropout=config_json["attn_dropout"],
--- a/src/transformers/models/gptj/configuration_gptj.py
+++ b/src/transformers/models/gptj/configuration_gptj.py
@@ -42,8 +42,6 @@ class GPTJConfig(PretrainedConfig):
        n_positions (:obj:`int`, `optional`, defaults to 2048):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        n_ctx (:obj:`int`, `optional`, defaults to 2048):
-            Dimensionality of the causal mask (usually same as n_positions).
        n_embd (:obj:`int`, `optional`, defaults to 4096):
            Dimensionality of the embeddings and hidden states.
        n_layer (:obj:`int`, `optional`, defaults to 28):
@@ -96,7 +94,6 @@ class GPTJConfig(PretrainedConfig):
        self,
        vocab_size=50400,
        n_positions=2048,
-        n_ctx=2048,
        n_embd=4096,
        n_layer=28,
        n_head=16,
@@ -115,7 +112,6 @@ class GPTJConfig(PretrainedConfig):
        **kwargs
    ):
        self.vocab_size = vocab_size
-        self.n_ctx = n_ctx
        self.n_positions = n_positions
        self.n_embd = n_embd
        self.n_layer = n_layer
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -99,7 +99,7 @@ class GPTJAttention(nn.Module):

    def _split_heads(self, tensor, num_attention_heads, attn_head_size, rotary):
        """
-        Splits n_ctx dim into attn_head_size and num_attention_heads
+        Splits hidden dim into attn_head_size and num_attention_heads
        """
        new_shape = tensor.size()[:-1] + (num_attention_heads, attn_head_size)
        tensor = tensor.view(*new_shape)
@@ -114,7 +114,7 @@ class GPTJAttention(nn.Module):

    def _merge_heads(self, tensor, num_attention_heads, attn_head_size):
        """
-        Merges attn_head_size dim and num_attn_heads dim into n_ctx
+        Merges attn_head_size dim and num_attn_heads dim into hidden dim
        """
        if len(tensor.shape) == 5:
            tensor = tensor.permute(0, 1, 3, 2, 4).contiguous()
@@ -377,7 +377,7 @@ GPTJ_INPUTS_DOCSTRING = r"""
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, n_ctx)`, `optional`):
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_dim)`, `optional`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
@@ -444,7 +444,6 @@ class GPTJModel(GPTJPreTrainedModel):
        self.drop = nn.Dropout(config.embd_pdrop)
        self.h = nn.ModuleList([GPTJBlock(config) for _ in range(config.n_layer)])
        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-        self.rotary_dim = min(config.rotary_dim, config.n_ctx // config.num_attention_heads)
        self.init_weights()

        # Model parallel
@@ -854,7 +853,7 @@ class GPTJForSequenceClassification(GPTJPreTrainedModel):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.transformer = GPTJModel(config)
-        self.score = nn.Linear(config.n_ctx, self.num_labels, bias=False)
+        self.score = nn.Linear(config.n_positions, self.num_labels, bias=False)

        self.init_weights()

--- a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
+++ b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
@@ -88,7 +88,6 @@ def convert_megatron_checkpoint(args, input_state_dict, config):

        config.vocab_size = ds_args.padded_vocab_size
        config.n_positions = ds_args.max_position_embeddings
-        config.n_ctx = ds_args.seq_length
        config.n_embd = ds_args.hidden_size
        config.n_layer = ds_args.num_layers
        config.n_head = ds_args.num_attention_heads
@@ -121,10 +120,10 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
    # The position embeddings.
    pos_embeddings = embeddings["position_embeddings"]["weight"]
    # Read the causal mask dimension (seqlen). [max_sequence_length, hidden_size]
-    n_ctx = pos_embeddings.size(0)
+    n_positions = pos_embeddings.size(0)
    assert (
-        n_ctx == config.n_ctx
-    ), f"pos_embeddings.max_sequence_length={n_ctx} and config.n_ctx={config.n_ctx} don't match"
+        n_positions == config.n_positions
+    ), f"pos_embeddings.max_sequence_length={n_positions} and config.n_positions={config.n_positions} don't match"
    # Store the position embeddings.
    output_state_dict["transformer.wpe.weight"] = pos_embeddings

@@ -173,7 +172,9 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
        ) and weight_or_bias == "weight":

            # Insert a tensor of 1x1xDxD bias.
-            causal_mask = torch.tril(torch.ones((n_ctx, n_ctx), dtype=torch.float16)).view(1, 1, n_ctx, n_ctx)
+            causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.float16)).view(
+                1, 1, n_positions, n_positions
+            )
            output_state_dict[layer_name + ".attn.bias"] = causal_mask

            # Insert a "dummy" tensor for masked_bias.
@@ -274,7 +275,6 @@ def main():
        config = GPT2Config(
            vocab_size=50257,
            n_positions=1024,
-            n_ctx=1024,
            n_embd=1024,
            n_layer=24,
            n_head=16,
--- a/src/transformers/models/openai/configuration_openai.py
+++ b/src/transformers/models/openai/configuration_openai.py
@@ -42,8 +42,6 @@ class OpenAIGPTConfig(PretrainedConfig):
        n_positions (:obj:`int`, `optional`, defaults to 512):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        n_ctx (:obj:`int`, `optional`, defaults to 512):
-            Dimensionality of the causal mask (usually same as n_positions).
        n_embd (:obj:`int`, `optional`, defaults to 768):
            Dimensionality of the embeddings and hidden states.
        n_layer (:obj:`int`, `optional`, defaults to 12):
@@ -126,7 +124,6 @@ class OpenAIGPTConfig(PretrainedConfig):
        self,
        vocab_size=40478,
        n_positions=512,
-        n_ctx=512,
        n_embd=768,
        n_layer=12,
        n_head=12,
@@ -145,7 +142,6 @@ class OpenAIGPTConfig(PretrainedConfig):
        **kwargs
    ):
        self.vocab_size = vocab_size
-        self.n_ctx = n_ctx
        self.n_positions = n_positions
        self.n_embd = n_embd
        self.n_layer = n_layer
--- a/src/transformers/models/openai/modeling_openai.py
+++ b/src/transformers/models/openai/modeling_openai.py
@@ -143,12 +143,14 @@ ACT_FNS = {"relu": nn.ReLU, "silu": silu, "gelu": gelu_new, "swish": silu}


 class Attention(nn.Module):
-    def __init__(self, nx, n_ctx, config, scale=False):
+    def __init__(self, nx, n_positions, config, scale=False):
        super().__init__()
        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
        # [switch nx => n_state from Block to Attention to keep identical to TF implementation]
        assert n_state % config.n_head == 0
-        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
+        self.register_buffer(
+            "bias", torch.tril(torch.ones(n_positions, n_positions)).view(1, 1, n_positions, n_positions)
+        )
        self.n_head = config.n_head
        self.split_size = n_state
        self.scale = scale
@@ -246,10 +248,10 @@ class MLP(nn.Module):


 class Block(nn.Module):
-    def __init__(self, n_ctx, config, scale=False):
+    def __init__(self, n_positions, config, scale=False):
        super().__init__()
        nx = config.n_embd
-        self.attn = Attention(nx, n_ctx, config, scale)
+        self.attn = Attention(nx, n_positions, config, scale)
        self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
        self.mlp = MLP(4 * nx, config)
        self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
@@ -413,7 +415,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
        self.tokens_embed = nn.Embedding(config.vocab_size, config.n_embd)
        self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
        self.drop = nn.Dropout(config.embd_pdrop)
-        self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
+        self.h = nn.ModuleList([Block(config.n_positions, config, scale=True) for _ in range(config.n_layer)])

        self.register_buffer("position_ids", torch.arange(config.n_positions))
        self.init_weights()
--- a/src/transformers/models/openai/modeling_tf_openai.py
+++ b/src/transformers/models/openai/modeling_tf_openai.py
@@ -58,7 +58,7 @@ TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [


 class TFAttention(tf.keras.layers.Layer):
-    def __init__(self, nx, n_ctx, config, scale=False, **kwargs):
+    def __init__(self, nx, config, scale=False, **kwargs):
        super().__init__(**kwargs)

        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
@@ -66,7 +66,6 @@ class TFAttention(tf.keras.layers.Layer):
        assert (
            n_state % config.n_head == 0
        ), f"Hidden dimension {n_state} not dividable by number of heads {config.n_head}"
-        self.n_ctx = n_ctx
        self.n_head = config.n_head
        self.split_size = n_state
        self.scale = scale
@@ -169,10 +168,10 @@ class TFMLP(tf.keras.layers.Layer):


 class TFBlock(tf.keras.layers.Layer):
-    def __init__(self, n_ctx, config, scale=False, **kwargs):
+    def __init__(self, config, scale=False, **kwargs):
        super().__init__(**kwargs)
        nx = config.n_embd
-        self.attn = TFAttention(nx, n_ctx, config, scale, name="attn")
+        self.attn = TFAttention(nx, config, scale, name="attn")
        self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
        self.mlp = TFMLP(4 * nx, config, name="mlp")
        self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
@@ -210,7 +209,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
            config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="tokens_embed"
        )
        self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
-        self.h = [TFBlock(config.n_ctx, config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)]
+        self.h = [TFBlock(config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)]

    def build(self, input_shape):
        with tf.name_scope("positions_embed"):
--- a/tests/test_modeling_ctrl.py
+++ b/tests/test_modeling_ctrl.py
@@ -114,7 +114,6 @@ class CTRLModelTester:
            # hidden_dropout_prob=self.hidden_dropout_prob,
            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            n_positions=self.max_position_embeddings,
-            n_ctx=self.max_position_embeddings,
            # type_vocab_size=self.type_vocab_size,
            # initializer_range=self.initializer_range,
            pad_token_id=self.pad_token_id,
--- a/tests/test_modeling_flax_gpt2.py
+++ b/tests/test_modeling_flax_gpt2.py
@@ -95,7 +95,6 @@ class FlaxGPT2ModelTester:
            n_layer=self.num_hidden_layers,
            n_head=self.num_attention_heads,
            n_positions=self.max_position_embeddings,
-            n_ctx=self.max_position_embeddings,
            use_cache=False,
            bos_token_id=self.bos_token_id,
            eos_token_id=self.eos_token_id,
--- a/tests/test_modeling_gpt2.py
+++ b/tests/test_modeling_gpt2.py
@@ -155,7 +155,6 @@ class GPT2ModelTester:
            resid_pdrop=self.hidden_dropout_prob,
            attn_pdrop=self.attention_probs_dropout_prob,
            n_positions=self.max_position_embeddings,
-            n_ctx=self.max_position_embeddings,
            type_vocab_size=self.type_vocab_size,
            initializer_range=self.initializer_range,
            use_cache=True,
--- a/tests/test_modeling_gptj.py
+++ b/tests/test_modeling_gptj.py
@@ -142,7 +142,6 @@ class GPTJModelTester:
            hidden_dropout_prob=self.hidden_dropout_prob,
            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            n_positions=self.max_position_embeddings,
-            n_ctx=self.max_position_embeddings,
            type_vocab_size=self.type_vocab_size,
            initializer_range=self.initializer_range,
            use_cache=True,
--- a/tests/test_modeling_openai.py
+++ b/tests/test_modeling_openai.py
@@ -90,7 +90,6 @@ class OpenAIGPTModelTester:
            # hidden_dropout_prob=self.hidden_dropout_prob,
            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            n_positions=self.max_position_embeddings,
-            n_ctx=self.max_position_embeddings,
            # type_vocab_size=self.type_vocab_size,
            # initializer_range=self.initializer_range
            pad_token_id=self.pad_token_id,
--- a/tests/test_modeling_tf_ctrl.py
+++ b/tests/test_modeling_tf_ctrl.py
@@ -97,7 +97,6 @@ class TFCTRLModelTester(object):
            # hidden_dropout_prob=self.hidden_dropout_prob,
            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            n_positions=self.max_position_embeddings,
-            n_ctx=self.max_position_embeddings,
            # type_vocab_size=self.type_vocab_size,
            # initializer_range=self.initializer_range,
            pad_token_id=self.pad_token_id,
--- a/tests/test_modeling_tf_gpt2.py
+++ b/tests/test_modeling_tf_gpt2.py
@@ -100,7 +100,6 @@ class TFGPT2ModelTester:
            # hidden_dropout_prob=self.hidden_dropout_prob,
            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            n_positions=self.max_position_embeddings,
-            n_ctx=self.max_position_embeddings,
            # type_vocab_size=self.type_vocab_size,
            # initializer_range=self.initializer_range
            bos_token_id=self.bos_token_id,
--- a/tests/test_modeling_tf_openai.py
+++ b/tests/test_modeling_tf_openai.py
@@ -98,7 +98,6 @@ class TFOpenAIGPTModelTester:
            # hidden_dropout_prob=self.hidden_dropout_prob,
            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            n_positions=self.max_position_embeddings,
-            n_ctx=self.max_position_embeddings,
            # type_vocab_size=self.type_vocab_size,
            # initializer_range=self.initializer_range,
            pad_token_id=self.pad_token_id,
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -490,7 +490,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        _ = trainer.predict(eval_dataset)

    def test_evaluation_with_keys_to_drop(self):
-        config = GPT2Config(vocab_size=100, n_positions=128, n_ctx=128, n_embd=32, n_layer=3, n_head=4)
+        config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
        tiny_gpt2 = GPT2LMHeadModel(config)
        x = torch.randint(0, 100, (128,))
        eval_dataset = RepeatDataset(x)
@@ -531,7 +531,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        self.assertEqual(train_output.global_step, 10)

    def test_logging_inf_nan_filter(self):
-        config = GPT2Config(vocab_size=100, n_positions=128, n_ctx=128, n_embd=32, n_layer=3, n_head=4)
+        config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
        tiny_gpt2 = GPT2LMHeadModel(config)
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)