From 5b45422b58da508dab8b18852609a279517b944d Mon Sep 17 00:00:00 2001
From: Thomas Wang <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 29 Oct 2021 11:50:25 +0200
Subject: [PATCH] Remove n_ctx from configs (#14165)

* Remove n_ctx from configs

* Fix GPTJ and OpenAIGPT, both are acceptable breaking changes as there are no configs such that it breaks

* Remove unecessary n_positions from TFOpenAIGPT
---
 .../distillation/training_configs/distilgpt2.json    |  1 -
 src/transformers/models/clip/modeling_clip.py        |  2 +-
 src/transformers/models/clip/modeling_flax_clip.py   |  2 +-
 src/transformers/models/ctrl/configuration_ctrl.py   |  4 ----
 src/transformers/models/gpt2/configuration_gpt2.py   |  4 ----
 src/transformers/models/gpt2/modeling_tf_gpt2.py     |  9 ++++-----
 .../gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py    |  2 +-
 src/transformers/models/gptj/configuration_gptj.py   |  4 ----
 src/transformers/models/gptj/modeling_gptj.py        |  9 ++++-----
 .../convert_megatron_gpt2_checkpoint.py              | 12 ++++++------
 .../models/openai/configuration_openai.py            |  4 ----
 src/transformers/models/openai/modeling_openai.py    | 12 +++++++-----
 src/transformers/models/openai/modeling_tf_openai.py |  9 ++++-----
 tests/test_modeling_ctrl.py                          |  1 -
 tests/test_modeling_flax_gpt2.py                     |  1 -
 tests/test_modeling_gpt2.py                          |  1 -
 tests/test_modeling_gptj.py                          |  1 -
 tests/test_modeling_openai.py                        |  1 -
 tests/test_modeling_tf_ctrl.py                       |  1 -
 tests/test_modeling_tf_gpt2.py                       |  1 -
 tests/test_modeling_tf_openai.py                     |  1 -
 tests/test_trainer.py                                |  4 ++--
 22 files changed, 30 insertions(+), 56 deletions(-)

diff --git a/examples/research_projects/distillation/training_configs/distilgpt2.json b/examples/research_projects/distillation/training_configs/distilgpt2.json
index 8616e8e60f..9820ac93b8 100644
--- a/examples/research_projects/distillation/training_configs/distilgpt2.json
+++ b/examples/research_projects/distillation/training_configs/distilgpt2.json
@@ -1,7 +1,6 @@
 {
 	"initializer_range": 0.02,
 	"layer_norm_epsilon": 0.00001,
-	"n_ctx": 1024,
 	"n_embd": 768,
 	"n_head": 12,
 	"n_layer": 6,
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index 4cd748c6f4..dfd8596fd1 100755
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -653,7 +653,7 @@ class CLIPTextTransformer(nn.Module):
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.final_layer_norm(last_hidden_state)
 
-        # text_embeds.shape = [batch_size, n_ctx, transformer.width]
+        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
         # take features from the eot embedding (eot_token is the highest number in each sequence)
         pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)]
 
diff --git a/src/transformers/models/clip/modeling_flax_clip.py b/src/transformers/models/clip/modeling_flax_clip.py
index b38142369a..dbb23c25f7 100644
--- a/src/transformers/models/clip/modeling_flax_clip.py
+++ b/src/transformers/models/clip/modeling_flax_clip.py
@@ -521,7 +521,7 @@ class FlaxCLIPTextTransformer(nn.Module):
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.final_layer_norm(last_hidden_state)
 
-        # text_embeds.shape = [batch_size, n_ctx, transformer.width]
+        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
         # take features from the EOS embedding (eos_token_id is the highest number in each sequence)
         pooled_output = last_hidden_state[jnp.arange(last_hidden_state.shape[0]), input_ids.argmax(axis=-1)]
 
diff --git a/src/transformers/models/ctrl/configuration_ctrl.py b/src/transformers/models/ctrl/configuration_ctrl.py
index bd045586b7..2db3f778f8 100644
--- a/src/transformers/models/ctrl/configuration_ctrl.py
+++ b/src/transformers/models/ctrl/configuration_ctrl.py
@@ -41,8 +41,6 @@ class CTRLConfig(PretrainedConfig):
         n_positions (:obj:`int`, `optional`, defaults to 256):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        n_ctx (:obj:`int`, `optional`, defaults to 256):
-            Dimensionality of the causal mask (usually same as n_positions).
         n_embd (:obj:`int`, `optional`, defaults to 1280):
             Dimensionality of the embeddings and hidden states.
         dff (:obj:`int`, `optional`, defaults to 8192):
@@ -92,7 +90,6 @@ class CTRLConfig(PretrainedConfig):
         self,
         vocab_size=246534,
         n_positions=256,
-        n_ctx=256,
         n_embd=1280,
         dff=8192,
         n_layer=48,
@@ -111,7 +108,6 @@ class CTRLConfig(PretrainedConfig):
         **kwargs
     ):
         self.vocab_size = vocab_size
-        self.n_ctx = n_ctx
         self.n_positions = n_positions
         self.n_embd = n_embd
         self.n_layer = n_layer
diff --git a/src/transformers/models/gpt2/configuration_gpt2.py b/src/transformers/models/gpt2/configuration_gpt2.py
index f527cd8238..be4f8df0a8 100644
--- a/src/transformers/models/gpt2/configuration_gpt2.py
+++ b/src/transformers/models/gpt2/configuration_gpt2.py
@@ -54,8 +54,6 @@ class GPT2Config(PretrainedConfig):
         n_positions (:obj:`int`, `optional`, defaults to 1024):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        n_ctx (:obj:`int`, `optional`, defaults to 1024):
-            Dimensionality of the causal mask (usually same as n_positions).
         n_embd (:obj:`int`, `optional`, defaults to 768):
             Dimensionality of the embeddings and hidden states.
         n_layer (:obj:`int`, `optional`, defaults to 12):
@@ -144,7 +142,6 @@ class GPT2Config(PretrainedConfig):
         self,
         vocab_size=50257,
         n_positions=1024,
-        n_ctx=1024,
         n_embd=768,
         n_layer=12,
         n_head=12,
@@ -169,7 +166,6 @@ class GPT2Config(PretrainedConfig):
         **kwargs,
     ):
         self.vocab_size = vocab_size
-        self.n_ctx = n_ctx
         self.n_positions = n_positions
         self.n_embd = n_embd
         self.n_layer = n_layer
diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py
index 609446e8c4..9546936099 100644
--- a/src/transformers/models/gpt2/modeling_tf_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py
@@ -66,13 +66,12 @@ TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
 
 
 class TFAttention(tf.keras.layers.Layer):
-    def __init__(self, nx, n_ctx, config, scale=False, **kwargs):
+    def __init__(self, nx, config, scale=False, **kwargs):
         super().__init__(**kwargs)
 
         n_state = nx  # in Attention: n_state=768 (nx=n_embd)
         # [switch nx => n_state from Block to Attention to keep identical to TF implementation]
         assert n_state % config.n_head == 0
-        self.n_ctx = n_ctx
         self.n_head = config.n_head
         self.split_size = n_state
         self.scale = scale
@@ -185,12 +184,12 @@ class TFMLP(tf.keras.layers.Layer):
 
 
 class TFBlock(tf.keras.layers.Layer):
-    def __init__(self, n_ctx, config, scale=False, **kwargs):
+    def __init__(self, config, scale=False, **kwargs):
         super().__init__(**kwargs)
         nx = config.n_embd
         inner_dim = config.n_inner if config.n_inner is not None else 4 * nx
         self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
-        self.attn = TFAttention(nx, n_ctx, config, scale, name="attn")
+        self.attn = TFAttention(nx, config, scale, name="attn")
         self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
         self.mlp = TFMLP(inner_dim, config, name="mlp")
 
@@ -233,7 +232,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
             config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte"
         )
         self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
-        self.h = [TFBlock(config.n_ctx, config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)]
+        self.h = [TFBlock(config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)]
         self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f")
 
     def build(self, input_shape):
diff --git a/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py b/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py
index 1c630fb2d8..7ee1c17477 100644
--- a/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py
+++ b/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py
@@ -33,7 +33,7 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_du
         num_layers=config_json["n_layer"],
         num_heads=config_json["n_head"],
         attention_types=config_json["attention_types"],
-        max_position_embeddings=config_json["n_ctx"],
+        max_position_embeddings=config_json["n_positions"],
         resid_dropout=config_json["res_dropout"],
         embed_dropout=config_json["embed_dropout"],
         attention_dropout=config_json["attn_dropout"],
diff --git a/src/transformers/models/gptj/configuration_gptj.py b/src/transformers/models/gptj/configuration_gptj.py
index 61dfd4e663..40408cb19f 100644
--- a/src/transformers/models/gptj/configuration_gptj.py
+++ b/src/transformers/models/gptj/configuration_gptj.py
@@ -42,8 +42,6 @@ class GPTJConfig(PretrainedConfig):
         n_positions (:obj:`int`, `optional`, defaults to 2048):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        n_ctx (:obj:`int`, `optional`, defaults to 2048):
-            Dimensionality of the causal mask (usually same as n_positions).
         n_embd (:obj:`int`, `optional`, defaults to 4096):
             Dimensionality of the embeddings and hidden states.
         n_layer (:obj:`int`, `optional`, defaults to 28):
@@ -96,7 +94,6 @@ class GPTJConfig(PretrainedConfig):
         self,
         vocab_size=50400,
         n_positions=2048,
-        n_ctx=2048,
         n_embd=4096,
         n_layer=28,
         n_head=16,
@@ -115,7 +112,6 @@ class GPTJConfig(PretrainedConfig):
         **kwargs
     ):
         self.vocab_size = vocab_size
-        self.n_ctx = n_ctx
         self.n_positions = n_positions
         self.n_embd = n_embd
         self.n_layer = n_layer
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index e81819b7f8..15d16627ad 100755
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -99,7 +99,7 @@ class GPTJAttention(nn.Module):
 
     def _split_heads(self, tensor, num_attention_heads, attn_head_size, rotary):
         """
-        Splits n_ctx dim into attn_head_size and num_attention_heads
+        Splits hidden dim into attn_head_size and num_attention_heads
         """
         new_shape = tensor.size()[:-1] + (num_attention_heads, attn_head_size)
         tensor = tensor.view(*new_shape)
@@ -114,7 +114,7 @@ class GPTJAttention(nn.Module):
 
     def _merge_heads(self, tensor, num_attention_heads, attn_head_size):
         """
-        Merges attn_head_size dim and num_attn_heads dim into n_ctx
+        Merges attn_head_size dim and num_attn_heads dim into hidden dim
         """
         if len(tensor.shape) == 5:
             tensor = tensor.permute(0, 1, 3, 2, 4).contiguous()
@@ -377,7 +377,7 @@ GPTJ_INPUTS_DOCSTRING = r"""
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
 
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, n_ctx)`, `optional`):
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_dim)`, `optional`):
             Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
             This is useful if you want more control over how to convert `input_ids` indices into associated vectors
             than the model's internal embedding lookup matrix.
@@ -444,7 +444,6 @@ class GPTJModel(GPTJPreTrainedModel):
         self.drop = nn.Dropout(config.embd_pdrop)
         self.h = nn.ModuleList([GPTJBlock(config) for _ in range(config.n_layer)])
         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-        self.rotary_dim = min(config.rotary_dim, config.n_ctx // config.num_attention_heads)
         self.init_weights()
 
         # Model parallel
@@ -854,7 +853,7 @@ class GPTJForSequenceClassification(GPTJPreTrainedModel):
         super().__init__(config)
         self.num_labels = config.num_labels
         self.transformer = GPTJModel(config)
-        self.score = nn.Linear(config.n_ctx, self.num_labels, bias=False)
+        self.score = nn.Linear(config.n_positions, self.num_labels, bias=False)
 
         self.init_weights()
 
diff --git a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
index e5c114b436..cacb22b28e 100644
--- a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
+++ b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
@@ -88,7 +88,6 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
 
         config.vocab_size = ds_args.padded_vocab_size
         config.n_positions = ds_args.max_position_embeddings
-        config.n_ctx = ds_args.seq_length
         config.n_embd = ds_args.hidden_size
         config.n_layer = ds_args.num_layers
         config.n_head = ds_args.num_attention_heads
@@ -121,10 +120,10 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
     # The position embeddings.
     pos_embeddings = embeddings["position_embeddings"]["weight"]
     # Read the causal mask dimension (seqlen). [max_sequence_length, hidden_size]
-    n_ctx = pos_embeddings.size(0)
+    n_positions = pos_embeddings.size(0)
     assert (
-        n_ctx == config.n_ctx
-    ), f"pos_embeddings.max_sequence_length={n_ctx} and config.n_ctx={config.n_ctx} don't match"
+        n_positions == config.n_positions
+    ), f"pos_embeddings.max_sequence_length={n_positions} and config.n_positions={config.n_positions} don't match"
     # Store the position embeddings.
     output_state_dict["transformer.wpe.weight"] = pos_embeddings
 
@@ -173,7 +172,9 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
         ) and weight_or_bias == "weight":
 
             # Insert a tensor of 1x1xDxD bias.
-            causal_mask = torch.tril(torch.ones((n_ctx, n_ctx), dtype=torch.float16)).view(1, 1, n_ctx, n_ctx)
+            causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.float16)).view(
+                1, 1, n_positions, n_positions
+            )
             output_state_dict[layer_name + ".attn.bias"] = causal_mask
 
             # Insert a "dummy" tensor for masked_bias.
@@ -274,7 +275,6 @@ def main():
         config = GPT2Config(
             vocab_size=50257,
             n_positions=1024,
-            n_ctx=1024,
             n_embd=1024,
             n_layer=24,
             n_head=16,
diff --git a/src/transformers/models/openai/configuration_openai.py b/src/transformers/models/openai/configuration_openai.py
index f23fe74201..5ba2a80078 100644
--- a/src/transformers/models/openai/configuration_openai.py
+++ b/src/transformers/models/openai/configuration_openai.py
@@ -42,8 +42,6 @@ class OpenAIGPTConfig(PretrainedConfig):
         n_positions (:obj:`int`, `optional`, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        n_ctx (:obj:`int`, `optional`, defaults to 512):
-            Dimensionality of the causal mask (usually same as n_positions).
         n_embd (:obj:`int`, `optional`, defaults to 768):
             Dimensionality of the embeddings and hidden states.
         n_layer (:obj:`int`, `optional`, defaults to 12):
@@ -126,7 +124,6 @@ class OpenAIGPTConfig(PretrainedConfig):
         self,
         vocab_size=40478,
         n_positions=512,
-        n_ctx=512,
         n_embd=768,
         n_layer=12,
         n_head=12,
@@ -145,7 +142,6 @@ class OpenAIGPTConfig(PretrainedConfig):
         **kwargs
     ):
         self.vocab_size = vocab_size
-        self.n_ctx = n_ctx
         self.n_positions = n_positions
         self.n_embd = n_embd
         self.n_layer = n_layer
diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py
index 0ce9344dac..c3f36a8889 100644
--- a/src/transformers/models/openai/modeling_openai.py
+++ b/src/transformers/models/openai/modeling_openai.py
@@ -143,12 +143,14 @@ ACT_FNS = {"relu": nn.ReLU, "silu": silu, "gelu": gelu_new, "swish": silu}
 
 
 class Attention(nn.Module):
-    def __init__(self, nx, n_ctx, config, scale=False):
+    def __init__(self, nx, n_positions, config, scale=False):
         super().__init__()
         n_state = nx  # in Attention: n_state=768 (nx=n_embd)
         # [switch nx => n_state from Block to Attention to keep identical to TF implementation]
         assert n_state % config.n_head == 0
-        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
+        self.register_buffer(
+            "bias", torch.tril(torch.ones(n_positions, n_positions)).view(1, 1, n_positions, n_positions)
+        )
         self.n_head = config.n_head
         self.split_size = n_state
         self.scale = scale
@@ -246,10 +248,10 @@ class MLP(nn.Module):
 
 
 class Block(nn.Module):
-    def __init__(self, n_ctx, config, scale=False):
+    def __init__(self, n_positions, config, scale=False):
         super().__init__()
         nx = config.n_embd
-        self.attn = Attention(nx, n_ctx, config, scale)
+        self.attn = Attention(nx, n_positions, config, scale)
         self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
         self.mlp = MLP(4 * nx, config)
         self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
@@ -413,7 +415,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         self.tokens_embed = nn.Embedding(config.vocab_size, config.n_embd)
         self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
         self.drop = nn.Dropout(config.embd_pdrop)
-        self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
+        self.h = nn.ModuleList([Block(config.n_positions, config, scale=True) for _ in range(config.n_layer)])
 
         self.register_buffer("position_ids", torch.arange(config.n_positions))
         self.init_weights()
diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py
index e4d5b80209..221f9c63cb 100644
--- a/src/transformers/models/openai/modeling_tf_openai.py
+++ b/src/transformers/models/openai/modeling_tf_openai.py
@@ -58,7 +58,7 @@ TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
 
 
 class TFAttention(tf.keras.layers.Layer):
-    def __init__(self, nx, n_ctx, config, scale=False, **kwargs):
+    def __init__(self, nx, config, scale=False, **kwargs):
         super().__init__(**kwargs)
 
         n_state = nx  # in Attention: n_state=768 (nx=n_embd)
@@ -66,7 +66,6 @@ class TFAttention(tf.keras.layers.Layer):
         assert (
             n_state % config.n_head == 0
         ), f"Hidden dimension {n_state} not dividable by number of heads {config.n_head}"
-        self.n_ctx = n_ctx
         self.n_head = config.n_head
         self.split_size = n_state
         self.scale = scale
@@ -169,10 +168,10 @@ class TFMLP(tf.keras.layers.Layer):
 
 
 class TFBlock(tf.keras.layers.Layer):
-    def __init__(self, n_ctx, config, scale=False, **kwargs):
+    def __init__(self, config, scale=False, **kwargs):
         super().__init__(**kwargs)
         nx = config.n_embd
-        self.attn = TFAttention(nx, n_ctx, config, scale, name="attn")
+        self.attn = TFAttention(nx, config, scale, name="attn")
         self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
         self.mlp = TFMLP(4 * nx, config, name="mlp")
         self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
@@ -210,7 +209,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
             config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="tokens_embed"
         )
         self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
-        self.h = [TFBlock(config.n_ctx, config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)]
+        self.h = [TFBlock(config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)]
 
     def build(self, input_shape):
         with tf.name_scope("positions_embed"):
diff --git a/tests/test_modeling_ctrl.py b/tests/test_modeling_ctrl.py
index c35e3cc025..15736ad8e3 100644
--- a/tests/test_modeling_ctrl.py
+++ b/tests/test_modeling_ctrl.py
@@ -114,7 +114,6 @@ class CTRLModelTester:
             # hidden_dropout_prob=self.hidden_dropout_prob,
             # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             n_positions=self.max_position_embeddings,
-            n_ctx=self.max_position_embeddings,
             # type_vocab_size=self.type_vocab_size,
             # initializer_range=self.initializer_range,
             pad_token_id=self.pad_token_id,
diff --git a/tests/test_modeling_flax_gpt2.py b/tests/test_modeling_flax_gpt2.py
index 3b2e43680e..3eed483a6f 100644
--- a/tests/test_modeling_flax_gpt2.py
+++ b/tests/test_modeling_flax_gpt2.py
@@ -95,7 +95,6 @@ class FlaxGPT2ModelTester:
             n_layer=self.num_hidden_layers,
             n_head=self.num_attention_heads,
             n_positions=self.max_position_embeddings,
-            n_ctx=self.max_position_embeddings,
             use_cache=False,
             bos_token_id=self.bos_token_id,
             eos_token_id=self.eos_token_id,
diff --git a/tests/test_modeling_gpt2.py b/tests/test_modeling_gpt2.py
index 462c6456d2..ef51c815e4 100644
--- a/tests/test_modeling_gpt2.py
+++ b/tests/test_modeling_gpt2.py
@@ -155,7 +155,6 @@ class GPT2ModelTester:
             resid_pdrop=self.hidden_dropout_prob,
             attn_pdrop=self.attention_probs_dropout_prob,
             n_positions=self.max_position_embeddings,
-            n_ctx=self.max_position_embeddings,
             type_vocab_size=self.type_vocab_size,
             initializer_range=self.initializer_range,
             use_cache=True,
diff --git a/tests/test_modeling_gptj.py b/tests/test_modeling_gptj.py
index e94aac990b..e0ef8a905e 100644
--- a/tests/test_modeling_gptj.py
+++ b/tests/test_modeling_gptj.py
@@ -142,7 +142,6 @@ class GPTJModelTester:
             hidden_dropout_prob=self.hidden_dropout_prob,
             attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             n_positions=self.max_position_embeddings,
-            n_ctx=self.max_position_embeddings,
             type_vocab_size=self.type_vocab_size,
             initializer_range=self.initializer_range,
             use_cache=True,
diff --git a/tests/test_modeling_openai.py b/tests/test_modeling_openai.py
index 08ee51df3f..584fac66b0 100644
--- a/tests/test_modeling_openai.py
+++ b/tests/test_modeling_openai.py
@@ -90,7 +90,6 @@ class OpenAIGPTModelTester:
             # hidden_dropout_prob=self.hidden_dropout_prob,
             # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             n_positions=self.max_position_embeddings,
-            n_ctx=self.max_position_embeddings,
             # type_vocab_size=self.type_vocab_size,
             # initializer_range=self.initializer_range
             pad_token_id=self.pad_token_id,
diff --git a/tests/test_modeling_tf_ctrl.py b/tests/test_modeling_tf_ctrl.py
index e9531552bd..6e4d73cc57 100644
--- a/tests/test_modeling_tf_ctrl.py
+++ b/tests/test_modeling_tf_ctrl.py
@@ -97,7 +97,6 @@ class TFCTRLModelTester(object):
             # hidden_dropout_prob=self.hidden_dropout_prob,
             # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             n_positions=self.max_position_embeddings,
-            n_ctx=self.max_position_embeddings,
             # type_vocab_size=self.type_vocab_size,
             # initializer_range=self.initializer_range,
             pad_token_id=self.pad_token_id,
diff --git a/tests/test_modeling_tf_gpt2.py b/tests/test_modeling_tf_gpt2.py
index 8e13f0fdc1..609c4731b0 100644
--- a/tests/test_modeling_tf_gpt2.py
+++ b/tests/test_modeling_tf_gpt2.py
@@ -100,7 +100,6 @@ class TFGPT2ModelTester:
             # hidden_dropout_prob=self.hidden_dropout_prob,
             # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             n_positions=self.max_position_embeddings,
-            n_ctx=self.max_position_embeddings,
             # type_vocab_size=self.type_vocab_size,
             # initializer_range=self.initializer_range
             bos_token_id=self.bos_token_id,
diff --git a/tests/test_modeling_tf_openai.py b/tests/test_modeling_tf_openai.py
index 4dc684adb7..c6d3a09895 100644
--- a/tests/test_modeling_tf_openai.py
+++ b/tests/test_modeling_tf_openai.py
@@ -98,7 +98,6 @@ class TFOpenAIGPTModelTester:
             # hidden_dropout_prob=self.hidden_dropout_prob,
             # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             n_positions=self.max_position_embeddings,
-            n_ctx=self.max_position_embeddings,
             # type_vocab_size=self.type_vocab_size,
             # initializer_range=self.initializer_range,
             pad_token_id=self.pad_token_id,
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index 72fda10c8f..b1e6e0f5a9 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -490,7 +490,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         _ = trainer.predict(eval_dataset)
 
     def test_evaluation_with_keys_to_drop(self):
-        config = GPT2Config(vocab_size=100, n_positions=128, n_ctx=128, n_embd=32, n_layer=3, n_head=4)
+        config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
         tiny_gpt2 = GPT2LMHeadModel(config)
         x = torch.randint(0, 100, (128,))
         eval_dataset = RepeatDataset(x)
@@ -531,7 +531,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         self.assertEqual(train_output.global_step, 10)
 
     def test_logging_inf_nan_filter(self):
-        config = GPT2Config(vocab_size=100, n_positions=128, n_ctx=128, n_embd=32, n_layer=3, n_head=4)
+        config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
         tiny_gpt2 = GPT2LMHeadModel(config)
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)