From 5b45422b58da508dab8b18852609a279517b944d Mon Sep 17 00:00:00 2001 From: Thomas Wang <24695242+thomasw21@users.noreply.github.com> Date: Fri, 29 Oct 2021 11:50:25 +0200 Subject: [PATCH] Remove n_ctx from configs (#14165) * Remove n_ctx from configs * Fix GPTJ and OpenAIGPT, both are acceptable breaking changes as there are no configs such that it breaks * Remove unecessary n_positions from TFOpenAIGPT --- .../distillation/training_configs/distilgpt2.json | 1 - src/transformers/models/clip/modeling_clip.py | 2 +- src/transformers/models/clip/modeling_flax_clip.py | 2 +- src/transformers/models/ctrl/configuration_ctrl.py | 4 ---- src/transformers/models/gpt2/configuration_gpt2.py | 4 ---- src/transformers/models/gpt2/modeling_tf_gpt2.py | 9 ++++----- .../gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py | 2 +- src/transformers/models/gptj/configuration_gptj.py | 4 ---- src/transformers/models/gptj/modeling_gptj.py | 9 ++++----- .../convert_megatron_gpt2_checkpoint.py | 12 ++++++------ .../models/openai/configuration_openai.py | 4 ---- src/transformers/models/openai/modeling_openai.py | 12 +++++++----- src/transformers/models/openai/modeling_tf_openai.py | 9 ++++----- tests/test_modeling_ctrl.py | 1 - tests/test_modeling_flax_gpt2.py | 1 - tests/test_modeling_gpt2.py | 1 - tests/test_modeling_gptj.py | 1 - tests/test_modeling_openai.py | 1 - tests/test_modeling_tf_ctrl.py | 1 - tests/test_modeling_tf_gpt2.py | 1 - tests/test_modeling_tf_openai.py | 1 - tests/test_trainer.py | 4 ++-- 22 files changed, 30 insertions(+), 56 deletions(-) diff --git a/examples/research_projects/distillation/training_configs/distilgpt2.json b/examples/research_projects/distillation/training_configs/distilgpt2.json index 8616e8e60f..9820ac93b8 100644 --- a/examples/research_projects/distillation/training_configs/distilgpt2.json +++ b/examples/research_projects/distillation/training_configs/distilgpt2.json @@ -1,7 +1,6 @@ { "initializer_range": 0.02, "layer_norm_epsilon": 0.00001, - "n_ctx": 1024, "n_embd": 768, "n_head": 12, "n_layer": 6, diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index 4cd748c6f4..dfd8596fd1 100755 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -653,7 +653,7 @@ class CLIPTextTransformer(nn.Module): last_hidden_state = encoder_outputs[0] last_hidden_state = self.final_layer_norm(last_hidden_state) - # text_embeds.shape = [batch_size, n_ctx, transformer.width] + # text_embeds.shape = [batch_size, sequence_length, transformer.width] # take features from the eot embedding (eot_token is the highest number in each sequence) pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)] diff --git a/src/transformers/models/clip/modeling_flax_clip.py b/src/transformers/models/clip/modeling_flax_clip.py index b38142369a..dbb23c25f7 100644 --- a/src/transformers/models/clip/modeling_flax_clip.py +++ b/src/transformers/models/clip/modeling_flax_clip.py @@ -521,7 +521,7 @@ class FlaxCLIPTextTransformer(nn.Module): last_hidden_state = encoder_outputs[0] last_hidden_state = self.final_layer_norm(last_hidden_state) - # text_embeds.shape = [batch_size, n_ctx, transformer.width] + # text_embeds.shape = [batch_size, sequence_length, transformer.width] # take features from the EOS embedding (eos_token_id is the highest number in each sequence) pooled_output = last_hidden_state[jnp.arange(last_hidden_state.shape[0]), input_ids.argmax(axis=-1)] diff --git a/src/transformers/models/ctrl/configuration_ctrl.py b/src/transformers/models/ctrl/configuration_ctrl.py index bd045586b7..2db3f778f8 100644 --- a/src/transformers/models/ctrl/configuration_ctrl.py +++ b/src/transformers/models/ctrl/configuration_ctrl.py @@ -41,8 +41,6 @@ class CTRLConfig(PretrainedConfig): n_positions (:obj:`int`, `optional`, defaults to 256): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). - n_ctx (:obj:`int`, `optional`, defaults to 256): - Dimensionality of the causal mask (usually same as n_positions). n_embd (:obj:`int`, `optional`, defaults to 1280): Dimensionality of the embeddings and hidden states. dff (:obj:`int`, `optional`, defaults to 8192): @@ -92,7 +90,6 @@ class CTRLConfig(PretrainedConfig): self, vocab_size=246534, n_positions=256, - n_ctx=256, n_embd=1280, dff=8192, n_layer=48, @@ -111,7 +108,6 @@ class CTRLConfig(PretrainedConfig): **kwargs ): self.vocab_size = vocab_size - self.n_ctx = n_ctx self.n_positions = n_positions self.n_embd = n_embd self.n_layer = n_layer diff --git a/src/transformers/models/gpt2/configuration_gpt2.py b/src/transformers/models/gpt2/configuration_gpt2.py index f527cd8238..be4f8df0a8 100644 --- a/src/transformers/models/gpt2/configuration_gpt2.py +++ b/src/transformers/models/gpt2/configuration_gpt2.py @@ -54,8 +54,6 @@ class GPT2Config(PretrainedConfig): n_positions (:obj:`int`, `optional`, defaults to 1024): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). - n_ctx (:obj:`int`, `optional`, defaults to 1024): - Dimensionality of the causal mask (usually same as n_positions). n_embd (:obj:`int`, `optional`, defaults to 768): Dimensionality of the embeddings and hidden states. n_layer (:obj:`int`, `optional`, defaults to 12): @@ -144,7 +142,6 @@ class GPT2Config(PretrainedConfig): self, vocab_size=50257, n_positions=1024, - n_ctx=1024, n_embd=768, n_layer=12, n_head=12, @@ -169,7 +166,6 @@ class GPT2Config(PretrainedConfig): **kwargs, ): self.vocab_size = vocab_size - self.n_ctx = n_ctx self.n_positions = n_positions self.n_embd = n_embd self.n_layer = n_layer diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py index 609446e8c4..9546936099 100644 --- a/src/transformers/models/gpt2/modeling_tf_gpt2.py +++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py @@ -66,13 +66,12 @@ TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [ class TFAttention(tf.keras.layers.Layer): - def __init__(self, nx, n_ctx, config, scale=False, **kwargs): + def __init__(self, nx, config, scale=False, **kwargs): super().__init__(**kwargs) n_state = nx # in Attention: n_state=768 (nx=n_embd) # [switch nx => n_state from Block to Attention to keep identical to TF implementation] assert n_state % config.n_head == 0 - self.n_ctx = n_ctx self.n_head = config.n_head self.split_size = n_state self.scale = scale @@ -185,12 +184,12 @@ class TFMLP(tf.keras.layers.Layer): class TFBlock(tf.keras.layers.Layer): - def __init__(self, n_ctx, config, scale=False, **kwargs): + def __init__(self, config, scale=False, **kwargs): super().__init__(**kwargs) nx = config.n_embd inner_dim = config.n_inner if config.n_inner is not None else 4 * nx self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1") - self.attn = TFAttention(nx, n_ctx, config, scale, name="attn") + self.attn = TFAttention(nx, config, scale, name="attn") self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2") self.mlp = TFMLP(inner_dim, config, name="mlp") @@ -233,7 +232,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte" ) self.drop = tf.keras.layers.Dropout(config.embd_pdrop) - self.h = [TFBlock(config.n_ctx, config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)] + self.h = [TFBlock(config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)] self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f") def build(self, input_shape): diff --git a/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py b/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py index 1c630fb2d8..7ee1c17477 100644 --- a/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py +++ b/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py @@ -33,7 +33,7 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_du num_layers=config_json["n_layer"], num_heads=config_json["n_head"], attention_types=config_json["attention_types"], - max_position_embeddings=config_json["n_ctx"], + max_position_embeddings=config_json["n_positions"], resid_dropout=config_json["res_dropout"], embed_dropout=config_json["embed_dropout"], attention_dropout=config_json["attn_dropout"], diff --git a/src/transformers/models/gptj/configuration_gptj.py b/src/transformers/models/gptj/configuration_gptj.py index 61dfd4e663..40408cb19f 100644 --- a/src/transformers/models/gptj/configuration_gptj.py +++ b/src/transformers/models/gptj/configuration_gptj.py @@ -42,8 +42,6 @@ class GPTJConfig(PretrainedConfig): n_positions (:obj:`int`, `optional`, defaults to 2048): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). - n_ctx (:obj:`int`, `optional`, defaults to 2048): - Dimensionality of the causal mask (usually same as n_positions). n_embd (:obj:`int`, `optional`, defaults to 4096): Dimensionality of the embeddings and hidden states. n_layer (:obj:`int`, `optional`, defaults to 28): @@ -96,7 +94,6 @@ class GPTJConfig(PretrainedConfig): self, vocab_size=50400, n_positions=2048, - n_ctx=2048, n_embd=4096, n_layer=28, n_head=16, @@ -115,7 +112,6 @@ class GPTJConfig(PretrainedConfig): **kwargs ): self.vocab_size = vocab_size - self.n_ctx = n_ctx self.n_positions = n_positions self.n_embd = n_embd self.n_layer = n_layer diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py index e81819b7f8..15d16627ad 100755 --- a/src/transformers/models/gptj/modeling_gptj.py +++ b/src/transformers/models/gptj/modeling_gptj.py @@ -99,7 +99,7 @@ class GPTJAttention(nn.Module): def _split_heads(self, tensor, num_attention_heads, attn_head_size, rotary): """ - Splits n_ctx dim into attn_head_size and num_attention_heads + Splits hidden dim into attn_head_size and num_attention_heads """ new_shape = tensor.size()[:-1] + (num_attention_heads, attn_head_size) tensor = tensor.view(*new_shape) @@ -114,7 +114,7 @@ class GPTJAttention(nn.Module): def _merge_heads(self, tensor, num_attention_heads, attn_head_size): """ - Merges attn_head_size dim and num_attn_heads dim into n_ctx + Merges attn_head_size dim and num_attn_heads dim into hidden dim """ if len(tensor.shape) == 5: tensor = tensor.permute(0, 1, 3, 2, 4).contiguous() @@ -377,7 +377,7 @@ GPTJ_INPUTS_DOCSTRING = r""" - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. - inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, n_ctx)`, `optional`): + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_dim)`, `optional`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. @@ -444,7 +444,6 @@ class GPTJModel(GPTJPreTrainedModel): self.drop = nn.Dropout(config.embd_pdrop) self.h = nn.ModuleList([GPTJBlock(config) for _ in range(config.n_layer)]) self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) - self.rotary_dim = min(config.rotary_dim, config.n_ctx // config.num_attention_heads) self.init_weights() # Model parallel @@ -854,7 +853,7 @@ class GPTJForSequenceClassification(GPTJPreTrainedModel): super().__init__(config) self.num_labels = config.num_labels self.transformer = GPTJModel(config) - self.score = nn.Linear(config.n_ctx, self.num_labels, bias=False) + self.score = nn.Linear(config.n_positions, self.num_labels, bias=False) self.init_weights() diff --git a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py index e5c114b436..cacb22b28e 100644 --- a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py +++ b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py @@ -88,7 +88,6 @@ def convert_megatron_checkpoint(args, input_state_dict, config): config.vocab_size = ds_args.padded_vocab_size config.n_positions = ds_args.max_position_embeddings - config.n_ctx = ds_args.seq_length config.n_embd = ds_args.hidden_size config.n_layer = ds_args.num_layers config.n_head = ds_args.num_attention_heads @@ -121,10 +120,10 @@ def convert_megatron_checkpoint(args, input_state_dict, config): # The position embeddings. pos_embeddings = embeddings["position_embeddings"]["weight"] # Read the causal mask dimension (seqlen). [max_sequence_length, hidden_size] - n_ctx = pos_embeddings.size(0) + n_positions = pos_embeddings.size(0) assert ( - n_ctx == config.n_ctx - ), f"pos_embeddings.max_sequence_length={n_ctx} and config.n_ctx={config.n_ctx} don't match" + n_positions == config.n_positions + ), f"pos_embeddings.max_sequence_length={n_positions} and config.n_positions={config.n_positions} don't match" # Store the position embeddings. output_state_dict["transformer.wpe.weight"] = pos_embeddings @@ -173,7 +172,9 @@ def convert_megatron_checkpoint(args, input_state_dict, config): ) and weight_or_bias == "weight": # Insert a tensor of 1x1xDxD bias. - causal_mask = torch.tril(torch.ones((n_ctx, n_ctx), dtype=torch.float16)).view(1, 1, n_ctx, n_ctx) + causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.float16)).view( + 1, 1, n_positions, n_positions + ) output_state_dict[layer_name + ".attn.bias"] = causal_mask # Insert a "dummy" tensor for masked_bias. @@ -274,7 +275,6 @@ def main(): config = GPT2Config( vocab_size=50257, n_positions=1024, - n_ctx=1024, n_embd=1024, n_layer=24, n_head=16, diff --git a/src/transformers/models/openai/configuration_openai.py b/src/transformers/models/openai/configuration_openai.py index f23fe74201..5ba2a80078 100644 --- a/src/transformers/models/openai/configuration_openai.py +++ b/src/transformers/models/openai/configuration_openai.py @@ -42,8 +42,6 @@ class OpenAIGPTConfig(PretrainedConfig): n_positions (:obj:`int`, `optional`, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). - n_ctx (:obj:`int`, `optional`, defaults to 512): - Dimensionality of the causal mask (usually same as n_positions). n_embd (:obj:`int`, `optional`, defaults to 768): Dimensionality of the embeddings and hidden states. n_layer (:obj:`int`, `optional`, defaults to 12): @@ -126,7 +124,6 @@ class OpenAIGPTConfig(PretrainedConfig): self, vocab_size=40478, n_positions=512, - n_ctx=512, n_embd=768, n_layer=12, n_head=12, @@ -145,7 +142,6 @@ class OpenAIGPTConfig(PretrainedConfig): **kwargs ): self.vocab_size = vocab_size - self.n_ctx = n_ctx self.n_positions = n_positions self.n_embd = n_embd self.n_layer = n_layer diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py index 0ce9344dac..c3f36a8889 100644 --- a/src/transformers/models/openai/modeling_openai.py +++ b/src/transformers/models/openai/modeling_openai.py @@ -143,12 +143,14 @@ ACT_FNS = {"relu": nn.ReLU, "silu": silu, "gelu": gelu_new, "swish": silu} class Attention(nn.Module): - def __init__(self, nx, n_ctx, config, scale=False): + def __init__(self, nx, n_positions, config, scale=False): super().__init__() n_state = nx # in Attention: n_state=768 (nx=n_embd) # [switch nx => n_state from Block to Attention to keep identical to TF implementation] assert n_state % config.n_head == 0 - self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx)) + self.register_buffer( + "bias", torch.tril(torch.ones(n_positions, n_positions)).view(1, 1, n_positions, n_positions) + ) self.n_head = config.n_head self.split_size = n_state self.scale = scale @@ -246,10 +248,10 @@ class MLP(nn.Module): class Block(nn.Module): - def __init__(self, n_ctx, config, scale=False): + def __init__(self, n_positions, config, scale=False): super().__init__() nx = config.n_embd - self.attn = Attention(nx, n_ctx, config, scale) + self.attn = Attention(nx, n_positions, config, scale) self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) self.mlp = MLP(4 * nx, config) self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) @@ -413,7 +415,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): self.tokens_embed = nn.Embedding(config.vocab_size, config.n_embd) self.positions_embed = nn.Embedding(config.n_positions, config.n_embd) self.drop = nn.Dropout(config.embd_pdrop) - self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)]) + self.h = nn.ModuleList([Block(config.n_positions, config, scale=True) for _ in range(config.n_layer)]) self.register_buffer("position_ids", torch.arange(config.n_positions)) self.init_weights() diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py index e4d5b80209..221f9c63cb 100644 --- a/src/transformers/models/openai/modeling_tf_openai.py +++ b/src/transformers/models/openai/modeling_tf_openai.py @@ -58,7 +58,7 @@ TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [ class TFAttention(tf.keras.layers.Layer): - def __init__(self, nx, n_ctx, config, scale=False, **kwargs): + def __init__(self, nx, config, scale=False, **kwargs): super().__init__(**kwargs) n_state = nx # in Attention: n_state=768 (nx=n_embd) @@ -66,7 +66,6 @@ class TFAttention(tf.keras.layers.Layer): assert ( n_state % config.n_head == 0 ), f"Hidden dimension {n_state} not dividable by number of heads {config.n_head}" - self.n_ctx = n_ctx self.n_head = config.n_head self.split_size = n_state self.scale = scale @@ -169,10 +168,10 @@ class TFMLP(tf.keras.layers.Layer): class TFBlock(tf.keras.layers.Layer): - def __init__(self, n_ctx, config, scale=False, **kwargs): + def __init__(self, config, scale=False, **kwargs): super().__init__(**kwargs) nx = config.n_embd - self.attn = TFAttention(nx, n_ctx, config, scale, name="attn") + self.attn = TFAttention(nx, config, scale, name="attn") self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1") self.mlp = TFMLP(4 * nx, config, name="mlp") self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2") @@ -210,7 +209,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="tokens_embed" ) self.drop = tf.keras.layers.Dropout(config.embd_pdrop) - self.h = [TFBlock(config.n_ctx, config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)] + self.h = [TFBlock(config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)] def build(self, input_shape): with tf.name_scope("positions_embed"): diff --git a/tests/test_modeling_ctrl.py b/tests/test_modeling_ctrl.py index c35e3cc025..15736ad8e3 100644 --- a/tests/test_modeling_ctrl.py +++ b/tests/test_modeling_ctrl.py @@ -114,7 +114,6 @@ class CTRLModelTester: # hidden_dropout_prob=self.hidden_dropout_prob, # attention_probs_dropout_prob=self.attention_probs_dropout_prob, n_positions=self.max_position_embeddings, - n_ctx=self.max_position_embeddings, # type_vocab_size=self.type_vocab_size, # initializer_range=self.initializer_range, pad_token_id=self.pad_token_id, diff --git a/tests/test_modeling_flax_gpt2.py b/tests/test_modeling_flax_gpt2.py index 3b2e43680e..3eed483a6f 100644 --- a/tests/test_modeling_flax_gpt2.py +++ b/tests/test_modeling_flax_gpt2.py @@ -95,7 +95,6 @@ class FlaxGPT2ModelTester: n_layer=self.num_hidden_layers, n_head=self.num_attention_heads, n_positions=self.max_position_embeddings, - n_ctx=self.max_position_embeddings, use_cache=False, bos_token_id=self.bos_token_id, eos_token_id=self.eos_token_id, diff --git a/tests/test_modeling_gpt2.py b/tests/test_modeling_gpt2.py index 462c6456d2..ef51c815e4 100644 --- a/tests/test_modeling_gpt2.py +++ b/tests/test_modeling_gpt2.py @@ -155,7 +155,6 @@ class GPT2ModelTester: resid_pdrop=self.hidden_dropout_prob, attn_pdrop=self.attention_probs_dropout_prob, n_positions=self.max_position_embeddings, - n_ctx=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, use_cache=True, diff --git a/tests/test_modeling_gptj.py b/tests/test_modeling_gptj.py index e94aac990b..e0ef8a905e 100644 --- a/tests/test_modeling_gptj.py +++ b/tests/test_modeling_gptj.py @@ -142,7 +142,6 @@ class GPTJModelTester: hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, n_positions=self.max_position_embeddings, - n_ctx=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, use_cache=True, diff --git a/tests/test_modeling_openai.py b/tests/test_modeling_openai.py index 08ee51df3f..584fac66b0 100644 --- a/tests/test_modeling_openai.py +++ b/tests/test_modeling_openai.py @@ -90,7 +90,6 @@ class OpenAIGPTModelTester: # hidden_dropout_prob=self.hidden_dropout_prob, # attention_probs_dropout_prob=self.attention_probs_dropout_prob, n_positions=self.max_position_embeddings, - n_ctx=self.max_position_embeddings, # type_vocab_size=self.type_vocab_size, # initializer_range=self.initializer_range pad_token_id=self.pad_token_id, diff --git a/tests/test_modeling_tf_ctrl.py b/tests/test_modeling_tf_ctrl.py index e9531552bd..6e4d73cc57 100644 --- a/tests/test_modeling_tf_ctrl.py +++ b/tests/test_modeling_tf_ctrl.py @@ -97,7 +97,6 @@ class TFCTRLModelTester(object): # hidden_dropout_prob=self.hidden_dropout_prob, # attention_probs_dropout_prob=self.attention_probs_dropout_prob, n_positions=self.max_position_embeddings, - n_ctx=self.max_position_embeddings, # type_vocab_size=self.type_vocab_size, # initializer_range=self.initializer_range, pad_token_id=self.pad_token_id, diff --git a/tests/test_modeling_tf_gpt2.py b/tests/test_modeling_tf_gpt2.py index 8e13f0fdc1..609c4731b0 100644 --- a/tests/test_modeling_tf_gpt2.py +++ b/tests/test_modeling_tf_gpt2.py @@ -100,7 +100,6 @@ class TFGPT2ModelTester: # hidden_dropout_prob=self.hidden_dropout_prob, # attention_probs_dropout_prob=self.attention_probs_dropout_prob, n_positions=self.max_position_embeddings, - n_ctx=self.max_position_embeddings, # type_vocab_size=self.type_vocab_size, # initializer_range=self.initializer_range bos_token_id=self.bos_token_id, diff --git a/tests/test_modeling_tf_openai.py b/tests/test_modeling_tf_openai.py index 4dc684adb7..c6d3a09895 100644 --- a/tests/test_modeling_tf_openai.py +++ b/tests/test_modeling_tf_openai.py @@ -98,7 +98,6 @@ class TFOpenAIGPTModelTester: # hidden_dropout_prob=self.hidden_dropout_prob, # attention_probs_dropout_prob=self.attention_probs_dropout_prob, n_positions=self.max_position_embeddings, - n_ctx=self.max_position_embeddings, # type_vocab_size=self.type_vocab_size, # initializer_range=self.initializer_range, pad_token_id=self.pad_token_id, diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 72fda10c8f..b1e6e0f5a9 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -490,7 +490,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): _ = trainer.predict(eval_dataset) def test_evaluation_with_keys_to_drop(self): - config = GPT2Config(vocab_size=100, n_positions=128, n_ctx=128, n_embd=32, n_layer=3, n_head=4) + config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4) tiny_gpt2 = GPT2LMHeadModel(config) x = torch.randint(0, 100, (128,)) eval_dataset = RepeatDataset(x) @@ -531,7 +531,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): self.assertEqual(train_output.global_step, 10) def test_logging_inf_nan_filter(self): - config = GPT2Config(vocab_size=100, n_positions=128, n_ctx=128, n_embd=32, n_layer=3, n_head=4) + config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4) tiny_gpt2 = GPT2LMHeadModel(config) x = torch.randint(0, 100, (128,)) train_dataset = RepeatDataset(x)