Remove n_ctx from configs (#14165)
* Remove n_ctx from configs * Fix GPTJ and OpenAIGPT, both are acceptable breaking changes as there are no configs such that it breaks * Remove unecessary n_positions from TFOpenAIGPT
This commit is contained in:
@@ -1,7 +1,6 @@
|
||||
{
|
||||
"initializer_range": 0.02,
|
||||
"layer_norm_epsilon": 0.00001,
|
||||
"n_ctx": 1024,
|
||||
"n_embd": 768,
|
||||
"n_head": 12,
|
||||
"n_layer": 6,
|
||||
|
||||
@@ -653,7 +653,7 @@ class CLIPTextTransformer(nn.Module):
|
||||
last_hidden_state = encoder_outputs[0]
|
||||
last_hidden_state = self.final_layer_norm(last_hidden_state)
|
||||
|
||||
# text_embeds.shape = [batch_size, n_ctx, transformer.width]
|
||||
# text_embeds.shape = [batch_size, sequence_length, transformer.width]
|
||||
# take features from the eot embedding (eot_token is the highest number in each sequence)
|
||||
pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)]
|
||||
|
||||
|
||||
@@ -521,7 +521,7 @@ class FlaxCLIPTextTransformer(nn.Module):
|
||||
last_hidden_state = encoder_outputs[0]
|
||||
last_hidden_state = self.final_layer_norm(last_hidden_state)
|
||||
|
||||
# text_embeds.shape = [batch_size, n_ctx, transformer.width]
|
||||
# text_embeds.shape = [batch_size, sequence_length, transformer.width]
|
||||
# take features from the EOS embedding (eos_token_id is the highest number in each sequence)
|
||||
pooled_output = last_hidden_state[jnp.arange(last_hidden_state.shape[0]), input_ids.argmax(axis=-1)]
|
||||
|
||||
|
||||
@@ -41,8 +41,6 @@ class CTRLConfig(PretrainedConfig):
|
||||
n_positions (:obj:`int`, `optional`, defaults to 256):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
n_ctx (:obj:`int`, `optional`, defaults to 256):
|
||||
Dimensionality of the causal mask (usually same as n_positions).
|
||||
n_embd (:obj:`int`, `optional`, defaults to 1280):
|
||||
Dimensionality of the embeddings and hidden states.
|
||||
dff (:obj:`int`, `optional`, defaults to 8192):
|
||||
@@ -92,7 +90,6 @@ class CTRLConfig(PretrainedConfig):
|
||||
self,
|
||||
vocab_size=246534,
|
||||
n_positions=256,
|
||||
n_ctx=256,
|
||||
n_embd=1280,
|
||||
dff=8192,
|
||||
n_layer=48,
|
||||
@@ -111,7 +108,6 @@ class CTRLConfig(PretrainedConfig):
|
||||
**kwargs
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.n_ctx = n_ctx
|
||||
self.n_positions = n_positions
|
||||
self.n_embd = n_embd
|
||||
self.n_layer = n_layer
|
||||
|
||||
@@ -54,8 +54,6 @@ class GPT2Config(PretrainedConfig):
|
||||
n_positions (:obj:`int`, `optional`, defaults to 1024):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
n_ctx (:obj:`int`, `optional`, defaults to 1024):
|
||||
Dimensionality of the causal mask (usually same as n_positions).
|
||||
n_embd (:obj:`int`, `optional`, defaults to 768):
|
||||
Dimensionality of the embeddings and hidden states.
|
||||
n_layer (:obj:`int`, `optional`, defaults to 12):
|
||||
@@ -144,7 +142,6 @@ class GPT2Config(PretrainedConfig):
|
||||
self,
|
||||
vocab_size=50257,
|
||||
n_positions=1024,
|
||||
n_ctx=1024,
|
||||
n_embd=768,
|
||||
n_layer=12,
|
||||
n_head=12,
|
||||
@@ -169,7 +166,6 @@ class GPT2Config(PretrainedConfig):
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.n_ctx = n_ctx
|
||||
self.n_positions = n_positions
|
||||
self.n_embd = n_embd
|
||||
self.n_layer = n_layer
|
||||
|
||||
@@ -66,13 +66,12 @@ TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
||||
|
||||
|
||||
class TFAttention(tf.keras.layers.Layer):
|
||||
def __init__(self, nx, n_ctx, config, scale=False, **kwargs):
|
||||
def __init__(self, nx, config, scale=False, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
n_state = nx # in Attention: n_state=768 (nx=n_embd)
|
||||
# [switch nx => n_state from Block to Attention to keep identical to TF implementation]
|
||||
assert n_state % config.n_head == 0
|
||||
self.n_ctx = n_ctx
|
||||
self.n_head = config.n_head
|
||||
self.split_size = n_state
|
||||
self.scale = scale
|
||||
@@ -185,12 +184,12 @@ class TFMLP(tf.keras.layers.Layer):
|
||||
|
||||
|
||||
class TFBlock(tf.keras.layers.Layer):
|
||||
def __init__(self, n_ctx, config, scale=False, **kwargs):
|
||||
def __init__(self, config, scale=False, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
nx = config.n_embd
|
||||
inner_dim = config.n_inner if config.n_inner is not None else 4 * nx
|
||||
self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
|
||||
self.attn = TFAttention(nx, n_ctx, config, scale, name="attn")
|
||||
self.attn = TFAttention(nx, config, scale, name="attn")
|
||||
self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
|
||||
self.mlp = TFMLP(inner_dim, config, name="mlp")
|
||||
|
||||
@@ -233,7 +232,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
|
||||
config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte"
|
||||
)
|
||||
self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
|
||||
self.h = [TFBlock(config.n_ctx, config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)]
|
||||
self.h = [TFBlock(config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)]
|
||||
self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f")
|
||||
|
||||
def build(self, input_shape):
|
||||
|
||||
@@ -33,7 +33,7 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_du
|
||||
num_layers=config_json["n_layer"],
|
||||
num_heads=config_json["n_head"],
|
||||
attention_types=config_json["attention_types"],
|
||||
max_position_embeddings=config_json["n_ctx"],
|
||||
max_position_embeddings=config_json["n_positions"],
|
||||
resid_dropout=config_json["res_dropout"],
|
||||
embed_dropout=config_json["embed_dropout"],
|
||||
attention_dropout=config_json["attn_dropout"],
|
||||
|
||||
@@ -42,8 +42,6 @@ class GPTJConfig(PretrainedConfig):
|
||||
n_positions (:obj:`int`, `optional`, defaults to 2048):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
n_ctx (:obj:`int`, `optional`, defaults to 2048):
|
||||
Dimensionality of the causal mask (usually same as n_positions).
|
||||
n_embd (:obj:`int`, `optional`, defaults to 4096):
|
||||
Dimensionality of the embeddings and hidden states.
|
||||
n_layer (:obj:`int`, `optional`, defaults to 28):
|
||||
@@ -96,7 +94,6 @@ class GPTJConfig(PretrainedConfig):
|
||||
self,
|
||||
vocab_size=50400,
|
||||
n_positions=2048,
|
||||
n_ctx=2048,
|
||||
n_embd=4096,
|
||||
n_layer=28,
|
||||
n_head=16,
|
||||
@@ -115,7 +112,6 @@ class GPTJConfig(PretrainedConfig):
|
||||
**kwargs
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.n_ctx = n_ctx
|
||||
self.n_positions = n_positions
|
||||
self.n_embd = n_embd
|
||||
self.n_layer = n_layer
|
||||
|
||||
@@ -99,7 +99,7 @@ class GPTJAttention(nn.Module):
|
||||
|
||||
def _split_heads(self, tensor, num_attention_heads, attn_head_size, rotary):
|
||||
"""
|
||||
Splits n_ctx dim into attn_head_size and num_attention_heads
|
||||
Splits hidden dim into attn_head_size and num_attention_heads
|
||||
"""
|
||||
new_shape = tensor.size()[:-1] + (num_attention_heads, attn_head_size)
|
||||
tensor = tensor.view(*new_shape)
|
||||
@@ -114,7 +114,7 @@ class GPTJAttention(nn.Module):
|
||||
|
||||
def _merge_heads(self, tensor, num_attention_heads, attn_head_size):
|
||||
"""
|
||||
Merges attn_head_size dim and num_attn_heads dim into n_ctx
|
||||
Merges attn_head_size dim and num_attn_heads dim into hidden dim
|
||||
"""
|
||||
if len(tensor.shape) == 5:
|
||||
tensor = tensor.permute(0, 1, 3, 2, 4).contiguous()
|
||||
@@ -377,7 +377,7 @@ GPTJ_INPUTS_DOCSTRING = r"""
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, n_ctx)`, `optional`):
|
||||
inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_dim)`, `optional`):
|
||||
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
|
||||
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
||||
than the model's internal embedding lookup matrix.
|
||||
@@ -444,7 +444,6 @@ class GPTJModel(GPTJPreTrainedModel):
|
||||
self.drop = nn.Dropout(config.embd_pdrop)
|
||||
self.h = nn.ModuleList([GPTJBlock(config) for _ in range(config.n_layer)])
|
||||
self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
|
||||
self.rotary_dim = min(config.rotary_dim, config.n_ctx // config.num_attention_heads)
|
||||
self.init_weights()
|
||||
|
||||
# Model parallel
|
||||
@@ -854,7 +853,7 @@ class GPTJForSequenceClassification(GPTJPreTrainedModel):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
self.transformer = GPTJModel(config)
|
||||
self.score = nn.Linear(config.n_ctx, self.num_labels, bias=False)
|
||||
self.score = nn.Linear(config.n_positions, self.num_labels, bias=False)
|
||||
|
||||
self.init_weights()
|
||||
|
||||
|
||||
@@ -88,7 +88,6 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
|
||||
|
||||
config.vocab_size = ds_args.padded_vocab_size
|
||||
config.n_positions = ds_args.max_position_embeddings
|
||||
config.n_ctx = ds_args.seq_length
|
||||
config.n_embd = ds_args.hidden_size
|
||||
config.n_layer = ds_args.num_layers
|
||||
config.n_head = ds_args.num_attention_heads
|
||||
@@ -121,10 +120,10 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
|
||||
# The position embeddings.
|
||||
pos_embeddings = embeddings["position_embeddings"]["weight"]
|
||||
# Read the causal mask dimension (seqlen). [max_sequence_length, hidden_size]
|
||||
n_ctx = pos_embeddings.size(0)
|
||||
n_positions = pos_embeddings.size(0)
|
||||
assert (
|
||||
n_ctx == config.n_ctx
|
||||
), f"pos_embeddings.max_sequence_length={n_ctx} and config.n_ctx={config.n_ctx} don't match"
|
||||
n_positions == config.n_positions
|
||||
), f"pos_embeddings.max_sequence_length={n_positions} and config.n_positions={config.n_positions} don't match"
|
||||
# Store the position embeddings.
|
||||
output_state_dict["transformer.wpe.weight"] = pos_embeddings
|
||||
|
||||
@@ -173,7 +172,9 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
|
||||
) and weight_or_bias == "weight":
|
||||
|
||||
# Insert a tensor of 1x1xDxD bias.
|
||||
causal_mask = torch.tril(torch.ones((n_ctx, n_ctx), dtype=torch.float16)).view(1, 1, n_ctx, n_ctx)
|
||||
causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.float16)).view(
|
||||
1, 1, n_positions, n_positions
|
||||
)
|
||||
output_state_dict[layer_name + ".attn.bias"] = causal_mask
|
||||
|
||||
# Insert a "dummy" tensor for masked_bias.
|
||||
@@ -274,7 +275,6 @@ def main():
|
||||
config = GPT2Config(
|
||||
vocab_size=50257,
|
||||
n_positions=1024,
|
||||
n_ctx=1024,
|
||||
n_embd=1024,
|
||||
n_layer=24,
|
||||
n_head=16,
|
||||
|
||||
@@ -42,8 +42,6 @@ class OpenAIGPTConfig(PretrainedConfig):
|
||||
n_positions (:obj:`int`, `optional`, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
n_ctx (:obj:`int`, `optional`, defaults to 512):
|
||||
Dimensionality of the causal mask (usually same as n_positions).
|
||||
n_embd (:obj:`int`, `optional`, defaults to 768):
|
||||
Dimensionality of the embeddings and hidden states.
|
||||
n_layer (:obj:`int`, `optional`, defaults to 12):
|
||||
@@ -126,7 +124,6 @@ class OpenAIGPTConfig(PretrainedConfig):
|
||||
self,
|
||||
vocab_size=40478,
|
||||
n_positions=512,
|
||||
n_ctx=512,
|
||||
n_embd=768,
|
||||
n_layer=12,
|
||||
n_head=12,
|
||||
@@ -145,7 +142,6 @@ class OpenAIGPTConfig(PretrainedConfig):
|
||||
**kwargs
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.n_ctx = n_ctx
|
||||
self.n_positions = n_positions
|
||||
self.n_embd = n_embd
|
||||
self.n_layer = n_layer
|
||||
|
||||
@@ -143,12 +143,14 @@ ACT_FNS = {"relu": nn.ReLU, "silu": silu, "gelu": gelu_new, "swish": silu}
|
||||
|
||||
|
||||
class Attention(nn.Module):
|
||||
def __init__(self, nx, n_ctx, config, scale=False):
|
||||
def __init__(self, nx, n_positions, config, scale=False):
|
||||
super().__init__()
|
||||
n_state = nx # in Attention: n_state=768 (nx=n_embd)
|
||||
# [switch nx => n_state from Block to Attention to keep identical to TF implementation]
|
||||
assert n_state % config.n_head == 0
|
||||
self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
|
||||
self.register_buffer(
|
||||
"bias", torch.tril(torch.ones(n_positions, n_positions)).view(1, 1, n_positions, n_positions)
|
||||
)
|
||||
self.n_head = config.n_head
|
||||
self.split_size = n_state
|
||||
self.scale = scale
|
||||
@@ -246,10 +248,10 @@ class MLP(nn.Module):
|
||||
|
||||
|
||||
class Block(nn.Module):
|
||||
def __init__(self, n_ctx, config, scale=False):
|
||||
def __init__(self, n_positions, config, scale=False):
|
||||
super().__init__()
|
||||
nx = config.n_embd
|
||||
self.attn = Attention(nx, n_ctx, config, scale)
|
||||
self.attn = Attention(nx, n_positions, config, scale)
|
||||
self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
|
||||
self.mlp = MLP(4 * nx, config)
|
||||
self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
|
||||
@@ -413,7 +415,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
||||
self.tokens_embed = nn.Embedding(config.vocab_size, config.n_embd)
|
||||
self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
|
||||
self.drop = nn.Dropout(config.embd_pdrop)
|
||||
self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
|
||||
self.h = nn.ModuleList([Block(config.n_positions, config, scale=True) for _ in range(config.n_layer)])
|
||||
|
||||
self.register_buffer("position_ids", torch.arange(config.n_positions))
|
||||
self.init_weights()
|
||||
|
||||
@@ -58,7 +58,7 @@ TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
||||
|
||||
|
||||
class TFAttention(tf.keras.layers.Layer):
|
||||
def __init__(self, nx, n_ctx, config, scale=False, **kwargs):
|
||||
def __init__(self, nx, config, scale=False, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
n_state = nx # in Attention: n_state=768 (nx=n_embd)
|
||||
@@ -66,7 +66,6 @@ class TFAttention(tf.keras.layers.Layer):
|
||||
assert (
|
||||
n_state % config.n_head == 0
|
||||
), f"Hidden dimension {n_state} not dividable by number of heads {config.n_head}"
|
||||
self.n_ctx = n_ctx
|
||||
self.n_head = config.n_head
|
||||
self.split_size = n_state
|
||||
self.scale = scale
|
||||
@@ -169,10 +168,10 @@ class TFMLP(tf.keras.layers.Layer):
|
||||
|
||||
|
||||
class TFBlock(tf.keras.layers.Layer):
|
||||
def __init__(self, n_ctx, config, scale=False, **kwargs):
|
||||
def __init__(self, config, scale=False, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
nx = config.n_embd
|
||||
self.attn = TFAttention(nx, n_ctx, config, scale, name="attn")
|
||||
self.attn = TFAttention(nx, config, scale, name="attn")
|
||||
self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
|
||||
self.mlp = TFMLP(4 * nx, config, name="mlp")
|
||||
self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
|
||||
@@ -210,7 +209,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
|
||||
config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="tokens_embed"
|
||||
)
|
||||
self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
|
||||
self.h = [TFBlock(config.n_ctx, config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)]
|
||||
self.h = [TFBlock(config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)]
|
||||
|
||||
def build(self, input_shape):
|
||||
with tf.name_scope("positions_embed"):
|
||||
|
||||
@@ -114,7 +114,6 @@ class CTRLModelTester:
|
||||
# hidden_dropout_prob=self.hidden_dropout_prob,
|
||||
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||
n_positions=self.max_position_embeddings,
|
||||
n_ctx=self.max_position_embeddings,
|
||||
# type_vocab_size=self.type_vocab_size,
|
||||
# initializer_range=self.initializer_range,
|
||||
pad_token_id=self.pad_token_id,
|
||||
|
||||
@@ -95,7 +95,6 @@ class FlaxGPT2ModelTester:
|
||||
n_layer=self.num_hidden_layers,
|
||||
n_head=self.num_attention_heads,
|
||||
n_positions=self.max_position_embeddings,
|
||||
n_ctx=self.max_position_embeddings,
|
||||
use_cache=False,
|
||||
bos_token_id=self.bos_token_id,
|
||||
eos_token_id=self.eos_token_id,
|
||||
|
||||
@@ -155,7 +155,6 @@ class GPT2ModelTester:
|
||||
resid_pdrop=self.hidden_dropout_prob,
|
||||
attn_pdrop=self.attention_probs_dropout_prob,
|
||||
n_positions=self.max_position_embeddings,
|
||||
n_ctx=self.max_position_embeddings,
|
||||
type_vocab_size=self.type_vocab_size,
|
||||
initializer_range=self.initializer_range,
|
||||
use_cache=True,
|
||||
|
||||
@@ -142,7 +142,6 @@ class GPTJModelTester:
|
||||
hidden_dropout_prob=self.hidden_dropout_prob,
|
||||
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||
n_positions=self.max_position_embeddings,
|
||||
n_ctx=self.max_position_embeddings,
|
||||
type_vocab_size=self.type_vocab_size,
|
||||
initializer_range=self.initializer_range,
|
||||
use_cache=True,
|
||||
|
||||
@@ -90,7 +90,6 @@ class OpenAIGPTModelTester:
|
||||
# hidden_dropout_prob=self.hidden_dropout_prob,
|
||||
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||
n_positions=self.max_position_embeddings,
|
||||
n_ctx=self.max_position_embeddings,
|
||||
# type_vocab_size=self.type_vocab_size,
|
||||
# initializer_range=self.initializer_range
|
||||
pad_token_id=self.pad_token_id,
|
||||
|
||||
@@ -97,7 +97,6 @@ class TFCTRLModelTester(object):
|
||||
# hidden_dropout_prob=self.hidden_dropout_prob,
|
||||
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||
n_positions=self.max_position_embeddings,
|
||||
n_ctx=self.max_position_embeddings,
|
||||
# type_vocab_size=self.type_vocab_size,
|
||||
# initializer_range=self.initializer_range,
|
||||
pad_token_id=self.pad_token_id,
|
||||
|
||||
@@ -100,7 +100,6 @@ class TFGPT2ModelTester:
|
||||
# hidden_dropout_prob=self.hidden_dropout_prob,
|
||||
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||
n_positions=self.max_position_embeddings,
|
||||
n_ctx=self.max_position_embeddings,
|
||||
# type_vocab_size=self.type_vocab_size,
|
||||
# initializer_range=self.initializer_range
|
||||
bos_token_id=self.bos_token_id,
|
||||
|
||||
@@ -98,7 +98,6 @@ class TFOpenAIGPTModelTester:
|
||||
# hidden_dropout_prob=self.hidden_dropout_prob,
|
||||
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||
n_positions=self.max_position_embeddings,
|
||||
n_ctx=self.max_position_embeddings,
|
||||
# type_vocab_size=self.type_vocab_size,
|
||||
# initializer_range=self.initializer_range,
|
||||
pad_token_id=self.pad_token_id,
|
||||
|
||||
@@ -490,7 +490,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
||||
_ = trainer.predict(eval_dataset)
|
||||
|
||||
def test_evaluation_with_keys_to_drop(self):
|
||||
config = GPT2Config(vocab_size=100, n_positions=128, n_ctx=128, n_embd=32, n_layer=3, n_head=4)
|
||||
config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
|
||||
tiny_gpt2 = GPT2LMHeadModel(config)
|
||||
x = torch.randint(0, 100, (128,))
|
||||
eval_dataset = RepeatDataset(x)
|
||||
@@ -531,7 +531,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
||||
self.assertEqual(train_output.global_step, 10)
|
||||
|
||||
def test_logging_inf_nan_filter(self):
|
||||
config = GPT2Config(vocab_size=100, n_positions=128, n_ctx=128, n_embd=32, n_layer=3, n_head=4)
|
||||
config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
|
||||
tiny_gpt2 = GPT2LMHeadModel(config)
|
||||
x = torch.randint(0, 100, (128,))
|
||||
train_dataset = RepeatDataset(x)
|
||||
|
||||
Reference in New Issue
Block a user