From 5705333441f44858d9e656a8370b6c4c2921455b Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 26 Sep 2019 10:06:20 +0200 Subject: [PATCH] add initialization for everybody --- examples/run_tf_glue.py | 49 +++++++++------- .../modeling_tf_distilbert.py | 57 ++++++++++++++----- pytorch_transformers/modeling_tf_gpt2.py | 22 ++++--- pytorch_transformers/modeling_tf_openai.py | 22 ++++--- pytorch_transformers/modeling_tf_roberta.py | 15 +++-- .../modeling_tf_transfo_xl.py | 49 +++++++++++----- pytorch_transformers/modeling_tf_utils.py | 18 +++--- pytorch_transformers/modeling_tf_xlm.py | 32 +++++++---- pytorch_transformers/modeling_tf_xlnet.py | 30 ++++++---- 9 files changed, 195 insertions(+), 99 deletions(-) diff --git a/examples/run_tf_glue.py b/examples/run_tf_glue.py index dfbcfd5b79..1301d13e08 100644 --- a/examples/run_tf_glue.py +++ b/examples/run_tf_glue.py @@ -1,37 +1,48 @@ import tensorflow as tf import tensorflow_datasets -from transformers import * +from pytorch_transformers import * # Load dataset, tokenizer, model from pretrained model/vocabulary tokenizer = BertTokenizer.from_pretrained('bert-base-cased') -dataset = tensorflow_datasets.load('glue/mrpc') model = TFBertForSequenceClassification.from_pretrained('bert-base-cased') +data = tensorflow_datasets.load('glue/mrpc') # Prepare dataset for GLUE as a tf.data.Dataset instance -train_dataset = glue_convert_examples_to_features(dataset['train'], tokenizer, task='mrpc') -valid_dataset = glue_convert_examples_to_features(dataset['validation'], tokenizer, task='mrpc') -train_dataset = train_dataset.shuffle(100).batch(32).repeat(3) +train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, 'mrpc') +valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, 'mrpc') +train_dataset = train_dataset.shuffle(100).batch(32).repeat(2) valid_dataset = valid_dataset.batch(64) # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule -learning_rate = tf.keras.optimizers.schedules.PolynomialDecay(2e-5, 345, end_learning_rate=0) -optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08, clipnorm=1.0) +optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) - -model.compile(optimizer=optimizer, loss=loss, metrics=['sparse_categorical_accuracy']) +metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') +model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) # Train and evaluate using tf.keras.Model.fit() -model.fit(train_dataset, epochs=3, steps_per_epoch=115, - validation_data=valid_dataset, validation_steps=7) +history = model.fit(train_dataset, epochs=2, steps_per_epoch=115, + validation_data=valid_dataset, validation_steps=7) -# Save the TensorFlow model and load it in PyTorch +>>> Train for 115 steps, validate for 7 steps +>>> Epoch 1/2 +>>> 115/115 [==============================] - 53s 459ms/step - loss: 0.6033 - accuracy: 0.6712 - val_loss: 0.4964 - val_accuracy: 0.7647 +>>> Epoch 2/2 +>>> 115/115 [==============================] - 33s 289ms/step - loss: 0.4141 - accuracy: 0.8160 - val_loss: 0.3914 - val_accuracy: 0.8382 + +# Load the TensorFlow model in PyTorch for inspection model.save_pretrained('./save/') pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True) -# Quickly inspect a few predictions - MRPC is a paraphrasing task -inputs = tokenizer.encode_plus("The company is doing great", - "The company has good results", - add_special_tokens=True, - return_tensors='pt') -pred = pytorch_model(**inputs) -print("Paraphrase" if pred.argmax().item() == 0 else "Not paraphrase") +# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task +sentence_0 = "This research was consistent with his findings." +sentence_1 = "His findings were compatible with this research." +sentence_2 = "His findings were not compatible with this research." +inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt') +inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt') + +pred_1 = pytorch_model(**inputs_1)[0].argmax().item() +pred_2 = pytorch_model(**inputs_2)[0].argmax().item() +print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0") +print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0") +>>> sentence_1 is a paraphrase of sentence_0 +>>> sentence_2 is not a paraphrase of sentence_0 \ No newline at end of file diff --git a/pytorch_transformers/modeling_tf_distilbert.py b/pytorch_transformers/modeling_tf_distilbert.py index 7ca8d8c815..2ec73fc43d 100644 --- a/pytorch_transformers/modeling_tf_distilbert.py +++ b/pytorch_transformers/modeling_tf_distilbert.py @@ -29,7 +29,7 @@ import numpy as np import tensorflow as tf from .configuration_distilbert import DistilBertConfig -from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list +from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list, get_initializer from .file_utils import add_start_docstrings from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model @@ -79,8 +79,15 @@ class TFEmbeddings(tf.keras.layers.Layer): super(TFEmbeddings, self).__init__(**kwargs) self.vocab_size = config.vocab_size self.dim = config.dim - self.word_embeddings = TFSharedEmbeddings(config.vocab_size, config.dim, name='word_embeddings') # padding_idx=0) - self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings, config.dim, name='position_embeddings') + self.initializer_range = config.initializer_range + self.word_embeddings = TFSharedEmbeddings(config.vocab_size, + config.dim, + initializer_range=config.initializer_range, + name='word_embeddings') # padding_idx=0) + self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings, + config.dim, + embeddings_initializer=get_initializer(config.initializer_range), + name='position_embeddings') if config.sinusoidal_pos_embds: raise NotImplementedError @@ -95,8 +102,7 @@ class TFEmbeddings(tf.keras.layers.Layer): self.word_embeddings = self.add_weight( "weight", shape=[self.vocab_size, self.dim], - initializer=tf.random_normal_initializer( - mean=0., stddev=self.dim**-0.5)) + initializer=get_initializer(self.initializer_range)) super(TFEmbeddings, self).build(input_shape) def call(self, inputs, mode="embedding", training=False): @@ -178,10 +184,18 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer): assert self.dim % self.n_heads == 0 - self.q_lin = tf.keras.layers.Dense(config.dim, name="q_lin") - self.k_lin = tf.keras.layers.Dense(config.dim, name="k_lin") - self.v_lin = tf.keras.layers.Dense(config.dim, name="v_lin") - self.out_lin = tf.keras.layers.Dense(config.dim, name="out_lin") + self.q_lin = tf.keras.layers.Dense(config.dim, + kernel_initializer=get_initializer(config.initializer_range), + name="q_lin") + self.k_lin = tf.keras.layers.Dense(config.dim, + kernel_initializer=get_initializer(config.initializer_range), + name="k_lin") + self.v_lin = tf.keras.layers.Dense(config.dim, + kernel_initializer=get_initializer(config.initializer_range), + name="v_lin") + self.out_lin = tf.keras.layers.Dense(config.dim, + kernel_initializer=get_initializer(config.initializer_range), + name="out_lin") self.pruned_heads = set() @@ -254,8 +268,12 @@ class TFFFN(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFFFN, self).__init__(**kwargs) self.dropout = tf.keras.layers.Dropout(config.dropout) - self.lin1 = tf.keras.layers.Dense(config.hidden_dim, name="lin1") - self.lin2 = tf.keras.layers.Dense(config.dim, name="lin2") + self.lin1 = tf.keras.layers.Dense(config.hidden_dim, + kernel_initializer=get_initializer(config.initializer_range), + name="lin1") + self.lin2 = tf.keras.layers.Dense(config.dim, + kernel_initializer=get_initializer(config.initializer_range), + name="lin2") assert config.activation in ['relu', 'gelu'], "activation ({}) must be in ['relu', 'gelu']".format(config.activation) self.activation = tf.keras.layers.Activation(gelu) if config.activation=='gelu' else tf.keras.activations.relu @@ -596,7 +614,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): self.vocab_size = config.vocab_size self.distilbert = TFDistilBertMainLayer(config, name="distilbert") - self.vocab_transform = tf.keras.layers.Dense(config.dim, name="vocab_transform") + self.vocab_transform = tf.keras.layers.Dense(config.dim, + kernel_initializer=get_initializer(config.initializer_range), + name="vocab_transform") self.act = tf.keras.layers.Activation(gelu) self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm") self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector") @@ -647,8 +667,13 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel): self.num_labels = config.num_labels self.distilbert = TFDistilBertMainLayer(config, name="distilbert") - self.pre_classifier = tf.keras.layers.Dense(config.dim, activation='relu', name="pre_classifier") - self.classifier = tf.keras.layers.Dense(config.num_labels, name="classifier") + self.pre_classifier = tf.keras.layers.Dense(config.dim, + kernel_initializer=get_initializer(config.initializer_range), + activation='relu', + name="pre_classifier") + self.classifier = tf.keras.layers.Dense(config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name="classifier") self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout) def call(self, inputs, **kwargs): @@ -700,7 +725,9 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel): super(TFDistilBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs) self.distilbert = TFDistilBertMainLayer(config, name="distilbert") - self.qa_outputs = tf.keras.layers.Dense(config.num_labels, name='qa_outputs') + self.qa_outputs = tf.keras.layers.Dense(config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name='qa_outputs') assert config.num_labels == 2 self.dropout = tf.keras.layers.Dropout(config.qa_dropout) diff --git a/pytorch_transformers/modeling_tf_gpt2.py b/pytorch_transformers/modeling_tf_gpt2.py index cb63b95794..77d78e7a93 100644 --- a/pytorch_transformers/modeling_tf_gpt2.py +++ b/pytorch_transformers/modeling_tf_gpt2.py @@ -29,7 +29,7 @@ import numpy as np import tensorflow as tf from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings, - TFSequenceSummary, shape_list) + TFSequenceSummary, shape_list, get_initializer) from .configuration_gpt2 import GPT2Config from .file_utils import add_start_docstrings from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model @@ -76,8 +76,8 @@ class TFAttention(tf.keras.layers.Layer): self.split_size = n_state self.scale = scale - self.c_attn = TFConv1D(n_state * 3, nx, name='c_attn') - self.c_proj = TFConv1D(n_state, nx, name='c_proj') + self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name='c_attn') + self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_proj') self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop) self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop) self.pruned_heads = set() @@ -166,8 +166,8 @@ class TFMLP(tf.keras.layers.Layer): def __init__(self, n_state, config, **kwargs): super(TFMLP, self).__init__(**kwargs) nx = config.n_embd - self.c_fc = TFConv1D(n_state, nx, name='c_fc') - self.c_proj = TFConv1D(nx, n_state, name='c_proj') + self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_fc') + self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name='c_proj') self.act = gelu self.dropout = tf.keras.layers.Dropout(config.resid_pdrop) @@ -212,8 +212,14 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): self.vocab_size = config.vocab_size self.n_embd = config.n_embd - self.wte = TFSharedEmbeddings(config.vocab_size, config.hidden_size, name='wte') - self.wpe = tf.keras.layers.Embedding(config.n_positions, config.n_embd, name='wpe') + self.wte = TFSharedEmbeddings(config.vocab_size, + config.hidden_size, + initializer_range=config.initializer_range, + name='wte') + self.wpe = tf.keras.layers.Embedding(config.n_positions, + config.n_embd, + embeddings_initializer=get_initializer(config.initializer_range), + name='wpe') self.drop = tf.keras.layers.Dropout(config.embd_pdrop) self.h = [TFBlock(config.n_ctx, config, @@ -557,7 +563,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): def __init__(self, config, *inputs, **kwargs): super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs) self.transformer = TFGPT2MainLayer(config, name='transformer') - self.multiple_choice_head = TFSequenceSummary(config, name='multiple_choice_head') + self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head') def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, mc_token_ids=None, training=False): if isinstance(inputs, (tuple, list)): diff --git a/pytorch_transformers/modeling_tf_openai.py b/pytorch_transformers/modeling_tf_openai.py index 18b07e0637..0704e574a4 100644 --- a/pytorch_transformers/modeling_tf_openai.py +++ b/pytorch_transformers/modeling_tf_openai.py @@ -29,7 +29,7 @@ import numpy as np import tensorflow as tf from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings, - TFSequenceSummary, shape_list) + TFSequenceSummary, shape_list, get_initializer) from .configuration_openai import OpenAIGPTConfig from .file_utils import add_start_docstrings from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model @@ -83,8 +83,8 @@ class TFAttention(tf.keras.layers.Layer): self.split_size = n_state self.scale = scale - self.c_attn = TFConv1D(n_state * 3, nx, name='c_attn') - self.c_proj = TFConv1D(n_state, nx, name='c_proj') + self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name='c_attn') + self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_proj') self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop) self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop) self.pruned_heads = set() @@ -168,8 +168,8 @@ class TFMLP(tf.keras.layers.Layer): def __init__(self, n_state, config, **kwargs): super(TFMLP, self).__init__(**kwargs) nx = config.n_embd - self.c_fc = TFConv1D(n_state, nx, name='c_fc') - self.c_proj = TFConv1D(nx, n_state, name='c_proj') + self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_fc') + self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name='c_proj') self.act = gelu self.dropout = tf.keras.layers.Dropout(config.resid_pdrop) @@ -212,8 +212,14 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): self.vocab_size = config.vocab_size self.n_embd = config.n_embd - self.tokens_embed = TFSharedEmbeddings(config.vocab_size, config.n_embd, name='tokens_embed') - self.positions_embed = tf.keras.layers.Embedding(config.n_positions, config.n_embd, name='positions_embed') + self.tokens_embed = TFSharedEmbeddings(config.vocab_size, + config.n_embd, + initializer_range=config.initializer_range, + name='tokens_embed') + self.positions_embed = tf.keras.layers.Embedding(config.n_positions, + config.n_embd, + embeddings_initializer=get_initializer(config.initializer_range), + name='positions_embed') self.drop = tf.keras.layers.Dropout(config.embd_pdrop) self.h = [TFBlock(config.n_ctx, config, @@ -522,7 +528,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super(TFOpenAIGPTDoubleHeadsModel, self).__init__(config, *inputs, **kwargs) self.transformer = TFOpenAIGPTMainLayer(config, name='transformer') - self.multiple_choice_head = TFSequenceSummary(config, name='multiple_choice_head') + self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head') def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, mc_token_ids=None, training=False): if isinstance(inputs, (tuple, list)): diff --git a/pytorch_transformers/modeling_tf_roberta.py b/pytorch_transformers/modeling_tf_roberta.py index 488ea566dc..862540cf10 100644 --- a/pytorch_transformers/modeling_tf_roberta.py +++ b/pytorch_transformers/modeling_tf_roberta.py @@ -24,7 +24,7 @@ import numpy as np import tensorflow as tf from .configuration_roberta import RobertaConfig -from .modeling_tf_utils import TFPreTrainedModel +from .modeling_tf_utils import TFPreTrainedModel, get_initializer from .file_utils import add_start_docstrings from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model @@ -232,7 +232,9 @@ class TFRobertaLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super(TFRobertaLMHead, self).__init__(**kwargs) self.vocab_size = config.vocab_size - self.dense = tf.keras.layers.Dense(config.hidden_size, name='dense') + self.dense = tf.keras.layers.Dense(config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + name='dense') self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm') self.act = tf.keras.layers.Activation(gelu) @@ -315,9 +317,14 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFRobertaClassificationHead, self).__init__(config, **kwargs) - self.dense = tf.keras.layers.Dense(config.hidden_size, activation='tanh', name="dense") + self.dense = tf.keras.layers.Dense(config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + activation='tanh', + name="dense") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.out_proj = tf.keras.layers.Dense(config.num_labels, name="out_proj") + self.out_proj = tf.keras.layers.Dense(config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name="out_proj") def call(self, features, training=False): x = features[:, 0, :] # take token (equiv. to [CLS]) diff --git a/pytorch_transformers/modeling_tf_transfo_xl.py b/pytorch_transformers/modeling_tf_transfo_xl.py index bd2854ac0a..377599d9e5 100644 --- a/pytorch_transformers/modeling_tf_transfo_xl.py +++ b/pytorch_transformers/modeling_tf_transfo_xl.py @@ -30,7 +30,7 @@ import numpy as np import tensorflow as tf from .configuration_transfo_xl import TransfoXLConfig -from .modeling_tf_utils import TFPreTrainedModel, TFConv1D, TFSequenceSummary, shape_list +from .modeling_tf_utils import TFPreTrainedModel, TFConv1D, TFSequenceSummary, shape_list, get_initializer from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask from .file_utils import add_start_docstrings from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model @@ -66,16 +66,21 @@ class TFPositionalEmbedding(tf.keras.layers.Layer): class TFPositionwiseFF(tf.keras.layers.Layer): - def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, **kwargs): + def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs): super(TFPositionwiseFF, self).__init__(**kwargs) self.d_model = d_model self.d_inner = d_inner self.dropout = dropout - self.layer_1 = tf.keras.layers.Dense(d_inner, activation=tf.nn.relu, name='CoreNet_._0') + self.layer_1 = tf.keras.layers.Dense(d_inner, + kernel_initializer=get_initializer(init_std), + activation=tf.nn.relu, + name='CoreNet_._0') self.drop_1 = tf.keras.layers.Dropout(dropout) - self.layer_2 = tf.keras.layers.Dense(d_model, name='CoreNet_._3') + self.layer_2 = tf.keras.layers.Dense(d_model, + kernel_initializer=get_initializer(init_std), + name='CoreNet_._3') self.drop_2 = tf.keras.layers.Dropout(dropout) self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name='layer_norm') @@ -110,7 +115,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False, r_r_bias=None, r_w_bias=None, output_attentions=False, - layer_norm_epsilon=1e-5, **kwargs): + layer_norm_epsilon=1e-5, init_std=0.02, **kwargs): super(TFRelPartialLearnableMultiHeadAttn, self).__init__(**kwargs) self.output_attentions = output_attentions @@ -119,11 +124,17 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): self.d_head = d_head self.dropout = dropout - self.qkv_net = tf.keras.layers.Dense(3 * n_head * d_head, use_bias=False, name='qkv_net') + self.qkv_net = tf.keras.layers.Dense(3 * n_head * d_head, + kernel_initializer=get_initializer(init_std), + use_bias=False, + name='qkv_net') self.drop = tf.keras.layers.Dropout(dropout) self.dropatt = tf.keras.layers.Dropout(dropatt) - self.o_net = tf.keras.layers.Dense(d_model, use_bias=False, name='o_net') + self.o_net = tf.keras.layers.Dense(d_model, + kernel_initializer=get_initializer(init_std), + use_bias=False, + name='o_net') self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name='layer_norm') @@ -138,14 +149,19 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): self.r_r_bias = None self.r_w_bias = None - self.r_net = tf.keras.layers.Dense(self.n_head * self.d_head, use_bias=False, name='r_net') + self.r_net = tf.keras.layers.Dense(self.n_head * self.d_head, + kernel_initializer=get_initializer(init_std), + use_bias=False, + name='r_net') def build(self, input_shape): if self.r_r_bias is None or self.r_w_bias is None: # Biases are not shared self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head), + initializer='zeros', trainable=True, name='r_r_bias') self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head), + initializer='zeros', trainable=True, name='r_w_bias') super(TFRelPartialLearnableMultiHeadAttn, self).build(input_shape) @@ -249,17 +265,18 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer): r_r_bias=None, output_attentions=False, layer_norm_epsilon=1e-5, + init_std=0.02, **kwargs): super(TFRelPartialLearnableDecoderLayer, self).__init__(**kwargs) self.dec_attn = TFRelPartialLearnableMultiHeadAttn(n_head, d_model, d_head, dropout, tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len, dropatt=dropatt, pre_lnorm=pre_lnorm, - r_w_bias=r_w_bias, r_r_bias=r_r_bias, + r_w_bias=r_w_bias, r_r_bias=r_r_bias, init_std=init_std, output_attentions=output_attentions, layer_norm_epsilon=layer_norm_epsilon, name='dec_attn') self.pos_ff = TFPositionwiseFF(d_model, d_inner, dropout, - pre_lnorm=pre_lnorm, + pre_lnorm=pre_lnorm, init_std=init_std, layer_norm_epsilon=layer_norm_epsilon, name='pos_ff') @@ -275,12 +292,13 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer): class TFAdaptiveEmbedding(tf.keras.layers.Layer): - def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, + def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02, sample_softmax=False, **kwargs): super(TFAdaptiveEmbedding, self).__init__(**kwargs) self.n_token = n_token self.d_embed = d_embed + self.init_std = init_std self.cutoffs = cutoffs + [n_token] self.div_val = div_val @@ -298,12 +316,16 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer): for i in range(len(self.cutoffs)): l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1] d_emb_i = d_embed // (div_val ** i) - self.emb_layers.append(tf.keras.layers.Embedding(r_idx-l_idx, d_emb_i, name='emb_layers_._{}'.format(i))) + self.emb_layers.append(tf.keras.layers.Embedding(r_idx-l_idx, + d_emb_i, + embeddings_initializer=get_initializer(init_std), + name='emb_layers_._{}'.format(i))) def build(self, input_shape): for i in range(len(self.cutoffs)): d_emb_i = self.d_embed // (self.div_val ** i) self.emb_projs.append(self.add_weight(shape=(d_emb_i, self.d_proj), + initializer=get_initializer(self.init_std), trainable=True, name='emb_projs_._{}'.format(i))) super(TFAdaptiveEmbedding, self).build(input_shape) @@ -349,7 +371,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): self.untie_r = config.untie_r self.word_emb = TFAdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs, - div_val=config.div_val, name='word_emb') + div_val=config.div_val, init_std=config.init_std, name='word_emb') self.drop = tf.keras.layers.Dropout(config.dropout) @@ -374,6 +396,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): r_r_bias=None if self.untie_r else self.r_r_bias, output_attentions=self.output_attentions, layer_norm_epsilon=config.layer_norm_epsilon, + init_std=config.init_std, name='layers_._{}'.format(i)) ) else: # learnable embeddings and absolute embeddings diff --git a/pytorch_transformers/modeling_tf_utils.py b/pytorch_transformers/modeling_tf_utils.py index 00e2d41a96..e341427c36 100644 --- a/pytorch_transformers/modeling_tf_utils.py +++ b/pytorch_transformers/modeling_tf_utils.py @@ -277,20 +277,20 @@ class TFPreTrainedModel(tf.keras.Model): return model class TFConv1D(tf.keras.layers.Layer): - def __init__(self, nf, nx, *inputs, **kwargs): + def __init__(self, nf, nx, *inputs, initializer_range=0.02, **kwargs): """ TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2) Basically works like a Linear layer but the weights are transposed """ super(TFConv1D, self).__init__(*inputs, **kwargs) self.nf = nf self.nx = nx + self.initializer_range = initializer_range def build(self, input_shape): self.weight = self.add_weight( "weight", shape=[self.nx, self.nf], - initializer=tf.random_normal_initializer( - mean=0., stddev=0.02)) + initializer=get_initializer(self.initializer_range)) self.bias = self.add_weight( "bias", shape=[1, self.nf], @@ -314,19 +314,17 @@ class TFSharedEmbeddings(tf.keras.layers.Layer): super(TFSharedEmbeddings, self).__init__(**kwargs) self.vocab_size = vocab_size self.hidden_size = hidden_size - self.initializer_range = initializer_range + self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range def build(self, input_shape): """Build shared word embedding layer Shared weights logic adapted from https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ - initializer_range = self.hidden_size**-0.5 if self.initializer_range is None else self.initializer_range self.weight = self.add_weight( "weight", shape=[self.vocab_size, self.hidden_size], - initializer=tf.random_normal_initializer( - mean=0., stddev=initializer_range)) + initializer=get_initializer(self.initializer_range)) super(TFSharedEmbeddings, self).build(input_shape) def call(self, inputs, mode="embedding"): @@ -385,7 +383,7 @@ class TFSequenceSummary(tf.keras.layers.Layer): summary_first_dropout: Add a dropout before the projection and activation summary_last_dropout: Add a dropout after the projection and activation """ - def __init__(self, config, **kwargs): + def __init__(self, config, initializer_range=0.02, **kwargs): super(TFSequenceSummary, self).__init__(**kwargs) self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last' @@ -401,7 +399,9 @@ class TFSequenceSummary(tf.keras.layers.Layer): num_classes = config.num_labels else: num_classes = config.hidden_size - self.summary = tf.keras.layers.Dense(num_classes, name='summary') + self.summary = tf.keras.layers.Dense(num_classes, + kernel_initializer=get_initializer(initializer_range), + name='summary') self.activation = None if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh': diff --git a/pytorch_transformers/modeling_tf_xlm.py b/pytorch_transformers/modeling_tf_xlm.py index 1cd2f355be..cc404b6f46 100644 --- a/pytorch_transformers/modeling_tf_xlm.py +++ b/pytorch_transformers/modeling_tf_xlm.py @@ -25,7 +25,7 @@ import numpy as np import tensorflow as tf from .configuration_xlm import XLMConfig -from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list +from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list, get_initializer from .file_utils import add_start_docstrings from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model @@ -119,10 +119,10 @@ class TFMultiHeadAttention(tf.keras.layers.Layer): self.n_heads = n_heads assert self.dim % self.n_heads == 0 - self.q_lin = tf.keras.layers.Dense(dim, name='q_lin') - self.k_lin = tf.keras.layers.Dense(dim, name='k_lin') - self.v_lin = tf.keras.layers.Dense(dim, name='v_lin') - self.out_lin = tf.keras.layers.Dense(dim, name='out_lin') + self.q_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='q_lin') + self.k_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='k_lin') + self.v_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='v_lin') + self.out_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='out_lin') self.dropout = tf.keras.layers.Dropout(config.attention_dropout) self.pruned_heads = set() @@ -199,8 +199,8 @@ class TFTransformerFFN(tf.keras.layers.Layer): def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs): super(TFTransformerFFN, self).__init__(**kwargs) - self.lin1 = tf.keras.layers.Dense(dim_hidden, name='lin1') - self.lin2 = tf.keras.layers.Dense(out_dim, name='lin2') + self.lin1 = tf.keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name='lin1') + self.lin2 = tf.keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name='lin2') self.act = tf.keras.layers.Activation(gelu) if config.gelu_activation else tf.keras.activations.relu self.dropout = tf.keras.layers.Dropout(config.dropout) @@ -249,13 +249,19 @@ class TFXLMMainLayer(tf.keras.layers.Layer): self.dropout = tf.keras.layers.Dropout(config.dropout) self.attention_dropout = tf.keras.layers.Dropout(config.attention_dropout) - self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings, self.dim, name='position_embeddings') + self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings, + self.dim, + embeddings_initializer=get_initializer(config.embed_init_std), + name='position_embeddings') if config.sinusoidal_embeddings: raise NotImplementedError # create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight) if config.n_langs > 1 and config.use_lang_emb: - self.lang_embeddings = tf.keras.layers.Embedding(self.n_langs, self.dim, name='lang_embeddings') - self.embeddings = TFSharedEmbeddings(self.n_words, self.dim, name='embeddings') # padding_idx=self.pad_index) + self.lang_embeddings = tf.keras.layers.Embedding(self.n_langs, + self.dim, + embeddings_initializer=get_initializer(config.embed_init_std), + name='lang_embeddings') + self.embeddings = TFSharedEmbeddings(self.n_words, self.dim, initializer_range=config.embed_init_std, name='embeddings') # padding_idx=self.pad_index) self.layer_norm_emb = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm_emb') # transformer layers @@ -676,7 +682,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel): self.num_labels = config.num_labels self.transformer = TFXLMMainLayer(config, name='transformer') - self.sequence_summary = TFSequenceSummary(config, name='sequence_summary') + self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name='sequence_summary') def call(self, inputs, **kwargs): transformer_outputs = self.transformer(inputs, **kwargs) @@ -721,7 +727,9 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super(TFXLMForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs) self.transformer = TFXLMMainLayer(config, name='transformer') - self.qa_outputs = tf.keras.layers.Dense(config.num_labels, name='qa_outputs') + self.qa_outputs = tf.keras.layers.Dense(config.num_labels, + kernel_initializer=get_initializer(config.init_std), + name='qa_outputs') def call(self, inputs, **kwargs): transformer_outputs = self.transformer(inputs, **kwargs) diff --git a/pytorch_transformers/modeling_tf_xlnet.py b/pytorch_transformers/modeling_tf_xlnet.py index a6ffb5b681..fa9a045fd8 100644 --- a/pytorch_transformers/modeling_tf_xlnet.py +++ b/pytorch_transformers/modeling_tf_xlnet.py @@ -28,7 +28,7 @@ import numpy as np import tensorflow as tf from .configuration_xlnet import XLNetConfig -from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list +from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list, get_initializer from .file_utils import add_start_docstrings from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model @@ -87,7 +87,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): self.dropout = tf.keras.layers.Dropout(config.dropout) def build(self, input_shape): - initializer = tf.random_normal_initializer(mean=0., stddev=self.initializer_range) + initializer = get_initializer(self.initializer_range) self.q = self.add_weight(shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name='q') @@ -104,13 +104,13 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): initializer=initializer, trainable=True, name='r') self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head), - initializer=initializer, + initializer='zeros', trainable=True, name='r_r_bias') self.r_s_bias = self.add_weight(shape=(self.n_head, self.d_head), - initializer=initializer, + initializer='zeros', trainable=True, name='r_s_bias') self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head), - initializer=initializer, + initializer='zeros', trainable=True, name='r_w_bias') self.seg_embed = self.add_weight(shape=(2, self.n_head, self.d_head), initializer=initializer, @@ -294,8 +294,12 @@ class TFXLNetFeedForward(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFXLNetFeedForward, self).__init__(**kwargs) self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm') - self.layer_1 = tf.keras.layers.Dense(config.d_inner, name='layer_1') - self.layer_2 = tf.keras.layers.Dense(config.d_model, name='layer_2') + self.layer_1 = tf.keras.layers.Dense(config.d_inner, + kernel_initializer=get_initializer(config.initializer_range), + name='layer_1') + self.layer_2 = tf.keras.layers.Dense(config.d_model, + kernel_initializer=get_initializer(config.initializer_range), + name='layer_2') self.dropout = tf.keras.layers.Dropout(config.dropout) if isinstance(config.ff_activation, str) or \ (sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)): @@ -375,7 +379,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): self.dropout = tf.keras.layers.Dropout(config.dropout) def build(self, input_shape): - initializer = tf.random_normal_initializer(mean=0., stddev=self.initializer_range) + initializer = get_initializer(self.initializer_range) self.mask_emb = self.add_weight(shape=(1, 1, self.d_model), initializer=initializer, trainable=True, name='mask_emb') @@ -900,8 +904,10 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel): self.num_labels = config.num_labels self.transformer = TFXLNetMainLayer(config, name='transformer') - self.sequence_summary = TFSequenceSummary(config, name='sequence_summary') - self.logits_proj = tf.keras.layers.Dense(config.num_labels, name='logits_proj') + self.sequence_summary = TFSequenceSummary(config, initializer_range=config.initializer_range, name='sequence_summary') + self.logits_proj = tf.keras.layers.Dense(config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name='logits_proj') def call(self, inputs, **kwargs): transformer_outputs = self.transformer(inputs, **kwargs) @@ -949,7 +955,9 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super(TFXLNetForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs) self.transformer = TFXLNetMainLayer(config, name='transformer') - self.qa_outputs = tf.keras.layers.Dense(config.num_labels, name='qa_outputs') + self.qa_outputs = tf.keras.layers.Dense(config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name='qa_outputs') def call(self, inputs, **kwargs): transformer_outputs = self.transformer(inputs, **kwargs)