From b97af8cce90aa2d147ca3fd9543ca372d3cb2ae3 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 13 Sep 2019 16:43:49 +0200 Subject: [PATCH] skip finetuned checkpoints --- pytorch_transformers/__init__.py | 5 + .../convert_pytorch_checkpoint_to_tf2.py | 14 ++- .../modeling_tf_transfo_xl_utilities.py | 106 +----------------- 3 files changed, 17 insertions(+), 108 deletions(-) diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py index 2f7bdb3de4..40c4bab7e1 100644 --- a/pytorch_transformers/__init__.py +++ b/pytorch_transformers/__init__.py @@ -113,6 +113,11 @@ if _tf_available: load_gpt2_pt_weights_in_tf2, TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_tf_transfo_xl import (TFTransfoXLPreTrainedModel, TFTransfoXLMainLayer, + TFTransfoXLModel, TFTransfoXLLMHeadModel, + load_transfo_xl_pt_weights_in_tf2, + TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_tf_xlnet import (TFXLNetPreTrainedModel, TFXLNetMainLayer, TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, diff --git a/pytorch_transformers/convert_pytorch_checkpoint_to_tf2.py b/pytorch_transformers/convert_pytorch_checkpoint_to_tf2.py index d586fecbc5..ee26ac0a89 100644 --- a/pytorch_transformers/convert_pytorch_checkpoint_to_tf2.py +++ b/pytorch_transformers/convert_pytorch_checkpoint_to_tf2.py @@ -27,7 +27,8 @@ from pytorch_transformers import is_torch_available, cached_path from pytorch_transformers import (BertConfig, TFBertForPreTraining, load_bert_pt_weights_in_tf2, GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2, XLNetConfig, TFXLNetLMHeadModel, load_xlnet_pt_weights_in_tf2, - XLMConfig, TFXLMWithLMHeadModel, load_xlm_pt_weights_in_tf2,) + XLMConfig, TFXLMWithLMHeadModel, load_xlm_pt_weights_in_tf2, + TransfoXLConfig, TFTransfoXLLMHeadModel, load_transfo_xl_pt_weights_in_tf2,) if is_torch_available(): import torch @@ -35,12 +36,15 @@ if is_torch_available(): from pytorch_transformers import (BertForPreTraining, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, - XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,) + XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, + TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,) else: (BertForPreTraining, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, - XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,) = ( + XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, + TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,) = ( + None, None, None, None, None, None, None, None, None, None, None, None, @@ -55,6 +59,7 @@ MODEL_CLASSES = { 'gpt2': (GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP), 'xlnet': (XLNetConfig, TFXLNetLMHeadModel, load_xlnet_pt_weights_in_tf2, XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP), 'xlm': (XLMConfig, TFXLMWithLMHeadModel, load_xlm_pt_weights_in_tf2, XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP), + 'transfo-xl': (TransfoXLConfig, TFTransfoXLLMHeadModel, load_transfo_xl_pt_weights_in_tf2, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP), } def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False): @@ -118,6 +123,9 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, compare_with print("-" * 100) print(" Converting checkpoint {}/{}: {}".format(i, len(aws_config_map), shortcut_name)) print("-" * 100) + if 'finetuned' in shortcut_name: + print(" Skipping fintenued checkpoint ") + continue config_file = cached_path(aws_config_map[shortcut_name], force_download=True) model_file = cached_path(aws_model_maps[shortcut_name], force_download=True) diff --git a/pytorch_transformers/modeling_tf_transfo_xl_utilities.py b/pytorch_transformers/modeling_tf_transfo_xl_utilities.py index d313ba177e..d7666a650e 100644 --- a/pytorch_transformers/modeling_tf_transfo_xl_utilities.py +++ b/pytorch_transformers/modeling_tf_transfo_xl_utilities.py @@ -13,8 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Utilities for PyTorch Transformer XL model. - Directly adapted from https://github.com/kimiyoung/transformer-xl. +""" A TF 2.0 Adaptive Softmax for Transformer XL model. """ from collections import defaultdict @@ -174,106 +173,3 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): self.add_metric(loss, name=self.name, aggregation='mean' if return_mean else '') return out - - -def mul_adaptive_logsoftmax(hidden, target, n_token, d_embed, d_proj, cutoffs, - params, tie_projs, - initializer=None, proj_initializer=None, - div_val=1, perms=None, proj_same_dim=True, - scope='adaptive_softmax', - **kwargs): - def _logit(x, W, b, proj): - y = x - if x.shape.ndims == 3: - if proj is not None: - y = tf.einsum('ibd,ed->ibe', y, proj) - return tf.einsum('ibd,nd->ibn', y, W) + b - else: - if proj is not None: - y = tf.einsum('id,ed->ie', y, proj) - return tf.einsum('id,nd->in', y, W) + b - - params_W, params_projs = params[0], params[1] - - with tf.variable_scope(scope): - if len(cutoffs) == 0: - softmax_b = tf.get_variable('bias', [n_token], - initializer=tf.zeros_initializer()) - output = _logit(hidden, params_W, softmax_b, params_projs) - nll = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, - logits=output) - nll = tf.reduce_mean(nll) - else: - total_loss, total_cnt = 0, 0 - cutoff_ends = [0] + cutoffs + [n_token] - for i in range(len(cutoff_ends) - 1): - with tf.variable_scope('cutoff_{}'.format(i)): - l_idx, r_idx = cutoff_ends[i], cutoff_ends[i + 1] - - cur_d_embed = d_embed // (div_val ** i) - - if div_val == 1: - cur_W = params_W[l_idx: r_idx] - else: - cur_W = params_W[i] - cur_b = tf.get_variable('b', [r_idx - l_idx], - initializer=tf.zeros_initializer()) - if tie_projs[i]: - if div_val == 1: - cur_proj = params_projs - else: - cur_proj = params_projs[i] - else: - if (div_val == 1 or not proj_same_dim) and d_proj == cur_d_embed: - cur_proj = None - else: - cur_proj = tf.get_variable('proj', [cur_d_embed, d_proj], - initializer=proj_initializer) - - if i == 0: - cluster_W = tf.get_variable('cluster_W', [len(cutoffs), d_embed], - initializer=tf.zeros_initializer()) - cluster_b = tf.get_variable('cluster_b', [len(cutoffs)], - initializer=tf.zeros_initializer()) - cur_W = tf.concat([cur_W, cluster_W], 0) - cur_b = tf.concat([cur_b, cluster_b], 0) - - head_logit = _logit(hidden, cur_W, cur_b, cur_proj) - - head_target = kwargs.get("head_target") - head_nll = tf.nn.sparse_softmax_cross_entropy_with_logits( - labels=head_target, - logits=head_logit) - - masked_loss = head_nll * perms[i] - total_loss += tf.reduce_sum(masked_loss) - total_cnt += tf.reduce_sum(perms[i]) - - # head_logprob = tf.nn.log_softmax(head_logit) - - # final_logprob = head_logprob * perms[i][:, :, None] - # final_target = tf.one_hot(target, tf.shape(head_logprob)[2]) - # total_loss -= tf.einsum('ibn,ibn->', final_logprob, final_target) - # total_cnt += tf.reduce_sum(perms[i]) - else: - cur_head_nll = tf.einsum('ib,ibk->k', head_nll, perms[i]) - - cur_hidden = tf.einsum('ibd,ibk->kd', hidden, perms[i]) - tail_logit = _logit(cur_hidden, cur_W, cur_b, cur_proj) - - tail_target = tf.einsum('ib,ibk->k', tf.to_float(target - l_idx), - perms[i]) - tail_nll = tf.nn.sparse_softmax_cross_entropy_with_logits( - labels=tf.to_int32(tail_target), - logits=tail_logit) - - sum_nll = cur_head_nll + tail_nll - mask = tf.reduce_sum(perms[i], [0, 1]) - - masked_loss = sum_nll * mask - total_loss += tf.reduce_sum(masked_loss) - total_cnt += tf.reduce_sum(mask) - - nll = total_loss / total_cnt - - return nll \ No newline at end of file